diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 21f15ade458..cd7b8aea6d7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
 repos:
       - repo: https://github.com/PyCQA/isort
         rev: 5.6.4
@@ -56,20 +58,28 @@ repos:
         hooks:
               - id: pydocstyle
                 args: ["--config=python/.flake8"]
-      - repo: local
+                exclude: |
+                    (?x)^(
+                    ci|
+                    cpp|
+                    conda|
+                    docs|
+                    java|
+                    notebooks|
+                    python/dask_cudf|
+                    python/cudf_kafka|
+                    python/custreamz|
+                    python/cudf/cudf/tests
+                    )
+      - repo: https://github.com/pre-commit/mirrors-clang-format
+        rev: v11.1.0
         hooks:
               - id: clang-format
-                # Using the pre-commit stage to simplify invocation of all
-                # other hooks simultaneously (via any other hook stage).  This
-                # can be removed if we also move to running clang-format
-                # entirely through pre-commit.
-                stages: [commit]
-                name: clang-format
-                description: Format files with ClangFormat.
-                entry: clang-format -i
-                language: system
                 files: \.(cu|cuh|h|hpp|cpp|inl)$
-                args: ['-fallback-style=none']
+                types_or: [file]
+                args: ['-fallback-style=none', '-style=file', '-i']
+      - repo: local
+        hooks:
               - id: cmake-format
                 name: cmake-format
                 entry: ./cpp/scripts/run-cmake-format.sh cmake-format
@@ -78,7 +88,7 @@ repos:
                 # Note that pre-commit autoupdate does not update the versions
                 # of dependencies, so we'll have to update this manually.
                 additional_dependencies:
-                  - cmake-format==0.6.11
+                  - cmakelang==0.6.13
               - id: cmake-lint
                 name: cmake-lint
                 entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
@@ -87,7 +97,7 @@ repos:
                 # Note that pre-commit autoupdate does not update the versions
                 # of dependencies, so we'll have to update this manually.
                 additional_dependencies:
-                  - cmake-format==0.6.11
+                  - cmakelang==0.6.13
               - id: copyright-check
                 name: copyright-check
                 # This hook's use of Git tools appears to conflict with
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6d1c0528832..db8a8d88b99 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,69 +1,79 @@
 # Contributing to cuDF
 
-Contributions to cuDF fall into the following three categories.
-
-1. To report a bug, request a new feature, or report a problem with
-    documentation, please file an [issue](https://github.com/rapidsai/cudf/issues/new/choose)
-    describing in detail the problem or new feature. The RAPIDS team evaluates
-    and triages issues, and schedules them for a release. If you believe the
-    issue needs priority attention, please comment on the issue to notify the
-    team.
-2. To propose and implement a new Feature, please file a new feature request
-    [issue](https://github.com/rapidsai/cudf/issues/new/choose). Describe the
-    intended feature and discuss the design and implementation with the team and
-    community. Once the team agrees that the plan looks good, go ahead and
-    implement it, using the [code contributions](#code-contributions) guide below.
-3. To implement a feature or bug-fix for an existing outstanding issue, please
-    Follow the [code contributions](#code-contributions) guide below. If you
-    need more context on a particular issue, please ask in a comment.
-
-As contributors and maintainers to this project,
-you are expected to abide by cuDF's code of conduct.
-More information can be found at: [Contributor Code of Conduct](https://docs.rapids.ai/resources/conduct/).
+Contributions to cuDF fall into the following categories:
+
+1. To report a bug, request a new feature, or report a problem with documentation, please file an
+   [issue](https://github.com/rapidsai/cudf/issues/new/choose) describing the problem or new feature
+   in detail. The RAPIDS team evaluates and triages issues, and schedules them for a release. If you
+   believe the issue needs priority attention, please comment on the issue to notify the team.
+2. To propose and implement a new feature, please file a new feature request
+   [issue](https://github.com/rapidsai/cudf/issues/new/choose). Describe the intended feature and
+   discuss the design and implementation with the team and community. Once the team agrees that the
+   plan looks good, go ahead and implement it, using the [code contributions](#code-contributions)
+   guide below.
+3. To implement a feature or bug fix for an existing issue, please follow the [code
+   contributions](#code-contributions) guide below. If you need more context on a particular issue,
+   please ask in a comment.
+
+As contributors and maintainers to this project, you are expected to abide by cuDF's code of
+conduct. More information can be found at:
+[Contributor Code of Conduct](https://docs.rapids.ai/resources/conduct/).
 
 ## Code contributions
 
 ### Your first issue
 
-1. Follow the guide at the bottom of this page for [Setting Up Your Build Environment](#setting-up-your-build-environment).
-2. Find an issue to work on. The best way is to look for the [good first issue](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
-    or [help wanted](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels.
+1. Follow the guide at the bottom of this page for
+   [Setting up your build environment](#setting-up-your-build-environment).
+2. Find an issue to work on. The best way is to look for the
+   [good first issue](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
+   or [help wanted](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22)
+   labels.
 3. Comment on the issue stating that you are going to work on it.
-4. Code! Make sure to update unit tests!
-5. When done, [create your pull request](https://github.com/rapidsai/cudf/compare).
-6. Verify that CI passes all [status checks](https://help.github.com/articles/about-status-checks/). Fix if needed.
-7. Wait for other developers to review your code and update code as needed.
-8. Once reviewed and approved, a RAPIDS developer will merge your pull request.
-
-Remember, if you are unsure about anything, don't hesitate to comment on issues
-and ask for clarifications!
+4. Create a fork of the cudf repository and check out a branch with a name that
+   describes your planned work. For example, `fix-documentation`.
+5. Write code to address the issue or implement the feature.
+6. Add unit tests and unit benchmarks.
+7. [Create your pull request](https://github.com/rapidsai/cudf/compare).
+8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks).
+   Fix if needed.
+9. Wait for other developers to review your code and update code as needed.
+10. Once reviewed and approved, a RAPIDS developer will merge your pull request.
+
+If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!
 
 ### Seasoned developers
 
-Once you have gotten your feet wet and are more comfortable with the code, you
-can look at the prioritized issues for our next release in our [project boards](https://github.com/rapidsai/cudf/projects).
-
-> **Pro Tip:** Always look at the release board with the highest number for
-issues to work on. This is where RAPIDS developers also focus their efforts.
+Once you have gotten your feet wet and are more comfortable with the code, you can look at the
+prioritized issues for our next release in our
+[project boards](https://github.com/rapidsai/cudf/projects).
 
-Look at the unassigned issues, and find an issue to which you are comfortable
-contributing. Start with _Step 3_ above, commenting on the issue to let
-others know you are working on it. If you have any questions related to the
-implementation of the issue, ask them in the issue instead of the PR.
+**Note:** Always look at the release board that is
+[currently under development](https://docs.rapids.ai/maintainers) for issues to work on. This is
+where RAPIDS developers also focus their efforts.
 
-## Setting Up Your Build Environment
+Look at the unassigned issues, and find an issue to which you are comfortable contributing. Start
+with _Step 3_ above, commenting on the issue to let others know you are working on it. If you have
+any questions related to the implementation of the issue, ask them in the issue instead of the PR.
 
-The following instructions are for developers and contributors to cuDF OSS development. These instructions are tested on Linux Ubuntu 16.04 & 18.04. Use these instructions to build cuDF from source and contribute to its development.  Other operating systems may be compatible, but are not currently tested.
+## Setting up your build environment
 
+The following instructions are for developers and contributors to cuDF development. These
+instructions are tested on Ubuntu Linux LTS releases. Use these instructions to build cuDF from
+source and contribute to its development. Other operating systems may be compatible, but are not
+currently tested.
 
+Building cudf with the provided conda environment is recommended for users who wish to enable all
+library features. The following instructions are for building with a conda environment. Dependencies
+for a minimal build of libcudf without using conda are also listed below.
 
 ### General requirements
 
 Compilers:
 
-* `gcc`     version 9.3+
-* `nvcc`    version 11.5+
-* `cmake`   version 3.20.1+
+* `gcc` version 9.3+
+* `nvcc` version 11.5+
+* `cmake` version 3.20.1+
 
 CUDA/GPU:
 
@@ -71,127 +81,166 @@ CUDA/GPU:
 * NVIDIA driver 450.80.02+
 * Pascal architecture or better
 
-You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
+You can obtain CUDA from
+[https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
 
-### Create the build Environment
+### Create the build environment
+
+- Clone the repository:
 
-- Clone the repository and submodules
 ```bash
 CUDF_HOME=$(pwd)/cudf
 git clone https://github.com/rapidsai/cudf.git $CUDF_HOME
 cd $CUDF_HOME
-git submodule update --init --remote --recursive
 ```
+
+#### Building with a conda environment
+
+**Note:** Using a conda environment is the easiest way to satisfy the library's dependencies.
+Instructions for a minimal build environment without conda are included below.
+
 - Create the conda development environment `cudf_dev`:
+
 ```bash
 # create the conda environment (assuming in base `cudf` directory)
-# note: RAPIDS currently doesn't support `channel_priority: strict`; use `channel_priority: flexible` instead
+# note: RAPIDS currently doesn't support `channel_priority: strict`;
+# use `channel_priority: flexible` instead
 conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml
 # activate the environment
 conda activate cudf_dev
 ```
-- For other CUDA versions, check the corresponding cudf_dev_cuda*.yml file in conda/environments
+
+- **Note**: the conda environment files are updated frequently, so the
+  development environment may also need to be updated if dependency versions or
+  pinnings are changed.
+
+- For other CUDA versions, check the corresponding `cudf_dev_cuda*.yml` file in
+  `conda/environments/`.
+
+#### Building without a conda environment
+
+- libcudf has the following minimal dependencies (in addition to those listed in the [General
+  requirements](#general-requirements)). The packages listed below use Ubuntu package names:
+
+  - `build-essential`
+  - `libssl-dev`
+  - `libz-dev`
+  - `libpython3-dev` (required if building cudf)
 
 ### Build cuDF from source
 
-- A `build.sh` script is provided in `$CUDF_HOME`. Running the script with no additional arguments will install the `libcudf`, `cudf` and `dask_cudf` libraries. By default, the libraries are installed to the `$CONDA_PREFIX` directory. To install into a different location, set the location in `$INSTALL_PREFIX`. Finally, note that the script depends on the `nvcc` executable being on your path, or defined in `$CUDACXX`.
+- A `build.sh` script is provided in `$CUDF_HOME`. Running the script with no additional arguments
+  will install the `libcudf`, `cudf` and `dask_cudf` libraries. By default, the libraries are
+  installed to the `$CONDA_PREFIX` directory. To install into a different location, set the location
+  in `$INSTALL_PREFIX`. Finally, note that the script depends on the `nvcc` executable being on your
+  path, or defined in `$CUDACXX`.
+
 ```bash
 cd $CUDF_HOME
 
 # Choose one of the following commands, depending on whether
-# you want to build and install the libcudf C++ library only, 
+# you want to build and install the libcudf C++ library only,
 # or include the cudf and/or dask_cudf Python libraries:
 
 ./build.sh  # libcudf, cudf and dask_cudf
 ./build.sh libcudf  # libcudf only
-./build.sh libcudf cudf  # libcudf and cudf only             
+./build.sh libcudf cudf  # libcudf and cudf only
 ```
-- Other libraries like `cudf-kafka` and `custreamz` can be installed with this script. For the complete list of libraries as well as details about the script usage, run the `help` command:
+
+- Other libraries like `cudf-kafka` and `custreamz` can be installed with this script. For the
+  complete list of libraries as well as details about the script usage, run the `help` command:
+
 ```bash
-./build.sh --help            
+./build.sh --help
 ```
 
 ### Build, install and test cuDF libraries for contributors
 
-The general workflow is provided below. Please, also see the last section about [code formatting](###code-formatting).
+The general workflow is provided below. Please also see the last section about
+[code formatting](#code-formatting).
 
 #### `libcudf` (C++)
 
-If you're only interested in building the library (and not the unit tests):
- 
+- If you're only interested in building the library (and not the unit tests):
+
 ```bash
 cd $CUDF_HOME
 ./build.sh libcudf
 ```
-If, in addition, you want to build tests:
+
+- If, in addition, you want to build tests:
 
 ```bash
 ./build.sh libcudf tests
 ```
-To run the tests:
+
+- To run the tests:
 
 ```bash
-make test                                      
+make test
 ```
 
 #### `cudf` (Python)
 
 - First, build the `libcudf` C++ library following the steps above
 
-- To build and install in edit/develop `cudf` python package:
+- To build and install in edit/develop `cudf` Python package:
 ```bash
 cd $CUDF_HOME/python/cudf
 python setup.py build_ext --inplace
 python setup.py develop
 ```
 
-- To run `cudf` tests :
+- To run `cudf` tests:
 ```bash
 cd $CUDF_HOME/python
-py.test -v cudf/cudf/tests
+pytest -v cudf/cudf/tests
 ```
 
 #### `dask-cudf` (Python)
 
 - First, build the `libcudf` C++ and `cudf` Python libraries following the steps above
 
-- To install in edit/develop mode the `dask-cudf` python package:
+- To install the `dask-cudf` Python package in editable/develop mode:
 ```bash
 cd $CUDF_HOME/python/dask_cudf
 python setup.py build_ext --inplace
 python setup.py develop
 ```
 
-- To run `dask_cudf` tests :
+- To run `dask_cudf` tests:
 ```bash
 cd $CUDF_HOME/python
-py.test -v dask_cudf
+pytest -v dask_cudf
 ```
 
 #### `libcudf_kafka` (C++)
 
-If you're only interested in building the library (and not the unit tests):
- 
+- If you're only interested in building the library (and not the unit tests):
+
 ```bash
 cd $CUDF_HOME
 ./build.sh libcudf_kafka
 ```
-If, in addition, you want to build tests:
+
+- If, in addition, you want to build tests:
 
 ```bash
 ./build.sh libcudf_kafka tests
 ```
-To run the tests:
+
+- To run the tests:
 
 ```bash
-make test                                      
+make test
 ```
 
 #### `cudf-kafka` (Python)
 
-- First, build the `libcudf` and `libcudf_kafka` following the steps above
+- First, build the `libcudf` and `libcudf_kafka` libraries following the steps above
+
+- To install the `cudf-kafka` Python package in editable/develop mode:
 
-- To install in edit/develop mode the `cudf-kafka` python package:
 ```bash
 cd $CUDF_HOME/python/cudf_kafka
 python setup.py build_ext --inplace
@@ -202,7 +251,8 @@ python setup.py develop
 
 - First, build `libcudf`, `libcudf_kafka`, and `cudf_kafka` following the steps above
 
-- To install in edit/develop mode the `custreamz` python package:
+- To install the `custreamz` Python package in editable/develop mode:
+
 ```bash
 cd $CUDF_HOME/python/custreamz
 python setup.py build_ext --inplace
@@ -210,40 +260,45 @@ python setup.py develop
 ```
 
 - To run `custreamz` tests :
+
 ```bash
 cd $CUDF_HOME/python
-py.test -v custreamz
+pytest -v custreamz
 ```
 
 #### `cudf` (Java):
 
 - First, build the `libcudf` C++ library following the steps above
 
-- Then, refer to [Java README](https://github.com/rapidsai/cudf/blob/branch-21.10/java/README.md)
-
+- Then, refer to the [Java README](java/README.md)
 
-Done! You are ready to develop for the cuDF OSS project. But please go to [code formatting](###code-formatting) to ensure that you contributing code follows the expected format.
+Done! You are ready to develop for the cuDF project. Please review the project's
+[code formatting guidelines](#code-formatting).
 
 ## Debugging cuDF
 
-### Building Debug mode from source
+### Building in debug mode from source
 
-Follow the [above instructions](####build-cudf-from-source) to build from source and add `-g` to the `./build.sh` command.
+Follow the instructions to [build from source](#build-cudf-from-source) and add `-g` to the
+`./build.sh` command.
 
 For example:
+
 ```bash
 ./build.sh libcudf -g
 ```
 
-This builds `libcudf` in Debug mode which enables some `assert` safety checks and includes symbols in the library for debugging.
+This builds `libcudf` in debug mode which enables some `assert` safety checks and includes symbols
+in the library for debugging.
 
 All other steps for installing `libcudf` into your environment are the same.
 
 ### Debugging with `cuda-gdb` and `cuda-memcheck`
 
-When you have a debug build of `libcudf` installed, debugging with the `cuda-gdb` and `cuda-memcheck` is easy.
+When you have a debug build of `libcudf` installed, debugging with the `cuda-gdb` and
+`cuda-memcheck` is easy.
 
-If you are debugging a Python script, simply run the following:
+If you are debugging a Python script, run the following:
 
 ```bash
 cuda-gdb -ex r --args python <program_name>.py <program_arguments>
@@ -255,143 +310,71 @@ cuda-memcheck python <program_name>.py <program_arguments>
 
 ### Device debug symbols
 
-The device debug symbols are not automatically added with the cmake `Debug`
-build type because it causes a runtime delay of several minutes when loading
-the libcudf.so library.
+The device debug symbols are not automatically added with the cmake `Debug` build type because it
+causes a runtime delay of several minutes when loading the libcudf.so library.
 
-Therefore, it is recommended to add device debug symbols only to specific files by
-setting the `-G` compile option locally in your `cpp/CMakeLists.txt` for that file.
-Here is an example of adding the `-G` option to the compile command for
-`src/copying/copy.cu` source file:
+Therefore, it is recommended to add device debug symbols only to specific files by setting the `-G`
+compile option locally in your `cpp/CMakeLists.txt` for that file. Here is an example of adding the
+`-G` option to the compile command for `src/copying/copy.cu` source file:
 
-```
+```cmake
 set_source_files_properties(src/copying/copy.cu PROPERTIES COMPILE_OPTIONS "-G")
 ```
 
-This will add the device debug symbols for this object file in libcudf.so.
-You can then use `cuda-dbg` to debug into the kernels in that source file.
-
-### Building and Testing on a gpuCI image locally
-
-Before submitting a pull request, you can do a local build and test on your machine that mimics our gpuCI environment using the `ci/local/build.sh` script.
-For detailed information on usage of this script, see [here](ci/local/README.md).
-
+This will add the device debug symbols for this object file in `libcudf.so`.  You can then use
+`cuda-dbg` to debug into the kernels in that source file.
 
-## Automated Build in Docker Container
+## Code Formatting
 
-A Dockerfile is provided with a preconfigured conda environment for building and installing cuDF from source based off of the main branch.
+### C++/CUDA
 
-### Prerequisites
+cuDF uses [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
 
-* Install [nvidia-docker2](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)) for Docker + GPU support
-* Verify NVIDIA driver is `450.80.02` or higher
-* Ensure CUDA 11.0+ is installed
-
-### Usage
+In order to format the C++/CUDA files, navigate to the root (`cudf`) directory and run:
 
-From cudf project root run the following, to build with defaults:
-```bash
-docker build --tag cudf .
-```
-After the container is built run the container:
 ```bash
-docker run --runtime=nvidia -it cudf bash
-```
-Activate the conda environment `cudf` to use the newly built cuDF and libcudf libraries:
-```
-root@3f689ba9c842:/# source activate cudf
-(cudf) root@3f689ba9c842:/# python -c "import cudf"
-(cudf) root@3f689ba9c842:/#
+python3 ./cpp/scripts/run-clang-format.py -inplace
 ```
 
-### Customizing the Build
-
-Several build arguments are available to customize the build process of the
-container. These are specified by using the Docker [build-arg](https://docs.docker.com/engine/reference/commandline/build/#set-build-time-variables---build-arg)
-flag. Below is a list of the available arguments and their purpose:
+Additionally, many editors have plugins or extensions that you can set up to automatically run
+`clang-format` either manually or on file save.
 
-| Build Argument | Default Value | Other Value(s) | Purpose |
-| --- | --- | --- | --- |
-| `CUDA_VERSION` | 11.0 | 11.2.2 | set CUDA version |
-| `LINUX_VERSION` | ubuntu18.04 | ubuntu20.04 | set Ubuntu version |
-| `CC` & `CXX` | 9 | 10 | set gcc/g++ version |
-| `CUDF_REPO` | This repo | Forks of cuDF | set git URL to use for `git clone` |
-| `CUDF_BRANCH` | main | Any branch name | set git branch to checkout of `CUDF_REPO` |
-| `NUMBA_VERSION` | newest | >=0.40.0 | set numba version |
-| `NUMPY_VERSION` | newest | >=1.14.3 | set numpy version |
-| `PANDAS_VERSION` | newest | >=0.23.4 | set pandas version |
-| `PYARROW_VERSION` | 1.0.1 | Not supported | set pyarrow version |
-| `CMAKE_VERSION` | newest | >=3.18 | set cmake version |
-| `CYTHON_VERSION` | 0.29 | Not supported | set Cython version |
-| `PYTHON_VERSION` | 3.7 | 3.8 | set python version |
+### Python / Pre-commit hooks
 
+cuDF uses [pre-commit](https://pre-commit.com/) to execute code linters and formatters such as
+[Black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/), and
+[flake8](https://flake8.pycqa.org/en/latest/). These tools ensure a consistent code format
+throughout the project. Using pre-commit ensures that linter versions and options are aligned for
+all developers. Additionally, there is a CI check in place to enforce that committed code follows
+our standards.
 
-### Code Formatting
-
-
-#### Python
-
-cuDF uses [Black](https://black.readthedocs.io/en/stable/),
-[isort](https://readthedocs.org/projects/isort/), and
-[flake8](http://flake8.pycqa.org/en/latest/) to ensure a consistent code format
-throughout the project. They have been installed during the `cudf_dev` environment creation.
-
-These tools are used to auto-format the Python code, as well as check the Cython
-code in the repository. Additionally, there is a CI check in place to enforce
-that committed code follows our standards. You can use the tools to
-automatically format your python code by running:
+To use `pre-commit`, install via `conda` or `pip`:
 
 ```bash
-isort --atomic python/**/*.py
-black python
+conda install -c conda-forge pre-commit
 ```
 
-and then check the syntax of your Python and Cython code by running:
-
 ```bash
-flake8 python
-flake8 --config=python/.flake8.cython
-```
-
-Additionally, many editors have plugins that will apply `isort` and `Black` as
-you edit files, as well as use `flake8` to report any style / syntax issues.
-
-#### C++/CUDA
-
-cuDF uses [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html)
-
-In order to format the C++/CUDA files, navigate to the root (`cudf`) directory and run:
-```
-python3 ./cpp/scripts/run-clang-format.py -inplace
+pip install pre-commit
 ```
 
-Additionally, many editors have plugins or extensions that you can set up to automatically run `clang-format` either manually or on file save.
-
-#### Pre-commit hooks
-
-Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com/)
-to automatically run `isort`, `Black`, `flake8` and `clang-format` when you make a git commit.
-This can be done by installing `pre-commit` via `conda` or `pip`:
+Then run pre-commit hooks before committing code:
 
 ```bash
-conda install -c conda-forge pre_commit
+pre-commit run
 ```
 
-```bash
-pip install pre-commit
-```
-
-and then running:
+Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
 
 ```bash
 pre-commit install
 ```
 
-from the root of the cuDF repository. Now `isort`, `Black`, `flake8` and `clang-format` will be
-run each time you commit changes.
+Now code linters and formatters will be run each time you commit changes.
 
----
+You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`.
 
 ## Attribution
+
 Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
 Portions adopted from https://github.com/dask/dask/blob/master/docs/source/develop.rst
diff --git a/build.sh b/build.sh
index e1d6df016dd..48182ca1a6f 100755
--- a/build.sh
+++ b/build.sh
@@ -148,10 +148,8 @@ function buildLibCudfJniInDocker {
                 -DCUDF_CPP_BUILD_DIR=$workspaceRepoDir/java/target/libcudf-cmake-build \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
-                -DRMM_LOGGING_LEVEL=OFF \
                 -DUSE_GDS=ON \
                 -DGPU_ARCHS=${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-                -DCUDF_JNI_ARROW_STATIC=ON \
                 -DCUDF_JNI_LIBCUDF_STATIC=ON \
                 -Dtest=*,!CuFileTest"
 }
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 6bea7b2623b..15f4bff583e 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -12,7 +12,7 @@ dependencies:
   - clang-tools=11.1.0
   - cupy>=9.5.0,<11.0.0a0
   - rmm=22.06.*
-  - cmake>=3.20.1,<3.23
+  - cmake>=3.20.1,!=3.23.0
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
   - numba>=0.54
@@ -54,6 +54,10 @@ dependencies:
   - hypothesis
   - sphinx-markdown-tables
   - sphinx-copybutton
+  - sphinx-autobuild
+  - myst-nb
+  - scipy
+  - dask-cuda
   - mimesis<4.1
   - packaging
   - protobuf
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 9e77d44c15d..7d7b5d65cce 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -25,7 +25,7 @@ build:
 
 requirements:
   build:
-    - cmake >=3.20.1,<3.23
+    - cmake >=3.20.1,!=3.23.0
   host:
     - python
     - cython >=0.29,<0.30
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 397feab067e..b598a157196 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -1,5 +1,5 @@
 cmake_version:
-  - ">=3.20.1,<3.23"
+  - ">=3.20.1,!=3.23.0"
 
 gtest_version:
   - "=1.10.0"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 0806bb964cf..68008e13897 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -79,6 +79,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
         - test -f $PREFIX/include/cudf/detail/concatenate.hpp
         - test -f $PREFIX/include/cudf/detail/copy.hpp
+        - test -f $PREFIX/include/cudf/detail/copy.cuh
         - test -f $PREFIX/include/cudf/detail/datetime.hpp
         - test -f $PREFIX/include/cudf/detail/fill.hpp
         - test -f $PREFIX/include/cudf/detail/gather.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d9422edaa8f..cbe2811afe4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -154,21 +154,28 @@ include(cmake/thirdparty/get_gtest.cmake)
 # preprocess jitify-able kernels
 include(cmake/Modules/JitifyPreprocessKernels.cmake)
 # find cuFile
-include(cmake/Modules/FindcuFile.cmake)
+include(cmake/thirdparty/get_cufile.cmake)
+# find KvikIO
+include(cmake/thirdparty/get_kvikio.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
   include("${rapids-cmake-dir}/export/find_package_file.cmake")
   list(APPEND METADATA_KINDS BUILD INSTALL)
+  list(APPEND dependencies cuco KvikIO ZLIB nvcomp)
+  if(TARGET cufile::cuFile_interface)
+    list(APPEND dependencies cuFile)
+  endif()
+
   foreach(METADATA_KIND IN LISTS METADATA_KINDS)
-    rapids_export_find_package_file(
-      ${METADATA_KIND} "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports
-    )
-    rapids_export_package(${METADATA_KIND} cuco cudf-exports)
-    rapids_export_package(${METADATA_KIND} ZLIB cudf-exports)
-    rapids_export_package(${METADATA_KIND} cuFile cudf-exports)
-    rapids_export_package(${METADATA_KIND} nvcomp cudf-exports)
+    foreach(dep IN LISTS dependencies)
+      rapids_export_package(${METADATA_KIND} ${dep} cudf-exports)
+    endforeach()
   endforeach()
+
+  if(TARGET conda_env)
+    install(TARGETS conda_env EXPORT cudf-exports)
+  endif()
 endif()
 
 # ##################################################################################################
@@ -231,6 +238,7 @@ add_library(
   src/copying/gather.cu
   src/copying/get_element.cu
   src/copying/pack.cpp
+  src/copying/purge_nonempty_nulls.cu
   src/copying/reverse.cu
   src/copying/sample.cu
   src/copying/scatter.cu
@@ -295,6 +303,8 @@ add_library(
   src/io/comp/cpu_unbz2.cpp
   src/io/comp/debrotli.cu
   src/io/comp/gpuinflate.cu
+  src/io/comp/nvcomp_adapter.cpp
+  src/io/comp/nvcomp_adapter.cu
   src/io/comp/snap.cu
   src/io/comp/uncomp.cpp
   src/io/comp/unsnap.cu
@@ -344,10 +354,12 @@ add_library(
   src/join/join.cu
   src/join/join_utils.cu
   src/join/mixed_join.cu
-  src/join/mixed_join_kernels.cu
+  src/join/mixed_join_kernel.cu
+  src/join/mixed_join_kernel_nulls.cu
   src/join/mixed_join_kernels_semi.cu
   src/join/mixed_join_semi.cu
-  src/join/mixed_join_size_kernels.cu
+  src/join/mixed_join_size_kernel.cu
+  src/join/mixed_join_size_kernel_nulls.cu
   src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
   src/lists/contains.cu
@@ -584,7 +596,8 @@ add_dependencies(cudf jitify_preprocess_run)
 target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} libcudacxx::libcudacxx cudf::Thrust rmm::rmm
-  PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
+  PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
+          $<TARGET_NAME_IF_EXISTS:cuFile_interface>
 )
 
 # Add Conda library, and include paths if specified
@@ -592,11 +605,6 @@ if(TARGET conda_env)
   target_link_libraries(cudf PRIVATE conda_env)
 endif()
 
-# Add cuFile interface if available
-if(TARGET cuFile::cuFile_interface)
-  target_link_libraries(cudf PRIVATE cuFile::cuFile_interface)
-endif()
-
 if(CUDA_STATIC_RUNTIME)
   # Tell CMake what CUDA language runtime to use
   set_target_properties(cudf PROPERTIES CUDA_RUNTIME_LIBRARY Static)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 26bb10da69f..e93b2bf4f25 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -194,10 +194,18 @@ ConfigureBench(FILL_BENCH filling/repeat.cpp)
 # ##################################################################################################
 # * groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(
-  GROUPBY_BENCH groupby/group_sum.cu groupby/group_nth.cu groupby/group_shift.cu
-  groupby/group_struct.cu groupby/group_no_requests.cu groupby/group_scan.cu
+  GROUPBY_BENCH
+  groupby/group_sum.cu
+  groupby/group_nth.cu
+  groupby/group_shift.cu
+  groupby/group_struct.cu
+  groupby/group_no_requests.cu
+  groupby/group_scan.cu
+  groupby/group_rank_benchmark.cu
 )
 
+ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu)
+
 # ##################################################################################################
 # * hashing benchmark -----------------------------------------------------------------------------
 ConfigureBench(HASHING_BENCH hashing/hash.cpp hashing/partition.cpp)
diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index 6b129a4a435..a61b18df8d1 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
diff --git a/cpp/benchmarks/copying/copy_if_else.cpp b/cpp/benchmarks/copying/copy_if_else.cpp
index 6f094aba680..6f355118f49 100644
--- a/cpp/benchmarks/copying/copy_if_else.cpp
+++ b/cpp/benchmarks/copying/copy_if_else.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/groupby/group_rank_benchmark.cu b/cpp/benchmarks/groupby/group_rank_benchmark.cu
new file mode 100644
index 00000000000..1eeb15debe9
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_rank_benchmark.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <cudf::rank_method method>
+static void nvbench_groupby_rank(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<method>>)
+{
+  using namespace cudf;
+  using type           = int64_t;
+  constexpr auto dtype = type_to_id<int64_t>();
+  cudf::rmm_pool_raii pool_raii;
+
+  bool const is_sorted              = state.get_int64("is_sorted");
+  cudf::size_type const column_size = state.get_int64("data_size");
+  constexpr int num_groups          = 100;
+
+  data_profile profile;
+  profile.set_null_frequency(std::nullopt);
+  profile.set_cardinality(0);
+  profile.set_distribution_params<type>(dtype, distribution_id::UNIFORM, 0, num_groups);
+
+  auto source_table = create_random_table({dtype, dtype}, row_count{column_size}, profile);
+
+  // values to be pre-sorted too for groupby rank
+  if (is_sorted) source_table = cudf::sort(*source_table);
+
+  table_view keys{{source_table->view().column(0)}};
+  column_view order_by{source_table->view().column(1)};
+
+  auto agg = cudf::make_rank_aggregation<groupby_scan_aggregation>(method);
+  std::vector<groupby::scan_request> requests;
+  requests.emplace_back(groupby::scan_request());
+  requests[0].values = order_by;
+  requests[0].aggregations.push_back(std::move(agg));
+
+  groupby::groupby gb_obj(keys, null_policy::EXCLUDE, is_sorted ? sorted::YES : sorted::NO);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    // groupby scan uses sort implementation
+    auto result = gb_obj.scan(requests);
+  });
+}
+
+enum class rank_method : int32_t {};
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  cudf::rank_method,
+  [](cudf::rank_method value) {
+    switch (value) {
+      case cudf::rank_method::FIRST: return "FIRST";
+      case cudf::rank_method::AVERAGE: return "AVERAGE";
+      case cudf::rank_method::MIN: return "MIN";
+      case cudf::rank_method::MAX: return "MAX";
+      case cudf::rank_method::DENSE: return "DENSE";
+      default: return "unknown";
+    }
+  },
+  [](cudf::rank_method value) {
+    switch (value) {
+      case cudf::rank_method::FIRST: return "cudf::rank_method::FIRST";
+      case cudf::rank_method::AVERAGE: return "cudf::rank_method::AVERAGE";
+      case cudf::rank_method::MIN: return "cudf::rank_method::MIN";
+      case cudf::rank_method::MAX: return "cudf::rank_method::MAX";
+      case cudf::rank_method::DENSE: return "cudf::rank_method::DENSE";
+      default: return "unknown";
+    }
+  })
+
+using methods = nvbench::enum_type_list<cudf::rank_method::AVERAGE,
+                                        cudf::rank_method::DENSE,
+                                        cudf::rank_method::FIRST,
+                                        cudf::rank_method::MAX,
+                                        cudf::rank_method::MIN>;
+
+NVBENCH_BENCH_TYPES(nvbench_groupby_rank, NVBENCH_TYPE_AXES(methods))
+  .set_type_axes_names({"rank_method"})
+  .set_name("groupby_rank")
+  .add_int64_axis("data_size",
+                  {
+                    1000000,    // 1M
+                    10000000,   // 10M
+                    100000000,  // 100M
+                  })
+
+  .add_int64_axis("is_sorted", {0, 1});
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index 1110b6fe9ef..9c0ef5b528d 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/io/csv/csv_reader.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp
index c50f5220200..b61ba75ce6e 100644
--- a/cpp/benchmarks/io/csv/csv_reader.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
@@ -52,6 +50,7 @@ void BM_csv_read_varying_input(benchmark::State& state)
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_csv(read_options);
   }
@@ -98,6 +97,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
       // only read the header in the first chunk
diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
index 65aa31c68dc..079df45b1d8 100644
--- a/cpp/benchmarks/io/csv/csv_writer.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index afe0cc77a4c..7d356263220 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -141,3 +141,31 @@ std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks,
 
   return selected_segments;
 }
+
+// Executes the command and returns stderr output
+std::string exec_cmd(std::string_view cmd)
+{
+  // Switch stderr and stdout to only capture stderr
+  auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
+  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
+  CUDF_EXPECTS(pipe != nullptr, "popen() failed");
+
+  std::array<char, 128> buffer;
+  std::string error_out;
+  while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+    error_out += buffer.data();
+  }
+  return error_out;
+}
+
+void try_drop_l3_cache()
+{
+  static bool is_drop_cache_enabled = std::getenv("CUDF_BENCHMARK_DROP_CACHE") != nullptr;
+  if (not is_drop_cache_enabled) { return; }
+
+  std::array drop_cache_cmds{"/sbin/sysctl vm.drop_caches=3", "sudo /sbin/sysctl vm.drop_caches=3"};
+  CUDF_EXPECTS(std::any_of(drop_cache_cmds.cbegin(),
+                           drop_cache_cmds.cend(),
+                           [](auto& cmd) { return exec_cmd(cmd).empty(); }),
+               "Failed to execute the drop cache command");
+}
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 2ed534d5333..8ea29684aae 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -16,12 +16,12 @@
 
 #pragma once
 
+#include <cudf_test/file_utilities.hpp>
+
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/types.hpp>
 
-#include <cudf_test/file_utilities.hpp>
-
 using cudf::io::io_type;
 
 #define RD_BENCHMARK_DEFINE_ALL_SOURCES(benchmark, name, type_or_group)                  \
@@ -132,3 +132,13 @@ std::vector<std::string> select_column_names(std::vector<std::string> const& col
  * The segments could be Parquet row groups or ORC stripes.
  */
 std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk);
+
+/**
+ * @brief Drops L3 cache if `CUDF_BENCHMARK_DROP_CACHE` environment variable is set.
+ *
+ * Has no effect if the environment variable is not set.
+ * May require sudo access ro run successfully.
+ *
+ * @throw cudf::logic_error if the environment variable is set and the command fails
+ */
+void try_drop_l3_cache();
diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
index 0fc2238a272..7d6eb432b5b 100644
--- a/cpp/benchmarks/io/orc/orc_reader.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
@@ -60,6 +58,7 @@ void BM_orc_read_varying_input(benchmark::State& state)
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_orc(read_opts);
   }
@@ -117,6 +116,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
     cudf::size_type rows_read = 0;
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index 525c13af5c0..4e7781b402a 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -14,15 +14,13 @@
  * limitations under the License.
  */
 
-#include "cudf/io/types.hpp"
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/orc.hpp>
+#include <cudf/io/types.hpp>
 
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
index 8a97fd35c31..af7121d37dc 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
@@ -60,6 +58,7 @@ void BM_parq_read_varying_input(benchmark::State& state)
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer const raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::read_parquet(read_opts);
   }
@@ -117,6 +116,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
 
     cudf::size_type rows_read = 0;
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index d25fae42d0e..776121028ef 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
index 30ed245ed9a..e22696b9c01 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
@@ -14,21 +14,14 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
-#include <cudf/column/column.hpp>
-#include <cudf/table/table.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf/column/column.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/table/table.hpp>
 
 // to enable, run cmake with -DBUILD_BENCHMARKS=ON
 
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index ada8856e8e5..d274f79a77c 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -19,10 +19,9 @@
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
 #include <cudf_test/file_utilities.hpp>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -38,8 +37,6 @@
 #include <fstream>
 #include <memory>
 
-using cudf::test::fixed_width_column_wrapper;
-
 temp_directory const temp_dir("cudf_gbench");
 
 enum data_chunk_source_type {
@@ -137,6 +134,7 @@ static void BM_multibyte_split(benchmark::State& state)
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
+    try_drop_l3_cache();
     cuda_event_timer raii(state, true);
     auto output = cudf::io::text::multibyte_split(*source, delim);
   }
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index 595775ddf00..5eaaec23211 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -31,8 +32,6 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-#include <benchmark/benchmark.h>
-
 #include <random>
 
 template <typename T>
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 6ff2543cf7d..a031b4e656d 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -21,10 +21,8 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
 #include <cudf/ast/expressions.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/filling.hpp>
 #include <cudf/join.hpp>
diff --git a/cpp/benchmarks/merge/merge.cpp b/cpp/benchmarks/merge/merge.cpp
index 88354bcc731..82d89233a33 100644
--- a/cpp/benchmarks/merge/merge.cpp
+++ b/cpp/benchmarks/merge/merge.cpp
@@ -14,18 +14,15 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+#include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
+#include <cudf/merge.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <cudf/merge.hpp>
-
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 
 #include <random>
diff --git a/cpp/benchmarks/null_mask/set_null_mask.cpp b/cpp/benchmarks/null_mask/set_null_mask.cpp
index 2057951ff8d..429a62a2bfa 100644
--- a/cpp/benchmarks/null_mask/set_null_mask.cpp
+++ b/cpp/benchmarks/null_mask/set_null_mask.cpp
@@ -19,8 +19,6 @@
 
 #include <cudf/null_mask.hpp>
 
-#include <benchmark/benchmark.h>
-
 class SetNullmask : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index aef4960789a..8c434465795 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segment_reduce.cu
index 3723147d95c..08fc4622b43 100644
--- a/cpp/benchmarks/reduction/segment_reduce.cu
+++ b/cpp/benchmarks/reduction/segment_reduce.cu
@@ -82,7 +82,7 @@ std::pair<std::unique_ptr<column>, thrust::device_vector<size_type>> make_test_d
 
   thrust::device_vector<size_type> d_offsets(offset_it, offset_it + num_segments + 1);
 
-  return std::make_pair(std::move((input->release())[0]), d_offsets);
+  return std::pair(std::move((input->release())[0]), d_offsets);
 }
 
 template <typename InputType, typename OutputType, aggregation::Kind kind>
diff --git a/cpp/benchmarks/replace/clamp.cpp b/cpp/benchmarks/replace/clamp.cpp
index d3a7415a478..e9a259d0c7b 100644
--- a/cpp/benchmarks/replace/clamp.cpp
+++ b/cpp/benchmarks/replace/clamp.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/replace/nans.cpp b/cpp/benchmarks/replace/nans.cpp
index e1b05bbc337..28ca798ebf0 100644
--- a/cpp/benchmarks/replace/nans.cpp
+++ b/cpp/benchmarks/replace/nans.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/sort/rank.cpp b/cpp/benchmarks/sort/rank.cpp
index 22acb241f0b..0a5c1844c69 100644
--- a/cpp/benchmarks/sort/rank.cpp
+++ b/cpp/benchmarks/sort/rank.cpp
@@ -14,20 +14,13 @@
  * limitations under the License.
  */
 
-#include "cudf/column/column_view.hpp"
-#include <cudf/sorting.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+
 class Rank : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/sort/sort.cpp b/cpp/benchmarks/sort/sort.cpp
index 1a42daa5bb0..d7c33e7170e 100644
--- a/cpp/benchmarks/sort/sort.cpp
+++ b/cpp/benchmarks/sort/sort.cpp
@@ -14,19 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/sorting.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf/sorting.hpp>
+
 template <bool stable>
 class Sort : public cudf::benchmark {
 };
diff --git a/cpp/benchmarks/sort/sort_strings.cpp b/cpp/benchmarks/sort/sort_strings.cpp
index 30a7aee043b..a58b9a4f6da 100644
--- a/cpp/benchmarks/sort/sort_strings.cpp
+++ b/cpp/benchmarks/sort/sort_strings.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp
index 81f7ad8a4c1..9b6c32940f5 100644
--- a/cpp/benchmarks/sort/sort_structs.cpp
+++ b/cpp/benchmarks/sort/sort_structs.cpp
@@ -16,11 +16,10 @@
 
 #include <benchmarks/fixture/rmm_pool_raii.hpp>
 
-#include <cudf/detail/sorting.hpp>
-
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/detail/sorting.hpp>
+
 #include <nvbench/nvbench.cuh>
 
 #include <random>
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index 0d74d0a6b7c..daa22d25677 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
index a0cfcd15fe8..85c48e18ce1 100644
--- a/cpp/benchmarks/string/combine.cpp
+++ b/cpp/benchmarks/string/combine.cpp
@@ -16,7 +16,6 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -24,7 +23,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 class StringCombine : public cudf::benchmark {
 };
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 3a89b5646d7..6689e3611d1 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp
index 3782fea1e36..488ce95d397 100644
--- a/cpp/benchmarks/string/convert_datetime.cpp
+++ b/cpp/benchmarks/string/convert_datetime.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp
index dc9a1e991b2..6e3a9e8faa9 100644
--- a/cpp/benchmarks/string/convert_durations.cpp
+++ b/cpp/benchmarks/string/convert_durations.cpp
@@ -14,24 +14,19 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/column/column_view.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/types.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
+#include <cudf/wrappers/durations.hpp>
 
 #include <algorithm>
 #include <random>
 
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
-#include "cudf/column/column_view.hpp"
-#include "cudf/wrappers/durations.hpp"
-
 class DurationsToString : public cudf::benchmark {
 };
 template <class TypeParam>
diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp
index 05b87906eca..88657c409cd 100644
--- a/cpp/benchmarks/string/convert_fixed_point.cpp
+++ b/cpp/benchmarks/string/convert_fixed_point.cpp
@@ -14,11 +14,9 @@
  * limitations under the License.
  */
 
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/convert/convert_floats.hpp>
diff --git a/cpp/benchmarks/string/convert_numerics.cpp b/cpp/benchmarks/string/convert_numerics.cpp
index 71a23c76829..3025c32b888 100644
--- a/cpp/benchmarks/string/convert_numerics.cpp
+++ b/cpp/benchmarks/string/convert_numerics.cpp
@@ -14,11 +14,9 @@
  * limitations under the License.
  */
 
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu
index a8f9eb111fc..0280322a3a1 100644
--- a/cpp/benchmarks/string/copy.cu
+++ b/cpp/benchmarks/string/copy.cu
@@ -20,9 +20,9 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index b8d206386f5..4ff29285482 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -20,9 +20,10 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <random>
 
diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu
index 2e0bf4afb36..dde0b7e4424 100644
--- a/cpp/benchmarks/string/factory.cu
+++ b/cpp/benchmarks/string/factory.cu
@@ -16,14 +16,14 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp
index b39cf25bc91..064b824619e 100644
--- a/cpp/benchmarks/string/filter.cpp
+++ b/cpp/benchmarks/string/filter.cpp
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/strip.hpp>
 #include <cudf/strings/translate.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <limits>
 #include <vector>
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 55eb52c9b30..aaa7bd29b31 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <limits>
 
diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp
index 9044db18522..835a437e3b5 100644
--- a/cpp/benchmarks/string/repeat_strings.cpp
+++ b/cpp/benchmarks/string/repeat_strings.cpp
@@ -16,7 +16,6 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index 0a3607c64f0..10f6e2a19ed 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -14,20 +14,20 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include "string_bench_args.hpp"
+
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <limits>
 
-#include "string_bench_args.hpp"
-
 class StringReplace : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index b9d04630837..148cbe678bd 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -16,14 +16,14 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 class StringReplace : public cudf::benchmark {
 };
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index ad25cfe54de..97eb0ba6dbf 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <limits>
 
diff --git a/cpp/benchmarks/string/substring.cpp b/cpp/benchmarks/string/substring.cpp
index 2195cc56515..a18462385fc 100644
--- a/cpp/benchmarks/string/substring.cpp
+++ b/cpp/benchmarks/string/substring.cpp
@@ -16,21 +16,20 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
-#include <limits>
 
 #include <thrust/iterator/constant_iterator.h>
 
+#include <limits>
+
 class StringSubstring : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp
index 38c6ff9c701..2ed0ccceba6 100644
--- a/cpp/benchmarks/string/translate.cpp
+++ b/cpp/benchmarks/string/translate.cpp
@@ -16,19 +16,19 @@
 
 #include "string_bench_args.hpp"
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/strings/translate.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <algorithm>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/translate.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <algorithm>
+
 class StringTranslate : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu
index 7971d44536d..40bf2b090d4 100644
--- a/cpp/benchmarks/string/url_decode.cu
+++ b/cpp/benchmarks/string/url_decode.cu
@@ -14,21 +14,17 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 157c27ae48a..b1e70517aea 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
@@ -22,7 +21,6 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
 
 #include <nvtext/generate_ngrams.hpp>
 
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 2cc083f4ae8..3b58a7dd187 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -14,15 +14,12 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <nvtext/normalize.hpp>
 
diff --git a/cpp/benchmarks/text/normalize_spaces.cpp b/cpp/benchmarks/text/normalize_spaces.cpp
index 3bd636d4aa9..1fe912e5740 100644
--- a/cpp/benchmarks/text/normalize_spaces.cpp
+++ b/cpp/benchmarks/text/normalize_spaces.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
@@ -22,8 +21,6 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <nvtext/normalize.hpp>
 
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 3fbb6054d5c..a093cd767b3 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/strings/strings_column_view.hpp>
+
 #include <nvtext/replace.hpp>
 
+#include <random>
+
 class TextReplace : public cudf::benchmark {
 };
 
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index 150f578a22a..d8357dcf92c 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -15,12 +15,13 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <cudf/strings/strings_column_view.hpp>
-#include <nvtext/subword_tokenize.hpp>
 
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/strings/strings_column_view.hpp>
+#include <nvtext/subword_tokenize.hpp>
+
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <vector>
@@ -29,7 +30,7 @@
 
 static std::string create_hash_vocab_file()
 {
-  std::string dir_template("/tmp");
+  std::string dir_template{std::filesystem::temp_directory_path().string()};
   if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
   std::string hash_file = dir_template + "/hash_vocab.txt";
   // create a fake hashed vocab text file for this test
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index 4cb9c9e5271..fea1973c026 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
 #include <nvtext/ngrams_tokenize.hpp>
 #include <nvtext/tokenize.hpp>
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index aba78dad3fe..53dac455b04 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -17,8 +17,6 @@
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake
index e539a6604a8..3661d7d68d6 100644
--- a/cpp/cmake/Modules/FindcuFile.cmake
+++ b/cpp/cmake/Modules/FindcuFile.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,9 +20,9 @@ Find cuFile headers and libraries.
 Imported Targets
 ^^^^^^^^^^^^^^^^
 
-``cuFile::cuFile``
+``cufile::cuFile``
   The cuFile library, if found.
-``cuFile::cuFileRDMA``
+``cufile::cuFileRDMA``
   The cuFile RDMA library, if found.
 
 Result Variables
@@ -80,29 +80,29 @@ find_package_handle_standard_args(
   VERSION_VAR cuFile_VERSION
 )
 
-if(cuFile_INCLUDE_DIR AND NOT TARGET cuFile::cuFile_interface)
-  add_library(cuFile::cuFile_interface IMPORTED INTERFACE)
+if(cuFile_INCLUDE_DIR AND NOT TARGET cufile::cuFile_interface)
+  add_library(cufile::cuFile_interface INTERFACE IMPORTED GLOBAL)
   target_include_directories(
-    cuFile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>"
+    cufile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>"
   )
-  target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")
-  target_compile_definitions(cuFile::cuFile_interface INTERFACE CUFILE_FOUND)
+  target_compile_options(cufile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")
+  target_compile_definitions(cufile::cuFile_interface INTERFACE CUFILE_FOUND)
 endif()
 
-if(cuFile_FOUND AND NOT TARGET cuFile::cuFile)
-  add_library(cuFile::cuFile UNKNOWN IMPORTED)
+if(cuFile_FOUND AND NOT TARGET cufile::cuFile)
+  add_library(cufile::cuFile UNKNOWN IMPORTED GLOBAL)
   set_target_properties(
-    cuFile::cuFile
+    cufile::cuFile
     PROPERTIES IMPORTED_LOCATION "${cuFile_LIBRARY}"
                INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}"
                INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}"
   )
 endif()
 
-if(cuFile_FOUND AND NOT TARGET cuFile::cuFileRDMA)
-  add_library(cuFile::cuFileRDMA UNKNOWN IMPORTED)
+if(cuFile_FOUND AND NOT TARGET cufile::cuFileRDMA)
+  add_library(cufile::cuFileRDMA UNKNOWN IMPORTED GLOBAL)
   set_target_properties(
-    cuFile::cuFileRDMA
+    cufile::cuFileRDMA
     PROPERTIES IMPORTED_LOCATION "${cuFileRDMA_LIBRARY}"
                INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}"
                INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}"
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 2b08e9f2d6c..0b14b812a05 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -80,10 +80,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
     # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
     list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
-    list(APPEND ARROW_PYTHON_OPTIONS "Thrift_SOURCE BUNDLED")
     list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO")
   endif()
 
+  set(ARROW_PARQUET_OPTIONS "")
+  if(ENABLE_PARQUET)
+    # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
+    list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM")
+    list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED")
+    list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO")
+  endif()
+
   # Set this so Arrow correctly finds the CUDA toolkit when the build machine does not have the CUDA
   # driver installed. This must be an env var.
   set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs")
@@ -106,6 +113,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
             "ARROW_S3 ${ENABLE_S3}"
             "ARROW_ORC ${ENABLE_ORC}"
             # e.g. needed by blazingsql-io
+            ${ARROW_PARQUET_OPTIONS}
             "ARROW_PARQUET ${ENABLE_PARQUET}"
             ${ARROW_PYTHON_OPTIONS}
             # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 1639655d1e9..5232821d113 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -21,12 +21,14 @@ function(find_and_configure_cucollections)
     cuco 0.0.1
     GLOBAL_TARGETS cuco::cuco
     BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
     GIT_TAG fb58a38701f1c24ecfe07d8f1f208bbe80930da5
     EXCLUDE_FROM_ALL ${BUILD_SHARED_LIBS}
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
+  if(NOT BUILD_SHARED_LIBS)
+    rapids_export_package(INSTALL cuco cudf-exports)
+  endif()
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_cufile.cmake b/cpp/cmake/thirdparty/get_cufile.cmake
new file mode 100644
index 00000000000..21088f4ec0f
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_cufile.cmake
@@ -0,0 +1,32 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds nvcomp and sets any additional necessary environment variables.
+function(find_and_configure_cufile)
+
+  list(APPEND CMAKE_MODULE_PATH ${CUDF_SOURCE_DIR}/cmake/Modules)
+  rapids_find_package(cuFile QUIET)
+
+  if(cuFile_FOUND AND NOT BUILD_SHARED_LIBS)
+    include("${rapids-cmake-dir}/export/find_package_file.cmake")
+    rapids_export_find_package_file(
+      BUILD "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports
+    )
+    rapids_export_find_package_file(
+      INSTALL "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports
+    )
+  endif()
+endfunction()
+
+find_and_configure_cufile()
diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake
new file mode 100644
index 00000000000..e94e024d6c9
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_kvikio.cmake
@@ -0,0 +1,36 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds KvikIO
+function(find_and_configure_kvikio VERSION)
+
+  rapids_cpm_find(
+    KvikIO ${VERSION}
+    GLOBAL_TARGETS kvikio::kvikio
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/rapidsai/kvikio.git
+    GIT_TAG branch-${VERSION}
+    GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
+    OPTIONS "KvikIO_BUILD_EXAMPLES OFF"
+  )
+
+  if(KvikIO_BINARY_DIR)
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(BUILD KvikIO "${KvikIO_BINARY_DIR}" cudf-exports)
+  endif()
+
+endfunction()
+
+set(KVIKIO_MIN_VERSION_cudf "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
+find_and_configure_kvikio(${KVIKIO_MIN_VERSION_cudf})
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index 0356725548b..d0007f93628 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -25,6 +25,11 @@ function(find_and_configure_nvcomp VERSION_MIN VERSION_MAX)
     OPTIONS "BUILD_STATIC ON" "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
+  if(nvcomp_BINARY_DIR)
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(BUILD nvcomp "${nvcomp_BINARY_DIR}" cudf-exports)
+  endif()
+
   if(NOT TARGET nvcomp::nvcomp)
     add_library(nvcomp::nvcomp ALIAS nvcomp)
   endif()
diff --git a/cpp/docs/BENCHMARKING.md b/cpp/docs/BENCHMARKING.md
index 8794c90d1db..270e7a87e85 100644
--- a/cpp/docs/BENCHMARKING.md
+++ b/cpp/docs/BENCHMARKING.md
@@ -35,6 +35,12 @@ provided in `cpp/benchmarks/synchronization/synchronization.hpp` to help with th
 can also optionally clear the GPU L2 cache in order to ensure cache hits do not artificially inflate
 performance in repeated iterations.
 
+## Data generation
+
+For generating benchmark input data, helper functions are available at [cpp/benchmarks/common/generate_input.hpp](/cpp/benchmarks/common/generate_input.hpp). The input data generation happens on device, in contrast to any `column_wrapper` where data generation happens on the host.
+* `create_sequence_table` can generate sequence columns starting with value 0 in first row and increasing by 1 in subsequent rows.
+* `create_random_table` can generate a table filled with random data. The random data parameters are configurable.
+
 ## What should we benchmark?
 
 In general, we should benchmark all features over a range of data sizes and types, so that we can
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 1599c81cbe5..84f69f559a8 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -572,7 +572,7 @@ The preferred style for how inputs are passed in and outputs are returned is the
 
 Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be
 done in C++ (including creating a `struct` for the output). One convenient way to do this is
-using `std::tie`  and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke
+using `std::tie`  and `std::pair`. Note that objects passed to `std::pair` will invoke
 either the copy constructor or the move constructor of the object, and it may be preferable to move
 non-trivially copyable objects (and required for types with deleted copy constructors, like
 `std::unique_ptr`).
@@ -585,7 +585,7 @@ std::pair<table, table> return_two_tables(void){
   // Do stuff with out0, out1
 
   // Return a std::pair of the two outputs
-  return std::make_pair(std::move(out0), std::move(out1));
+  return std::pair(std::move(out0), std::move(out1));
 }
 
 cudf::table out0;
@@ -745,6 +745,30 @@ void isolated_helper_function(...);
 
 [**Anonymous namespaces should *never* be used in a header file.**](https://wiki.sei.cmu.edu/confluence/display/cplusplus/DCL59-CPP.+Do+not+define+an+unnamed+namespace+in+a+header+file)
 
+# Deprecating and Removing Code
+
+libcudf is constantly evolving to improve performance and better meet our users' needs. As a
+result, we occasionally need to break or entirely remove APIs to respond to new and improved
+understanding of the functionality we provide. Remaining free to do this is essential to making
+libcudf an agile library that can rapidly accommodate our users needs. As a result, we do not
+always provide a warning or any lead time prior to releasing breaking changes. On a best effort
+basis, the libcudf team will notify users of changes that we expect to have significant or
+widespread effects.
+
+Where possible, indicate pending API removals using the
+[deprecated](https://en.cppreference.com/w/cpp/language/attributes/deprecated) attribute and
+document them using Doxygen's
+[deprecated](https://www.doxygen.nl/manual/commands.html#cmddeprecated) command prior to removal.
+When a replacement API is available for a deprecated API, mention the replacement in both the
+deprecation message and the deprecation documentation. Pull requests that introduce deprecations
+should be labeled "deprecation" to facilitate discovery and removal in the subsequent release.
+
+Advertise breaking changes by labeling any pull request that breaks or removes an existing API with
+the "breaking" tag. This ensures that the "Breaking" section of the release notes includes a
+description of what has broken from the past release. Label pull requests that contain deprecations
+with the "non-breaking" tag.
+
+
 # Error Handling
 
 libcudf follows conventions (and provides utilities) enforcing compile-time and run-time
diff --git a/cpp/docs/DOCUMENTATION.md b/cpp/docs/DOCUMENTATION.md
index 2382a0eb022..ebb52836577 100644
--- a/cpp/docs/DOCUMENTATION.md
+++ b/cpp/docs/DOCUMENTATION.md
@@ -369,6 +369,20 @@ Although using 3 backtick marks `` ``` `` for example blocks will work too, they
 Do not use the `@example` tag in the comments for a declaration, or doxygen will interpret the entire source file as example source code.
 The source file is then published under a separate _Examples_ page in the output.
 
+### Deprecations
+
+Add a single [@deprecated](https://www.doxygen.nl/manual/commands.html#cmddeprecated) comment line
+to comment blocks for APIs that will be removed in future releases. Mention alternative /
+replacement APIs in the deprecation comment.
+
+```c++
+/**
+ * ...
+ *
+ * @deprecated This function is deprecated. Use another new function instead.
+ */
+```
+
 ## Namespaces
 
 Doxygen output includes a _Namespaces_ page that shows all the namespaces declared with comment blocks in the processed files.
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 539a7c04106..5c7513a6c99 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -43,6 +43,32 @@ namespace detail {
 class simple_aggregations_collector;
 class aggregation_finalizer;
 }  // namespace detail
+
+/**
+ * @brief Tie-breaker method to use for ranking the column.
+ *
+ * @see cudf::make_rank_aggregation for more details.
+ * @ingroup column_sort
+ */
+enum class rank_method : int32_t {
+  FIRST,    ///< stable sort order ranking (no ties)
+  AVERAGE,  ///< mean of first in the group
+  MIN,      ///< min of first in the group
+  MAX,      ///< max of first in the group
+  DENSE     ///< rank always increases by 1 between groups
+};
+
+/**
+ * @brief Whether returned rank should be percentage or not and
+ *  mention the type of percentage normalization.
+ *
+ */
+enum class rank_percentage : int32_t {
+  NONE,             ///< rank
+  ZERO_NORMALIZED,  ///< rank / count
+  ONE_NORMALIZED    ///< (rank - 1) / (count - 1)
+};
+
 /**
  * @brief Abstract base class for specifying the desired aggregation in an
  * `aggregation_request`.
@@ -77,9 +103,7 @@ class aggregation {
     NUNIQUE,         ///< count number of unique elements
     NTH_ELEMENT,     ///< get the nth element
     ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
-    RANK,            ///< get rank       of current index
-    DENSE_RANK,      ///< get dense rank of current index
-    PERCENT_RANK,    ///< get percent (i.e. fractional) rank of current index
+    RANK,            ///< get rank of current index
     COLLECT_LIST,    ///< collect values into a list
     COLLECT_SET,     ///< collect values into a list without duplicate entries
     LEAD,            ///< window function, accesses row at specified offset following current row
@@ -323,9 +347,11 @@ std::unique_ptr<Base> make_row_number_aggregation();
 /**
  * @brief Factory to create a RANK aggregation
  *
- * `RANK` returns a non-nullable column of size_type "ranks": the number of rows preceding or
- * equal to the current row plus one. As a result, ranks are not unique and gaps will appear in
- * the ranking sequence.
+ * `RANK` returns a column of size_type or double "ranks" (see note 3 below for how the
+ * data type is determined) for a given rank method and column order.
+ * If nulls are excluded, the rank will be null for those rows, otherwise a non-nullable column is
+ * returned. Double precision column is returned only when percentage!=NONE and when rank method is
+ * average.
  *
  * This aggregation only works with "scan" algorithms. The input column into the group or
  * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks.
@@ -333,10 +359,12 @@ std::unique_ptr<Base> make_row_number_aggregation();
  * column containing the ordering columns.
  *
  * Note:
- *  1. This method requires that the rows are presorted by the group keys and order_by columns.
- *  2. `RANK` aggregations will return a fully valid column regardless of null_handling policy
- *     specified in the scan.
- *  3. `RANK` aggregations are not compatible with exclusive scans.
+ *  1. This method could work faster with the rows that are presorted by the group keys and order_by
+ *     columns. Though groupby object does not require order_by column to be sorted, groupby rank
+ *     scan aggregation does require the order_by column to be sorted if the keys are sorted.
+ *  2. `RANK` aggregations are not compatible with exclusive scans.
+ *  3. All rank methods except AVERAGE method and percentage!=NONE returns size_type column.
+ *     For AVERAGE method and percentage!=NONE, the return type is double column.
  *
  * @code{.pseudo}
  * Example: Consider a motor-racing statistics dataset, containing the following columns:
@@ -362,123 +390,37 @@ std::unique_ptr<Base> make_row_number_aggregation();
  * A grouped rank aggregation scan with:
  *   groupby column      : venue
  *   input orderby column: time
- * Produces the following rank column:
- * {   1,     2,     3,     3,     5,      1,     2,     2,     4,     5}
- * (This corresponds to the following grouping and `driver` rows:)
- * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
- *   <----------silverstone----------->|<-------------monza-------------->
- * @endcode
- */
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_rank_aggregation();
-
-/**
- * @brief Factory to create a DENSE_RANK aggregation
- *
- * `DENSE_RANK` returns a non-nullable column of size_type "dense ranks": the preceding unique
- * value's rank plus one. As a result, ranks are not unique but there are no gaps in the ranking
- * sequence (unlike RANK aggregations).
- *
- * This aggregation only works with "scan" algorithms. The input column into the group or
- * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks.
- * If rows are ordered by more than one column, the orderby input column should be a struct
- * column containing the ordering columns.
- *
- * Note:
- *  1. This method requires that the rows are presorted by the group keys and order_by columns.
- *  2. `DENSE_RANK` aggregations will return a fully valid column regardless of null_handling
- *     policy specified in the scan.
- *  3. `DENSE_RANK` aggregations are not compatible with exclusive scans.
- *
- * @code{.pseudo}
- * Example: Consider a motor-racing statistics dataset, containing the following columns:
- *   1. venue:  (STRING) Location of the race event
- *   2. driver: (STRING) Name of the car driver (abbreviated to 3 characters)
- *   3. time:   (INT32)  Time taken to complete the circuit
- *
- * For the following presorted data:
+ * Produces the following rank column for each methods:
+ * first:   {   1,     2,     3,     4,     5,      1,     2,     3,     4,     5}
+ * average: {   1,     2,   3.5,   3.5,     5,      1,   2.5,   2.5,     4,     5}
+ * min:     {   1,     2,     3,     3,     5,      1,     2,     2,     4,     5}
+ * max:     {   1,     2,     4,     4,     5,      1,     3,     3,     4,     5}
+ * dense:   {   1,     2,     3,     3,     4,      1,     2,     2,     3,     4}
+ * This corresponds to the following grouping and `driver` rows:
+ *          { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
+ *            <----------silverstone----------->|<-------------monza-------------->
+ *
+ * min rank for each percentage types:
+ * NONE:             {   1,      2,     3,     3,     5,      1,     2,     2,     4,     5 }
+ * ZERO_NORMALIZED : { 0.16,  0.33,  0.50,  0.50,  0.83,   0.16,  0.33,  0.33,  0.66,  0.83 }
+ * ONE_NORMALIZED:   { 0.00,  0.25,  0.50,  0.50,  1.00,   0.00,  0.25,  0.25,  0.75,  1.00 }
+ * where count corresponds to the number of rows in the group. @see cudf::rank_percentage
  *
- *  [ //      venue,           driver,           time
- *    {   "silverstone",  "HAM" ("hamilton"),   15823},
- *    {   "silverstone",  "LEC" ("leclerc"),    15827},
- *    {   "silverstone",  "BOT" ("bottas"),     15834},  // <-- Tied for 3rd place.
- *    {   "silverstone",  "NOR" ("norris"),     15834},  // <-- Tied for 3rd place.
- *    {   "silverstone",  "RIC" ("ricciardo"),  15905},
- *    {      "monza",     "RIC" ("ricciardo"),  12154},
- *    {      "monza",     "NOR" ("norris"),     12156},  // <-- Tied for 2nd place.
- *    {      "monza",     "BOT" ("bottas"),     12156},  // <-- Tied for 2nd place.
- *    {      "monza",     "LEC" ("leclerc"),    12201},
- *    {      "monza",     "PER" ("perez"),      12203}
- *  ]
- *
- * A grouped dense rank aggregation scan with:
- *   groupby column      : venue
- *   input orderby column: time
- * Produces the following dense rank column:
- * {   1,     2,     3,     3,     4,      1,     2,     2,     3,     4}
- * (This corresponds to the following grouping and `driver` rows:)
- * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
- *   <----------silverstone----------->|<-------------monza-------------->
  * @endcode
- */
-template <typename Base = aggregation>
-std::unique_ptr<Base> make_dense_rank_aggregation();
-
-/**
- * @brief Factory to create a PERCENT_RANK aggregation
  *
- * `PERCENT_RANK` returns a non-nullable column of double precision "fractional" ranks.
- * For row index `i`, the percent rank of row `i` is defined as:
- *   percent_rank = (rank - 1) / (group_row_count - 1)
- * where,
- *   1. rank is the `RANK` of the row within the group
- *   2. group_row_count is the number of rows in the group
- *
- * This aggregation only works with "scan" algorithms. The input to the grouped or
- * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks.
- * If rows are ordered by more than one column, the orderby input column should be a struct
- * column containing the ordering columns.
- *
- * Note:
- *  1. This method requires that the rows are presorted by the group keys and order_by columns.
- *  2. `PERCENT_RANK` aggregations will return a fully valid column regardless of null_handling
- *     policy specified in the scan.
- *  3. `PERCENT_RANK` aggregations are not compatible with exclusive scans.
- *
- * @code{.pseudo}
- * Example: Consider a motor-racing statistics dataset, containing the following columns:
- *   1. venue:  (STRING) Location of the race event
- *   2. driver: (STRING) Name of the car driver (abbreviated to 3 characters)
- *   3. time:   (INT32)  Time taken to complete the circuit
- *
- * For the following presorted data:
- *
- *  [ //      venue,           driver,           time
- *    {   "silverstone",  "HAM" ("hamilton"),   15823},
- *    {   "silverstone",  "LEC" ("leclerc"),    15827},
- *    {   "silverstone",  "BOT" ("bottas"),     15834},  // <-- Tied for 3rd place.
- *    {   "silverstone",  "NOR" ("norris"),     15834},  // <-- Tied for 3rd place.
- *    {   "silverstone",  "RIC" ("ricciardo"),  15905},
- *    {      "monza",     "RIC" ("ricciardo"),  12154},
- *    {      "monza",     "NOR" ("norris"),     12156},  // <-- Tied for 2nd place.
- *    {      "monza",     "BOT" ("bottas"),     12156},  // <-- Tied for 2nd place.
- *    {      "monza",     "LEC" ("leclerc"),    12201},
- *    {      "monza",     "PER" ("perez"),      12203}
- *  ]
- *
- * A grouped percent rank aggregation scan with:
- *   groupby column      : venue
- *   input orderby column: time
- * Produces the following percent rank column:
- * { 0.00,  0.25,  0.50,  0.50,  1.00,   0.00,  0.25,  0.25,  0.75,  1.00 }
- *
- * (This corresponds to the following grouping and `driver` rows:)
- * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
- *   <----------silverstone----------->|<-------------monza-------------->
- * @endcode
+ * @param method The ranking method used for tie breaking (same values).
+ * @param column_order The desired sort order for ranking
+ * @param null_handling  flag to include nulls during ranking. If nulls are not included,
+ * the corresponding rank will be null.
+ * @param null_precedence The desired order of null compared to other elements for column
+ * @param percentage enum to denote the type of conversion of ranks to percentage in range (0,1]
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_percent_rank_aggregation();
+std::unique_ptr<Base> make_rank_aggregation(rank_method method,
+                                            order column_order         = order::ASCENDING,
+                                            null_policy null_handling  = null_policy::EXCLUDE,
+                                            null_order null_precedence = null_order::AFTER,
+                                            rank_percentage percentage = rank_percentage::NONE);
 
 /**
  * @brief Factory to create a COLLECT_LIST aggregation
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index eb98e0e0bee..96c99e054a5 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <cstdint>
+
 namespace cudf {
 namespace ast {
 
@@ -53,7 +55,7 @@ struct expression {
 /**
  * @brief Enum of supported operators.
  */
-enum class ast_operator {
+enum class ast_operator : int32_t {
   // Binary operators
   ADD,         ///< operator +
   SUB,         ///< operator -
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 2e559afef4f..8f1ad7da9b6 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -17,7 +17,10 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
@@ -939,5 +942,155 @@ std::unique_ptr<table> sample(
   int64_t const seed                  = 0,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Checks if a column or its descendants have non-empty null rows
+ *
+ * @note This function is exact. If it returns `true`, there exists one or more
+ * non-empty null elements.
+ *
+ * A LIST or STRING column might have non-empty rows that are marked as null.
+ * A STRUCT OR LIST column might have child columns that have non-empty null rows.
+ * Other types of columns are deemed incapable of having non-empty null rows.
+ * E.g. Fixed width columns have no concept of an "empty" row.
+ *
+ * @param input The column which is (and whose descendants are) to be checked for
+ * non-empty null rows.
+ * @return true If either the column or its descendants have non-empty null rows.
+ * @return false If neither the column or its descendants have non-empty null rows.
+ */
+bool has_nonempty_nulls(column_view const& input);
+
+/**
+ * @brief Approximates if a column or its descendants *may* have non-empty null elements
+ *
+ * @note This function is approximate.
+ * - `true`: Non-empty null elements could exist
+ * - `false`: Non-empty null elements definitely do not exist
+ *
+ * False positives are possible, but false negatives are not.
+ *
+ * Compared to the exact `has_nonempty_nulls()` function, this function is typically
+ * more efficient.
+ *
+ * Complexity:
+ * - Best case: `O(count_descendants(input))`
+ * - Worst case: `O(count_descendants(input)) * m`, where `m` is the number of rows in the largest
+ * descendant
+ *
+ * @param input The column which is (and whose descendants are) to be checked for
+ * non-empty null rows
+ * @return true If either the column or its decendants have null rows
+ * @return false If neither the column nor its descendants have null rows
+ */
+bool may_have_nonempty_nulls(column_view const& input);
+
+/**
+ * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
+ *
+ * LIST columns may have non-empty null rows.
+ * For example:
+ * @code{.pseudo}
+ *
+ * auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} }.release();
+ * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false);
+ *
+ * lists[1] is now null, but the lists child column still stores `{2,3}`.
+ * The lists column contents will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 4, 6]
+ *   Child:    [0, 1, 2, 3, 4, 5]
+ *
+ * After purging the contents of the list's null rows, the column's contents
+ * will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 2, 4]
+ *   Child:    [0, 1, 4, 5]
+ * @endcode
+ *
+ * The purge operation only applies directly to LIST and STRING columns, but it
+ * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
+ * may have child/decendant columns that are LIST or STRING.
+ *
+ * @param input The column whose null rows are to be checked and purged
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
+ * the contents of null rows purged
+ */
+std::unique_ptr<column> purge_nonempty_nulls(
+  lists_column_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
+ *
+ * STRING columns may have non-empty null rows.
+ * For example:
+ * @code{.pseudo}
+ *
+ * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release();
+ * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false);
+ *
+ * strings[1] is now null, but the strings column still stores `"CD"`.
+ * The lists column contents will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 4, 6]
+ *   Child:    [A, B, C, D, E, F]
+ *
+ * After purging the contents of the list's null rows, the column's contents
+ * will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 2, 4]
+ *   Child:    [A, B, E, F]
+ * @endcode
+ *
+ * The purge operation only applies directly to LIST and STRING columns, but it
+ * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
+ * may have child/decendant columns that are LIST or STRING.
+ *
+ * @param input The column whose null rows are to be checked and purged
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
+ * the contents of null rows purged
+ */
+std::unique_ptr<column> purge_nonempty_nulls(
+  strings_column_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
+ *
+ * STRUCTS columns may have null rows, with non-empty child rows.
+ * For example:
+ * @code{.pseudo}
+ *
+ * auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} };
+ * auto const structs = structs_column_wrapper{ {lists}, null_at(1) };
+ *
+ * structs[1].child is now null, but the lists column still stores `{2,3}`.
+ * The lists column contents will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 4, 6]
+ *   Child:    [0, 1, 2, 3, 4, 5]
+ *
+ * After purging the contents of the list's null rows, the column's contents
+ * will be:
+ *   Validity: 101
+ *   Offsets:  [0, 2, 2, 4]
+ *   Child:    [0, 1, 4, 5]
+ * @endcode
+ *
+ * The purge operation only applies directly to LIST and STRING columns, but it
+ * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
+ * may have child/decendant columns that are LIST or STRING.
+ *
+ * @param input The column whose null rows are to be checked and purged
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
+ * the contents of null rows purged
+ */
+std::unique_ptr<column> purge_nonempty_nulls(
+  structs_column_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 886151fb9d6..8ca49dd7d5f 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -75,10 +75,6 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class row_number_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class rank_aggregation const& agg);
-  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
-                                                          class dense_rank_aggregation const& agg);
-  virtual std::vector<std::unique_ptr<aggregation>> visit(
-    data_type col_type, class percent_rank_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(
     data_type col_type, class collect_list_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -127,8 +123,6 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class nth_element_aggregation const& agg);
   virtual void visit(class row_number_aggregation const& agg);
   virtual void visit(class rank_aggregation const& agg);
-  virtual void visit(class dense_rank_aggregation const& agg);
-  virtual void visit(class percent_rank_aggregation const& agg);
   virtual void visit(class collect_list_aggregation const& agg);
   virtual void visit(class collect_set_aggregation const& agg);
   virtual void visit(class lead_lag_aggregation const& agg);
@@ -642,32 +636,42 @@ class rank_aggregation final : public rolling_aggregation,
                                public groupby_scan_aggregation,
                                public scan_aggregation {
  public:
-  rank_aggregation() : aggregation{RANK} {}
-
-  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  rank_aggregation(rank_method method,
+                   order column_order,
+                   null_policy null_handling,
+                   null_order null_precedence,
+                   rank_percentage percentage)
+    : aggregation{RANK},
+      _method{method},
+      _column_order{column_order},
+      _null_handling{null_handling},
+      _null_precedence{null_precedence},
+      _percentage(percentage)
   {
-    return std::make_unique<rank_aggregation>(*this);
   }
-  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
-    data_type col_type, simple_aggregations_collector& collector) const override
+  rank_method const _method;          ///< rank method
+  order const _column_order;          ///< order of the column to rank
+  null_policy const _null_handling;   ///< include or exclude nulls in ranks
+  null_order const _null_precedence;  ///< order of nulls in ranks
+  rank_percentage const _percentage;  ///< whether to return percentage ranks
+
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
-    return collector.visit(col_type, *this);
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<rank_aggregation const&>(_other);
+    return _method == other._method and _null_handling == other._null_handling and
+           _column_order == other._column_order and _null_precedence == other._null_precedence and
+           _percentage == other._percentage;
   }
-  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-};
 
-/**
- * @brief Derived class for specifying a dense rank aggregation
- */
-class dense_rank_aggregation final : public rolling_aggregation,
-                                     public groupby_scan_aggregation,
-                                     public scan_aggregation {
- public:
-  dense_rank_aggregation() : aggregation{DENSE_RANK} {}
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
   [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
-    return std::make_unique<dense_rank_aggregation>(*this);
+    return std::make_unique<rank_aggregation>(*this);
   }
   std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
     data_type col_type, simple_aggregations_collector& collector) const override
@@ -675,24 +679,16 @@ class dense_rank_aggregation final : public rolling_aggregation,
     return collector.visit(col_type, *this);
   }
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
-};
-
-class percent_rank_aggregation final : public rolling_aggregation,
-                                       public groupby_scan_aggregation,
-                                       public scan_aggregation {
- public:
-  percent_rank_aggregation() : aggregation{PERCENT_RANK} {}
 
-  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
-  {
-    return std::make_unique<percent_rank_aggregation>(*this);
-  }
-  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
-    data_type col_type, simple_aggregations_collector& collector) const override
+ private:
+  [[nodiscard]] size_t hash_impl() const
   {
-    return collector.visit(col_type, *this);
+    return std::hash<int>{}(static_cast<int>(_method)) ^
+           std::hash<int>{}(static_cast<int>(_column_order)) ^
+           std::hash<int>{}(static_cast<int>(_null_handling)) ^
+           std::hash<int>{}(static_cast<int>(_null_precedence)) ^
+           std::hash<int>{}(static_cast<int>(_percentage));
   }
-  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
@@ -1278,19 +1274,7 @@ struct target_type_impl<Source, aggregation::ROW_NUMBER> {
 // Always use size_type accumulator for RANK
 template <typename Source>
 struct target_type_impl<Source, aggregation::RANK> {
-  using type = size_type;
-};
-
-// Always use size_type accumulator for DENSE_RANK
-template <typename Source>
-struct target_type_impl<Source, aggregation::DENSE_RANK> {
-  using type = size_type;
-};
-
-// Always use double for PERCENT_RANK
-template <typename SourceType>
-struct target_type_impl<SourceType, aggregation::PERCENT_RANK> {
-  using type = double;
+  using type = size_type;  // double for percentage=true.
 };
 
 // Always use list for COLLECT_LIST
@@ -1453,10 +1437,6 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::ROW_NUMBER>(std::forward<Ts>(args)...);
     case aggregation::RANK:
       return f.template operator()<aggregation::RANK>(std::forward<Ts>(args)...);
-    case aggregation::DENSE_RANK:
-      return f.template operator()<aggregation::DENSE_RANK>(std::forward<Ts>(args)...);
-    case aggregation::PERCENT_RANK:
-      return f.template operator()<aggregation::PERCENT_RANK>(std::forward<Ts>(args)...);
     case aggregation::COLLECT_LIST:
       return f.template operator()<aggregation::COLLECT_LIST>(std::forward<Ts>(args)...);
     case aggregation::COLLECT_SET:
diff --git a/cpp/include/cudf/detail/copy.cuh b/cpp/include/cudf/detail/copy.cuh
new file mode 100644
index 00000000000..773bce7131f
--- /dev/null
+++ b/cpp/include/cudf/detail/copy.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.cuh>
+
+namespace cudf::detail {
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*)
+ *
+ * @tparam ColumnViewT View type (lists_column_view, strings_column_view, or strings_column_view)
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+template <typename ColumnViewT>
+std::unique_ptr<cudf::column> purge_nonempty_nulls(ColumnViewT const& input,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  // Implement via identity gather.
+  auto const input_column = input.parent();
+  auto const gather_begin = thrust::counting_iterator<cudf::size_type>(0);
+  auto const gather_end   = gather_begin + input_column.size();
+
+  auto gathered_table = cudf::detail::gather(table_view{{input_column}},
+                                             gather_begin,
+                                             gather_end,
+                                             out_of_bounds_policy::DONT_CHECK,
+                                             stream,
+                                             mr);
+  return std::move(gathered_table->release()[0]);
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 50157d16876..abd14fbda89 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -299,5 +299,22 @@ std::unique_ptr<scalar> get_element(
   size_type index,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::has_nonempty_nulls
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+bool has_nonempty_nulls(column_view const& input,
+                        rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @copydoc cudf::may_have_nonempty_nulls
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+bool may_have_nonempty_nulls(column_view const& input,
+                             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
new file mode 100644
index 00000000000..12e4aaa03fd
--- /dev/null
+++ b/cpp/include/cudf/detail/join.hpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <cuco/static_multimap.cuh>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+// Forward declaration
+template <typename T>
+class default_allocator;
+
+namespace cudf {
+namespace detail {
+
+constexpr int DEFAULT_JOIN_CG_SIZE = 2;
+
+enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
+
+/**
+ * @brief Hash join that builds hash table in creation and probes results in subsequent `*_join`
+ * member functions.
+ *
+ * User-defined hash function can be passed via the template parameter `Hasher`
+ *
+ * @tparam Hasher Unary callable type
+ */
+template <typename Hasher>
+struct hash_join {
+ public:
+  using map_type =
+    cuco::static_multimap<hash_value_type,
+                          cudf::size_type,
+                          cuda::thread_scope_device,
+                          rmm::mr::stream_allocator_adaptor<default_allocator<char>>,
+                          cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, Hasher, Hasher>>;
+
+  hash_join()                 = delete;
+  ~hash_join()                = default;
+  hash_join(hash_join const&) = delete;
+  hash_join(hash_join&&)      = delete;
+  hash_join& operator=(hash_join const&) = delete;
+  hash_join& operator=(hash_join&&) = delete;
+
+ private:
+  bool const _is_empty;                    ///< true if `_hash_table` is empty
+  cudf::null_equality const _nulls_equal;  ///< whether to consider nulls as equal
+  cudf::table_view _build;                 ///< input table to build the hash map
+  cudf::structs::detail::flattened_table
+    _flattened_build_table;  ///< flattened data structures for `_build`
+  map_type _hash_table;      ///< hash table built on `_build`
+
+ public:
+  /**
+   * @brief Constructor that internally builds the hash table based on the given `build` table.
+   *
+   * @throw cudf::logic_error if the number of columns in `build` table is 0.
+   * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
+   *
+   * @param build The build table, from which the hash table is built.
+   * @param compare_nulls Controls whether null join-key values should match or not.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   */
+  hash_join(cudf::table_view const& build,
+            cudf::null_equality compare_nulls,
+            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+  /**
+   * @copydoc cudf::hash_join::inner_join
+   */
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  inner_join(cudf::table_view const& probe,
+             std::optional<std::size_t> output_size,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr) const;
+
+  /**
+   * @copydoc cudf::hash_join::left_join
+   */
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  left_join(cudf::table_view const& probe,
+            std::optional<std::size_t> output_size,
+            rmm::cuda_stream_view stream,
+            rmm::mr::device_memory_resource* mr) const;
+
+  /**
+   * @copydoc cudf::hash_join::full_join
+   */
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  full_join(cudf::table_view const& probe,
+            std::optional<std::size_t> output_size,
+            rmm::cuda_stream_view stream,
+            rmm::mr::device_memory_resource* mr) const;
+
+  /**
+   * @copydoc cudf::hash_join::inner_join_size
+   */
+  [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe,
+                                            rmm::cuda_stream_view stream) const;
+
+  /**
+   * @copydoc cudf::hash_join::left_join_size
+   */
+  [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe,
+                                           rmm::cuda_stream_view stream) const;
+
+  /**
+   * @copydoc cudf::hash_join::full_join_size
+   */
+  std::size_t full_join_size(cudf::table_view const& probe,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr) const;
+
+ private:
+  /**
+   * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`,
+   * and returns the output indices of `build_table` and `probe_table` as a combined table,
+   * i.e. if full join is specified as the join type then left join is called. Behavior
+   * is undefined if the provided `output_size` is smaller than the actual output size.
+   *
+   * @throw cudf::logic_error if build table is empty and `JoinKind == INNER_JOIN`.
+   *
+   * @tparam JoinKind The type of join to be performed.
+   *
+   * @param probe_table Table of probe side columns to join.
+   * @param output_size Optional value which allows users to specify the exact output size.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned vectors.
+   *
+   * @return Join output indices vector pair.
+   */
+  template <cudf::detail::join_kind JoinKind>
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  probe_join_indices(cudf::table_view const& probe_table,
+                     std::optional<std::size_t> output_size,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const;
+
+  /**
+   * @copydoc cudf::detail::hash_join::probe_join_indices
+   *
+   * @throw cudf::logic_error if probe table is empty.
+   * @throw cudf::logic_error if the size of probe table exceeds `MAX_JOIN_SIZE`.
+   * @throw cudf::logic_error if the number of columns in build table and probe table do not match.
+   * @throw cudf::logic_error if the column data types in build table and probe table do not match.
+   */
+  template <cudf::detail::join_kind JoinKind>
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  compute_hash_join(cudf::table_view const& probe,
+                    std::optional<std::size_t> output_size,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr) const;
+};
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 7aec56fdc51..6a6cdd43004 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -133,7 +133,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_binop(
                           stream,
                           mr);
 
-  return std::make_pair(std::move(dest_mask), null_count);
+  return std::pair(std::move(dest_mask), null_count);
 }
 
 /**
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
index 3a6113e66ce..317e4d0cf47 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
-#include "cudf/lists/lists_column_view.hpp"
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index fc829617c2d..13dddd3b0c8 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -103,16 +103,17 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row percent ranks for a column.
+ * @brief Generate row ONE_NORMALIZED percent ranks for a column.
+ * Also, knowns as ANSI SQL PERCENT RANK.
+ * Calculated by (rank - 1) / (count - 1).
  *
  * @param order_by Input column to generate ranks for.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return rank values.
  */
-std::unique_ptr<column> inclusive_percent_rank_scan(column_view const& order_by,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 09d94d10e79..2c5434b63d2 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -31,11 +31,21 @@
 #include <thrust/pair.h>
 #include <thrust/reverse.h>
 
-using hash_value_type = uint32_t;
-
 namespace cudf {
 namespace detail {
 
+/**
+ * Normalization of floating point NaNs, passthrough for all other values.
+ */
+template <typename T>
+T __device__ inline normalize_nans(T const& key)
+{
+  if constexpr (cudf::is_floating_point<T>()) {
+    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
+  }
+  return key;
+}
+
 /**
  * Normalization of floating point NaNs and zeros, passthrough for all other values.
  */
@@ -43,13 +53,9 @@ template <typename T>
 T __device__ inline normalize_nans_and_zeros(T const& key)
 {
   if constexpr (cudf::is_floating_point<T>()) {
-    if (std::isnan(key)) {
-      return std::numeric_limits<T>::quiet_NaN();
-    } else if (key == T{0.0}) {
-      return T{0.0};
-    }
+    if (key == T{0.0}) { return T{0.0}; }
   }
-  return key;
+  return normalize_nans(key);
 }
 
 __device__ inline uint32_t rotate_bits_left(uint32_t x, uint32_t r)
@@ -176,9 +182,6 @@ void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destinatio
   std::memcpy(destination, reinterpret_cast<uint8_t*>(&x), 8);
 }
 
-}  // namespace detail
-}  // namespace cudf
-
 // MurmurHash3_32 implementation from
 // https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
 //-----------------------------------------------------------------------------
@@ -192,7 +195,7 @@ template <typename Key>
 struct MurmurHash3_32 {
   using result_type = hash_value_type;
 
-  MurmurHash3_32() = default;
+  constexpr MurmurHash3_32() = default;
   constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
   [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
@@ -214,24 +217,9 @@ struct MurmurHash3_32 {
     return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
   }
 
-  // TODO Do we need this operator() and/or compute? Probably not both.
   [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
   {
-    return compute(key);
-  }
-
-  // compute wrapper for floating point types
-  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-  hash_value_type __device__ inline compute_floating_point(T const& key) const
-  {
-    if (key == T{0.0}) {
-      return compute(T{0.0});
-    } else if (std::isnan(key)) {
-      T nan = std::numeric_limits<T>::quiet_NaN();
-      return compute(nan);
-    } else {
-      return compute(key);
-    }
+    return compute(detail::normalize_nans_and_zeros(key));
   }
 
   template <typename T>
@@ -240,17 +228,32 @@ struct MurmurHash3_32 {
     return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
   }
 
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type len,
+                                                        cudf::size_type tail_offset,
+                                                        result_type h) const
+  {
+    // Process remaining bytes that do not fill a four-byte chunk.
+    uint32_t k1 = 0;
+    switch (len % 4) {
+      case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16; [[fallthrough]];
+      case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8; [[fallthrough]];
+      case 1:
+        k1 ^= std::to_integer<uint8_t>(data[tail_offset]);
+        k1 *= c1;
+        k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+        k1 *= c2;
+        h ^= k1;
+    };
+    return h;
+  }
+
   result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
   {
     constexpr cudf::size_type BLOCK_SIZE = 4;
     cudf::size_type const nblocks        = len / BLOCK_SIZE;
     cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
-    result_type h1                       = m_seed;
-    constexpr uint32_t c1                = 0xcc9e2d51;
-    constexpr uint32_t c2                = 0x1b873593;
-    constexpr uint32_t c3                = 0xe6546b64;
-    constexpr uint32_t rot_c1            = 15;
-    constexpr uint32_t rot_c2            = 13;
+    result_type h                        = m_seed;
 
     // Process all four-byte chunks.
     for (cudf::size_type i = 0; i < nblocks; i++) {
@@ -258,50 +261,44 @@ struct MurmurHash3_32 {
       k1 *= c1;
       k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
       k1 *= c2;
-      h1 ^= k1;
-      h1 = cudf::detail::rotate_bits_left(h1, rot_c2);
-      h1 = h1 * 5 + c3;
+      h ^= k1;
+      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
     }
 
-    // Process remaining bytes that do not fill a four-byte chunk.
-    uint32_t k1 = 0;
-    switch (len % 4) {
-      case 3: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 2]) << 16;
-      case 2: k1 ^= std::to_integer<uint8_t>(data[tail_offset + 1]) << 8;
-      case 1:
-        k1 ^= std::to_integer<uint8_t>(data[tail_offset]);
-        k1 *= c1;
-        k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
-        k1 *= c2;
-        h1 ^= k1;
-    };
+    h = compute_remaining_bytes(data, len, tail_offset, h);
 
     // Finalize hash.
-    h1 ^= len;
-    h1 = fmix32(h1);
-    return h1;
+    h ^= len;
+    h = fmix32(h);
+    return h;
   }
 
  private:
   uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
+  static constexpr uint32_t c1     = 0xcc9e2d51;
+  static constexpr uint32_t c2     = 0x1b873593;
+  static constexpr uint32_t c3     = 0xe6546b64;
+  static constexpr uint32_t rot_c1 = 15;
+  static constexpr uint32_t rot_c2 = 13;
 };
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<bool>::operator()(bool const& key) const
 {
-  return this->compute(static_cast<uint8_t>(key));
+  return compute(static_cast<uint8_t>(key));
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<float>::operator()(float const& key) const
 {
-  return this->compute_floating_point(key);
+  return compute(detail::normalize_nans_and_zeros(key));
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<double>::operator()(double const& key) const
 {
-  return this->compute_floating_point(key);
+  return compute(detail::normalize_nans_and_zeros(key));
 }
 
 template <>
@@ -310,28 +307,28 @@ hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
 {
   auto const data = reinterpret_cast<std::byte const*>(key.data());
   auto const len  = key.size_bytes();
-  return this->compute_bytes(data, len);
+  return compute_bytes(data, len);
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<numeric::decimal32>::operator()(
   numeric::decimal32 const& key) const
 {
-  return this->compute(key.value());
+  return compute(key.value());
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<numeric::decimal64>::operator()(
   numeric::decimal64 const& key) const
 {
-  return this->compute(key.value());
+  return compute(key.value());
 }
 
 template <>
 hash_value_type __device__ inline MurmurHash3_32<numeric::decimal128>::operator()(
   numeric::decimal128 const& key) const
 {
-  return this->compute(key.value());
+  return compute(key.value());
 }
 
 template <>
@@ -352,10 +349,10 @@ template <typename Key>
 struct SparkMurmurHash3_32 {
   using result_type = hash_value_type;
 
-  SparkMurmurHash3_32() = default;
+  constexpr SparkMurmurHash3_32() = default;
   constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  __device__ inline uint32_t fmix32(uint32_t h) const
+  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
   {
     h ^= h >> 16;
     h *= 0x85ebca6b;
@@ -365,18 +362,18 @@ struct SparkMurmurHash3_32 {
     return h;
   }
 
-  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
+  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
+                                                      cudf::size_type offset) const
+  {
+    // Read a 4-byte value from the data pointer as individual bytes for safe
+    // unaligned access (very likely for string types).
+    auto block = reinterpret_cast<uint8_t const*>(data + offset);
+    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+  }
 
-  // compute wrapper for floating point types
-  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-  hash_value_type __device__ inline compute_floating_point(T const& key) const
+  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
   {
-    if (std::isnan(key)) {
-      T nan = std::numeric_limits<T>::quiet_NaN();
-      return compute(nan);
-    } else {
-      return compute(key);
-    }
+    return compute(key);
   }
 
   template <typename T>
@@ -385,24 +382,35 @@ struct SparkMurmurHash3_32 {
     return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
   }
 
-  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
-                                                      cudf::size_type offset) const
+  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
+                                                        cudf::size_type len,
+                                                        cudf::size_type tail_offset,
+                                                        result_type h) const
   {
-    // Individual byte reads for unaligned accesses (very likely for strings)
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
+    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
+    // (does not conform to normal MurmurHash3).
+    for (auto i = tail_offset; i < len; i++) {
+      // We require a two-step cast to get the k1 value from the byte. First,
+      // we must cast to a signed int8_t. Then, the sign bit is preserved when
+      // casting to uint32_t under 2's complement. Java preserves the sign when
+      // casting byte-to-int, but C++ does not.
+      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
+      k1 *= c1;
+      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
+      k1 *= c2;
+      h ^= k1;
+      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
+    }
+    return h;
   }
 
   result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
   {
     constexpr cudf::size_type BLOCK_SIZE = 4;
     cudf::size_type const nblocks        = len / BLOCK_SIZE;
-    result_type h1                       = m_seed;
-    constexpr uint32_t c1                = 0xcc9e2d51;
-    constexpr uint32_t c2                = 0x1b873593;
-    constexpr uint32_t c3                = 0xe6546b64;
-    constexpr uint32_t rot_c1            = 15;
-    constexpr uint32_t rot_c2            = 13;
+    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
+    result_type h                        = m_seed;
 
     // Process all four-byte chunks.
     for (cudf::size_type i = 0; i < nblocks; i++) {
@@ -410,78 +418,69 @@ struct SparkMurmurHash3_32 {
       k1 *= c1;
       k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
       k1 *= c2;
-      h1 ^= k1;
-      h1 = cudf::detail::rotate_bits_left(h1, rot_c2);
-      h1 = h1 * 5 + c3;
+      h ^= k1;
+      h = cudf::detail::rotate_bits_left(h, rot_c2);
+      h = h * 5 + c3;
     }
 
-    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
-    // (does not conform to normal MurmurHash3).
-    for (cudf::size_type i = nblocks * 4; i < len; i++) {
-      // We require a two-step cast to get the k1 value from the byte. First,
-      // we must cast to a signed int8_t. Then, the sign bit is preserved when
-      // casting to uint32_t under 2's complement. Java preserves the
-      // signedness when casting byte-to-int, but C++ does not.
-      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
-      k1 *= c1;
-      k1 = cudf::detail::rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h1 ^= k1;
-      h1 = cudf::detail::rotate_bits_left(h1, rot_c2);
-      h1 = h1 * 5 + c3;
-    }
+    h = compute_remaining_bytes(data, len, tail_offset, h);
 
     // Finalize hash.
-    h1 ^= len;
-    h1 = fmix32(h1);
-    return h1;
+    h ^= len;
+    h = fmix32(h);
+    return h;
   }
 
  private:
   uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
+  static constexpr uint32_t c1     = 0xcc9e2d51;
+  static constexpr uint32_t c2     = 0x1b873593;
+  static constexpr uint32_t c3     = 0xe6546b64;
+  static constexpr uint32_t rot_c1 = 15;
+  static constexpr uint32_t rot_c2 = 13;
 };
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<bool>::operator()(bool const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<int8_t>::operator()(int8_t const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<uint8_t>::operator()(uint8_t const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<int16_t>::operator()(int16_t const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<uint16_t>::operator()(
   uint16_t const& key) const
 {
-  return this->compute<uint32_t>(key);
+  return compute<uint32_t>(key);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(float const& key) const
 {
-  return this->compute_floating_point(key);
+  return compute<float>(detail::normalize_nans(key));
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(double const& key) const
 {
-  return this->compute_floating_point(key);
+  return compute<double>(detail::normalize_nans(key));
 }
 
 template <>
@@ -490,21 +489,21 @@ hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operat
 {
   auto const data = reinterpret_cast<std::byte const*>(key.data());
   auto const len  = key.size_bytes();
-  return this->compute_bytes(data, len);
+  return compute_bytes(data, len);
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal32>::operator()(
   numeric::decimal32 const& key) const
 {
-  return this->compute<uint64_t>(key.value());
+  return compute<uint64_t>(key.value());
 }
 
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal64>::operator()(
   numeric::decimal64 const& key) const
 {
-  return this->compute<uint64_t>(key.value());
+  return compute<uint64_t>(key.value());
 }
 
 template <>
@@ -546,7 +545,7 @@ hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::oper
   __int128_t big_endian_value = 0;
   auto big_endian_data        = reinterpret_cast<std::byte*>(&big_endian_value);
   thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data);
-  return this->compute_bytes(big_endian_data, length);
+  return compute_bytes(big_endian_data, length);
 }
 
 template <>
@@ -593,3 +592,6 @@ struct IdentityHash {
 
 template <typename Key>
 using default_hash = MurmurHash3_32<Key>;
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index aa4421bb4ed..f91f51b2161 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -110,7 +110,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(
 
     null_count = size - valid_count.value(stream);
   }
-  return std::make_pair(std::move(null_mask), null_count);
+  return std::pair(std::move(null_mask), null_count);
 }
 
 /**
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index e973c585410..bbff304e547 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -19,6 +19,9 @@
 #include <cudf/table/table_view.hpp>
 
 namespace cudf {
+
+using hash_value_type = uint32_t;
+
 /**
  * @addtogroup column_hash
  * @{
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 7e4ab5b8d9d..23ed0153f3f 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -244,6 +244,7 @@ class column_in_metadata {
   bool _use_int96_timestamp = false;
   // bool _output_as_binary = false;
   thrust::optional<uint8_t> _decimal_precision;
+  thrust::optional<int32_t> _parquet_field_id;
   std::vector<column_in_metadata> children;
 
  public:
@@ -324,6 +325,18 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Set the parquet field id of this column.
+   *
+   * @param field_id The parquet field id to set
+   * @return this for chaining
+   */
+  column_in_metadata& set_parquet_field_id(int32_t field_id)
+  {
+    _parquet_field_id = field_id;
+    return *this;
+  }
+
   /**
    * @brief Get reference to a child of this column
    *
@@ -379,6 +392,18 @@ class column_in_metadata {
    */
   [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
 
+  /**
+   * @brief Get whether parquet field id has been set for this column.
+   */
+  [[nodiscard]] bool is_parquet_field_id_set() const { return _parquet_field_id.has_value(); }
+
+  /**
+   * @brief Get the parquet field id that was set for this column.
+   * @throws If parquet field id was not set for this column.
+   *         Check using `is_parquet_field_id_set()` first.
+   */
+  [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); }
+
   /**
    * @brief Get the number of children of this column
    */
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index d56f8f0e904..f48f8a83e9a 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/hashing.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
@@ -29,6 +30,16 @@
 #include <vector>
 
 namespace cudf {
+
+// forward declaration
+namespace detail {
+template <typename T>
+class MurmurHash3_32;
+
+template <typename T>
+class hash_join;
+}  // namespace detail
+
 /**
  * @addtogroup column_join
  * @{
@@ -503,6 +514,9 @@ std::unique_ptr<cudf::table> cross_join(
  */
 class hash_join {
  public:
+  using impl_type =
+    typename cudf::detail::hash_join<cudf::detail::MurmurHash3_32<cudf::hash_value_type>>;
+
   hash_join() = delete;
   ~hash_join();
   hash_join(hash_join const&) = delete;
@@ -634,8 +648,7 @@ class hash_join {
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
  private:
-  struct hash_join_impl;
-  const std::unique_ptr<const hash_join_impl> impl;
+  const std::unique_ptr<const impl_type> _impl;
 };
 
 /**
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index c637ad041ba..7df36be2385 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -82,6 +83,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
   auto dst_offsets_c = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
   mutable_column_view dst_offsets_v = dst_offsets_c->mutable_view();
+  auto const source_column_nullmask = source_column.null_mask();
 
   // generate the compacted outgoing offsets.
   auto count_iter = thrust::make_counting_iterator<int32_t>(0);
@@ -90,12 +92,23 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
     count_iter,
     count_iter + offset_count,
     dst_offsets_v.begin<int32_t>(),
-    [gather_map, output_count, src_offsets, src_size] __device__(int32_t index) -> int32_t {
+    [source_column_nullmask,
+     source_column_offset = source_column.offset(),
+     gather_map,
+     output_count,
+     src_offsets,
+     src_size] __device__(int32_t index) -> int32_t {
       int32_t offset_index = index < output_count ? gather_map[index] : 0;
 
       // if this is an invalid index, this will be a NULL list
       if (NullifyOutOfBounds && ((offset_index < 0) || (offset_index >= src_size))) { return 0; }
 
+      // If the source row is null, the output row size must be 0.
+      if (source_column_nullmask != nullptr &&
+          not cudf::bit_is_set(source_column_nullmask, source_column_offset + offset_index)) {
+        return 0;
+      }
+
       // the length of this list
       return src_offsets[offset_index + 1] - src_offsets[offset_index];
     },
@@ -110,15 +123,27 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 
   // generate the base offsets
   rmm::device_uvector<int32_t> base_offsets = rmm::device_uvector<int32_t>(output_count, stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    gather_map,
-                    gather_map + output_count,
-                    base_offsets.data(),
-                    [src_offsets, src_size, shift] __device__(int32_t index) {
-                      // if this is an invalid index, this will be a NULL list
-                      if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; }
-                      return src_offsets[index] - shift;
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream),
+    gather_map,
+    gather_map + output_count,
+    base_offsets.data(),
+    [source_column_nullmask,
+     source_column_offset = source_column.offset(),
+     src_offsets,
+     src_size,
+     shift] __device__(int32_t index) {
+      // if this is an invalid index, this will be a NULL list
+      if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; }
+
+      // If the source row is null, the output row size must be 0.
+      if (source_column_nullmask != nullptr &&
+          not cudf::bit_is_set(source_column_nullmask, source_column_offset + index)) {
+        return 0;
+      }
+
+      return src_offsets[index] - shift;
+    });
 
   // Retrieve size of the resulting gather map for level N+1 (the last offset)
   size_type child_gather_map_size =
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index ff334b9ee85..b7e915650dc 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/aggregation.hpp>
 #include <cudf/types.hpp>
 
 #include <memory>
@@ -23,19 +24,6 @@
 
 namespace cudf {
 
-/**
- * @brief Tie-breaker method to use for ranking the column.
- *
- * @ingroup column_sort
- */
-enum class rank_method {
-  FIRST,    ///< stable sort order ranking (no ties)
-  AVERAGE,  ///< mean of first in the group
-  MIN,      ///< min of first in the group
-  MAX,      ///< max of first in the group
-  DENSE     ///< rank always increases by 1 between groups
-};
-
 /**
  * @addtogroup column_sort
  * @{
@@ -198,7 +186,7 @@ std::unique_ptr<table> stable_sort_by_key(
  * included, corresponding rank will be null.
  * @param null_precedence The desired order of null compared to other elements
  * for column
- * @param percentage flag to convert ranks to percentage in range (0,1}
+ * @param percentage flag to convert ranks to percentage in range (0,1]
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> A column of containing the rank of the each
  * element of the column of `input`. The output column type will be `size_type`
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 1b10c70d6d6..d46ab3a91a1 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -303,14 +303,17 @@ std::unique_ptr<cudf::column> gather(
     data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto const d_out_offsets = out_offsets_column->mutable_view().template data<int32_t>();
   auto const d_in_offsets  = (strings_count > 0) ? strings.offsets_begin() : nullptr;
-  thrust::transform(rmm::exec_policy(stream),
-                    begin,
-                    end,
-                    d_out_offsets,
-                    [d_in_offsets, strings_count] __device__(size_type in_idx) {
-                      if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0;
-                      return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx];
-                    });
+  auto const d_strings     = column_device_view::create(strings.parent(), stream);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    begin,
+    end,
+    d_out_offsets,
+    [d_strings = *d_strings, d_in_offsets, strings_count] __device__(size_type in_idx) {
+      if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0;
+      if (not d_strings.is_valid(in_idx)) return 0;
+      return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx];
+    });
 
   // check total size is not too large
   size_t const total_bytes = thrust::transform_reduce(
@@ -329,7 +332,6 @@ std::unique_ptr<cudf::column> gather(
 
   // build chars column
   cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
-  auto const d_strings  = column_device_view::create(strings.parent(), stream);
   auto out_chars_column = gather_chars(d_strings->begin<string_view>(),
                                        begin,
                                        end,
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index b6aa22cc316..cfede60c771 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -15,14 +15,13 @@
  */
 #pragma once
 
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/scatter.h>
@@ -68,20 +67,17 @@ std::unique_ptr<column> scatter(
   // create vector of string_view's to scatter into
   rmm::device_uvector<string_view> target_vector = create_string_vector_from_column(target, stream);
 
-  // do the scatter
-  thrust::scatter(rmm::exec_policy(stream), begin, end, scatter_map, target_vector.begin());
+  // this ensures empty strings are not mapped to nulls in the make_strings_column function
+  auto const size = thrust::distance(begin, end);
+  auto itr        = thrust::make_transform_iterator(
+    begin, [] __device__(string_view const sv) { return sv.empty() ? string_view{} : sv; });
 
-  // build offsets column
-  auto offsets_column = child_offsets_from_string_vector(target_vector, stream, mr);
-  // build chars column
-  auto chars_column =
-    child_chars_from_string_vector(target_vector, offsets_column->view(), stream, mr);
+  // do the scatter
+  thrust::scatter(rmm::exec_policy(stream), itr, itr + size, scatter_map, target_vector.begin());
 
-  return make_strings_column(target.size(),
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             UNKNOWN_NULL_COUNT,
-                             cudf::detail::copy_bitmask(target.parent(), stream, mr));
+  // build the output column
+  auto sv_span = cudf::device_span<string_view const>(target_vector);
+  return make_strings_column(sv_span, string_view{nullptr, 0}, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index b9ea2d9ecff..e6dba5147b5 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -71,28 +71,6 @@ std::unique_ptr<column> make_offsets_child_column(
   return offsets_column;
 }
 
-/**
- * @brief Creates an offsets column from a string_view iterator, and size.
- *
- * @tparam Iter Iterator type that returns string_view instances
- * @param strings_begin Iterator to the beginning of the string_view sequence
- * @param num_strings The number of string_view instances in the sequence
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Child offsets column
- */
-template <typename Iter>
-std::unique_ptr<cudf::column> child_offsets_from_string_iterator(
-  Iter strings_begin,
-  cudf::size_type num_strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto transformer = [] __device__(string_view v) { return v.size_bytes(); };
-  auto begin       = thrust::make_transform_iterator(strings_begin, transformer);
-  return make_offsets_child_column(begin, begin + num_strings, stream, mr);
-}
-
 /**
  * @brief Copies input string data into a buffer and increments the pointer by the number of bytes
  * copied.
@@ -178,7 +156,7 @@ auto make_strings_children(
     for_each_fn(size_and_exec_fn);
   }
 
-  return std::make_pair(std::move(offsets_column), std::move(chars_column));
+  return std::pair(std::move(offsets_column), std::move(chars_column));
 }
 
 /**
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 6424841ba86..c4f9e547148 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,36 +45,11 @@ std::unique_ptr<column> create_chars_child_column(
  *
  * @param strings Strings column instance.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned vector's device memory.
  * @return Device vector of string_views
  */
 rmm::device_uvector<string_view> create_string_vector_from_column(
-  cudf::strings_column_view const strings, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-/**
- * @brief Creates an offsets column from a string_view vector.
- *
- * @param strings Strings input data
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Child offsets column
- */
-std::unique_ptr<cudf::column> child_offsets_from_string_vector(
-  cudf::device_span<string_view> strings,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Creates a chars column from a string_view vector.
- *
- * @param strings Strings input data
- * @param d_offsets Offsets vector for placing strings into column's memory.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Child chars column
- */
-std::unique_ptr<cudf::column> child_chars_from_string_vector(
-  cudf::device_span<string_view> strings,
-  column_view const& offsets,
+  cudf::strings_column_view const strings,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh
index 0cfcaeb913e..d20080cc0e5 100644
--- a/cpp/include/cudf/strings/string.cuh
+++ b/cpp/include/cudf/strings/string.cuh
@@ -23,7 +23,6 @@
 
 namespace cudf {
 namespace strings {
-namespace string {
 /**
  * @addtogroup strings_classes
  * @{
@@ -150,6 +149,5 @@ inline __device__ bool is_float(string_view const& d_str)
 }
 
 /** @} */  // end of group
-}  // namespace string
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp
index 329c24cfe0a..ca866d8555e 100644
--- a/cpp/include/cudf/structs/structs_column_view.hpp
+++ b/cpp/include/cudf/structs/structs_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,11 @@ class structs_column_view : public column_view {
 
   explicit structs_column_view(column_view const& rhs);
 
+  /**
+   * @brief Returns the parent column.
+   */
+  [[nodiscard]] column_view parent() const;
+
   using column_view::child_begin;
   using column_view::child_end;
   using column_view::has_nulls;
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 88e31744fdf..32b71e660ac 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -165,14 +165,13 @@ class device_row_comparator {
         bool const rhs_is_null{_rhs.is_null(rhs_element_index)};
 
         if (lhs_is_null or rhs_is_null) {  // at least one is null
-          return cuda::std::make_pair(null_compare(lhs_is_null, rhs_is_null, _null_precedence),
-                                      _depth);
+          return cuda::std::pair(null_compare(lhs_is_null, rhs_is_null, _null_precedence), _depth);
         }
       }
 
-      return cuda::std::make_pair(relational_compare(_lhs.element<Element>(lhs_element_index),
-                                                     _rhs.element<Element>(rhs_element_index)),
-                                  std::numeric_limits<int>::max());
+      return cuda::std::pair(relational_compare(_lhs.element<Element>(lhs_element_index),
+                                                _rhs.element<Element>(rhs_element_index)),
+                             std::numeric_limits<int>::max());
     }
 
     template <typename Element,
@@ -197,11 +196,11 @@ class device_row_comparator {
 
         if (lhs_is_null or rhs_is_null) {  // at least one is null
           weak_ordering state = null_compare(lhs_is_null, rhs_is_null, _null_precedence);
-          return cuda::std::make_pair(state, depth);
+          return cuda::std::pair(state, depth);
         }
 
         if (lcol.num_child_columns() == 0) {
-          return cuda::std::make_pair(weak_ordering::EQUIVALENT, depth);
+          return cuda::std::pair(weak_ordering::EQUIVALENT, depth);
         }
 
         // Non-empty structs have been modified to only have 1 child when using this.
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index 8be1a7e3a32..8f6190bbaf7 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -46,7 +46,20 @@ struct logic_error : public std::logic_error {
  * @brief Exception thrown when a CUDA error is encountered.
  */
 struct cuda_error : public std::runtime_error {
-  cuda_error(std::string const& message) : std::runtime_error(message) {}
+  cuda_error(std::string const& message, cudaError_t const& error)
+    : std::runtime_error(message), _cudaError(error)
+  {
+  }
+
+ public:
+  cudaError_t error_code() const { return _cudaError; }
+
+ protected:
+  cudaError_t _cudaError;
+};
+
+struct fatal_cuda_error : public cuda_error {
+  using cuda_error::cuda_error;
 };
 /** @} */
 
@@ -101,9 +114,20 @@ namespace detail {
 
 inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int line)
 {
-  throw cudf::cuda_error(std::string{"CUDA error encountered at: " + std::string{file} + ":" +
-                                     std::to_string(line) + ": " + std::to_string(error) + " " +
-                                     cudaGetErrorName(error) + " " + cudaGetErrorString(error)});
+  // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
+  // call doesn't return with cudaSuccess.
+  cudaGetLastError();
+  auto const last = cudaGetLastError();
+  auto const msg  = std::string{"CUDA error encountered at: " + std::string{file} + ":" +
+                               std::to_string(line) + ": " + std::to_string(error) + " " +
+                               cudaGetErrorName(error) + " " + cudaGetErrorString(error)};
+  // Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error.
+  // between two calls.
+  if (error == last && last == cudaDeviceSynchronize()) {
+    throw fatal_cuda_error{"Fatal " + msg, error};
+  } else {
+    throw cuda_error{msg, error};
+  }
 }
 }  // namespace detail
 }  // namespace cudf
@@ -115,13 +139,10 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
  * cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
  * exception detailing the CUDA error that occurred
  */
-#define CUDF_CUDA_TRY(call)                                       \
-  do {                                                            \
-    cudaError_t const status = (call);                            \
-    if (cudaSuccess != status) {                                  \
-      cudaGetLastError();                                         \
-      cudf::detail::throw_cuda_error(status, __FILE__, __LINE__); \
-    }                                                             \
+#define CUDF_CUDA_TRY(call)                                                                    \
+  do {                                                                                         \
+    cudaError_t const status = (call);                                                         \
+    if (cudaSuccess != status) { cudf::detail::throw_cuda_error(status, __FILE__, __LINE__); } \
   } while (0);
 
 /**
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 8a5d4e5efcc..ff2ff2a0961 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -275,7 +275,7 @@ auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, Validity
     chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
     offsets.push_back(offsets.back() + tmp.length());
   }
-  return std::make_pair(std::move(chars), std::move(offsets));
+  return std::pair(std::move(chars), std::move(offsets));
 };
 }  // namespace detail
 
@@ -702,13 +702,11 @@ class strings_column_wrapper : public detail::column_wrapper {
   template <typename StringsIterator>
   strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
-    std::vector<char> chars;
-    std::vector<cudf::size_type> offsets;
-    auto all_valid           = thrust::make_constant_iterator(true);
-    std::tie(chars, offsets) = detail::make_chars_and_offsets(begin, end, all_valid);
-    auto d_chars             = cudf::detail::make_device_uvector_sync(chars);
-    auto d_offsets           = cudf::detail::make_device_uvector_sync(offsets);
-    wrapped                  = cudf::make_strings_column(d_chars, d_offsets);
+    auto all_valid        = thrust::make_constant_iterator(true);
+    auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
+    auto d_chars          = cudf::detail::make_device_uvector_sync(chars);
+    auto d_offsets        = cudf::detail::make_device_uvector_sync(offsets);
+    wrapped               = cudf::make_strings_column(d_chars, d_offsets);
   }
 
   /**
@@ -744,14 +742,12 @@ class strings_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     size_type num_strings = std::distance(begin, end);
-    std::vector<char> chars;
-    std::vector<size_type> offsets;
-    std::tie(chars, offsets) = detail::make_chars_and_offsets(begin, end, v);
-    auto null_mask           = detail::make_null_mask_vector(v, v + num_strings);
-    auto d_chars             = cudf::detail::make_device_uvector_sync(chars);
-    auto d_offsets           = cudf::detail::make_device_uvector_sync(offsets);
-    auto d_bitmask           = cudf::detail::make_device_uvector_sync(null_mask);
-    wrapped                  = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
+    auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
+    auto null_mask        = detail::make_null_mask_vector(v, v + num_strings);
+    auto d_chars          = cudf::detail::make_device_uvector_sync(chars);
+    auto d_offsets        = cudf::detail::make_device_uvector_sync(offsets);
+    auto d_bitmask        = cudf::detail::make_device_uvector_sync(null_mask);
+    wrapped               = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
   }
 
   /**
@@ -1468,20 +1464,18 @@ class lists_column_wrapper : public detail::column_wrapper {
       0, [&v](auto i) { return v.empty() ? true : v[i]; });
 
     // compute the expected hierarchy and depth
-    auto const hierarchy_and_depth = std::accumulate(
-      elements.begin(),
-      elements.end(),
-      std::pair<column_view, int32_t>{{}, -1},
-      [](auto acc, lists_column_wrapper const& lcw) {
-        return lcw.depth > acc.second ? std::make_pair(lcw.get_view(), lcw.depth) : acc;
-      });
+    auto const hierarchy_and_depth =
+      std::accumulate(elements.begin(),
+                      elements.end(),
+                      std::pair<column_view, int32_t>{{}, -1},
+                      [](auto acc, lists_column_wrapper const& lcw) {
+                        return lcw.depth > acc.second ? std::pair(lcw.get_view(), lcw.depth) : acc;
+                      });
     column_view expected_hierarchy = hierarchy_and_depth.first;
     int32_t const expected_depth   = hierarchy_and_depth.second;
 
     // preprocess columns so that every column_view in 'cols' is an equivalent hierarchy
-    std::vector<std::unique_ptr<column>> stubs;
-    std::vector<column_view> cols;
-    std::tie(cols, stubs) = preprocess_columns(elements, expected_hierarchy, expected_depth);
+    auto [cols, stubs] = preprocess_columns(elements, expected_hierarchy, expected_depth);
 
     // generate offsets
     size_type count = 0;
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index d078bf90a8a..7bd704a288d 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,6 +120,9 @@ struct TypeList<Types<TYPES...>> {
 #define CUDA_EXPECT_THROW_MESSAGE(x, msg) \
   EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "CUDA error encountered at:", msg)
 
+#define FATAL_CUDA_EXPECT_THROW_MESSAGE(x, msg) \
+  EXPECT_THROW_MESSAGE(x, cudf::fatal_cuda_error, "Fatal CUDA error encountered at:", msg)
+
 /**
  * @brief test macro to be expected as no exception.
  * The testing is same with EXPECT_NO_THROW() in gtest.
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 4df7b6a69c8..d722b836674 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -18,6 +18,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <filesystem>
 #include <string>
 
 #include <ftw.h>
@@ -34,17 +35,14 @@ class temp_directory {
  public:
   temp_directory(const std::string& base_name)
   {
-    std::string dir_template("/tmp");
-    if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p;
+    std::string dir_template{std::filesystem::temp_directory_path().string()};
+    if (auto env_p = std::getenv("WORKSPACE")) dir_template = env_p;
+
     dir_template += "/" + base_name + ".XXXXXX";
     auto const tmpdirptr = mkdtemp(const_cast<char*>(dir_template.data()));
-    if (tmpdirptr == nullptr) CUDF_FAIL("Temporary directory creation failure: " + dir_template);
-    _path = dir_template + "/";
-  }
+    CUDF_EXPECTS(tmpdirptr != nullptr, "Temporary directory creation failure: " + dir_template);
 
-  static int rm_files(const char* pathname, const struct stat* sbuf, int type, struct FTW* ftwb)
-  {
-    return std::remove(pathname);
+    _path = dir_template + "/";
   }
 
   temp_directory& operator=(temp_directory const&) = delete;
@@ -52,11 +50,7 @@ class temp_directory {
   temp_directory& operator=(temp_directory&&) = default;
   temp_directory(temp_directory&&)            = default;
 
-  ~temp_directory()
-  {
-    // TODO: should use std::filesystem instead, once C++17 support added
-    nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
-  }
+  ~temp_directory() { std::filesystem::remove_all(std::filesystem::path{_path}); }
 
   /**
    * @brief Returns the path of the temporary directory
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index c94c1a3b9b7..bfd5a42493b 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -79,8 +79,10 @@ endif()
 
 set_target_properties(
   cudf_kafka
-  PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" # set target compile options
-             CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             INSTALL_RPATH "\$ORIGIN" # set target compile options
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
 )
 
 # ##################################################################################################
diff --git a/cpp/libcudf_kafka/src/kafka_callback.cpp b/cpp/libcudf_kafka/src/kafka_callback.cpp
index 6b98747c145..79a40640627 100644
--- a/cpp/libcudf_kafka/src/kafka_callback.cpp
+++ b/cpp/libcudf_kafka/src/kafka_callback.cpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "cudf_kafka/kafka_callback.hpp"
+#include <cudf_kafka/kafka_callback.hpp>
 
 #include <librdkafka/rdkafkacpp.h>
 
diff --git a/cpp/libcudf_kafka/src/kafka_consumer.cpp b/cpp/libcudf_kafka/src/kafka_consumer.cpp
index 49e89a56e60..2ddaa9892da 100644
--- a/cpp/libcudf_kafka/src/kafka_consumer.cpp
+++ b/cpp/libcudf_kafka/src/kafka_consumer.cpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "cudf_kafka/kafka_consumer.hpp"
+#include <cudf_kafka/kafka_consumer.hpp>
 
 #include <librdkafka/rdkafkacpp.h>
 
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 8fedf641c8f..27732b25401 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -154,18 +154,6 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
-std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, dense_rank_aggregation const& agg)
-{
-  return visit(col_type, static_cast<aggregation const&>(agg));
-}
-
-std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
-  data_type col_type, percent_rank_aggregation const& agg)
-{
-  return visit(col_type, static_cast<aggregation const&>(agg));
-}
-
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, collect_list_aggregation const& agg)
 {
@@ -334,16 +322,6 @@ void aggregation_finalizer::visit(rank_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
-void aggregation_finalizer::visit(dense_rank_aggregation const& agg)
-{
-  visit(static_cast<aggregation const&>(agg));
-}
-
-void aggregation_finalizer::visit(percent_rank_aggregation const& agg)
-{
-  visit(static_cast<aggregation const&>(agg));
-}
-
 void aggregation_finalizer::visit(collect_list_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -644,36 +622,33 @@ template std::unique_ptr<rolling_aggregation> make_row_number_aggregation<rollin
 
 /// Factory to create a RANK aggregation
 template <typename Base>
-std::unique_ptr<Base> make_rank_aggregation()
-{
-  return std::make_unique<detail::rank_aggregation>();
-}
-template std::unique_ptr<aggregation> make_rank_aggregation<aggregation>();
-template std::unique_ptr<groupby_scan_aggregation>
-make_rank_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<scan_aggregation> make_rank_aggregation<scan_aggregation>();
-
-/// Factory to create a DENSE_RANK aggregation
-template <typename Base>
-std::unique_ptr<Base> make_dense_rank_aggregation()
-{
-  return std::make_unique<detail::dense_rank_aggregation>();
-}
-template std::unique_ptr<aggregation> make_dense_rank_aggregation<aggregation>();
-template std::unique_ptr<groupby_scan_aggregation>
-make_dense_rank_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<scan_aggregation> make_dense_rank_aggregation<scan_aggregation>();
-
-/// Factory to create a PERCENT_RANK aggregation
-template <typename Base>
-std::unique_ptr<Base> make_percent_rank_aggregation()
-{
-  return std::make_unique<detail::percent_rank_aggregation>();
-}
-template std::unique_ptr<aggregation> make_percent_rank_aggregation<aggregation>();
-template std::unique_ptr<groupby_scan_aggregation>
-make_percent_rank_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<scan_aggregation> make_percent_rank_aggregation<scan_aggregation>();
+std::unique_ptr<Base> make_rank_aggregation(rank_method method,
+                                            order column_order,
+                                            null_policy null_handling,
+                                            null_order null_precedence,
+                                            rank_percentage percentage)
+{
+  return std::make_unique<detail::rank_aggregation>(
+    method, column_order, null_handling, null_precedence, percentage);
+}
+template std::unique_ptr<aggregation> make_rank_aggregation<aggregation>(
+  rank_method method,
+  order column_order,
+  null_policy null_handling,
+  null_order null_precedence,
+  rank_percentage percentage);
+template std::unique_ptr<groupby_scan_aggregation> make_rank_aggregation<groupby_scan_aggregation>(
+  rank_method method,
+  order column_order,
+  null_policy null_handling,
+  null_order null_precedence,
+  rank_percentage percentage);
+template std::unique_ptr<scan_aggregation> make_rank_aggregation<scan_aggregation>(
+  rank_method method,
+  order column_order,
+  null_policy null_handling,
+  null_order null_precedence,
+  rank_percentage percentage);
 
 /// Factory to create a COLLECT_LIST aggregation
 template <typename Base>
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 756cf3421c9..ec14f8e6ded 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -445,7 +445,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
   if (view.num_rows() == 0 or view.num_columns() == 0) {
-    return std::make_pair(std::move(null_mask), 0);
+    return std::pair(std::move(null_mask), 0);
   }
 
   std::vector<bitmask_type const*> masks;
@@ -467,7 +467,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
       mr);
   }
 
-  return std::make_pair(std::move(null_mask), 0);
+  return std::pair(std::move(null_mask), 0);
 }
 
 // Returns the bitwise OR of the null masks of all columns in the table view
@@ -478,7 +478,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
   if (view.num_rows() == 0 or view.num_columns() == 0) {
-    return std::make_pair(std::move(null_mask), 0);
+    return std::pair(std::move(null_mask), 0);
   }
 
   std::vector<bitmask_type const*> masks;
@@ -500,7 +500,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
       mr);
   }
 
-  return std::make_pair(std::move(null_mask), 0);
+  return std::pair(std::move(null_mask), 0);
 }
 
 }  // namespace detail
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 514374d450d..35e7eba974f 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -688,9 +688,9 @@ BufInfo build_output_columns(InputIter begin,
                                   ? 0
                                   : (current_info->num_rows - current_info->valid_count);
         ++current_info;
-        return std::make_pair(ptr, null_count);
+        return std::pair(ptr, null_count);
       }
-      return std::make_pair(static_cast<bitmask_type const*>(nullptr), 0);
+      return std::pair(static_cast<bitmask_type const*>(nullptr), 0);
     }();
 
     // size/data pointer for the column
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
new file mode 100644
index 00000000000..778d6c4df55
--- /dev/null
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cudf {
+namespace detail {
+
+using cudf::type_id;
+
+namespace {
+
+/// Check if nonempty-null checks can be skipped for a given type.
+bool type_may_have_nonempty_nulls(cudf::type_id const& type)
+{
+  return type == type_id::STRING || type == type_id::LIST || type == type_id::STRUCT;
+}
+
+/// Check if the (STRING/LIST) column has any null rows with non-zero length.
+bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_view stream)
+{
+  if (not input.has_nulls()) { return false; }  // No nulls => no dirty rows.
+
+  // Cross-reference nullmask and offsets.
+  auto const type         = input.type().id();
+  auto const offsets      = (type == type_id::STRING) ? (strings_column_view{input}).offsets()
+                                                      : (lists_column_view{input}).offsets();
+  auto const d_input      = cudf::column_device_view::create(input);
+  auto const is_dirty_row = [d_input = *d_input, offsets = offsets.begin<size_type>()] __device__(
+                              size_type const& row_idx) {
+    return d_input.is_null_nocheck(row_idx) && (offsets[row_idx] != offsets[row_idx + 1]);
+  };
+
+  auto const row_begin = thrust::counting_iterator<cudf::size_type>(0);
+  auto const row_end   = row_begin + input.size();
+  return thrust::count_if(rmm::exec_policy(stream), row_begin, row_end, is_dirty_row) > 0;
+}
+
+}  // namespace
+
+/**
+ * @copydoc cudf::detail::has_nonempty_nulls
+ */
+bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view stream)
+{
+  auto const type = input.type().id();
+
+  if (not type_may_have_nonempty_nulls(type)) { return false; }
+
+  // For types with variable-length rows, check if any rows are "dirty".
+  // A dirty row is a null row with non-zero length.
+  if ((type == type_id::STRING || type == type_id::LIST) && has_nonempty_null_rows(input, stream)) {
+    return true;
+  }
+
+  // For complex types, check if child columns need purging.
+  if ((type == type_id::STRUCT || type == type_id::LIST) &&
+      std::any_of(input.child_begin(), input.child_end(), [stream](auto const& child) {
+        return cudf::detail::has_nonempty_nulls(child, stream);
+      })) {
+    return true;
+  }
+
+  return false;
+}
+}  // namespace detail
+
+/**
+ * @copydoc cudf::may_have_nonempty_nulls
+ */
+bool may_have_nonempty_nulls(column_view const& input)
+{
+  auto const type = input.type().id();
+
+  if (not detail::type_may_have_nonempty_nulls(type)) { return false; }
+
+  if ((type == type_id::STRING || type == type_id::LIST) && input.has_nulls()) { return true; }
+
+  if ((type == type_id::STRUCT || type == type_id::LIST) &&
+      std::any_of(input.child_begin(), input.child_end(), may_have_nonempty_nulls)) {
+    return true;
+  }
+
+  return false;
+}
+
+/**
+ * @copydoc cudf::has_nonempty_nulls
+ */
+bool has_nonempty_nulls(column_view const& input) { return detail::has_nonempty_nulls(input); }
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls(lists_column_view const&, rmm::mr::device_memory_resource*)
+ */
+std::unique_ptr<cudf::column> purge_nonempty_nulls(lists_column_view const& input,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*)
+ */
+std::unique_ptr<cudf::column> purge_nonempty_nulls(structs_column_view const& input,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls(strings_column_view const&, rmm::mr::device_memory_resource*)
+ */
+std::unique_ptr<cudf::column> purge_nonempty_nulls(strings_column_view const& input,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 57bb222aaa0..a002b0bb744 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -83,8 +83,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
                  "Unsupported groupby key type does not support equality comparison");
     auto [grouped_keys, results] =
       detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
-    return std::make_pair(unflatten_nested_columns(std::move(grouped_keys), _keys),
-                          std::move(results));
+    return std::pair(unflatten_nested_columns(std::move(grouped_keys), _keys), std::move(results));
   } else {
     return sort_aggregate(requests, stream, mr);
   }
@@ -102,9 +101,12 @@ namespace {
  * Adds special handling for COLLECT_LIST/COLLECT_SET, because:
  * 1. `make_empty_column()` does not support construction of nested columns.
  * 2. Empty lists need empty child columns, to persist type information.
+ * Adds special handling for RANK, because it needs to return double type column when rank_method is
+ * AVERAGE or percentage is true.
  */
 struct empty_column_constructor {
   column_view values;
+  aggregation const& agg;
 
   template <typename ValuesType, aggregation::Kind k>
   std::unique_ptr<cudf::column> operator()() const
@@ -117,6 +119,14 @@ struct empty_column_constructor {
         0, make_empty_column(type_to_id<offset_type>()), empty_like(values), 0, {});
     }
 
+    if constexpr (k == aggregation::Kind::RANK) {
+      auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
+      if (rank_agg._method == cudf::rank_method::AVERAGE or
+          rank_agg._percentage != rank_percentage::NONE)
+        return make_empty_column(type_to_id<double>());
+      return make_empty_column(target_type(values.type(), k));
+    }
+
     // If `values` is LIST typed, and the aggregation results match the type,
     // construct empty results based on `values`.
     // Most generally, this applies if input type matches output type.
@@ -149,7 +159,7 @@ auto empty_results(host_span<RequestType const> requests)
         std::back_inserter(results),
         [&request](auto const& agg) {
           return cudf::detail::dispatch_type_and_aggregation(
-            request.values.type(), agg->kind, empty_column_constructor{request.values});
+            request.values.type(), agg->kind, empty_column_constructor{request.values, *agg});
         });
 
       return aggregation_result{std::move(results)};
@@ -193,7 +203,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
   verify_valid_requests(requests);
 
-  if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); }
+  if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
 
   return dispatch_aggregation(requests, rmm::cuda_stream_default, mr);
 }
@@ -211,7 +221,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
 
   verify_valid_requests(requests);
 
-  if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); }
+  if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
 
   return sort_scan(requests, rmm::cuda_stream_default, mr);
 }
@@ -250,7 +260,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls
   CUDF_EXPECTS(static_cast<cudf::size_type>(replace_policies.size()) == values.num_columns(),
                "Size mismatch between num_columns and replace_policies.");
 
-  if (values.is_empty()) { return std::make_pair(empty_like(_keys), empty_like(values)); }
+  if (values.is_empty()) { return std::pair(empty_like(_keys), empty_like(values)); }
   auto const stream = rmm::cuda_stream_default;
 
   auto const& group_labels = helper().group_labels(stream);
@@ -269,8 +279,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls
                       : std::move(grouped_values);
     });
 
-  return std::make_pair(std::move(helper().sorted_keys(stream, mr)),
-                        std::make_unique<table>(std::move(results)));
+  return std::pair(std::move(helper().sorted_keys(stream, mr)),
+                   std::make_unique<table>(std::move(results)));
 }
 
 // Get the sort helper object
@@ -310,8 +320,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
         grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr);
     });
 
-  return std::make_pair(helper().sorted_keys(stream, mr),
-                        std::make_unique<cudf::table>(std::move(results)));
+  return std::pair(helper().sorted_keys(stream, mr),
+                   std::make_unique<cudf::table>(std::move(results)));
 }
 
 }  // namespace groupby
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 44df981f5bf..e22b3a4f3a4 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -429,17 +429,19 @@ auto create_hash_map(table_device_view const& d_keys,
   size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
   size_type constexpr unused_value{std::numeric_limits<size_type>::max()};
 
-  using map_type = concurrent_unordered_map<size_type,
-                                            size_type,
-                                            row_hasher<default_hash, nullate::DYNAMIC>,
-                                            row_equality_comparator<nullate::DYNAMIC>>;
+  using map_type =
+    concurrent_unordered_map<size_type,
+                             size_type,
+                             row_hasher<cudf::detail::default_hash, nullate::DYNAMIC>,
+                             row_equality_comparator<nullate::DYNAMIC>>;
 
   using allocator_type = typename map_type::allocator_type;
 
   auto const null_keys_are_equal =
     include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;
 
-  row_hasher<default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls}, d_keys};
+  row_hasher<cudf::detail::default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls},
+                                                                  d_keys};
   row_equality_comparator rows_equal{
     nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal};
 
@@ -670,7 +672,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   std::unique_ptr<table> unique_keys =
     groupby(keys, requests, &cache, has_nulls(keys), include_null_keys, stream, mr);
 
-  return std::make_pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
+  return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
 }
 }  // namespace hash
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 4904aa42723..02036ff0bbf 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -778,7 +778,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
 
   auto results = detail::extract_results(requests, cache, stream, mr);
 
-  return std::make_pair(helper().unique_keys(stream, mr), std::move(results));
+  return std::pair(helper().unique_keys(stream, mr), std::move(results));
 }
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index fa3d19bdcfd..748e34a583d 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,7 +87,6 @@ struct store_result_functor {
    */
   column_view get_sorted_values()
   {
-    if (is_presorted()) { return values; }
     return sorted_values ? sorted_values->view()
                          : (sorted_values = helper.sorted_values(values, stream))->view();
   };
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index 8b8a03f35a5..000a595ea2f 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -82,8 +82,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   auto null_purged_offsets = strings::detail::make_offsets_child_column(
     null_purged_sizes.cbegin(), null_purged_sizes.cend(), stream, mr);
 
-  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(
-    std::move(null_purged_values), std::move(null_purged_offsets));
+  return std::pair(std::move(null_purged_values), std::move(null_purged_offsets));
 }
 
 std::unique_ptr<column> group_collect(column_view const& values,
@@ -109,8 +108,8 @@ std::unique_ptr<column> group_collect(column_view const& values,
       return cudf::groupby::detail::purge_null_entries(
         values, offsets_column->view(), num_groups, stream, mr);
     } else {
-      return std::make_pair(std::make_unique<cudf::column>(values, stream, mr),
-                            std::move(offsets_column));
+      return std::pair(std::make_unique<cudf::column>(values, stream, mr),
+                       std::move(offsets_column));
     }
   }();
 
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 77d68edaa3a..0b25ab9a33d 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/row_operators.cuh>
@@ -27,6 +28,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/functional.h>
+#include <thrust/iterator/reverse_iterator.h>
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
@@ -35,23 +37,59 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 namespace {
+
+/**
+ * @brief Functor to compare two rows of a table in given permutation order
+ * This is useful to identify unique elements in a sorted order table, when the permutation order is
+ * the sorted order of the table.
+ *
+ */
+template <typename Iterator>
+struct permuted_comparator {
+  /**
+   * @brief comparator object which compares two rows of the table in given permutation order
+   *
+   * @param device_table Device table to compare
+   * @param permutation The permutation order, integer type column.
+   * @param has_nulls whether the table has nulls
+   */
+  permuted_comparator(table_device_view device_table, Iterator const permutation, bool has_nulls)
+    : comparator(nullate::DYNAMIC{has_nulls}, device_table, device_table, null_equality::EQUAL),
+      permutation(permutation)
+  {
+  }
+  __device__ bool operator()(size_type index1, size_type index2) const
+  {
+    return comparator(permutation[index1], permutation[index2]);
+  };
+
+ private:
+  row_equality_comparator<nullate::DYNAMIC> comparator;
+  Iterator const permutation;
+};
+
 /**
  * @brief generate grouped row ranks or dense ranks using a row comparison then scan the results
  *
+ * @tparam forward true if the rank scan computation should use forward iterator traversal (default)
+ * else reverse iterator traversal
  * @tparam value_resolver flag value resolver function with boolean first and row number arguments
  * @tparam scan_operator scan function ran on the flag values
- * @param order_by input column to generate ranks for
+ * @param grouped_values input column to generate ranks for
+ * @param value_order column of type INT32 that contains the order of the values in the
+ * grouped_values column
  * @param group_labels ID of group that the corresponding value belongs to
  * @param group_offsets group index offsets with group ID indices
  * @param resolver flag value resolver
  * @param scan_op scan operation ran on the flag results
- * @param has_nulls true if nulls are included in the `order_by` column
+ * @param has_nulls true if nulls are included in the `grouped_values` column
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> rank values
  */
-template <typename value_resolver, typename scan_operator>
-std::unique_ptr<column> rank_generator(column_view const& order_by,
+template <bool forward, typename value_resolver, typename scan_operator>
+std::unique_ptr<column> rank_generator(column_view const& grouped_values,
+                                       column_view const& value_order,
                                        device_span<size_type const> group_labels,
                                        device_span<size_type const> group_offsets,
                                        value_resolver resolver,
@@ -61,10 +99,11 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        rmm::mr::device_memory_resource* mr)
 {
   auto const flattened = cudf::structs::detail::flatten_nested_columns(
-    table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
+    table_view{{grouped_values}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
   auto const d_flat_order = table_device_view::create(flattened, stream);
-  row_equality_comparator comparator(
-    nullate::DYNAMIC{has_nulls}, *d_flat_order, *d_flat_order, null_equality::EQUAL);
+  auto sorted_index_order = value_order.begin<size_type>();
+  auto comparator         = permuted_comparator(*d_flat_order, sorted_index_order, has_nulls);
+
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                        flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
@@ -72,100 +111,218 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        mr);
   auto mutable_ranks = ranks->mutable_view();
 
-  thrust::tabulate(
-    rmm::exec_policy(stream),
-    mutable_ranks.begin<size_type>(),
-    mutable_ranks.end<size_type>(),
-    [comparator, resolver, labels = group_labels.data(), offsets = group_offsets.data()] __device__(
-      size_type row_index) {
-      auto group_start = offsets[labels[row_index]];
+  auto unique_identifier = [labels  = group_labels.begin(),
+                            offsets = group_offsets.begin(),
+                            comparator,
+                            resolver] __device__(size_type row_index) {
+    auto const group_start = offsets[labels[row_index]];
+    if constexpr (forward) {
+      // First value of equal values is 1.
       return resolver(row_index == group_start || !comparator(row_index, row_index - 1),
                       row_index - group_start);
-    });
+    } else {
+      auto const group_end = offsets[labels[row_index] + 1];
+      // Last value of equal values is 1.
+      return resolver(row_index + 1 == group_end || !comparator(row_index, row_index + 1),
+                      row_index - group_start);
+    }
+  };
+  thrust::tabulate(rmm::exec_policy(stream),
+                   mutable_ranks.begin<size_type>(),
+                   mutable_ranks.end<size_type>(),
+                   unique_identifier);
 
+  auto [group_labels_begin, mutable_rank_begin] = [&]() {
+    if constexpr (forward) {
+      return thrust::pair{group_labels.begin(), mutable_ranks.begin<size_type>()};
+    } else {
+      return thrust::pair{thrust::reverse_iterator(group_labels.end()),
+                          thrust::reverse_iterator(mutable_ranks.end<size_type>())};
+    }
+  }();
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                group_labels.begin(),
-                                group_labels.end(),
-                                mutable_ranks.begin<size_type>(),
-                                mutable_ranks.begin<size_type>(),
+                                group_labels_begin,
+                                group_labels_begin + group_labels.size(),
+                                mutable_rank_begin,
+                                mutable_rank_begin,
                                 thrust::equal_to{},
                                 scan_op);
-
   return ranks;
 }
 }  // namespace
 
-std::unique_ptr<column> rank_scan(column_view const& order_by,
-                                  device_span<size_type const> group_labels,
-                                  device_span<size_type const> group_offsets,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
+                                      column_view const& value_order,
+                                      device_span<size_type const> group_labels,
+                                      device_span<size_type const> group_offsets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
-  return rank_generator(
-    order_by,
+  return rank_generator<true>(
+    grouped_values,
+    value_order,
     group_labels,
     group_offsets,
     [] __device__(bool unequal, auto row_index_in_group) {
       return unequal ? row_index_in_group + 1 : 0;
     },
     DeviceMax{},
-    has_nested_nulls(table_view{{order_by}}),
+    has_nested_nulls(table_view{{grouped_values}}),
     stream,
     mr);
 }
 
-std::unique_ptr<column> dense_rank_scan(column_view const& order_by,
-                                        device_span<size_type const> group_labels,
-                                        device_span<size_type const> group_offsets,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
+                                      column_view const& value_order,
+                                      device_span<size_type const> group_labels,
+                                      device_span<size_type const> group_offsets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
-  return rank_generator(
-    order_by,
+  return rank_generator<false>(
+    grouped_values,
+    value_order,
     group_labels,
     group_offsets,
-    [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; },
-    DeviceSum{},
-    has_nested_nulls(table_view{{order_by}}),
+    [] __device__(bool unequal, auto row_index_in_group) {
+      return unequal ? row_index_in_group + 1 : std::numeric_limits<size_type>::max();
+    },
+    DeviceMin{},
+    has_nested_nulls(table_view{{grouped_values}}),
     stream,
     mr);
 }
 
-std::unique_ptr<column> percent_rank_scan(column_view const& order_by,
+std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
+                                        column_view const&,
+                                        device_span<size_type const> group_labels,
+                                        device_span<size_type const> group_offsets,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  auto ranks = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
+  auto mutable_ranks = ranks->mutable_view();
+  thrust::tabulate(rmm::exec_policy(stream),
+                   mutable_ranks.begin<size_type>(),
+                   mutable_ranks.end<size_type>(),
+                   [labels  = group_labels.begin(),
+                    offsets = group_offsets.begin()] __device__(size_type row_index) {
+                     auto group_start = offsets[labels[row_index]];
+                     return row_index - group_start + 1;
+                   });
+  return ranks;
+}
+
+std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
+                                          column_view const& value_order,
                                           device_span<size_type const> group_labels,
                                           device_span<size_type const> group_offsets,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
-  auto const rank_column = rank_scan(
-    order_by, group_labels, group_offsets, stream, rmm::mr::get_current_device_resource());
-  auto const rank_view       = rank_column->view();
-  auto const group_size_iter = cudf::detail::make_counting_transform_iterator(
-    0,
-    [labels  = group_labels.begin(),
-     offsets = group_offsets.begin()] __device__(size_type row_index) {
-      auto const group_label = labels[row_index];
-      auto const group_start = offsets[group_label];
-      auto const group_end   = offsets[group_label + 1];
-      return group_end - group_start;
-    });
-
-  // Result type for PERCENT_RANK is independent of input type.
-  using result_type = cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
-
-  auto percent_rank_result = cudf::make_fixed_width_column(
-    data_type{type_to_id<result_type>()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr);
-
+  auto max_rank = max_rank_scan(grouped_values,
+                                value_order,
+                                group_labels,
+                                group_offsets,
+                                stream,
+                                rmm::mr::get_current_device_resource());
+  auto min_rank = min_rank_scan(grouped_values,
+                                value_order,
+                                group_labels,
+                                group_offsets,
+                                stream,
+                                rmm::mr::get_current_device_resource());
+  auto ranks    = make_fixed_width_column(
+    data_type{type_to_id<double>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
+  auto mutable_ranks = ranks->mutable_view();
   thrust::transform(rmm::exec_policy(stream),
-                    rank_view.begin<size_type>(),
-                    rank_view.end<size_type>(),
-                    group_size_iter,
-                    percent_rank_result->mutable_view().begin<result_type>(),
-                    [] __device__(auto const rank, auto const group_size) {
-                      return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1));
+                    max_rank->view().begin<size_type>(),
+                    max_rank->view().end<size_type>(),
+                    min_rank->view().begin<size_type>(),
+                    mutable_ranks.begin<double>(),
+                    [] __device__(auto max_rank, auto min_rank) -> double {
+                      return min_rank + (max_rank - min_rank) / 2.0;
                     });
+  return ranks;
+}
 
-  return percent_rank_result;
+std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
+                                        column_view const& value_order,
+                                        device_span<size_type const> group_labels,
+                                        device_span<size_type const> group_offsets,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return rank_generator<true>(
+    grouped_values,
+    value_order,
+    group_labels,
+    group_offsets,
+    [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; },
+    DeviceSum{},
+    has_nested_nulls(table_view{{grouped_values}}),
+    stream,
+    mr);
+}
+
+std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
+                                                 rank_percentage const percentage,
+                                                 column_view const& rank,
+                                                 column_view const& count,
+                                                 device_span<size_type const> group_labels,
+                                                 device_span<size_type const> group_offsets,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(percentage != rank_percentage::NONE, "Percentage cannot be NONE");
+  auto ranks = make_fixed_width_column(
+    data_type{type_to_id<double>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
+  ranks->set_null_mask(copy_bitmask(rank, stream, mr));
+  auto mutable_ranks = ranks->mutable_view();
+
+  auto one_normalized = [] __device__(auto const rank, auto const group_size) {
+    return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1));
+  };
+  if (method == rank_method::DENSE) {
+    thrust::tabulate(rmm::exec_policy(stream),
+                     mutable_ranks.begin<double>(),
+                     mutable_ranks.end<double>(),
+                     [percentage,
+                      one_normalized,
+                      is_double = rank.type().id() == type_id::FLOAT64,
+                      dcount    = count.begin<size_type>(),
+                      labels    = group_labels.begin(),
+                      offsets   = group_offsets.begin(),
+                      d_rank    = rank.begin<double>(),
+                      s_rank = rank.begin<size_type>()] __device__(size_type row_index) -> double {
+                       double const r   = is_double ? d_rank[row_index] : s_rank[row_index];
+                       auto const count = dcount[labels[row_index]];
+                       size_type const last_rank_index = offsets[labels[row_index]] + count - 1;
+                       auto const last_rank            = s_rank[last_rank_index];
+                       return percentage == rank_percentage::ZERO_NORMALIZED
+                                ? r / last_rank
+                                : one_normalized(r, last_rank);
+                     });
+  } else {
+    thrust::tabulate(rmm::exec_policy(stream),
+                     mutable_ranks.begin<double>(),
+                     mutable_ranks.end<double>(),
+                     [percentage,
+                      one_normalized,
+                      is_double = rank.type().id() == type_id::FLOAT64,
+                      dcount    = count.begin<size_type>(),
+                      labels    = group_labels.begin(),
+                      d_rank    = rank.begin<double>(),
+                      s_rank = rank.begin<size_type>()] __device__(size_type row_index) -> double {
+                       double const r   = is_double ? d_rank[row_index] : s_rank[row_index];
+                       auto const count = dcount[labels[row_index]];
+                       return percentage == rank_percentage::ZERO_NORMALIZED
+                                ? r / count
+                                : one_normalized(r, count);
+                     });
+  }
+  return ranks;
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index 76a7f3f73c7..dc0eb691748 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -85,52 +85,115 @@ std::unique_ptr<column> count_scan(device_span<size_type const> group_labels,
                                    rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Internal API to calculate groupwise rank value
+ * @brief Internal API to calculate groupwise min rank value
  *
- * @param order_by column or struct column that rows within a group are sorted by
+ * @param grouped_values column or struct column that rows within a group are sorted by
+ * @param value_order column of type INT32 that contains the order of the values in the
+ * grouped_values column
  * @param group_labels ID of group that the corresponding value belongs to
  * @param group_offsets group index offsets with group ID indices
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of type size_type of rank values
  */
-std::unique_ptr<column> rank_scan(column_view const& order_by,
-                                  device_span<size_type const> group_labels,
-                                  device_span<size_type const> group_offsets,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
+                                      column_view const& value_order,
+                                      device_span<size_type const> group_labels,
+                                      device_span<size_type const> group_offsets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate groupwise max rank value
+ *
+ * @details  @copydetails min_rank_scan(column_view const& grouped_values,
+ *                                      column_view const& value_order,
+ *                                      device_span<size_type const> group_labels,
+ *                                      device_span<size_type const> group_offsets,
+ *                                      rmm::cuda_stream_view stream,
+ *                                      rmm::mr::device_memory_resource* mr)
+ */
+std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
+                                      column_view const& value_order,
+                                      device_span<size_type const> group_labels,
+                                      device_span<size_type const> group_offsets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate groupwise first rank value
+ *
+ * @details  @copydetails min_rank_scan(column_view const& grouped_values,
+ *                                      column_view const& value_order,
+ *                                      device_span<size_type const> group_labels,
+ *                                      device_span<size_type const> group_offsets,
+ *                                      rmm::cuda_stream_view stream,
+ *                                      rmm::mr::device_memory_resource* mr)
+ */
+std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
+                                        column_view const& value_order,
+                                        device_span<size_type const> group_labels,
+                                        device_span<size_type const> group_offsets,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to calculate groupwise average rank value
+ *
+ * @details  @copydetails min_rank_scan(column_view const& grouped_values,
+ *                                      column_view const& value_order,
+ *                                      device_span<size_type const> group_labels,
+ *                                      device_span<size_type const> group_offsets,
+ *                                      rmm::cuda_stream_view stream,
+ *                                      rmm::mr::device_memory_resource* mr)
+ */
+std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
+                                          column_view const& value_order,
+                                          device_span<size_type const> group_labels,
+                                          device_span<size_type const> group_offsets,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate groupwise dense rank value
  *
- * @param order_by column or struct column that rows within a group are sorted by
+ * @param grouped_values column or struct column that rows within a group are sorted by
  * @param group_labels ID of group that the corresponding value belongs to
  * @param group_offsets group index offsets with group ID indices
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of type size_type of dense rank values
  */
-std::unique_ptr<column> dense_rank_scan(column_view const& order_by,
+std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
+                                        column_view const& value_order,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Internal API to calculate groupwise percent rank value
+ * @brief Convert groupwise rank to groupwise percentage rank
  *
- * @param order_by column or struct column by which the rows within a group are sorted
- * @param group_labels ID of group to which the row belongs
+ * @param method rank method
+ * @param percentage enum to denote the type of conversion ranks to percentage in range (0,1]
+ * @param rank Groupwise rank column
+ * @param count Groupwise count column
+ * @param group_labels ID of group that the corresponding value belongs to
  * @param group_offsets group index offsets with group ID indices
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Column of type `double` of percent rank values
+ * @return Column of type double of rank values
+
  */
-std::unique_ptr<column> percent_rank_scan(column_view const& order_by,
-                                          device_span<size_type const> group_labels,
-                                          device_span<size_type const> group_offsets,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
+                                                 rank_percentage const percentage,
+                                                 column_view const& rank,
+                                                 column_view const& count,
+                                                 device_span<size_type const> group_labels,
+                                                 device_span<size_type const> group_offsets,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 8c4959da35b..5d345273782 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -16,14 +16,20 @@
 
 #include <groupby/common/utils.hpp>
 #include <groupby/sort/functors.hpp>
+#include <groupby/sort/group_reductions.hpp>
 #include <groupby/sort/group_scan.hpp>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/scatter.hpp>
+#include <cudf/detail/sequence.hpp>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -115,51 +121,70 @@ template <>
 void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
 {
   if (cache.has_result(values, agg)) return;
-  CUDF_EXPECTS(helper.is_presorted(),
-               "Rank aggregate in groupby scan requires the keys to be presorted");
-  auto const order_by = get_grouped_values();
-  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
-               "Unsupported list type in grouped rank scan.");
-
-  cache.add_result(
-    values,
-    agg,
-    detail::rank_scan(
-      order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr));
-}
-
-template <>
-void scan_result_functor::operator()<aggregation::DENSE_RANK>(aggregation const& agg)
-{
-  if (cache.has_result(values, agg)) return;
-  CUDF_EXPECTS(helper.is_presorted(),
-               "Dense rank aggregate in groupby scan requires the keys to be presorted");
-  auto const order_by = get_grouped_values();
-  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
-               "Unsupported list type in grouped dense_rank scan.");
 
-  cache.add_result(
-    values,
-    agg,
-    detail::dense_rank_scan(
-      order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr));
-}
-
-template <>
-void scan_result_functor::operator()<aggregation::PERCENT_RANK>(aggregation const& agg)
-{
-  if (cache.has_result(values, agg)) return;
-  CUDF_EXPECTS(helper.is_presorted(),
-               "Percent rank aggregate in groupby scan requires the keys to be presorted");
-  auto const order_by = get_grouped_values();
-  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
-               "Unsupported list type in grouped percent_rank scan.");
-
-  cache.add_result(
-    values,
-    agg,
-    detail::percent_rank_scan(
-      order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr));
+  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(values),
+               "Unsupported list type in grouped rank scan.");
+  auto const& rank_agg         = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
+  auto const& group_labels     = helper.group_labels(stream);
+  auto const group_labels_view = column_view(cudf::device_span<const size_type>(group_labels));
+  auto const gather_map        = [&]() {
+    if (is_presorted()) {  // assumes both keys and values are sorted, Spark does this.
+      return cudf::detail::sequence(
+        group_labels.size(), *cudf::make_fixed_width_scalar(size_type{0}, stream), stream);
+    } else {
+      auto sort_order = (rank_agg._method == rank_method::FIRST ? cudf::detail::stable_sorted_order
+                                                                       : cudf::detail::sorted_order);
+      return sort_order(table_view({group_labels_view, get_grouped_values()}),
+                        {order::ASCENDING, rank_agg._column_order},
+                        {null_order::AFTER, rank_agg._null_precedence},
+                        stream,
+                        rmm::mr::get_current_device_resource());
+    }
+  }();
+
+  auto rank_scan = [&]() {
+    switch (rank_agg._method) {
+      case rank_method::FIRST: return detail::first_rank_scan;
+      case rank_method::AVERAGE: return detail::average_rank_scan;
+      case rank_method::DENSE: return detail::dense_rank_scan;
+      case rank_method::MIN: return detail::min_rank_scan;
+      case rank_method::MAX: return detail::max_rank_scan;
+      default: CUDF_FAIL("Unsupported rank method in groupby scan");
+    }
+  }();
+  auto result = rank_scan(get_grouped_values(),
+                          *gather_map,
+                          helper.group_labels(stream),
+                          helper.group_offsets(stream),
+                          stream,
+                          rmm::mr::get_current_device_resource());
+  if (rank_agg._percentage != rank_percentage::NONE) {
+    auto count = get_grouped_values().nullable() and rank_agg._null_handling == null_policy::EXCLUDE
+                   ? detail::group_count_valid(get_grouped_values(),
+                                               helper.group_labels(stream),
+                                               helper.num_groups(stream),
+                                               stream,
+                                               rmm::mr::get_current_device_resource())
+                   : detail::group_count_all(helper.group_offsets(stream),
+                                             helper.num_groups(stream),
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
+    result     = detail::group_rank_to_percentage(rank_agg._method,
+                                              rank_agg._percentage,
+                                              *result,
+                                              *count,
+                                              helper.group_labels(stream),
+                                              helper.group_offsets(stream),
+                                              stream,
+                                              mr);
+  }
+  result = std::move(cudf::detail::scatter(
+                       table_view{{*result}}, *gather_map, table_view{{*result}}, false, stream, mr)
+                       ->release()[0]);
+  if (rank_agg._null_handling == null_policy::EXCLUDE) {
+    result->set_null_mask(cudf::detail::copy_bitmask(get_grouped_values(), stream, mr));
+  }
+  cache.add_result(values, agg, std::move(result));
 }
 }  // namespace detail
 
@@ -185,7 +210,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
 
   auto results = detail::extract_results(requests, cache, stream, mr);
 
-  return std::make_pair(helper().sorted_keys(stream, mr), std::move(results));
+  return std::pair(helper().sorted_keys(stream, mr), std::move(results));
 }
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 76f3fba4689..9136410a03d 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -113,7 +113,7 @@ union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
  */
 template <typename Key,
           typename Element,
-          typename Hasher    = default_hash<Key>,
+          typename Hasher    = cudf::detail::default_hash<Key>,
           typename Equality  = equal_to<Key>,
           typename Allocator = default_allocator<thrust::pair<Key, Element>>>
 class concurrent_unordered_map {
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 5885b61b35b..556ca6b9d80 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -162,62 +162,66 @@ rmm::device_buffer decompress_data(datasource& source,
                                    rmm::cuda_stream_view stream)
 {
   if (meta.codec == "deflate") {
-    size_t uncompressed_data_size = 0;
+    auto inflate_in = hostdevice_vector<device_span<uint8_t const>>(meta.block_list.size(), stream);
+    auto inflate_out   = hostdevice_vector<device_span<uint8_t>>(meta.block_list.size(), stream);
+    auto inflate_stats = hostdevice_vector<decompress_status>(meta.block_list.size(), stream);
 
-    auto inflate_in  = hostdevice_vector<gpu_inflate_input_s>(meta.block_list.size(), stream);
-    auto inflate_out = hostdevice_vector<gpu_inflate_status_s>(meta.block_list.size(), stream);
+    // Guess an initial maximum uncompressed block size. We estimate the compression factor is two
+    // and round up to the next multiple of 4096 bytes.
+    uint32_t const initial_blk_len = meta.max_block_size * 2 + (meta.max_block_size * 2) % 4096;
+    size_t const uncomp_size       = initial_blk_len * meta.block_list.size();
 
-    // Guess an initial maximum uncompressed block size
-    uint32_t initial_blk_len = (meta.max_block_size * 2 + 0xfff) & ~0xfff;
-    uncompressed_data_size   = initial_blk_len * meta.block_list.size();
-    for (size_t i = 0; i < inflate_in.size(); ++i) {
-      inflate_in[i].dstSize = initial_blk_len;
-    }
-
-    rmm::device_buffer decomp_block_data(uncompressed_data_size, stream);
+    rmm::device_buffer decomp_block_data(uncomp_size, stream);
 
     auto const base_offset = meta.block_list[0].offset;
     for (size_t i = 0, dst_pos = 0; i < meta.block_list.size(); i++) {
       auto const src_pos = meta.block_list[i].offset - base_offset;
 
-      inflate_in[i].srcDevice = static_cast<uint8_t const*>(comp_block_data.data()) + src_pos;
-      inflate_in[i].srcSize   = meta.block_list[i].size;
-      inflate_in[i].dstDevice = static_cast<uint8_t*>(decomp_block_data.data()) + dst_pos;
+      inflate_in[i]  = {static_cast<uint8_t const*>(comp_block_data.data()) + src_pos,
+                       meta.block_list[i].size};
+      inflate_out[i] = {static_cast<uint8_t*>(decomp_block_data.data()) + dst_pos, initial_blk_len};
 
       // Update blocks offsets & sizes to refer to uncompressed data
       meta.block_list[i].offset = dst_pos;
-      meta.block_list[i].size   = static_cast<uint32_t>(inflate_in[i].dstSize);
+      meta.block_list[i].size   = static_cast<uint32_t>(inflate_out[i].size());
       dst_pos += meta.block_list[i].size;
     }
+    inflate_in.host_to_device(stream);
 
     for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) {
-      inflate_in.host_to_device(stream);
-      CUDF_CUDA_TRY(
-        cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value()));
-      CUDF_CUDA_TRY(gpuinflate(
-        inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream));
-      inflate_out.device_to_host(stream, true);
+      inflate_out.host_to_device(stream);
+      CUDF_CUDA_TRY(cudaMemsetAsync(
+        inflate_stats.device_ptr(), 0, inflate_stats.memory_size(), stream.value()));
+      gpuinflate(inflate_in, inflate_out, inflate_stats, gzip_header_included::NO, stream);
+      inflate_stats.device_to_host(stream, true);
 
       // Check if larger output is required, as it's not known ahead of time
       if (loop_cnt == 0) {
-        size_t actual_uncompressed_size = 0;
-        for (size_t i = 0; i < meta.block_list.size(); i++) {
-          // If error status is 1 (buffer too small), the `bytes_written` field
-          // is actually contains the uncompressed data size
-          if (inflate_out[i].status == 1 && inflate_out[i].bytes_written > inflate_in[i].dstSize) {
-            inflate_in[i].dstSize = inflate_out[i].bytes_written;
-          }
-          actual_uncompressed_size += inflate_in[i].dstSize;
-        }
-        if (actual_uncompressed_size > uncompressed_data_size) {
-          decomp_block_data.resize(actual_uncompressed_size, stream);
-          for (size_t i = 0, dst_pos = 0; i < meta.block_list.size(); i++) {
-            auto dst_base           = static_cast<uint8_t*>(decomp_block_data.data());
-            inflate_in[i].dstDevice = dst_base + dst_pos;
-
-            meta.block_list[i].offset = dst_pos;
-            meta.block_list[i].size   = static_cast<uint32_t>(inflate_in[i].dstSize);
-            dst_pos += meta.block_list[i].size;
+        std::vector<size_t> actual_uncomp_sizes;
+        actual_uncomp_sizes.reserve(inflate_out.size());
+        std::transform(inflate_out.begin(),
+                       inflate_out.end(),
+                       inflate_stats.begin(),
+                       std::back_inserter(actual_uncomp_sizes),
+                       [](auto const& inf_out, auto const& inf_stats) {
+                         // If error status is 1 (buffer too small), the `bytes_written` field
+                         // actually contains the uncompressed data size
+                         return inf_stats.status == 1
+                                  ? std::max(inf_out.size(), inf_stats.bytes_written)
+                                  : inf_out.size();
+                       });
+        auto const total_actual_uncomp_size =
+          std::accumulate(actual_uncomp_sizes.cbegin(), actual_uncomp_sizes.cend(), 0ul);
+        if (total_actual_uncomp_size > uncomp_size) {
+          decomp_block_data.resize(total_actual_uncomp_size, stream);
+          for (size_t i = 0; i < meta.block_list.size(); ++i) {
+            meta.block_list[i].offset =
+              i > 0 ? (meta.block_list[i - 1].size + meta.block_list[i - 1].offset) : 0;
+            meta.block_list[i].size = static_cast<uint32_t>(actual_uncomp_sizes[i]);
+
+            inflate_out[i] = {
+              static_cast<uint8_t*>(decomp_block_data.data()) + meta.block_list[i].offset,
+              meta.block_list[i].size};
           }
         } else {
           break;
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 631cf19b2aa..cf4d1b0e0f4 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -1904,41 +1904,42 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
  *
  * blockDim = {block_size,1,1}
  *
- * @param[in] inputs Source/Destination buffer information per block
- * @param[out] outputs Decompressor status per block
+ * @param[in] inputs Source buffer per block
+ * @param[out] outputs Destination buffer per block
+ * @param[out] statuses Decompressor status per block
  * @param scratch Intermediate device memory heap space (will be dynamically shared between blocks)
  * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between
- *blocks)
- * @param count Number of blocks to decompress
+ * blocks)
  */
-extern "C" __global__ void __launch_bounds__(block_size, 2)
-  gpu_debrotli_kernel(gpu_inflate_input_s* inputs,
-                      gpu_inflate_status_s* outputs,
+__global__ void __launch_bounds__(block_size, 2)
+  gpu_debrotli_kernel(device_span<device_span<uint8_t const> const> inputs,
+                      device_span<device_span<uint8_t> const> outputs,
+                      device_span<decompress_status> statuses,
                       uint8_t* scratch,
-                      uint32_t scratch_size,
-                      uint32_t count)
+                      uint32_t scratch_size)
 {
   __shared__ __align__(16) debrotli_state_s state_g;
 
   int t                     = threadIdx.x;
-  int z                     = blockIdx.x;
+  auto const block_id       = blockIdx.x;
   debrotli_state_s* const s = &state_g;
 
-  if (z >= count) { return; }
+  if (block_id >= inputs.size()) { return; }
   // Thread0: initializes shared state and decode stream header
   if (!t) {
-    auto const* src = static_cast<uint8_t const*>(inputs[z].srcDevice);
-    size_t src_size = inputs[z].srcSize;
+    auto const src      = inputs[block_id].data();
+    auto const src_size = inputs[block_id].size();
     if (src && src_size >= 8) {
-      s->error = 0;
-      s->out = s->outbase = static_cast<uint8_t*>(inputs[z].dstDevice);
-      s->bytes_left       = inputs[z].dstSize;
-      s->mtf_upper_bound  = 63;
-      s->dist_rb[0]       = 16;
-      s->dist_rb[1]       = 15;
-      s->dist_rb[2]       = 11;
-      s->dist_rb[3]       = 4;
-      s->dist_rb_idx      = 0;
+      s->error           = 0;
+      s->out             = outputs[block_id].data();
+      s->outbase         = s->out;
+      s->bytes_left      = outputs[block_id].size();
+      s->mtf_upper_bound = 63;
+      s->dist_rb[0]      = 16;
+      s->dist_rb[1]      = 15;
+      s->dist_rb[2]      = 11;
+      s->dist_rb[3]      = 4;
+      s->dist_rb_idx     = 0;
       s->p1 = s->p2 = 0;
       initbits(s, src, src_size);
       DecodeStreamHeader(s);
@@ -2015,9 +2016,10 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
   __syncthreads();
   // Output decompression status
   if (!t) {
-    outputs[z].bytes_written = s->out - s->outbase;
-    outputs[z].status        = s->error;
-    outputs[z].reserved      = s->fb_size;  // Return ext heap used by last block (statistics)
+    statuses[block_id].bytes_written = s->out - s->outbase;
+    statuses[block_id].status        = s->error;
+    // Return ext heap used by last block (statistics)
+    statuses[block_id].reserved = s->fb_size;
   }
 }
 
@@ -2075,20 +2077,21 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
 #include <stdio.h>
 #endif
 
-cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
-                                  gpu_inflate_status_s* outputs,
-                                  void* scratch,
-                                  size_t scratch_size,
-                                  int count,
-                                  rmm::cuda_stream_view stream)
+void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
+                  device_span<device_span<uint8_t> const> outputs,
+                  device_span<decompress_status> statuses,
+                  void* scratch,
+                  size_t scratch_size,
+                  rmm::cuda_stream_view stream)
 {
-  uint32_t count32 = (count > 0) ? count : 0;
+  auto const count = inputs.size();
   uint32_t fb_heap_size;
   auto* scratch_u8 = static_cast<uint8_t*>(scratch);
   dim3 dim_block(block_size, 1);
-  dim3 dim_grid(count32, 1);  // TODO: Check max grid dimensions vs max expected count
+  dim3 dim_grid(count, 1);  // TODO: Check max grid dimensions vs max expected count
 
-  if (scratch_size < sizeof(brotli_dictionary_s)) { return cudaErrorLaunchOutOfResources; }
+  CUDF_EXPECTS(scratch_size >= sizeof(brotli_dictionary_s),
+               "Insufficient scratch space for debrotli");
   scratch_size = min(scratch_size, (size_t)0xffffffffu);
   fb_heap_size = (uint32_t)((scratch_size - sizeof(brotli_dictionary_s)) & ~0xf);
 
@@ -2101,7 +2104,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
                                 cudaMemcpyHostToDevice,
                                 stream.value()));
   gpu_debrotli_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(
-    inputs, outputs, scratch_u8, fb_heap_size, count32);
+    inputs, outputs, statuses, scratch_u8, fb_heap_size);
 #if DUMP_FB_HEAP
   uint32_t dump[2];
   uint32_t cur = 0;
@@ -2114,8 +2117,6 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
     cur = (dump[0] > cur) ? dump[0] : 0xffffffffu;
   }
 #endif
-
-  return cudaSuccess;
 }
 
 }  // namespace io
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 508e960430d..0d33158da2b 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,8 +124,8 @@ struct inflate_state_s {
   uint8_t* outbase;  ///< start of output buffer
   uint8_t* outend;   ///< end of output buffer
   // Input state
-  uint8_t* cur;  ///< input buffer
-  uint8_t* end;  ///< end of input buffer
+  uint8_t const* cur;  ///< input buffer
+  uint8_t const* end;  ///< end of input buffer
 
   uint2 bitbuf;     ///< bit buffer (64-bit)
   uint32_t bitpos;  ///< position in bit buffer
@@ -180,10 +180,10 @@ inline __device__ void skipbits(inflate_state_s* s, uint32_t n)
 {
   uint32_t bitpos = s->bitpos + n;
   if (bitpos >= 32) {
-    uint8_t* cur = s->cur + 8;
-    s->bitbuf.x  = s->bitbuf.y;
-    s->bitbuf.y  = (cur < s->end) ? *reinterpret_cast<uint32_t*>(cur) : 0;
-    s->cur       = cur - 4;
+    auto cur    = s->cur + 8;
+    s->bitbuf.x = s->bitbuf.y;
+    s->bitbuf.y = (cur < s->end) ? *reinterpret_cast<uint32_t const*>(cur) : 0;
+    s->cur      = cur - 4;
     bitpos &= 0x1f;
   }
   s->bitpos = bitpos;
@@ -510,8 +510,8 @@ __device__ void decode_symbols(inflate_state_s* s)
 {
   uint32_t bitpos = s->bitpos;
   uint2 bitbuf    = s->bitbuf;
-  uint8_t* cur    = s->cur;
-  uint8_t* end    = s->end;
+  auto cur        = s->cur;
+  auto end        = s->end;
   int32_t batch   = 0;
   int32_t sym, batch_len;
 
@@ -871,13 +871,11 @@ __device__ int init_stored(inflate_state_s* s)
 /// Copy bytes from stored block to destination
 __device__ void copy_stored(inflate_state_s* s, int t)
 {
-  int len         = s->stored_blk_len;
-  uint8_t* cur    = s->cur + (s->bitpos >> 3);
-  uint8_t* out    = s->out;
-  uint8_t* outend = s->outend;
-  uint8_t* cur4;
-  int slow_bytes = min(len, (int)((16 - (size_t)out) & 0xf));
-  int fast_bytes, bitpos;
+  auto len              = s->stored_blk_len;
+  auto cur              = s->cur + s->bitpos / 8;
+  auto out              = s->out;
+  auto outend           = s->outend;
+  auto const slow_bytes = min(len, (int)((16 - reinterpret_cast<size_t>(out)) % 16));
 
   // Slow copy until output is 16B aligned
   if (slow_bytes) {
@@ -890,11 +888,11 @@ __device__ void copy_stored(inflate_state_s* s, int t)
     out += slow_bytes;
     len -= slow_bytes;
   }
-  fast_bytes = len;
+  auto fast_bytes = len;
   if (out < outend) { fast_bytes = (int)min((size_t)fast_bytes, (outend - out)); }
   fast_bytes &= ~0xf;
-  bitpos = ((int)(3 & (size_t)cur)) << 3;
-  cur4   = cur - (bitpos >> 3);
+  auto bitpos = ((int)((size_t)cur % 4)) * 8;
+  auto cur4   = cur - (bitpos / 8);
   if (out < outend) {
     // Fast copy 16 bytes at a time
     for (int i = t * 16; i < fast_bytes; i += blockDim.x * 16) {
@@ -926,13 +924,13 @@ __device__ void copy_stored(inflate_state_s* s, int t)
   __syncthreads();
   if (t == 0) {
     // Reset bitstream to end of block
-    uint8_t* p        = cur + len;
+    auto p            = cur + len;
     auto prefix_bytes = (uint32_t)(((size_t)p) & 3);
     p -= prefix_bytes;
     s->cur      = p;
-    s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
+    s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
     p += 4;
-    s->bitbuf.y = (p < s->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
+    s->bitbuf.y = (p < s->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
     s->bitpos   = prefix_bytes * 8;
     s->out      = out;
   }
@@ -1021,12 +1019,16 @@ __device__ int parse_gzip_header(const uint8_t* src, size_t src_size)
  *
  * @tparam block_size Thread block dimension for this call
  * @param inputs Source and destination buffer information per block
- * @param outputs Decompression status buffer per block
+ * @param outputs Destination buffer information per block
+ * @param statuses Decompression status buffer per block
  * @param parse_hdr If nonzero, indicates that the compressed bitstream includes a GZIP header
  */
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  inflate_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int parse_hdr)
+  inflate_kernel(device_span<device_span<uint8_t const> const> inputs,
+                 device_span<device_span<uint8_t> const> outputs,
+                 device_span<decompress_status> statuses,
+                 gzip_header_included parse_hdr)
 {
   __shared__ __align__(16) inflate_state_s state_g;
 
@@ -1035,12 +1037,11 @@ __global__ void __launch_bounds__(block_size)
   inflate_state_s* state = &state_g;
 
   if (!t) {
-    auto* p         = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].srcDevice));
-    size_t src_size = inputs[z].srcSize;
-    uint32_t prefix_bytes;
+    auto p        = inputs[z].data();
+    auto src_size = inputs[z].size();
     // Parse header if needed
     state->err = 0;
-    if (parse_hdr) {
+    if (parse_hdr == gzip_header_included::YES) {
       int hdr_len = parse_gzip_header(p, src_size);
       src_size    = (src_size >= 8) ? src_size - 8 : 0;  // ignore footer
       if (hdr_len >= 0) {
@@ -1051,16 +1052,16 @@ __global__ void __launch_bounds__(block_size)
       }
     }
     // Initialize shared state
-    state->out     = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].dstDevice));
-    state->outbase = state->out;
-    state->outend  = state->out + inputs[z].dstSize;
-    state->end     = p + src_size;
-    prefix_bytes   = (uint32_t)(((size_t)p) & 3);
+    state->out              = outputs[z].data();
+    state->outbase          = state->out;
+    state->outend           = state->out + outputs[z].size();
+    state->end              = p + src_size;
+    auto const prefix_bytes = (uint32_t)(((size_t)p) & 3);
     p -= prefix_bytes;
     state->cur      = p;
-    state->bitbuf.x = (p < state->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
+    state->bitbuf.x = (p < state->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
     p += 4;
-    state->bitbuf.y = (p < state->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
+    state->bitbuf.y = (p < state->end) ? *reinterpret_cast<uint32_t const*>(p) : 0;
     state->bitpos   = prefix_bytes * 8;
   }
   __syncthreads();
@@ -1132,9 +1133,9 @@ __global__ void __launch_bounds__(block_size)
       // Output buffer too small
       state->err = 1;
     }
-    outputs[z].bytes_written = state->out - state->outbase;
-    outputs[z].status        = state->err;
-    outputs[z].reserved      = (int)(state->end - state->cur);  // Here mainly for debug purposes
+    statuses[z].bytes_written = state->out - state->outbase;
+    statuses[z].status        = state->err;
+    statuses[z].reserved      = (int)(state->end - state->cur);  // Here mainly for debug purposes
   }
 }
 
@@ -1145,7 +1146,9 @@ __global__ void __launch_bounds__(block_size)
  *
  * @param inputs Source and destination information per block
  */
-__global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_input_s* inputs)
+__global__ void __launch_bounds__(1024)
+  copy_uncompressed_kernel(device_span<device_span<uint8_t const> const> inputs,
+                           device_span<device_span<uint8_t> const> outputs)
 {
   __shared__ const uint8_t* volatile src_g;
   __shared__ uint8_t* volatile dst_g;
@@ -1158,9 +1161,9 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp
   uint32_t len, src_align_bytes, src_align_bits, dst_align_bytes;
 
   if (!t) {
-    src        = static_cast<const uint8_t*>(inputs[z].srcDevice);
-    dst        = static_cast<uint8_t*>(inputs[z].dstDevice);
-    len        = min((uint32_t)inputs[z].srcSize, (uint32_t)inputs[z].dstSize);
+    src        = inputs[z].data();
+    dst        = outputs[z].data();
+    len        = static_cast<uint32_t>(min(inputs[z].size(), outputs[z].size()));
     src_g      = src;
     dst_g      = dst;
     copy_len_g = len;
@@ -1195,26 +1198,26 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp
   if (t < len) { dst[t] = src[t]; }
 }
 
-cudaError_t __host__ gpuinflate(gpu_inflate_input_s* inputs,
-                                gpu_inflate_status_s* outputs,
-                                int count,
-                                int parse_hdr,
-                                rmm::cuda_stream_view stream)
+void gpuinflate(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses,
+                gzip_header_included parse_hdr,
+                rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 128;  // Threads per block
-  if (count > 0) {
+  if (inputs.size() > 0) {
     inflate_kernel<block_size>
-      <<<count, block_size, 0, stream.value()>>>(inputs, outputs, parse_hdr);
+      <<<inputs.size(), block_size, 0, stream.value()>>>(inputs, outputs, statuses, parse_hdr);
   }
-  return cudaSuccess;
 }
 
-cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs,
-                                                  int count,
-                                                  rmm::cuda_stream_view stream)
+void gpu_copy_uncompressed_blocks(device_span<device_span<uint8_t const> const> inputs,
+                                  device_span<device_span<uint8_t> const> outputs,
+                                  rmm::cuda_stream_view stream)
 {
-  if (count > 0) { copy_uncompressed_kernel<<<count, 1024, 0, stream.value()>>>(inputs); }
-  return cudaSuccess;
+  if (inputs.size() > 0) {
+    copy_uncompressed_kernel<<<inputs.size(), 1024, 0, stream.value()>>>(inputs, outputs);
+  }
 }
 
 }  // namespace io
diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h
index 29856bcd3f3..3870b2ac3b3 100644
--- a/cpp/src/io/comp/gpuinflate.h
+++ b/cpp/src/io/comp/gpuinflate.h
@@ -16,75 +16,70 @@
 
 #pragma once
 
-#include <cstdint>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cstdint>
+
 namespace cudf {
 namespace io {
-/**
- * @brief Input parameters for the decompression interface
- */
-struct gpu_inflate_input_s {
-  const void* srcDevice;
-  uint64_t srcSize;
-  void* dstDevice;
-  uint64_t dstSize;
-};
 
 /**
  * @brief Output parameters for the decompression interface
  */
-struct gpu_inflate_status_s {
+struct decompress_status {
   uint64_t bytes_written;
   uint32_t status;
   uint32_t reserved;
 };
 
+enum class gzip_header_included { NO, YES };
+
 /**
  * @brief Interface for decompressing GZIP-compressed data
  *
  * Multiple, independent chunks of compressed data can be decompressed by using
- * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk.
+ * separate input/output/status for each chunk.
  *
- * @param[in] inputs List of input argument structures
- * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
  * @param[in] parse_hdr Whether or not to parse GZIP header
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpuinflate(gpu_inflate_input_s* inputs,
-                       gpu_inflate_status_s* outputs,
-                       int count,
-                       int parse_hdr,
-                       rmm::cuda_stream_view stream);
+void gpuinflate(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses,
+                gzip_header_included parse_hdr,
+                rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for copying uncompressed byte blocks
  *
- * @param[in] inputs List of input argument structures
- * @param[in] count Number of input structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs,
-                                         int count,
-                                         rmm::cuda_stream_view stream);
+void gpu_copy_uncompressed_blocks(device_span<device_span<uint8_t const> const> inputs,
+                                  device_span<device_span<uint8_t> const> outputs,
+                                  rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for decompressing Snappy-compressed data
  *
  * Multiple, independent chunks of compressed data can be decompressed by using
- * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk.
+ * separate input/output/status for each chunk.
  *
- * @param[in] inputs List of input argument structures
- * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpu_unsnap(gpu_inflate_input_s* inputs,
-                       gpu_inflate_status_s* outputs,
-                       int count,
-                       rmm::cuda_stream_view stream);
+void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses,
+                rmm::cuda_stream_view stream);
 
 /**
  * @brief Computes the size of temporary memory for Brotli decompression
@@ -99,37 +94,37 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
  * @brief Interface for decompressing Brotli-compressed data
  *
  * Multiple, independent chunks of compressed data can be decompressed by using
- * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk.
+ * separate input/output/status pairs for each chunk.
  *
- * @param[in] inputs List of input argument structures
- * @param[out] outputs List of output status structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
  * @param[in] scratch Temporary memory for intermediate work
  * @param[in] scratch_size Size in bytes of the temporary memory
- * @param[in] count Number of input/output structures
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs,
-                         gpu_inflate_status_s* outputs,
-                         void* scratch,
-                         size_t scratch_size,
-                         int count,
-                         rmm::cuda_stream_view stream);
+void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
+                  device_span<device_span<uint8_t> const> outputs,
+                  device_span<decompress_status> statuses,
+                  void* scratch,
+                  size_t scratch_size,
+                  rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for compressing data with Snappy
  *
  * Multiple, independent chunks of compressed data can be compressed by using
- * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk.
+ * separate input/output/status for each chunk.
  *
- * @param[in] inputs List of input argument structures
- * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
  * @param[in] stream CUDA stream to use
  */
-cudaError_t gpu_snap(gpu_inflate_input_s* inputs,
-                     gpu_inflate_status_s* outputs,
-                     int count,
-                     rmm::cuda_stream_view stream);
+void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<decompress_status> statuses,
+              rmm::cuda_stream_view stream);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
new file mode 100644
index 00000000000..b2e6f07b80b
--- /dev/null
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nvcomp_adapter.hpp"
+#include "nvcomp_adapter.cuh"
+
+#include <cudf/utilities/error.hpp>
+
+#include <nvcomp/snappy.h>
+
+namespace cudf::io::nvcomp {
+
+template <typename... Args>
+auto batched_decompress_get_temp_size(compression_type type, Args&&... args)
+{
+  switch (type) {
+    case compression_type::SNAPPY:
+      return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+};
+
+template <typename... Args>
+auto batched_decompress_async(compression_type type, Args&&... args)
+{
+  switch (type) {
+    case compression_type::SNAPPY:
+      return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+};
+
+size_t get_temp_size(compression_type type, size_t num_chunks, size_t max_uncomp_chunk_size)
+{
+  size_t temp_size = 0;
+  nvcompStatus_t nvcomp_status =
+    batched_decompress_get_temp_size(type, num_chunks, max_uncomp_chunk_size, &temp_size);
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+               "Unable to get scratch size for decompression");
+
+  return temp_size;
+}
+
+void batched_decompress(compression_type type,
+                        device_span<device_span<uint8_t const> const> inputs,
+                        device_span<device_span<uint8_t> const> outputs,
+                        device_span<decompress_status> statuses,
+                        size_t max_uncomp_chunk_size,
+                        rmm::cuda_stream_view stream)
+{
+  auto const num_chunks = inputs.size();
+
+  // cuDF inflate inputs converted to nvcomp inputs
+  auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream);
+  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_chunks, stream);
+  rmm::device_uvector<nvcompStatus_t> nvcomp_statuses(num_chunks, stream);
+  // Temporary space required for decompression
+  rmm::device_buffer scratch(get_temp_size(type, num_chunks, max_uncomp_chunk_size), stream);
+  auto const nvcomp_status = batched_decompress_async(type,
+                                                      nvcomp_args.compressed_data_ptrs.data(),
+                                                      nvcomp_args.compressed_data_sizes.data(),
+                                                      nvcomp_args.uncompressed_data_sizes.data(),
+                                                      actual_uncompressed_data_sizes.data(),
+                                                      num_chunks,
+                                                      scratch.data(),
+                                                      scratch.size(),
+                                                      nvcomp_args.uncompressed_data_ptrs.data(),
+                                                      nvcomp_statuses.data(),
+                                                      stream.value());
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression");
+
+  convert_status(nvcomp_statuses, actual_uncompressed_data_sizes, statuses, stream);
+}
+}  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
new file mode 100644
index 00000000000..ce294cc9b00
--- /dev/null
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "nvcomp_adapter.cuh"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+namespace cudf::io::nvcomp {
+
+batched_args create_batched_nvcomp_args(device_span<device_span<uint8_t const> const> inputs,
+                                        device_span<device_span<uint8_t> const> outputs,
+                                        rmm::cuda_stream_view stream)
+{
+  size_t num_comp_pages = inputs.size();
+  rmm::device_uvector<void const*> compressed_data_ptrs(num_comp_pages, stream);
+  rmm::device_uvector<size_t> compressed_data_sizes(num_comp_pages, stream);
+  rmm::device_uvector<void*> uncompressed_data_ptrs(num_comp_pages, stream);
+  rmm::device_uvector<size_t> uncompressed_data_sizes(num_comp_pages, stream);
+
+  // Prepare the input vectors
+  auto ins_it =
+    thrust::make_zip_iterator(compressed_data_ptrs.begin(), compressed_data_sizes.begin());
+  thrust::transform(
+    rmm::exec_policy(stream), inputs.begin(), inputs.end(), ins_it, [] __device__(auto const& in) {
+      return thrust::make_tuple(in.data(), in.size());
+    });
+
+  // Prepare the output vectors
+  auto outs_it =
+    thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), uncompressed_data_sizes.begin());
+  thrust::transform(
+    rmm::exec_policy(stream),
+    outputs.begin(),
+    outputs.end(),
+    outs_it,
+    [] __device__(auto const& out) { return thrust::make_tuple(out.data(), out.size()); });
+
+  return {std::move(compressed_data_ptrs),
+          std::move(compressed_data_sizes),
+          std::move(uncompressed_data_ptrs),
+          std::move(uncompressed_data_sizes)};
+}
+
+void convert_status(device_span<nvcompStatus_t const> nvcomp_stats,
+                    device_span<size_t const> actual_uncompressed_sizes,
+                    device_span<decompress_status> cudf_stats,
+                    rmm::cuda_stream_view stream)
+{
+  thrust::transform(
+    rmm::exec_policy(stream),
+    nvcomp_stats.begin(),
+    nvcomp_stats.end(),
+    actual_uncompressed_sizes.begin(),
+    cudf_stats.begin(),
+    [] __device__(auto const& status, auto const& size) {
+      return decompress_status{size, status == nvcompStatus_t::nvcompSuccess ? 0u : 1u};
+    });
+}
+}  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh
new file mode 100644
index 00000000000..a76ddcf6813
--- /dev/null
+++ b/cpp/src/io/comp/nvcomp_adapter.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gpuinflate.h"
+
+#include <cudf/utilities/span.hpp>
+
+#include <nvcomp.h>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cudf::io::nvcomp {
+
+struct batched_args {
+  rmm::device_uvector<void const*> compressed_data_ptrs;
+  rmm::device_uvector<size_t> compressed_data_sizes;
+  rmm::device_uvector<void*> uncompressed_data_ptrs;
+  rmm::device_uvector<size_t> uncompressed_data_sizes;
+};
+
+/**
+ * @brief Split lists of src/dst device spans into lists of pointers/sizes.
+ *
+ * @param[in] inputs List of input buffers
+ * @param[in] outputs List of output buffers
+ * @param[in] stream CUDA stream to use
+ */
+batched_args create_batched_nvcomp_args(device_span<device_span<uint8_t const> const> inputs,
+                                        device_span<device_span<uint8_t> const> outputs,
+                                        rmm::cuda_stream_view stream);
+
+/**
+ * @brief Convert nvcomp statuses into cuIO compression statuses.
+ */
+void convert_status(device_span<nvcompStatus_t const> nvcomp_stats,
+                    device_span<size_t const> actual_uncompressed_sizes,
+                    device_span<decompress_status> cudf_stats,
+                    rmm::cuda_stream_view stream);
+}  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
new file mode 100644
index 00000000000..a0eb6bc4fbf
--- /dev/null
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gpuinflate.h"
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::io::nvcomp {
+
+enum class compression_type { SNAPPY };
+
+/**
+ * @brief Device batch decompression of given type.
+ *
+ * @param[in] type Compression type
+ * @param[in] inputs List of input buffers
+ * @param[out] outputs List of output buffers
+ * @param[out] statuses List of output status structures
+ * @param[in] max_uncomp_page_size maximum size of uncompressed block
+ * @param[in] stream CUDA stream to use
+ */
+void batched_decompress(compression_type type,
+                        device_span<device_span<uint8_t const> const> inputs,
+                        device_span<device_span<uint8_t> const> outputs,
+                        device_span<decompress_status> statuses,
+                        size_t max_uncomp_page_size,
+                        rmm::cuda_stream_view stream);
+}  // namespace cudf::io::nvcomp
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 9f0a610f8f7..d64eea06631 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -258,7 +258,9 @@ static __device__ uint32_t Match60(const uint8_t* src1,
  * @param[in] count Number of blocks to compress
  */
 __global__ void __launch_bounds__(128)
-  snap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int count)
+  snap_kernel(device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<decompress_status> statuses)
 {
   __shared__ __align__(16) snap_state_s state_g;
 
@@ -268,15 +270,15 @@ __global__ void __launch_bounds__(128)
   const uint8_t* src;
 
   if (!t) {
-    const auto* src = static_cast<const uint8_t*>(inputs[blockIdx.x].srcDevice);
-    auto src_len    = static_cast<uint32_t>(inputs[blockIdx.x].srcSize);
-    auto* dst       = static_cast<uint8_t*>(inputs[blockIdx.x].dstDevice);
-    auto dst_len    = static_cast<uint32_t>(inputs[blockIdx.x].dstSize);
-    uint8_t* end    = dst + dst_len;
-    s->src          = src;
-    s->src_len      = src_len;
-    s->dst_base     = dst;
-    s->end          = end;
+    auto const src     = inputs[blockIdx.x].data();
+    auto src_len       = static_cast<uint32_t>(inputs[blockIdx.x].size());
+    auto dst           = outputs[blockIdx.x].data();
+    auto const dst_len = static_cast<uint32_t>(outputs[blockIdx.x].size());
+    auto const end     = dst + dst_len;
+    s->src             = src;
+    s->src_len         = src_len;
+    s->dst_base        = dst;
+    s->end             = end;
     while (src_len > 0x7f) {
       if (dst < end) { dst[0] = src_len | 0x80; }
       dst++;
@@ -335,23 +337,22 @@ __global__ void __launch_bounds__(128)
   }
   __syncthreads();
   if (!t) {
-    outputs[blockIdx.x].bytes_written = s->dst - s->dst_base;
-    outputs[blockIdx.x].status        = (s->dst > s->end) ? 1 : 0;
-    outputs[blockIdx.x].reserved      = 0;
+    statuses[blockIdx.x].bytes_written = s->dst - s->dst_base;
+    statuses[blockIdx.x].status        = (s->dst > s->end) ? 1 : 0;
+    statuses[blockIdx.x].reserved      = 0;
   }
 }
 
-cudaError_t __host__ gpu_snap(gpu_inflate_input_s* inputs,
-                              gpu_inflate_status_s* outputs,
-                              int count,
-                              rmm::cuda_stream_view stream)
+void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<decompress_status> statuses,
+              rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);  // 4 warps per stream, 1 stream per block
-  dim3 dim_grid(count, 1);
-  if (count > 0) {
-    snap_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, count);
+  dim3 dim_grid(inputs.size(), 1);
+  if (inputs.size() > 0) {
+    snap_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, statuses);
   }
-  return cudaSuccess;
 }
 
 }  // namespace io
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index 791a16bc912..dc44b9fcd59 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,14 +64,15 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  const uint8_t* base;         ///< base ptr of compressed stream
-  const uint8_t* end;          ///< end of compressed stream
-  uint32_t uncompressed_size;  ///< uncompressed stream size
-  uint32_t bytes_left;         ///< bytes to uncompressed remaining
-  int32_t error;               ///< current error status
-  uint32_t tstart;             ///< start time for perf logging
-  volatile unsnap_queue_s q;   ///< queue for cross-warp communication
-  gpu_inflate_input_s in;      ///< input parameters for current block
+  const uint8_t* base;             ///< base ptr of compressed stream
+  const uint8_t* end;              ///< end of compressed stream
+  uint32_t uncompressed_size;      ///< uncompressed stream size
+  uint32_t bytes_left;             ///< remaining bytes to decompress
+  int32_t error;                   ///< current error status
+  uint32_t tstart;                 ///< start time for perf logging
+  volatile unsnap_queue_s q;       ///< queue for cross-warp communication
+  device_span<uint8_t const> src;  ///< input for current block
+  device_span<uint8_t> dst;        ///< output for current block
 };
 
 inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos)
@@ -497,9 +498,9 @@ __device__ void snappy_decode_symbols(unsnap_state_s* s, uint32_t t)
 template <typename Storage>
 __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_storage)
 {
-  const uint8_t* literal_base = s->base;
-  auto* out                   = static_cast<uint8_t*>(s->in.dstDevice);
-  int batch                   = 0;
+  auto const literal_base = s->base;
+  auto out                = s->dst.data();
+  int batch               = 0;
 
   do {
     volatile unsnap_batch_s* b = &s->q.batch[batch * batch_size];
@@ -624,7 +625,9 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
  */
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  unsnap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs)
+  unsnap_kernel(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses)
 {
   __shared__ __align__(16) unsnap_state_s state_g;
   __shared__ cub::WarpReduce<uint32_t>::TempStorage temp_storage;
@@ -632,16 +635,14 @@ __global__ void __launch_bounds__(block_size)
   unsnap_state_s* s = &state_g;
   int strm_id       = blockIdx.x;
 
-  if (t < sizeof(gpu_inflate_input_s) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t*>(&s->in)[t] = reinterpret_cast<const uint32_t*>(&inputs[strm_id])[t];
-    __threadfence_block();
-  }
   if (t < batch_count) { s->q.batch_len[t] = 0; }
   __syncthreads();
   if (!t) {
-    const auto* cur    = static_cast<const uint8_t*>(s->in.srcDevice);
-    const uint8_t* end = cur + s->in.srcSize;
-    s->error           = 0;
+    s->src         = inputs[strm_id];
+    s->dst         = outputs[strm_id];
+    auto cur       = s->src.begin();
+    auto const end = s->src.end();
+    s->error       = 0;
     if (log_cyclecount) { s->tstart = clock(); }
     if (cur < end) {
       // Read uncompressed size (varint), limited to 32-bit
@@ -672,7 +673,7 @@ __global__ void __launch_bounds__(block_size)
       s->bytes_left        = uncompressed_size;
       s->base              = cur;
       s->end               = end;
-      if ((cur >= end && uncompressed_size != 0) || (uncompressed_size > s->in.dstSize)) {
+      if ((cur >= end && uncompressed_size != 0) || (uncompressed_size > s->dst.size())) {
         s->error = -1;
       }
     } else {
@@ -697,28 +698,25 @@ __global__ void __launch_bounds__(block_size)
     __syncthreads();
   }
   if (!t) {
-    outputs[strm_id].bytes_written = s->uncompressed_size - s->bytes_left;
-    outputs[strm_id].status        = s->error;
+    statuses[strm_id].bytes_written = s->uncompressed_size - s->bytes_left;
+    statuses[strm_id].status        = s->error;
     if (log_cyclecount) {
-      outputs[strm_id].reserved = clock() - s->tstart;
+      statuses[strm_id].reserved = clock() - s->tstart;
     } else {
-      outputs[strm_id].reserved = 0;
+      statuses[strm_id].reserved = 0;
     }
   }
 }
 
-cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s* inputs,
-                                gpu_inflate_status_s* outputs,
-                                int count,
-                                rmm::cuda_stream_view stream)
+void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
+                device_span<device_span<uint8_t> const> outputs,
+                device_span<decompress_status> statuses,
+                rmm::cuda_stream_view stream)
 {
-  uint32_t count32 = (count > 0) ? count : 0;
-  dim3 dim_block(128, 1);     // 4 warps per stream, 1 stream per block
-  dim3 dim_grid(count32, 1);  // TODO: Check max grid dimensions vs max expected count
-
-  unsnap_kernel<128><<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs);
+  dim3 dim_block(128, 1);           // 4 warps per stream, 1 stream per block
+  dim3 dim_grid(inputs.size(), 1);  // TODO: Check max grid dimensions vs max expected count
 
-  return cudaSuccess;
+  unsnap_kernel<128><<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, statuses);
 }
 
 }  // namespace io
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 56a00191ae4..43411157319 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -356,7 +356,7 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
       ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
       : [&]() {
           auto const key_range = get_next_key(begin, end, opts.quotechar);
-          auto const key_hash  = MurmurHash3_32<cudf::string_view>{}(
+          auto const key_hash  = cudf::detail::MurmurHash3_32<cudf::string_view>{}(
             cudf::string_view(key_range.first, key_range.second - key_range.first));
           auto const hash_col = col_map.find(key_hash);
           // Fall back to field index if not found (parsing error)
@@ -667,7 +667,8 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
       keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
       keys_info->column(1).element<uint16_t>(idx) = len;
       keys_info->column(2).element<uint32_t>(idx) =
-        MurmurHash3_32<cudf::string_view>{}(cudf::string_view(field_range.key_begin, len));
+        cudf::detail::MurmurHash3_32<cudf::string_view>{}(
+          cudf::string_view(field_range.key_begin, len));
     }
   }
 }
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index a4ae9999a19..47244279599 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -177,7 +177,7 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
                                              per_file_metadata[src_file_idx].ff.stripes.size()),
           "Invalid stripe index");
         stripe_infos.push_back(
-          std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+          std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
         row_count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
       }
       selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
@@ -206,7 +206,7 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
         count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
         if (count > row_start || count == 0) {
           stripe_infos.push_back(
-            std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+            std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
         } else {
           stripe_skip_rows = count;
         }
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index d94aa00c7b9..837fd03a112 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,9 +43,10 @@ struct CompressedStreamInfo {
     : compressed_data(compressed_data_),
       uncompressed_data(nullptr),
       compressed_data_size(compressed_size_),
-      decctl(nullptr),
-      decstatus(nullptr),
-      copyctl(nullptr),
+      dec_in_ctl(nullptr),
+      dec_out_ctl(nullptr),
+      copy_in_ctl(nullptr),
+      copy_out_ctl(nullptr),
       num_compressed_blocks(0),
       num_uncompressed_blocks(0),
       max_uncompressed_size(0),
@@ -54,14 +55,15 @@ struct CompressedStreamInfo {
   }
   const uint8_t* compressed_data;  // [in] base ptr to compressed stream data
   uint8_t* uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
-  size_t compressed_data_size;      // [in] compressed data size for this stream
-  gpu_inflate_input_s* decctl;      // [in] base ptr to decompression structure to be filled
-  gpu_inflate_status_s* decstatus;  // [in] results of decompression
-  gpu_inflate_input_s*
-    copyctl;  // [in] base ptr to copy structure to be filled for uncompressed blocks
+  size_t compressed_data_size;               // [in] compressed data size for this stream
+  device_span<uint8_t const>* dec_in_ctl;    // [in] input buffer to decompress
+  device_span<uint8_t>* dec_out_ctl;         // [in] output buffer to decompress into
+  device_span<decompress_status> decstatus;  // [in] results of decompression
+  device_span<uint8_t const>* copy_in_ctl;   // [out] input buffer to copy
+  device_span<uint8_t>* copy_out_ctl;        // [out] output buffer to copy to
   uint32_t num_compressed_blocks;  // [in,out] number of entries in decctl(in), number of compressed
                                    // blocks(out)
-  uint32_t num_uncompressed_blocks;      // [in,out] number of entries in copyctl(in), number of
+  uint32_t num_uncompressed_blocks;      // [in,out] number of entries in dec_in_ctl(in), number of
                                          // uncompressed blocks(out)
   uint64_t max_uncompressed_size;        // [out] maximum uncompressed data size of stream
   uint32_t max_uncompressed_block_size;  // [out] maximum uncompressed size of any block in stream
@@ -345,8 +347,9 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
  * @param[in] max_comp_blk_size Max size of any block after compression
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
- * @param[out] comp_in Per-block compression input parameters
- * @param[out] comp_out Per-block compression status
+ * @param[out] comp_in Per-block compression input buffers
+ * @param[out] comp_out Per-block compression output buffers
+ * @param[out] comp_stat Per-block compression status
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void CompressOrcDataStreams(uint8_t* compressed_data,
@@ -356,8 +359,9 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            device_span<gpu_inflate_input_s> comp_in,
-                            device_span<gpu_inflate_status_s> comp_out,
+                            device_span<device_span<uint8_t const>> comp_in,
+                            device_span<device_span<uint8_t>> comp_out,
+                            device_span<decompress_status> comp_stat,
                             rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 83c23774362..139eb28d1a1 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -25,6 +25,7 @@
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
+#include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
@@ -40,8 +41,6 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <nvcomp/snappy.h>
-
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -108,20 +107,20 @@ constexpr std::pair<gpu::StreamIndexType, uint32_t> get_index_type_and_pos(
     case orc::DATA:
       skip_count += 1;
       skip_count |= (skip_count & 0xff) << 8;
-      return std::make_pair(gpu::CI_DATA, skip_count);
+      return std::pair(gpu::CI_DATA, skip_count);
     case orc::LENGTH:
     case orc::SECONDARY:
       skip_count += 1;
       skip_count |= (skip_count & 0xff) << 16;
-      return std::make_pair(gpu::CI_DATA2, skip_count);
-    case orc::DICTIONARY_DATA: return std::make_pair(gpu::CI_DICTIONARY, skip_count);
+      return std::pair(gpu::CI_DATA2, skip_count);
+    case orc::DICTIONARY_DATA: return std::pair(gpu::CI_DICTIONARY, skip_count);
     case orc::PRESENT:
       skip_count += (non_child ? 1 : 0);
-      return std::make_pair(gpu::CI_PRESENT, skip_count);
-    case orc::ROW_INDEX: return std::make_pair(gpu::CI_INDEX, skip_count);
+      return std::pair(gpu::CI_PRESENT, skip_count);
+    case orc::ROW_INDEX: return std::pair(gpu::CI_INDEX, skip_count);
     default:
       // Skip this stream as it's not strictly required
-      return std::make_pair(gpu::CI_NUM_STREAMS, 0);
+      return std::pair(gpu::CI_NUM_STREAMS, 0);
   }
 }
 
@@ -262,7 +261,7 @@ auto decimal_column_type(std::vector<std::string> const& decimal128_columns,
 
 }  // namespace
 
-__global__ void decompress_check_kernel(device_span<gpu_inflate_status_s const> stats,
+__global__ void decompress_check_kernel(device_span<decompress_status const> stats,
                                         bool* any_block_failure)
 {
   auto tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -273,7 +272,7 @@ __global__ void decompress_check_kernel(device_span<gpu_inflate_status_s const>
   }
 }
 
-void decompress_check(device_span<gpu_inflate_status_s> stats,
+void decompress_check(device_span<decompress_status> stats,
                       bool* any_block_failure,
                       rmm::cuda_stream_view stream)
 {
@@ -284,74 +283,6 @@ void decompress_check(device_span<gpu_inflate_status_s> stats,
   decompress_check_kernel<<<grid, block, 0, stream.value()>>>(stats, any_block_failure);
 }
 
-__global__ void convert_nvcomp_status(device_span<nvcompStatus_t const> nvcomp_stats,
-                                      device_span<size_t const> actual_uncompressed_sizes,
-                                      device_span<gpu_inflate_status_s> stats)
-{
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < stats.size()) {
-    stats[tid].status        = nvcomp_stats[tid] == nvcompStatus_t::nvcompSuccess ? 0 : 1;
-    stats[tid].bytes_written = actual_uncompressed_sizes[tid];
-  }
-}
-
-void snappy_decompress(device_span<gpu_inflate_input_s> comp_in,
-                       device_span<gpu_inflate_status_s> comp_stat,
-                       size_t max_uncomp_page_size,
-                       rmm::cuda_stream_view stream)
-{
-  size_t num_blocks = comp_in.size();
-  size_t temp_size;
-
-  auto status =
-    nvcompBatchedSnappyDecompressGetTempSize(num_blocks, max_uncomp_page_size, &temp_size);
-  CUDF_EXPECTS(nvcompStatus_t::nvcompSuccess == status,
-               "Unable to get scratch size for snappy decompression");
-
-  rmm::device_buffer scratch(temp_size, stream);
-  rmm::device_uvector<void const*> compressed_data_ptrs(num_blocks, stream);
-  rmm::device_uvector<size_t> compressed_data_sizes(num_blocks, stream);
-  rmm::device_uvector<void*> uncompressed_data_ptrs(num_blocks, stream);
-  rmm::device_uvector<size_t> uncompressed_data_sizes(num_blocks, stream);
-
-  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_blocks, stream);
-  rmm::device_uvector<nvcompStatus_t> statuses(num_blocks, stream);
-
-  device_span<size_t const> actual_uncompressed_sizes_span(actual_uncompressed_data_sizes.data(),
-                                                           actual_uncompressed_data_sizes.size());
-  device_span<nvcompStatus_t const> statuses_span(statuses.data(), statuses.size());
-
-  // Prepare the vectors
-  auto comp_it = thrust::make_zip_iterator(compressed_data_ptrs.begin(),
-                                           compressed_data_sizes.begin(),
-                                           uncompressed_data_ptrs.begin(),
-                                           uncompressed_data_sizes.data());
-  thrust::transform(rmm::exec_policy(stream),
-                    comp_in.begin(),
-                    comp_in.end(),
-                    comp_it,
-                    [] __device__(gpu_inflate_input_s in) {
-                      return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice, in.dstSize);
-                    });
-
-  status = nvcompBatchedSnappyDecompressAsync(compressed_data_ptrs.data(),
-                                              compressed_data_sizes.data(),
-                                              uncompressed_data_sizes.data(),
-                                              actual_uncompressed_data_sizes.data(),
-                                              num_blocks,
-                                              scratch.data(),
-                                              scratch.size(),
-                                              uncompressed_data_ptrs.data(),
-                                              statuses.data(),
-                                              stream.value());
-  CUDF_EXPECTS(nvcompStatus_t::nvcompSuccess == status, "unable to perform snappy decompression");
-
-  dim3 block(128);
-  dim3 grid(cudf::util::div_rounding_up_safe(num_blocks, static_cast<size_t>(block.x)));
-  convert_nvcomp_status<<<grid, block, 0, stream.value()>>>(
-    statuses_span, actual_uncompressed_sizes_span, comp_stat);
-}
-
 rmm::device_buffer reader::impl::decompress_stripe_data(
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   const std::vector<rmm::device_buffer>& stripe_data,
@@ -396,9 +327,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found");
 
   rmm::device_buffer decomp_data(total_decomp_size, stream);
-  rmm::device_uvector<gpu_inflate_input_s> inflate_in(
+  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<device_span<uint8_t>> inflate_out(
     num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<gpu_inflate_status_s> inflate_out(num_compressed_blocks, stream);
+  rmm::device_uvector<decompress_status> inflate_stats(num_compressed_blocks, stream);
 
   // Parse again to populate the decompression input/output buffers
   size_t decomp_offset           = 0;
@@ -408,9 +341,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   for (size_t i = 0; i < compinfo.size(); ++i) {
     auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
-    compinfo[i].decctl            = inflate_in.data() + start_pos;
-    compinfo[i].decstatus         = inflate_out.data() + start_pos;
-    compinfo[i].copyctl           = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
+    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
+    compinfo[i].decstatus   = {inflate_stats.data() + start_pos, compinfo[i].num_compressed_blocks};
+    compinfo[i].copy_in_ctl = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
     stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
@@ -428,29 +363,36 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
 
   // Dispatch batches of blocks to decompress
   if (num_compressed_blocks > 0) {
-    device_span<gpu_inflate_status_s> inflate_out_view(inflate_out.data(), num_compressed_blocks);
+    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
+                                                            num_compressed_blocks};
+    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
     switch (decompressor->GetKind()) {
       case orc::ZLIB:
-        CUDF_CUDA_TRY(
-          gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream));
+        gpuinflate(
+          inflate_in_view, inflate_out_view, inflate_stats, gzip_header_included::NO, stream);
         break;
       case orc::SNAPPY:
         if (nvcomp_integration::is_stable_enabled()) {
-          device_span<gpu_inflate_input_s> inflate_in_view{inflate_in.data(),
-                                                           num_compressed_blocks};
-          snappy_decompress(inflate_in_view, inflate_out_view, max_uncomp_block_size, stream);
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_stats,
+                                     max_uncomp_block_size,
+                                     stream);
         } else {
-          CUDF_CUDA_TRY(
-            gpu_unsnap(inflate_in.data(), inflate_out.data(), num_compressed_blocks, stream));
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_stats, stream);
         }
         break;
       default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
-    decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream);
+    decompress_check(inflate_stats, any_block_failure.device_ptr(), stream);
   }
   if (num_uncompressed_blocks > 0) {
-    CUDF_CUDA_TRY(gpu_copy_uncompressed_blocks(
-      inflate_in.data() + num_compressed_blocks, num_uncompressed_blocks, stream));
+    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
+                                                         num_uncompressed_blocks};
+    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
+                                                    num_uncompressed_blocks};
+    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
   }
   gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
 
@@ -1120,9 +1062,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
                   .source->is_device_read_preferred(len)) {
               read_tasks.push_back(
-                std::make_pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                 .source->device_read_async(offset, len, d_dst, stream),
-                               len));
+                std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                            .source->device_read_async(offset, len, d_dst, stream),
+                          len));
 
             } else {
               const auto buffer =
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index b377a2e7076..3ddfebfbb24 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -81,8 +81,7 @@ __global__ void __launch_bounds__(block_size, 1)
     uint32_t stats_len = 0, stats_pos;
     uint32_t idx       = start + t;
     if (idx < statistics_count) {
-      const stats_column_desc* col = groups[idx].col;
-      statistics_dtype dtype       = col->stats_dtype;
+      statistics_dtype const dtype = groups[idx].stats_dtype;
       switch (dtype) {
         case dtype_bool: stats_len = pb_fldlen_common + pb_fld_hdrlen + pb_fldlen_bucket1; break;
         case dtype_int8:
@@ -126,7 +125,7 @@ struct stats_state_s {
   uint8_t* end;   ///< Output buffer end
   statistics_chunk chunk;
   statistics_merge_group group;
-  stats_column_desc col;
+  statistics_dtype stats_dtype;  //!< Statistics data type for this column
   // ORC stats
   uint64_t numberOfValues;
   uint8_t hasNull;
@@ -231,12 +230,12 @@ __global__ void __launch_bounds__(encode_threads_per_block)
   if (idx < statistics_count && t == 0) {
     s->chunk           = chunks[idx];
     s->group           = groups[idx];
-    s->col             = *(s->group.col);
+    s->stats_dtype     = s->group.stats_dtype;
     s->base            = blob_bfr + s->group.start_chunk;
     s->end             = blob_bfr + s->group.start_chunk + s->group.num_chunks;
     uint8_t* cur       = pb_put_uint(s->base, 1, s->chunk.non_nulls);
     uint8_t* fld_start = cur;
-    switch (s->col.stats_dtype) {
+    switch (s->stats_dtype) {
       case dtype_int8:
       case dtype_int16:
       case dtype_int32:
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index f1d524058d2..3fe623be5b1 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1141,8 +1141,9 @@ __global__ void __launch_bounds__(1024)
  *
  * @param[in] strm_desc StripeStream device array [stripe][stream]
  * @param[in] chunks EncChunk device array [rowgroup][column]
- * @param[out] comp_in Per-block compression input parameters
- * @param[out] comp_out Per-block compression status
+ * @param[out] inputs Per-block compression input buffers
+ * @param[out] outputs Per-block compression output buffers
+ * @param[out] statuses Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
  * @param[in] max_comp_blk_size Max size of any block after compression
@@ -1151,8 +1152,9 @@ __global__ void __launch_bounds__(1024)
 __global__ void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams,  // const?
-                           device_span<gpu_inflate_input_s> comp_in,
-                           device_span<gpu_inflate_status_s> comp_out,
+                           device_span<device_span<uint8_t const>> inputs,
+                           device_span<device_span<uint8_t>> outputs,
+                           device_span<decompress_status> statuses,
                            uint8_t* compressed_bfr,
                            uint32_t comp_blk_size,
                            uint32_t max_comp_blk_size)
@@ -1175,16 +1177,11 @@ __global__ void __launch_bounds__(256)
   dst        = compressed_bfr + ss.bfr_offset;
   num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1;
   for (uint32_t b = t; b < num_blocks; b += 256) {
-    gpu_inflate_input_s* blk_in   = &comp_in[ss.first_block + b];
-    gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b];
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
-    blk_in->srcDevice = src + b * comp_blk_size;
-    blk_in->srcSize   = blk_size;
-    blk_in->dstDevice = dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE;
-    blk_in->dstSize   = max_comp_blk_size;
-    blk_out->bytes_written = blk_size;
-    blk_out->status        = 1;
-    blk_out->reserved      = 0;
+    inputs[ss.first_block + b]  = {src + b * comp_blk_size, blk_size};
+    outputs[ss.first_block + b] = {
+      dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE, max_comp_blk_size};
+    statuses[ss.first_block + b] = {blk_size, 1, 0};
   }
 }
 
@@ -1194,8 +1191,9 @@ __global__ void __launch_bounds__(256)
  *
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in] chunks EncChunk device array [rowgroup][column]
- * @param[in] comp_in Per-block compression input parameters
- * @param[in] comp_out Per-block compression status
+ * @param[out] inputs Per-block compression input buffers
+ * @param[out] outputs Per-block compression output buffers
+ * @param[out] statuses Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
  * @param[in] max_comp_blk_size Max size of any block after compression
@@ -1203,8 +1201,9 @@ __global__ void __launch_bounds__(256)
 // blockDim {1024,1,1}
 __global__ void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
-                             device_span<gpu_inflate_input_s> comp_in,
-                             device_span<gpu_inflate_status_s> comp_out,
+                             device_span<device_span<uint8_t const> const> inputs,
+                             device_span<device_span<uint8_t> const> outputs,
+                             device_span<decompress_status> statuses,
                              uint8_t* compressed_bfr,
                              uint32_t comp_blk_size,
                              uint32_t max_comp_blk_size)
@@ -1228,21 +1227,21 @@ __global__ void __launch_bounds__(1024)
   b          = 0;
   do {
     if (t == 0) {
-      gpu_inflate_input_s* blk_in   = &comp_in[ss.first_block + b];
-      gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b];
-      uint32_t src_len =
+      auto const src_len =
         min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
-      uint32_t dst_len = (blk_out->status == 0) ? blk_out->bytes_written : src_len;
-      uint32_t blk_size24;
+      auto dst_len = (statuses[ss.first_block + b].status == 0)
+                       ? statuses[ss.first_block + b].bytes_written
+                       : src_len;
+      uint32_t blk_size24{};
       if (dst_len >= src_len) {
         // Copy from uncompressed source
-        src                    = static_cast<const uint8_t*>(blk_in->srcDevice);
-        blk_out->bytes_written = src_len;
-        dst_len                = src_len;
-        blk_size24             = dst_len * 2 + 1;
+        src                                        = inputs[ss.first_block + b].data();
+        statuses[ss.first_block + b].bytes_written = src_len;
+        dst_len                                    = src_len;
+        blk_size24                                 = dst_len * 2 + 1;
       } else {
         // Compressed block
-        src        = static_cast<const uint8_t*>(blk_in->dstDevice);
+        src        = outputs[ss.first_block + b].data();
         blk_size24 = dst_len * 2 + 0;
       }
       dst[0]     = static_cast<uint8_t>(blk_size24 >> 0);
@@ -1311,14 +1310,21 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            device_span<gpu_inflate_input_s> comp_in,
-                            device_span<gpu_inflate_status_s> comp_out,
+                            device_span<device_span<uint8_t const>> comp_in,
+                            device_span<device_span<uint8_t>> comp_out,
+                            device_span<decompress_status> comp_stat,
                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block_init(256, 1);
   dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
-  gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(
-    strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
+  gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(strm_desc,
+                                                                            enc_streams,
+                                                                            comp_in,
+                                                                            comp_out,
+                                                                            comp_stat,
+                                                                            compressed_data,
+                                                                            comp_blk_size,
+                                                                            max_comp_blk_size);
   if (compression == SNAPPY) {
     if (detail::nvcomp_integration::is_stable_enabled()) {
       try {
@@ -1336,15 +1342,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
         rmm::device_uvector<size_t> compressed_bytes_written(num_compressed_blocks, stream);
 
         auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(),
-                                                 uncompressed_data_sizes.begin(),
-                                                 compressed_data_ptrs.begin());
+                                                 uncompressed_data_sizes.begin());
+        thrust::transform(
+          rmm::exec_policy(stream),
+          comp_in.begin(),
+          comp_in.end(),
+          comp_it,
+          [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); });
         thrust::transform(rmm::exec_policy(stream),
-                          comp_in.begin(),
-                          comp_in.end(),
-                          comp_it,
-                          [] __device__(gpu_inflate_input_s in) {
-                            return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice);
-                          });
+                          comp_out.begin(),
+                          comp_out.end(),
+                          compressed_data_ptrs.begin(),
+                          [] __device__(auto const& out) { return out.data(); });
         nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(),
                                                          uncompressed_data_sizes.data(),
                                                          max_comp_blk_size,
@@ -1361,9 +1370,9 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
         thrust::transform(rmm::exec_policy(stream),
                           compressed_bytes_written.begin(),
                           compressed_bytes_written.end(),
-                          comp_out.begin(),
+                          comp_stat.begin(),
                           [] __device__(size_t size) {
-                            gpu_inflate_status_s status{};
+                            decompress_status status{};
                             status.bytes_written = size;
                             return status;
                           });
@@ -1371,18 +1380,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
         // If we reach this then there was an error in compressing so set an error status for each
         // block
         thrust::for_each(rmm::exec_policy(stream),
-                         comp_out.begin(),
-                         comp_out.end(),
-                         [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; });
+                         comp_stat.begin(),
+                         comp_stat.end(),
+                         [] __device__(decompress_status & stat) { stat.status = 1; });
       };
 
     } else {
-      gpu_snap(comp_in.data(), comp_out.data(), num_compressed_blocks, stream);
+      gpu_snap(comp_in, comp_out, comp_stat, stream);
     }
   }
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
-    strm_desc, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
+    strm_desc, comp_in, comp_out, comp_stat, compressed_data, comp_blk_size, max_comp_blk_size);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 276a1f49abf..e44ca10922f 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -26,9 +26,16 @@ namespace cudf {
 namespace io {
 namespace orc {
 namespace gpu {
+
+struct comp_in_out {
+  uint8_t const* in_ptr;
+  size_t in_size;
+  uint8_t* out_ptr;
+  size_t out_size;
+};
 struct compressed_stream_s {
   CompressedStreamInfo info;
-  gpu_inflate_input_s ctl;
+  comp_in_out ctl;
 };
 
 // blockDim {128,1,1}
@@ -57,7 +64,8 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size;
-      gpu_inflate_input_s* init_ctl = nullptr;
+      device_span<uint8_t const>* init_in_ctl = nullptr;
+      device_span<uint8_t>* init_out_ctl      = nullptr;
       block_len >>= 1;
       cur += BLOCK_HEADER_SIZE;
       if (block_len > block_size || cur + block_len > end) {
@@ -82,27 +90,34 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
             uncompressed[max_uncompressed_size + lane_id] = cur[lane_id];
           }
         } else {
-          init_ctl = s->info.copyctl;
-          init_ctl = (init_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks)
-                       ? &init_ctl[num_uncompressed_blocks]
-                       : nullptr;
+          init_in_ctl =
+            (s->info.copy_in_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks)
+              ? &s->info.copy_in_ctl[num_uncompressed_blocks]
+              : nullptr;
+          init_out_ctl =
+            (s->info.copy_out_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks)
+              ? &s->info.copy_out_ctl[num_uncompressed_blocks]
+              : nullptr;
           num_uncompressed_blocks++;
         }
       } else {
-        init_ctl = s->info.decctl;
-        init_ctl = (init_ctl && num_compressed_blocks < s->info.num_compressed_blocks)
-                     ? &init_ctl[num_compressed_blocks]
-                     : nullptr;
+        init_in_ctl = (s->info.dec_in_ctl && num_compressed_blocks < s->info.num_compressed_blocks)
+                        ? &s->info.dec_in_ctl[num_compressed_blocks]
+                        : nullptr;
+        init_out_ctl =
+          (s->info.dec_out_ctl && num_compressed_blocks < s->info.num_compressed_blocks)
+            ? &s->info.dec_out_ctl[num_compressed_blocks]
+            : nullptr;
         num_compressed_blocks++;
       }
-      if (!lane_id && init_ctl) {
-        s->ctl.srcDevice = const_cast<uint8_t*>(cur);
-        s->ctl.srcSize   = block_len;
-        s->ctl.dstDevice = uncompressed + max_uncompressed_size;
-        s->ctl.dstSize   = uncompressed_size;
+      if (!lane_id && init_in_ctl) {
+        s->ctl = {cur, block_len, uncompressed + max_uncompressed_size, uncompressed_size};
       }
       __syncwarp();
-      if (init_ctl && lane_id == 0) *init_ctl = s->ctl;
+      if (init_in_ctl && lane_id == 0) {
+        *init_in_ctl  = {s->ctl.in_ptr, s->ctl.in_size};
+        *init_out_ctl = {s->ctl.out_ptr, s->ctl.out_size};
+      }
       cur += block_len;
       max_uncompressed_size += uncompressed_size;
       max_uncompressed_block_size = max(max_uncompressed_block_size, uncompressed_size);
@@ -137,14 +152,14 @@ extern "C" __global__ void __launch_bounds__(128, 8)
       s->info.num_compressed_blocks + s->info.num_uncompressed_blocks > 0 &&
       s->info.max_uncompressed_size > 0) {
     // Walk through the compressed blocks
-    const uint8_t* cur                  = s->info.compressed_data;
-    const uint8_t* end                  = cur + s->info.compressed_data_size;
-    const gpu_inflate_input_s* dec_in   = s->info.decctl;
-    const gpu_inflate_status_s* dec_out = s->info.decstatus;
-    uint8_t* uncompressed_actual        = s->info.uncompressed_data;
-    uint8_t* uncompressed_estimated     = uncompressed_actual;
-    uint32_t num_compressed_blocks      = 0;
-    uint32_t max_compressed_blocks      = s->info.num_compressed_blocks;
+    const uint8_t* cur              = s->info.compressed_data;
+    const uint8_t* end              = cur + s->info.compressed_data_size;
+    auto dec_out                    = s->info.dec_out_ctl;
+    auto dec_status                 = s->info.decstatus;
+    uint8_t* uncompressed_actual    = s->info.uncompressed_data;
+    uint8_t* uncompressed_estimated = uncompressed_actual;
+    uint32_t num_compressed_blocks  = 0;
+    uint32_t max_compressed_blocks  = s->info.num_compressed_blocks;
 
     while (cur + BLOCK_HEADER_SIZE < end) {
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
@@ -158,14 +173,14 @@ extern "C" __global__ void __launch_bounds__(128, 8)
         uncompressed_size_actual = block_len;
       } else {
         if (num_compressed_blocks > max_compressed_blocks) { break; }
-        if (shuffle((lane_id == 0) ? dec_out[num_compressed_blocks].status : 0) != 0) {
+        if (shuffle((lane_id == 0) ? dec_status[num_compressed_blocks].status : 0) != 0) {
           // Decompression failed, not much point in doing anything else
           break;
         }
-        uncompressed_size_est =
-          shuffle((lane_id == 0) ? *(const uint32_t*)&dec_in[num_compressed_blocks].dstSize : 0);
-        uncompressed_size_actual = shuffle(
-          (lane_id == 0) ? *(const uint32_t*)&dec_out[num_compressed_blocks].bytes_written : 0);
+        uint32_t const dst_size      = dec_out[num_compressed_blocks].size();
+        uncompressed_size_est        = shuffle((lane_id == 0) ? dst_size : 0);
+        uint32_t const bytes_written = dec_status[num_compressed_blocks].bytes_written;
+        uncompressed_size_actual     = shuffle((lane_id == 0) ? bytes_written : 0);
       }
       // In practice, this should never happen with a well-behaved writer, as we would expect the
       // uncompressed size to always be equal to the compression block size except for the last
@@ -360,11 +375,11 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
   if (strm_len > 0) {
     int32_t compressed_offset = (t < num_rowgroups) ? s->compressed_offset[t][ci_id] : 0;
     if (compressed_offset > 0) {
-      const uint8_t* start            = s->strm_info[ci_id].compressed_data;
-      const uint8_t* cur              = start;
-      const uint8_t* end              = cur + s->strm_info[ci_id].compressed_data_size;
-      gpu_inflate_status_s* decstatus = s->strm_info[ci_id].decstatus;
-      uint32_t uncomp_offset          = 0;
+      const uint8_t* start   = s->strm_info[ci_id].compressed_data;
+      const uint8_t* cur     = start;
+      const uint8_t* end     = cur + s->strm_info[ci_id].compressed_data_size;
+      auto decstatus         = s->strm_info[ci_id].decstatus.data();
+      uint32_t uncomp_offset = 0;
       for (;;) {
         uint32_t block_len, is_uncompressed;
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d0c1cea97a8..ecd2d6f6ec0 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -190,6 +190,7 @@ class orc_column_view {
 
   void add_child(uint32_t child_idx) { children.emplace_back(child_idx); }
 
+  auto type() const noexcept { return cudf_column.type(); }
   auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; }
   void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; }
   [[nodiscard]] auto dict_stride() const noexcept { return _dict_stride; }
@@ -1073,23 +1074,47 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
                    [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx]; });
 }
 
-writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
-  statistics_freq stats_freq,
+hostdevice_vector<uint8_t> allocate_and_encode_blobs(
+  hostdevice_vector<statistics_merge_group>& stats_merge_groups,
+  rmm::device_uvector<statistics_chunk>& stat_chunks,
+  int num_stat_blobs,
+  rmm::cuda_stream_view stream)
+{
+  // figure out the buffer size needed for protobuf format
+  gpu::orc_init_statistics_buffersize(
+    stats_merge_groups.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
+  auto max_blobs = stats_merge_groups.element(num_stat_blobs - 1, stream);
+
+  hostdevice_vector<uint8_t> blobs(max_blobs.start_chunk + max_blobs.num_chunks, stream);
+  gpu::orc_encode_statistics(blobs.device_ptr(),
+                             stats_merge_groups.device_ptr(),
+                             stat_chunks.data(),
+                             num_stat_blobs,
+                             stream);
+  stats_merge_groups.device_to_host(stream);
+  blobs.device_to_host(stream, true);
+  return blobs;
+}
+
+writer::impl::intermediate_statistics writer::impl::gather_statistic_blobs(
+  statistics_freq const stats_freq,
   orc_table_view const& orc_table,
   file_segmentation const& segmentation)
 {
   auto const num_rowgroup_blobs     = segmentation.rowgroups.count();
   auto const num_stripe_blobs       = segmentation.num_stripes() * orc_table.num_columns();
-  auto const num_file_blobs         = orc_table.num_columns();
-  auto const num_stat_blobs         = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
   auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE;
-  if (not are_statistics_enabled or num_stat_blobs == 0) { return {}; }
+  if (not are_statistics_enabled or num_rowgroup_blobs + num_stripe_blobs == 0) {
+    return writer::impl::intermediate_statistics{stream};
+  }
 
   hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
-  hostdevice_vector<statistics_merge_group> stat_merge(num_stat_blobs, stream);
-  auto rowgroup_stat_merge = stat_merge.host_ptr();
-  auto stripe_stat_merge   = rowgroup_stat_merge + num_rowgroup_blobs;
-  auto file_stat_merge     = stripe_stat_merge + num_stripe_blobs;
+  hostdevice_vector<statistics_merge_group> rowgroup_merge(num_rowgroup_blobs, stream);
+  hostdevice_vector<statistics_merge_group> stripe_merge(num_stripe_blobs, stream);
+  std::vector<statistics_dtype> col_stats_dtypes;
+  std::vector<data_type> col_types;
+  auto rowgroup_stat_merge = rowgroup_merge.host_ptr();
+  auto stripe_stat_merge   = stripe_merge.host_ptr();
 
   for (auto const& column : orc_table.columns) {
     stats_column_desc* desc = &stat_desc[column.index()];
@@ -1121,82 +1146,148 @@ writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
     } else {
       desc->ts_scale = 0;
     }
+    col_stats_dtypes.push_back(desc->stats_dtype);
+    col_types.push_back(column.type());
     for (auto const& stripe : segmentation.stripes) {
-      auto& grp = stripe_stat_merge[column.index() * segmentation.num_stripes() + stripe.id];
-      grp.col   = stat_desc.device_ptr(column.index());
+      auto& grp       = stripe_stat_merge[column.index() * segmentation.num_stripes() + stripe.id];
+      grp.col_dtype   = column.type();
+      grp.stats_dtype = desc->stats_dtype;
       grp.start_chunk =
         static_cast<uint32_t>(column.index() * segmentation.num_rowgroups() + stripe.first);
       grp.num_chunks = stripe.size;
-      for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) {
+      for (auto rg_idx_it = stripe.cbegin(); rg_idx_it != stripe.cend(); ++rg_idx_it) {
         auto& rg_grp =
           rowgroup_stat_merge[column.index() * segmentation.num_rowgroups() + *rg_idx_it];
-        rg_grp.col         = stat_desc.device_ptr(column.index());
+        rg_grp.col_dtype   = column.type();
+        rg_grp.stats_dtype = desc->stats_dtype;
         rg_grp.start_chunk = *rg_idx_it;
         rg_grp.num_chunks  = 1;
       }
     }
-    auto col_stats         = &file_stat_merge[column.index()];
-    col_stats->col         = stat_desc.device_ptr(column.index());
-    col_stats->start_chunk = static_cast<uint32_t>(column.index() * segmentation.num_stripes());
-    col_stats->num_chunks  = static_cast<uint32_t>(segmentation.num_stripes());
   }
   stat_desc.host_to_device(stream);
-  stat_merge.host_to_device(stream);
+  rowgroup_merge.host_to_device(stream);
+  stripe_merge.host_to_device(stream);
   set_stat_desc_leaf_cols(orc_table.d_columns, stat_desc, stream);
 
-  rmm::device_uvector<statistics_chunk> stat_chunks(num_stat_blobs, stream);
-  auto rowgroup_stat_chunks = stat_chunks.data();
-  auto stripe_stat_chunks   = rowgroup_stat_chunks + num_rowgroup_blobs;
-  auto file_stat_chunks     = stripe_stat_chunks + num_stripe_blobs;
+  // The rowgroup stat chunks are written out in each stripe. The stripe and file-level chunks are
+  // written in the footer. To prevent persisting the rowgroup stat chunks across multiple write
+  // calls in a chunked write situation, these allocations are split up so stripe data can persist
+  // until the footer is written and rowgroup data can be freed after being written to the stripe.
+  rmm::device_uvector<statistics_chunk> rowgroup_chunks(num_rowgroup_blobs, stream);
+  rmm::device_uvector<statistics_chunk> stripe_chunks(num_stripe_blobs, stream);
+  auto rowgroup_stat_chunks = rowgroup_chunks.data();
+  auto stripe_stat_chunks   = stripe_chunks.data();
 
-  rmm::device_uvector<statistics_group> stat_groups(num_rowgroup_blobs, stream);
+  rmm::device_uvector<statistics_group> rowgroup_groups(num_rowgroup_blobs, stream);
   gpu::orc_init_statistics_groups(
-    stat_groups.data(), stat_desc.device_ptr(), segmentation.rowgroups, stream);
+    rowgroup_groups.data(), stat_desc.device_ptr(), segmentation.rowgroups, stream);
 
   detail::calculate_group_statistics<detail::io_file_format::ORC>(
-    stat_chunks.data(), stat_groups.data(), num_rowgroup_blobs, stream);
+    rowgroup_chunks.data(), rowgroup_groups.data(), num_rowgroup_blobs, stream);
 
   detail::merge_group_statistics<detail::io_file_format::ORC>(
-    stripe_stat_chunks,
-    rowgroup_stat_chunks,
-    stat_merge.device_ptr(num_rowgroup_blobs),
-    num_stripe_blobs,
-    stream);
+    stripe_stat_chunks, rowgroup_stat_chunks, stripe_merge.device_ptr(), num_stripe_blobs, stream);
 
-  detail::merge_group_statistics<detail::io_file_format::ORC>(
-    file_stat_chunks,
-    stripe_stat_chunks,
-    stat_merge.device_ptr(num_rowgroup_blobs + num_stripe_blobs),
-    num_file_blobs,
-    stream);
-  gpu::orc_init_statistics_buffersize(
-    stat_merge.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
-  stat_merge.device_to_host(stream, true);
+  // With chunked writes, the orc table can be deallocated between write calls.
+  // This forces our hand to encode row groups and stripes only in this stage and further
+  // we have to persist any data from the table that we need later. The
+  // minimum and maximum string inside the `str_val` structure inside `statistics_val` in
+  // `statistic_chunk` that are copies of the largest and smallest strings in the row group,
+  // or stripe need to be persisted between write calls. We write rowgroup data with each
+  // stripe and then save each stripe's stats until the end where we merge those all together
+  // to get the file-level stats.
 
-  hostdevice_vector<uint8_t> blobs(
-    stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
   // Skip rowgroup blobs when encoding, if chosen granularity is coarser than "ROW_GROUP".
   auto const is_granularity_rowgroup = stats_freq == ORC_STATISTICS_ROW_GROUP;
-  auto const num_skip                = is_granularity_rowgroup ? 0 : num_rowgroup_blobs;
-  gpu::orc_encode_statistics(blobs.device_ptr(),
-                             stat_merge.device_ptr(num_skip),
-                             stat_chunks.data() + num_skip,
-                             num_stat_blobs - num_skip,
-                             stream);
-  stat_merge.device_to_host(stream);
-  blobs.device_to_host(stream, true);
-
+  // we have to encode the row groups now IF they are being written out
   auto rowgroup_blobs = [&]() -> std::vector<ColStatsBlob> {
     if (not is_granularity_rowgroup) { return {}; }
+
+    hostdevice_vector<uint8_t> blobs =
+      allocate_and_encode_blobs(rowgroup_merge, rowgroup_chunks, num_rowgroup_blobs, stream);
+
     std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
     for (size_t i = 0; i < num_rowgroup_blobs; i++) {
-      auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
-      auto const stat_end   = stat_begin + rowgroup_stat_merge[i].num_chunks;
+      auto const stat_begin = blobs.host_ptr(rowgroup_merge[i].start_chunk);
+      auto const stat_end   = stat_begin + rowgroup_merge[i].num_chunks;
       rowgroup_blobs[i].assign(stat_begin, stat_end);
     }
     return rowgroup_blobs;
   }();
 
+  return {std::move(rowgroup_blobs),
+          std::move(stripe_chunks),
+          std::move(stripe_merge),
+          std::move(col_stats_dtypes),
+          std::move(col_types)};
+}
+
+writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
+  int num_stripes, writer::impl::persisted_statistics& per_chunk_stats)
+{
+  auto stripe_size_iter = thrust::make_transform_iterator(per_chunk_stats.stripe_stat_merge.begin(),
+                                                          [](auto const& i) { return i.size(); });
+
+  auto const num_columns = per_chunk_stats.col_types.size();
+  auto const num_stripe_blobs =
+    thrust::reduce(stripe_size_iter, stripe_size_iter + per_chunk_stats.stripe_stat_merge.size());
+  auto const num_file_blobs = num_columns;
+  auto const num_blobs = single_write_mode ? static_cast<int>(num_stripe_blobs + num_file_blobs)
+                                           : static_cast<int>(num_stripe_blobs);
+
+  if (num_stripe_blobs == 0) { return {}; }
+
+  // merge the stripe persisted data and add file data
+  rmm::device_uvector<statistics_chunk> stat_chunks(num_blobs, stream);
+  hostdevice_vector<statistics_merge_group> stats_merge(num_blobs, stream);
+
+  size_t chunk_offset = 0;
+  size_t merge_offset = 0;
+  for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) {
+    auto chunk_bytes = per_chunk_stats.stripe_stat_chunks[i].size() * sizeof(statistics_chunk);
+    auto merge_bytes = per_chunk_stats.stripe_stat_merge[i].size() * sizeof(statistics_merge_group);
+    cudaMemcpyAsync(stat_chunks.data() + chunk_offset,
+                    per_chunk_stats.stripe_stat_chunks[i].data(),
+                    chunk_bytes,
+                    cudaMemcpyDeviceToDevice,
+                    stream);
+    cudaMemcpyAsync(stats_merge.device_ptr() + merge_offset,
+                    per_chunk_stats.stripe_stat_merge[i].device_ptr(),
+                    merge_bytes,
+                    cudaMemcpyDeviceToDevice,
+                    stream);
+    chunk_offset += per_chunk_stats.stripe_stat_chunks[i].size();
+    merge_offset += per_chunk_stats.stripe_stat_merge[i].size();
+  }
+
+  if (single_write_mode) {
+    std::vector<statistics_merge_group> file_stats_merge(num_file_blobs);
+    for (auto i = 0u; i < num_file_blobs; ++i) {
+      auto col_stats         = &file_stats_merge[i];
+      col_stats->col_dtype   = per_chunk_stats.col_types[i];
+      col_stats->stats_dtype = per_chunk_stats.stats_dtypes[i];
+      col_stats->start_chunk = static_cast<uint32_t>(i * num_stripes);
+      col_stats->num_chunks  = static_cast<uint32_t>(num_stripes);
+    }
+
+    auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs);
+    cudaMemcpyAsync(d_file_stats_merge,
+                    file_stats_merge.data(),
+                    num_file_blobs * sizeof(statistics_merge_group),
+                    cudaMemcpyHostToDevice,
+                    stream);
+
+    auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs;
+    detail::merge_group_statistics<detail::io_file_format::ORC>(
+      file_stat_chunks, stat_chunks.data(), d_file_stats_merge, num_file_blobs, stream);
+  }
+
+  hostdevice_vector<uint8_t> blobs =
+    allocate_and_encode_blobs(stats_merge, stat_chunks, num_blobs, stream);
+
+  auto stripe_stat_merge = stats_merge.host_ptr();
+
   std::vector<ColStatsBlob> stripe_blobs(num_stripe_blobs);
   for (size_t i = 0; i < num_stripe_blobs; i++) {
     auto const stat_begin = blobs.host_ptr(stripe_stat_merge[i].start_chunk);
@@ -1204,13 +1295,17 @@ writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
     stripe_blobs[i].assign(stat_begin, stat_end);
   }
 
-  std::vector<ColStatsBlob> file_blobs(num_file_blobs);
-  for (size_t i = 0; i < num_file_blobs; i++) {
-    auto const stat_begin = blobs.host_ptr(file_stat_merge[i].start_chunk);
-    auto const stat_end   = stat_begin + file_stat_merge[i].num_chunks;
-    file_blobs[i].assign(stat_begin, stat_end);
+  std::vector<ColStatsBlob> file_blobs(single_write_mode ? num_file_blobs : 0);
+  if (single_write_mode) {
+    auto file_stat_merge = stats_merge.host_ptr(num_stripe_blobs);
+    for (auto i = 0u; i < num_file_blobs; i++) {
+      auto const stat_begin = blobs.host_ptr(file_stat_merge[i].start_chunk);
+      auto const stat_end   = stat_begin + file_stat_merge[i].num_chunks;
+      file_blobs[i].assign(stat_begin, stat_end);
+    }
   }
-  return {std::move(rowgroup_blobs), std::move(stripe_blobs), std::move(file_blobs)};
+
+  return {std::move(stripe_blobs), std::move(file_blobs)};
 }
 
 void writer::impl::write_index_stream(int32_t stripe_id,
@@ -1219,7 +1314,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
                                       file_segmentation const& segmentation,
                                       host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                                       host_2dspan<gpu::StripeStream const> strm_desc,
-                                      host_span<gpu_inflate_status_s const> comp_out,
+                                      host_span<decompress_status const> comp_out,
                                       std::vector<ColStatsBlob> const& rg_stats,
                                       StripeInformation* stripe,
                                       orc_streams* streams,
@@ -1955,8 +2050,9 @@ void writer::impl::write(table_view const& table)
 
     // Compress the data streams
     rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    hostdevice_vector<device_span<uint8_t const>> comp_in(num_compressed_blocks, stream);
+    hostdevice_vector<device_span<uint8_t>> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<decompress_status> comp_stats(num_compressed_blocks, stream);
     if (compression_kind_ != NONE) {
       strm_descs.host_to_device(stream);
       gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
@@ -1968,14 +2064,25 @@ void writer::impl::write(table_view const& table)
                                   enc_data.streams,
                                   comp_in,
                                   comp_out,
+                                  comp_stats,
                                   stream);
       strm_descs.device_to_host(stream);
-      comp_out.device_to_host(stream, true);
+      comp_stats.device_to_host(stream, true);
     }
 
     ProtobufWriter pbw_(&buffer_);
 
-    auto const statistics = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
+    auto intermediate_stats = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
+
+    if (intermediate_stats.stripe_stat_chunks.size() > 0) {
+      persisted_stripe_statistics.stripe_stat_chunks.emplace_back(
+        std::move(intermediate_stats.stripe_stat_chunks));
+      persisted_stripe_statistics.stripe_stat_merge.emplace_back(
+        std::move(intermediate_stats.stripe_stat_merge));
+      persisted_stripe_statistics.stats_dtypes = std::move(intermediate_stats.stats_dtypes);
+      persisted_stripe_statistics.col_types    = std::move(intermediate_stats.col_types);
+      persisted_stripe_statistics.num_rows     = orc_table.num_rows();
+    }
 
     // Write stripes
     std::vector<std::future<void>> write_tasks;
@@ -1992,8 +2099,8 @@ void writer::impl::write(table_view const& table)
                            segmentation,
                            enc_data.streams,
                            strm_descs,
-                           comp_out,
-                           statistics.rowgroup_level,
+                           comp_stats,
+                           intermediate_stats.rowgroup_blobs,
                            &stripe,
                            &streams,
                            &pbw_);
@@ -2034,41 +2141,10 @@ void writer::impl::write(table_view const& table)
       }
       out_sink_->host_write(buffer_.data(), buffer_.size());
     }
+
     for (auto const& task : write_tasks) {
       task.wait();
     }
-
-    // File-level statistics
-    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-    if (single_write_mode and not statistics.file_level.empty()) {
-      // First entry contains total number of rows
-      buffer_.resize(0);
-      pbw_.put_uint(encode_field_number<size_type>(1));
-      pbw_.put_uint(num_rows);
-      ff.statistics.reserve(1 + orc_table.num_columns());
-      ff.statistics.emplace_back(std::move(buffer_));
-      // Add file stats, stored after stripe stats in `column_stats`
-      ff.statistics.insert(ff.statistics.end(),
-                           std::make_move_iterator(statistics.file_level.begin()),
-                           std::make_move_iterator(statistics.file_level.end()));
-    }
-    // Stripe-level statistics
-    if (not statistics.stripe_level.empty()) {
-      size_t first_stripe = md.stripeStats.size();
-      md.stripeStats.resize(first_stripe + stripes.size());
-      for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
-        md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
-        buffer_.resize(0);
-        pbw_.put_uint(encode_field_number<size_type>(1));
-        pbw_.put_uint(stripes[stripe_id].numberOfRows);
-        md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
-        for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
-          size_t idx = stripes.size() * col_idx + stripe_id;
-          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-            std::move(statistics.stripe_level[idx]);
-        }
-      }
-    }
   }
   if (ff.headerLength == 0) {
     // First call
@@ -2125,6 +2201,40 @@ void writer::impl::close()
   ProtobufWriter pbw_(&buffer_);
   PostScript ps;
 
+  auto const statistics = finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics);
+
+  // File-level statistics
+  if (single_write_mode and not statistics.file_level.empty()) {
+    buffer_.resize(0);
+    pbw_.put_uint(encode_field_number<size_type>(1));
+    pbw_.put_uint(persisted_stripe_statistics.num_rows);
+    // First entry contains total number of rows
+    ff.statistics.reserve(ff.types.size());
+    ff.statistics.emplace_back(std::move(buffer_));
+    // Add file stats, stored after stripe stats in `column_stats`
+    ff.statistics.insert(ff.statistics.end(),
+                         std::make_move_iterator(statistics.file_level.begin()),
+                         std::make_move_iterator(statistics.file_level.end()));
+  }
+
+  // Stripe-level statistics
+  if (not statistics.stripe_level.empty()) {
+    md.stripeStats.resize(ff.stripes.size());
+    for (size_t stripe_id = 0; stripe_id < ff.stripes.size(); stripe_id++) {
+      md.stripeStats[stripe_id].colStats.resize(ff.types.size());
+      buffer_.resize(0);
+      pbw_.put_uint(encode_field_number<size_type>(1));
+      pbw_.put_uint(ff.stripes[stripe_id].numberOfRows);
+      md.stripeStats[stripe_id].colStats[0] = std::move(buffer_);
+      for (size_t col_idx = 0; col_idx < ff.types.size() - 1; col_idx++) {
+        size_t idx                                      = ff.stripes.size() * col_idx + stripe_id;
+        md.stripeStats[stripe_id].colStats[1 + col_idx] = std::move(statistics.stripe_level[idx]);
+      }
+    }
+  }
+
+  persisted_stripe_statistics.clear();
+
   ff.contentLength = out_sink_->bytes_written();
   std::transform(
     kv_meta.begin(), kv_meta.end(), std::back_inserter(ff.metadata), [&](auto const& udata) {
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index b3662bf309f..d823c73007f 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -286,24 +286,84 @@ class writer::impl {
     hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
     hostdevice_2dvector<gpu::StripeStream>* strm_desc);
 
-  struct encoded_statistics {
-    std::vector<ColStatsBlob> rowgroup_level;
+  /**
+   * @brief Statistics data stored between calls to write for chunked writes
+   *
+   */
+  struct intermediate_statistics {
+    explicit intermediate_statistics(rmm::cuda_stream_view stream)
+      : stripe_stat_chunks(0, stream){};
+    intermediate_statistics(std::vector<ColStatsBlob> rb,
+                            rmm::device_uvector<statistics_chunk> sc,
+                            hostdevice_vector<statistics_merge_group> smg,
+                            std::vector<statistics_dtype> sdt,
+                            std::vector<data_type> sct)
+      : rowgroup_blobs(std::move(rb)),
+        stripe_stat_chunks(std::move(sc)),
+        stripe_stat_merge(std::move(smg)),
+        stats_dtypes(std::move(sdt)),
+        col_types(std::move(sct)){};
+
+    // blobs for the rowgroups and stripes. Not persisted
+    std::vector<ColStatsBlob> rowgroup_blobs;
+
+    rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
+    hostdevice_vector<statistics_merge_group> stripe_stat_merge;
+    std::vector<statistics_dtype> stats_dtypes;
+    std::vector<data_type> col_types;
+  };
+
+  /**
+   * @brief used for chunked writes to persist data between calls to write.
+   *
+   */
+  struct persisted_statistics {
+    void clear()
+    {
+      stripe_stat_chunks.clear();
+      stripe_stat_merge.clear();
+      stats_dtypes.clear();
+      col_types.clear();
+      num_rows = 0;
+    }
+
+    std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
+    std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
+    std::vector<statistics_dtype> stats_dtypes;
+    std::vector<data_type> col_types;
+    int num_rows = 0;
+  };
+
+  /**
+   * @brief Protobuf encoded statistics created at file close
+   *
+   */
+  struct encoded_footer_statistics {
     std::vector<ColStatsBlob> stripe_level;
     std::vector<ColStatsBlob> file_level;
   };
 
   /**
-   * @brief Returns column statistics encoded in ORC protobuf format.
+   * @brief Returns column statistics in an intermediate format.
    *
    * @param statistics_freq Frequency of statistics to be included in the output file
    * @param orc_table Table information to be written
-   * @param columns List of columns
    * @param segmentation stripe and rowgroup ranges
-   * @return The statistic blobs
+   * @return The statistic information
+   */
+  intermediate_statistics gather_statistic_blobs(statistics_freq const statistics_freq,
+                                                 orc_table_view const& orc_table,
+                                                 file_segmentation const& segmentation);
+
+  /**
+   * @brief Returns column statistics encoded in ORC protobuf format stored in the footer.
+   *
+   * @param num_stripes number of stripes in the data
+   * @param incoming_stats intermediate statistics returned from `gather_statistic_blobs`
+   * @return The encoded statistic blobs
    */
-  encoded_statistics gather_statistic_blobs(statistics_freq statistics_freq,
-                                            orc_table_view const& orc_table,
-                                            file_segmentation const& segmentation);
+  encoded_footer_statistics finish_statistic_blobs(
+    int num_stripes, writer::impl::persisted_statistics& incoming_stats);
 
   /**
    * @brief Writes the specified column's row index stream.
@@ -326,7 +386,7 @@ class writer::impl {
                           file_segmentation const& segmentation,
                           host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                           host_2dspan<gpu::StripeStream const> strm_desc,
-                          host_span<gpu_inflate_status_s const> comp_out,
+                          host_span<decompress_status const> comp_out,
                           std::vector<ColStatsBlob> const& rg_stats,
                           StripeInformation* stripe,
                           orc_streams* streams,
@@ -384,6 +444,8 @@ class writer::impl {
   std::map<std::string, std::string> kv_meta;
   // to track if the output has been written to sink
   bool closed = false;
+  // statistics data saved between calls to write before a close writes out the statistics
+  persisted_statistics persisted_stripe_statistics;
 
   std::vector<uint8_t> buffer_;
   std::unique_ptr<data_sink> out_sink_;
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index f61cfa83579..93e76a6ac23 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -22,19 +22,24 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace io {
 namespace parquet {
 namespace gpu {
+namespace {
+constexpr int DEFAULT_BLOCK_SIZE = 256;
+}
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
+__global__ void __launch_bounds__(block_size)
   initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
 {
   auto chunk = chunks[blockIdx.x];
   auto t     = threadIdx.x;
   // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
-  for (size_t i = 0; i < chunk.dict_map_size; i += block_size) {
+  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
       new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
       new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
@@ -55,7 +60,10 @@ struct equality_functor {
 template <typename T>
 struct hash_functor {
   column_device_view const& col;
-  __device__ auto operator()(size_type idx) { return MurmurHash3_32<T>{}(col.element<T>(idx)); }
+  __device__ auto operator()(size_type idx) const
+  {
+    return cudf::detail::MurmurHash3_32<T>{}(col.element<T>(idx));
+  }
 };
 
 struct map_insert_fn {
@@ -67,7 +75,7 @@ struct map_insert_fn {
     if constexpr (column_device_view::has_element_accessor<T>()) {
       auto hash_fn     = hash_functor<T>{col};
       auto equality_fn = equality_functor<T>{col};
-      return map.insert(std::make_pair(i, i), hash_fn, equality_fn);
+      return map.insert(std::pair(i, i), hash_fn, equality_fn);
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
@@ -91,9 +99,8 @@ struct map_find_fn {
 };
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                                  cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+__global__ void __launch_bounds__(block_size)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -102,70 +109,57 @@ __global__ void __launch_bounds__(block_size, 1)
   auto chunk   = frag.chunk;
   auto col     = chunk->col_desc;
 
-  size_type start_row = frag.start_row;
-  size_type end_row   = frag.start_row + frag.num_rows;
+  if (not chunk->use_dictionary) { return; }
 
-  __shared__ size_type s_start_value_idx;
-  __shared__ size_type s_num_values;
+  using block_reduce = cub::BlockReduce<size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
 
-  if (not chunk->use_dictionary) { return; }
+  size_type start_row = frag.start_row;
+  size_type end_row   = frag.start_row + frag.num_rows;
 
-  if (t == 0) {
-    // Find the bounds of values in leaf column to be inserted into the map for current chunk
-    auto cudf_col      = *(col->parent_column);
-    s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
-    auto end_value_idx = row_to_value_idx(end_row, cudf_col);
-    s_num_values       = end_value_idx - s_start_value_idx;
-  }
-  __syncthreads();
+  // Find the bounds of values in leaf column to be inserted into the map for current chunk
+  auto const cudf_col               = *(col->parent_column);
+  size_type const s_start_value_idx = row_to_value_idx(start_row, cudf_col);
+  size_type const end_value_idx     = row_to_value_idx(end_row, cudf_col);
 
   column_device_view const& data_col = *col->leaf_column;
-  using block_reduce                 = cub::BlockReduce<size_type, block_size>;
-  __shared__ typename block_reduce::TempStorage reduce_storage;
 
   // Make a view of the hash map
   auto hash_map_mutable = map_type::device_mutable_view(
     chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
-  auto hash_map = map_type::device_view(
-    chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
-  __shared__ int total_num_dict_entries;
-  for (size_type i = 0; i < s_num_values; i += block_size) {
-    // add the value to hash map
-    size_type val_idx = i + t + s_start_value_idx;
-    bool is_valid =
-      (i + t < s_num_values && val_idx < data_col.size()) and data_col.is_valid(val_idx);
+  __shared__ size_type total_num_dict_entries;
+  size_type val_idx = s_start_value_idx + t;
+  while (val_idx - block_size < end_value_idx) {
+    auto const is_valid =
+      val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
 
     // insert element at val_idx to hash map and count successful insertions
     size_type is_unique      = 0;
     size_type uniq_elem_size = 0;
     if (is_valid) {
-      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{hash_map}, data_col, val_idx);
-      if (found_slot == hash_map.end()) {
-        is_unique =
-          type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
-        uniq_elem_size = [&]() -> size_type {
-          if (not is_unique) { return 0; }
-          switch (col->physical_type) {
-            case Type::INT32: return 4;
-            case Type::INT64: return 8;
-            case Type::INT96: return 12;
-            case Type::FLOAT: return 4;
-            case Type::DOUBLE: return 8;
-            case Type::BYTE_ARRAY:
-              if (data_col.type().id() == type_id::STRING) {
-                // Strings are stored as 4 byte length + string bytes
-                return 4 + data_col.element<string_view>(val_idx).size_bytes();
-              }
-            case Type::FIXED_LEN_BYTE_ARRAY:
-              if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-            default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
-          }
-        }();
-      }
+      is_unique =
+        type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
+      uniq_elem_size = [&]() -> size_type {
+        if (not is_unique) { return 0; }
+        switch (col->physical_type) {
+          case Type::INT32: return 4;
+          case Type::INT64: return 8;
+          case Type::INT96: return 12;
+          case Type::FLOAT: return 4;
+          case Type::DOUBLE: return 8;
+          case Type::BYTE_ARRAY:
+            if (data_col.type().id() == type_id::STRING) {
+              // Strings are stored as 4 byte length + string bytes
+              return 4 + data_col.element<string_view>(val_idx).size_bytes();
+            }
+          case Type::FIXED_LEN_BYTE_ARRAY:
+            if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+          default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
+        }
+      }();
     }
 
-    __syncthreads();
     auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
     __syncthreads();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
@@ -178,11 +172,13 @@ __global__ void __launch_bounds__(block_size, 1)
 
     // Check if the num unique values in chunk has already exceeded max dict size and early exit
     if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
-  }
+
+    val_idx += block_size;
+  }  // while
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
+__global__ void __launch_bounds__(block_size)
   collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
@@ -192,31 +188,30 @@ __global__ void __launch_bounds__(block_size, 1)
   auto map =
     map_type::device_view(chunk.dict_map_slots, chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
-  __shared__ size_type counter;
-  if (t == 0) counter = 0;
+  __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
+  using cuda::std::memory_order_relaxed;
+  if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
   __syncthreads();
-  for (size_t i = 0; i < chunk.dict_map_size; i += block_size) {
+  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
     if (t + i < chunk.dict_map_size) {
-      auto slot = map.begin_slot() + t + i;
-      auto key  = static_cast<map_type::key_type>(slot->first);
+      auto* slot = reinterpret_cast<map_type::value_type*>(map.begin_slot() + t + i);
+      auto key   = slot->first;
       if (key != KEY_SENTINEL) {
-        auto loc = atomicAdd(&counter, 1);
+        auto loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
         chunk.dict_data[loc] = key;
         // If sorting dict page ever becomes a hard requirement, enable the following statement and
         // add a dict sorting step before storing into the slot's second field.
         // chunk.dict_data_idx[loc] = t + i;
-        slot->second.store(loc);
-        // TODO: ^ This doesn't need to be atomic. Try casting to value_type ptr and just writing.
+        slot->second = loc;
       }
     }
   }
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                                cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+__global__ void __launch_bounds__(block_size)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -225,47 +220,38 @@ __global__ void __launch_bounds__(block_size, 1)
   auto chunk   = frag.chunk;
   auto col     = chunk->col_desc;
 
+  if (not chunk->use_dictionary) { return; }
+
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
-  __shared__ size_type s_start_value_idx;
-  __shared__ size_type s_ck_start_val_idx;
-  __shared__ size_type s_num_values;
-
-  if (t == 0) {
-    // Find the bounds of values in leaf column to be searched in the map for current chunk
-    auto cudf_col      = *(col->parent_column);
-    s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
-    s_ck_start_val_idx = row_to_value_idx(chunk->start_row, cudf_col);
-    auto end_value_idx = row_to_value_idx(end_row, cudf_col);
-    s_num_values       = end_value_idx - s_start_value_idx;
-  }
-  __syncthreads();
-
-  if (not chunk->use_dictionary) { return; }
+  // Find the bounds of values in leaf column to be searched in the map for current chunk
+  auto const cudf_col           = *(col->parent_column);
+  auto const s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
+  auto const s_ck_start_val_idx = row_to_value_idx(chunk->start_row, cudf_col);
+  auto const end_value_idx      = row_to_value_idx(end_row, cudf_col);
 
   column_device_view const& data_col = *col->leaf_column;
 
   auto map = map_type::device_view(
     chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
-  for (size_t i = 0; i < s_num_values; i += block_size) {
-    if (t + i < s_num_values) {
-      auto val_idx = s_start_value_idx + t + i;
-      bool is_valid =
-        (i + t < s_num_values && val_idx < data_col.size()) ? data_col.is_valid(val_idx) : false;
-
-      if (is_valid) {
-        auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
-        cudf_assert(found_slot != map.end() &&
-                    "Unable to find value in map in dictionary index construction");
-        if (found_slot != map.end()) {
-          // No need for atomic as this is not going to be modified by any other thread
-          auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
-          chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
-        }
+  auto val_idx = s_start_value_idx + t;
+  while (val_idx < end_value_idx) {
+    auto const is_valid = val_idx < data_col.size() and data_col.is_valid(val_idx);
+
+    if (is_valid) {
+      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
+      cudf_assert(found_slot != map.end() &&
+                  "Unable to find value in map in dictionary index construction");
+      if (found_slot != map.end()) {
+        // No need for atomic as this is not going to be modified by any other thread
+        auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
+        chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
       }
     }
+
+    val_idx += block_size;
   }
 }
 
@@ -276,15 +262,12 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
     <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                              cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
-  constexpr int block_size = 256;
   dim3 const dim_grid(frags.size().second, frags.size().first);
-
-  populate_chunk_hash_maps_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, frags);
+  populate_chunk_hash_maps_kernel<DEFAULT_BLOCK_SIZE>
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
 
 void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -293,15 +276,12 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
   collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                            cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
-  constexpr int block_size = 256;
   dim3 const dim_grid(frags.size().second, frags.size().first);
-
-  get_dictionary_indices_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, frags);
+  get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
 }  // namespace gpu
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 7feaa8e61b4..a1fc2edb0bb 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -156,6 +156,7 @@ bool CompactProtocolReader::read(SchemaElement* s)
                             ParquetFieldEnum<ConvertedType>(6, s->converted_type),
                             ParquetFieldInt32(7, s->decimal_scale),
                             ParquetFieldInt32(8, s->decimal_precision),
+                            ParquetFieldOptionalInt32(9, s->field_id),
                             ParquetFieldStruct(10, s->logical_type));
   return function_builder(this, op);
 }
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index ba48f7b127f..ddca6c37e08 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet.hpp"
 
+#include <thrust/optional.h>
+
 #include <algorithm>
 #include <cstddef>
 #include <string>
@@ -137,6 +139,7 @@ class CompactProtocolReader {
   friend class ParquetFieldBool;
   friend class ParquetFieldInt8;
   friend class ParquetFieldInt32;
+  friend class ParquetFieldOptionalInt32;
   friend class ParquetFieldInt64;
   template <typename T>
   friend class ParquetFieldStructListFunctor;
@@ -216,6 +219,27 @@ class ParquetFieldInt32 {
   int field() { return field_val; }
 };
 
+/**
+ * @brief Functor to set value to optional 32 bit integer read from CompactProtocolReader
+ *
+ * @return True if field type is not int32
+ */
+class ParquetFieldOptionalInt32 {
+  int field_val;
+  thrust::optional<int32_t>& val;
+
+ public:
+  ParquetFieldOptionalInt32(int f, thrust::optional<int32_t>& v) : field_val(f), val(v) {}
+
+  inline bool operator()(CompactProtocolReader* cpr, int field_type)
+  {
+    val = cpr->get_i32();
+    return (field_type != ST_FLD_I32);
+  }
+
+  int field() { return field_val; }
+};
+
 /**
  * @brief Functor to set value to 64 bit integer read from CompactProtocolReader
  *
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 927844cb1c2..176ecb6a572 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -144,6 +144,7 @@ size_t CompactProtocolWriter::write(const SchemaElement& s)
       c.field_int(8, s.decimal_precision);
     }
   }
+  if (s.field_id) { c.field_int(9, s.field_id.value()); }
   auto const isset = s.logical_type.isset;
   // TODO: add handling for all logical types
   // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index da671d4c665..f05f0af2a79 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -81,8 +81,6 @@ struct page_enc_state_s {
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
-  gpu_inflate_input_s comp_in;
-  gpu_inflate_status_s comp_stat;
   uint16_t vals[rle_buffer_size];
 };
 
@@ -278,7 +276,8 @@ __global__ void __launch_bounds__(128)
     uint32_t max_stats_len       = 0;
 
     if (!t) {
-      pagestats_g.col         = &col_desc[blockIdx.x];
+      pagestats_g.col_dtype   = col_g.leaf_column->type();
+      pagestats_g.stats_dtype = col_g.stats_dtype;
       pagestats_g.start_chunk = ck_g.first_fragment;
       pagestats_g.num_chunks  = 0;
     }
@@ -749,8 +748,9 @@ static __device__ std::pair<duration_ns, duration_D> convert_nanoseconds(timesta
 template <int block_size>
 __global__ void __launch_bounds__(128, 8)
   gpuEncodePages(device_span<gpu::EncPage> pages,
-                 device_span<gpu_inflate_input_s> comp_in,
-                 device_span<gpu_inflate_status_s> comp_stat)
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<decompress_status> comp_stats)
 {
   __shared__ __align__(8) page_enc_state_s state_g;
   using block_scan = cub::BlockScan<uint32_t, block_size>;
@@ -760,6 +760,7 @@ __global__ void __launch_bounds__(128, 8)
   uint32_t t                = threadIdx.x;
 
   if (t == 0) {
+    state_g = page_enc_state_s{};
     s->page = pages[blockIdx.x];
     s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
@@ -1084,21 +1085,14 @@ __global__ void __launch_bounds__(128, 8)
     auto actual_data_size        = static_cast<uint32_t>(s->cur - base);
     uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size);
     s->page.max_data_size        = actual_data_size;
-    s->comp_in.srcDevice         = base;
-    s->comp_in.srcSize           = actual_data_size;
-    s->comp_in.dstDevice         = s->page.compressed_data + s->page.max_hdr_size;
-    s->comp_in.dstSize           = compressed_bfr_size;
-    s->comp_stat.bytes_written   = 0;
-    s->comp_stat.status          = ~0;
-    s->comp_stat.reserved        = 0;
-  }
-  __syncthreads();
-  if (t == 0) {
+    if (not comp_in.empty()) {
+      comp_in[blockIdx.x]  = {base, actual_data_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, compressed_bfr_size};
+    }
     pages[blockIdx.x] = s->page;
-    if (not comp_in.empty()) comp_in[blockIdx.x] = s->comp_in;
-    if (not comp_stat.empty()) {
-      comp_stat[blockIdx.x]       = s->comp_stat;
-      pages[blockIdx.x].comp_stat = &comp_stat[blockIdx.x];
+    if (not comp_stats.empty()) {
+      comp_stats[blockIdx.x]      = {0, ~0u};
+      pages[blockIdx.x].comp_stat = &comp_stats[blockIdx.x];
     }
   }
 }
@@ -1316,7 +1310,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
 // blockDim(128, 1, 1)
 __global__ void __launch_bounds__(128)
   gpuEncodePageHeaders(device_span<EncPage> pages,
-                       device_span<gpu_inflate_status_s const> comp_stat,
+                       device_span<decompress_status const> comp_stat,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats)
 {
@@ -1662,9 +1656,7 @@ dremel_data get_dremel_data(column_view h_col,
     }
   }
 
-  std::unique_ptr<rmm::device_buffer> device_view_owners;
-  column_device_view* d_nesting_levels;
-  std::tie(device_view_owners, d_nesting_levels) =
+  auto [device_view_owners, d_nesting_levels] =
     contiguous_copy_column_device_views<column_device_view>(nesting_levels, stream);
 
   thrust::exclusive_scan(
@@ -1734,10 +1726,7 @@ dremel_data get_dremel_data(column_view h_col,
     auto offset_size_at_level = column_ends[level] - column_offsets[level] + 1;
 
     // Get empties at this level
-    rmm::device_uvector<size_type> empties(0, stream);
-    rmm::device_uvector<size_type> empties_idx(0, stream);
-    size_t empties_size;
-    std::tie(empties, empties_idx, empties_size) =
+    auto [empties, empties_idx, empties_size] =
       get_empties(nesting_levels[level], column_offsets[level], column_ends[level]);
 
     // Merge empty at deepest parent level with the rep, def level vals at leaf level
@@ -1818,10 +1807,7 @@ dremel_data get_dremel_data(column_view h_col,
     auto offset_size_at_level = column_ends[level] - column_offsets[level] + 1;
 
     // Get empties at this level
-    rmm::device_uvector<size_type> empties(0, stream);
-    rmm::device_uvector<size_type> empties_idx(0, stream);
-    size_t empties_size;
-    std::tie(empties, empties_idx, empties_size) =
+    auto [empties, empties_idx, empties_size] =
       get_empties(nesting_levels[level], column_offsets[level], column_ends[level]);
 
     auto offset_transformer = [new_child_offsets = new_offsets.data(),
@@ -1953,14 +1939,15 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
 }
 
 void EncodePages(device_span<gpu::EncPage> pages,
-                 device_span<gpu_inflate_input_s> comp_in,
-                 device_span<gpu_inflate_status_s> comp_stat,
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<decompress_status> comp_stats,
                  rmm::cuda_stream_view stream)
 {
   auto num_pages = pages.size();
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_stat);
+  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_out, comp_stats);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -1969,7 +1956,7 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
 }
 
 void EncodePageHeaders(device_span<EncPage> pages,
-                       device_span<gpu_inflate_status_s const> comp_stat,
+                       device_span<decompress_status const> comp_stats,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats,
                        rmm::cuda_stream_view stream)
@@ -1977,7 +1964,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
   // threads to coop load structs
   gpuEncodePageHeaders<<<pages.size(), 128, 0, stream.value()>>>(
-    pages, comp_stat, page_stats, chunk_stats);
+    pages, comp_stats, page_stats, chunk_stats);
 }
 
 void GatherPages(device_span<EncColumnChunk> chunks,
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index b1800640c91..ccaf3485bdf 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet_common.hpp"
 
+#include <thrust/optional.h>
+
 #include <cstdint>
 #include <string>
 #include <vector>
@@ -145,6 +147,7 @@ struct SchemaElement {
   int32_t num_children                = 0;
   int32_t decimal_scale               = 0;
   int32_t decimal_precision           = 0;
+  thrust::optional<int32_t> field_id  = thrust::nullopt;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
@@ -157,7 +160,8 @@ struct SchemaElement {
     return type == other.type && converted_type == other.converted_type &&
            type_length == other.type_length && repetition_type == other.repetition_type &&
            name == other.name && num_children == other.num_children &&
-           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision;
+           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
+           field_id == other.field_id;
   }
 
   // the parquet format is a little squishy when it comes to interpreting
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 8d0aa8881c3..057b9a87214 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -378,7 +378,7 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
-  gpu_inflate_status_s* comp_stat;  //!< Ptr to compression status
+  decompress_status* comp_stat;  //!< Ptr to compression status
 };
 
 /**
@@ -529,12 +529,10 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
 /**
  * @brief Insert chunk values into their respective hash maps
  *
- * @param chunks Column chunks [rowgroup][column]
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                              cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -554,12 +552,10 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * Since dict_data itself contains indices into the original cudf column, this means that
  * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
  *
- * @param chunks Column chunks [rowgroup][column]
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                            cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -588,13 +584,15 @@ void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
  * @brief Launches kernel for packing column data into parquet pages
  *
  * @param[in,out] pages Device array of EncPages (unordered)
- * @param[out] comp_in Optionally initializes compressor input params
- * @param[out] comp_out Optionally initializes compressor output params
+ * @param[out] comp_in Compressor input buffers
+ * @param[out] comp_in Compressor output buffers
+ * @param[out] comp_stats Compressor statuses
  * @param[in] stream CUDA stream to use, default 0
  */
 void EncodePages(device_span<EncPage> pages,
-                 device_span<gpu_inflate_input_s> comp_in,
-                 device_span<gpu_inflate_status_s> comp_out,
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<decompress_status> comp_stats,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -609,13 +607,13 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
  * @brief Launches kernel to encode page headers
  *
  * @param[in,out] pages Device array of EncPages
- * @param[in] comp_out Compressor status or nullptr if no compression
+ * @param[in] comp_stats Compressor status
  * @param[in] page_stats Optional page-level statistics to be included in page header
  * @param[in] chunk_stats Optional chunk-level statistics to be encoded
  * @param[in] stream CUDA stream to use, default 0
  */
 void EncodePageHeaders(device_span<EncPage> pages,
-                       device_span<gpu_inflate_status_s const> comp_out,
+                       device_span<decompress_status const> comp_stats,
                        device_span<statistics_chunk const> page_stats,
                        const statistics_chunk* chunk_stats,
                        rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 46b3206f731..a40993ee2dd 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -24,6 +24,7 @@
 #include "compact_protocol_reader.hpp"
 
 #include <io/comp/gpuinflate.h>
+#include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
@@ -38,10 +39,9 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <nvcomp/snappy.h>
-
 #include <thrust/for_each.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/logical.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -1050,96 +1050,13 @@ void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>&
   pages.device_to_host(stream, true);
 }
 
-__global__ void decompress_check_kernel(device_span<gpu_inflate_status_s const> stats,
-                                        bool* any_block_failure)
-{
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < stats.size()) {
-    if (stats[tid].status != 0) {
-      *any_block_failure = true;  // Doesn't need to be atomic
-    }
-  }
-}
-
-void decompress_check(device_span<gpu_inflate_status_s> stats,
-                      bool* any_block_failure,
-                      rmm::cuda_stream_view stream)
-{
-  if (stats.empty()) { return; }  // early exit for empty stats
-
-  dim3 block(128);
-  dim3 grid(cudf::util::div_rounding_up_safe(stats.size(), static_cast<size_t>(block.x)));
-  decompress_check_kernel<<<grid, block, 0, stream.value()>>>(stats, any_block_failure);
-}
-
-__global__ void convert_nvcomp_status(device_span<nvcompStatus_t const> nvcomp_stats,
-                                      device_span<gpu_inflate_status_s> stats)
-{
-  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < stats.size()) {
-    stats[tid].status = nvcomp_stats[tid] == nvcompStatus_t::nvcompSuccess ? 0 : 1;
-  }
-}
-
-void snappy_decompress(device_span<gpu_inflate_input_s> comp_in,
-                       device_span<gpu_inflate_status_s> comp_stat,
-                       size_t max_uncomp_page_size,
-                       rmm::cuda_stream_view stream)
+void decompress_check(device_span<decompress_status const> stats, rmm::cuda_stream_view stream)
 {
-  size_t num_comp_pages = comp_in.size();
-  size_t temp_size;
-
-  nvcompStatus_t nvcomp_status =
-    nvcompBatchedSnappyDecompressGetTempSize(num_comp_pages, max_uncomp_page_size, &temp_size);
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for snappy decompression");
-
-  // Not needed now but nvcomp API makes no promises about future
-  rmm::device_buffer scratch(temp_size, stream);
-  // Analogous to comp_in.srcDevice
-  rmm::device_uvector<void const*> compressed_data_ptrs(num_comp_pages, stream);
-  // Analogous to comp_in.srcSize
-  rmm::device_uvector<size_t> compressed_data_sizes(num_comp_pages, stream);
-  // Analogous to comp_in.dstDevice
-  rmm::device_uvector<void*> uncompressed_data_ptrs(num_comp_pages, stream);
-  // Analogous to comp_in.dstSize
-  rmm::device_uvector<size_t> uncompressed_data_sizes(num_comp_pages, stream);
-
-  // Analogous to comp_stat.bytes_written
-  rmm::device_uvector<size_t> actual_uncompressed_data_sizes(num_comp_pages, stream);
-  // Convertible to comp_stat.status
-  rmm::device_uvector<nvcompStatus_t> statuses(num_comp_pages, stream);
-  device_span<nvcompStatus_t const> statuses_span(statuses.data(), statuses.size());
-
-  // Prepare the vectors
-  auto comp_it = thrust::make_zip_iterator(compressed_data_ptrs.begin(),
-                                           compressed_data_sizes.begin(),
-                                           uncompressed_data_ptrs.begin(),
-                                           uncompressed_data_sizes.data());
-  thrust::transform(rmm::exec_policy(stream),
-                    comp_in.begin(),
-                    comp_in.end(),
-                    comp_it,
-                    [] __device__(gpu_inflate_input_s in) {
-                      return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice, in.dstSize);
-                    });
-
-  nvcomp_status = nvcompBatchedSnappyDecompressAsync(compressed_data_ptrs.data(),
-                                                     compressed_data_sizes.data(),
-                                                     uncompressed_data_sizes.data(),
-                                                     actual_uncompressed_data_sizes.data(),
-                                                     num_comp_pages,
-                                                     scratch.data(),
-                                                     scratch.size(),
-                                                     uncompressed_data_ptrs.data(),
-                                                     statuses.data(),
-                                                     stream.value());
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "unable to perform snappy decompression");
-
-  dim3 block(128);
-  dim3 grid(cudf::util::div_rounding_up_safe(num_comp_pages, static_cast<size_t>(block.x)));
-  convert_nvcomp_status<<<grid, block, 0, stream.value()>>>(statuses_span, comp_stat);
+  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
+                              stats.begin(),
+                              stats.end(),
+                              [] __device__(auto const& stat) { return stat.status == 0; }),
+               "Error during decompression");
 }
 
 /**
@@ -1175,9 +1092,9 @@ rmm::device_buffer reader::impl::decompress_page_data(
     int32_t max_decompressed_size;
   };
 
-  std::array<codec_stats, 3> codecs{codec_stats{parquet::GZIP, 0, 0},
-                                    codec_stats{parquet::SNAPPY, 0, 0},
-                                    codec_stats{parquet::BROTLI, 0, 0}};
+  std::array codecs{codec_stats{parquet::GZIP, 0, 0},
+                    codec_stats{parquet::SNAPPY, 0, 0},
+                    codec_stats{parquet::BROTLI, 0, 0}};
 
   auto is_codec_supported = [&codecs](int8_t codec) {
     if (codec == parquet::UNCOMPRESSED) return true;
@@ -1207,91 +1124,73 @@ rmm::device_buffer reader::impl::decompress_page_data(
 
   // Dispatch batches of pages to decompress for each codec
   rmm::device_buffer decomp_pages(total_decomp_size, stream);
-  hostdevice_vector<gpu_inflate_input_s> inflate_in(0, num_comp_pages, stream);
-  hostdevice_vector<gpu_inflate_status_s> inflate_out(0, num_comp_pages, stream);
 
-  hostdevice_vector<bool> any_block_failure(1, stream);
-  any_block_failure[0] = false;
-  any_block_failure.host_to_device(stream);
+  std::vector<device_span<uint8_t const>> comp_in;
+  comp_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> comp_out;
+  comp_out.reserve(num_comp_pages);
 
-  device_span<gpu_inflate_input_s> inflate_in_view(inflate_in.device_ptr(), inflate_in.size());
-  device_span<gpu_inflate_status_s> inflate_out_view(inflate_out.device_ptr(), inflate_out.size());
+  rmm::device_uvector<decompress_status> comp_stats(num_comp_pages, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_stats.begin(),
+               comp_stats.end(),
+               decompress_status{0, static_cast<uint32_t>(-1000), 0});
 
   size_t decomp_offset = 0;
-  int32_t argc         = 0;
+  int32_t start_pos    = 0;
   for (const auto& codec : codecs) {
-    if (codec.num_pages > 0) {
-      int32_t start_pos = argc;
-
-      for_each_codec_page(codec.compression_type, [&](size_t page) {
-        auto dst_base              = static_cast<uint8_t*>(decomp_pages.data());
-        inflate_in[argc].srcDevice = pages[page].page_data;
-        inflate_in[argc].srcSize   = pages[page].compressed_page_size;
-        inflate_in[argc].dstDevice = dst_base + decomp_offset;
-        inflate_in[argc].dstSize   = pages[page].uncompressed_page_size;
-
-        inflate_out[argc].bytes_written = 0;
-        inflate_out[argc].status        = static_cast<uint32_t>(-1000);
-        inflate_out[argc].reserved      = 0;
-
-        pages[page].page_data = static_cast<uint8_t*>(inflate_in[argc].dstDevice);
-        decomp_offset += inflate_in[argc].dstSize;
-        argc++;
-      });
+    if (codec.num_pages == 0) { continue; }
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos),
-                                    inflate_in.host_ptr(start_pos),
-                                    sizeof(decltype(inflate_in)::value_type) * (argc - start_pos),
-                                    cudaMemcpyHostToDevice,
-                                    stream.value()));
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos),
-                                    inflate_out.host_ptr(start_pos),
-                                    sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
-                                    cudaMemcpyHostToDevice,
-                                    stream.value()));
-
-      switch (codec.compression_type) {
-        case parquet::GZIP:
-          CUDF_CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos),
-                                   inflate_out.device_ptr(start_pos),
-                                   argc - start_pos,
-                                   1,
-                                   stream))
-          break;
-        case parquet::SNAPPY:
-          if (nvcomp_integration::is_stable_enabled()) {
-            snappy_decompress(inflate_in_view.subspan(start_pos, argc - start_pos),
-                              inflate_out_view.subspan(start_pos, argc - start_pos),
-                              codec.max_decompressed_size,
-                              stream);
-          } else {
-            CUDF_CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos),
-                                     inflate_out.device_ptr(start_pos),
-                                     argc - start_pos,
-                                     stream));
-          }
-          break;
-        case parquet::BROTLI:
-          CUDF_CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos),
-                                     inflate_out.device_ptr(start_pos),
-                                     debrotli_scratch.data(),
-                                     debrotli_scratch.size(),
-                                     argc - start_pos,
-                                     stream));
-          break;
-        default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-      }
-      CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos),
-                                    inflate_out.device_ptr(start_pos),
-                                    sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
-                                    cudaMemcpyDeviceToHost,
-                                    stream.value()));
+    for_each_codec_page(codec.compression_type, [&](size_t page) {
+      auto dst_base = static_cast<uint8_t*>(decomp_pages.data());
+      comp_in.emplace_back(pages[page].page_data,
+                           static_cast<size_t>(pages[page].compressed_page_size));
+      comp_out.emplace_back(dst_base + decomp_offset,
+                            static_cast<size_t>(pages[page].uncompressed_page_size));
+
+      pages[page].page_data = static_cast<uint8_t*>(comp_out.back().data());
+      decomp_offset += comp_out.back().size();
+    });
+
+    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
+                                                             codec.num_pages};
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream);
+    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
+                                                        codec.num_pages);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream);
+    device_span<decompress_status> d_comp_stats_view(comp_stats.data() + start_pos,
+                                                     codec.num_pages);
+
+    switch (codec.compression_type) {
+      case parquet::GZIP:
+        gpuinflate(d_comp_in, d_comp_out, d_comp_stats_view, gzip_header_included::YES, stream);
+        break;
+      case parquet::SNAPPY:
+        if (nvcomp_integration::is_stable_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     d_comp_in,
+                                     d_comp_out,
+                                     d_comp_stats_view,
+                                     codec.max_decompressed_size,
+                                     stream);
+        } else {
+          gpu_unsnap(d_comp_in, d_comp_out, d_comp_stats_view, stream);
+        }
+        break;
+      case parquet::BROTLI:
+        gpu_debrotli(d_comp_in,
+                     d_comp_out,
+                     d_comp_stats_view,
+                     debrotli_scratch.data(),
+                     debrotli_scratch.size(),
+                     stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
+    start_pos += codec.num_pages;
   }
 
-  decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream);
-  any_block_failure.device_to_host(stream, true);  // synchronizes stream
-  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
+  decompress_check(comp_stats, stream);
 
   // Update the page information in device memory with the updated value of
   // page_data; it now points to the uncompressed data buffer
@@ -1729,11 +1628,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           continue;
         }
 
-        int32_t type_width;
-        int32_t clock_rate;
-        int8_t converted_type;
-
-        std::tie(type_width, clock_rate, converted_type) =
+        auto [type_width, clock_rate, converted_type] =
           conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
                           _timestamp_type.id(),
                           schema.type,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index cb1acb4d9ec..dbbd39fb508 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -491,6 +491,13 @@ std::vector<schema_tree_node> construct_schema_tree(
     [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
       bool col_nullable = is_col_nullable(col, col_meta, single_write_mode);
 
+      auto set_field_id = [&schema, parent_idx](schema_tree_node& s,
+                                                column_in_metadata const& col_meta) {
+        if (schema[parent_idx].name != "list" and col_meta.is_parquet_field_id_set()) {
+          s.field_id = col_meta.get_parquet_field_id();
+        }
+      };
+
       if (col->type().id() == type_id::STRUCT) {
         // if struct, add current and recursively call for all children
         schema_tree_node struct_schema{};
@@ -500,6 +507,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         struct_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         struct_schema.num_children = col->children.size();
         struct_schema.parent_idx   = parent_idx;
+        set_field_id(struct_schema, col_meta);
         schema.push_back(std::move(struct_schema));
 
         auto struct_node_index = schema.size() - 1;
@@ -524,6 +532,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         list_schema_1.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         list_schema_1.num_children = 1;
         list_schema_1.parent_idx   = parent_idx;
+        set_field_id(list_schema_1, col_meta);
         schema.push_back(std::move(list_schema_1));
 
         schema_tree_node list_schema_2{};
@@ -555,7 +564,10 @@ std::vector<schema_tree_node> construct_schema_tree(
         map_schema.converted_type = ConvertedType::MAP;
         map_schema.repetition_type =
           col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
-        map_schema.name         = col_meta.get_name();
+        map_schema.name = col_meta.get_name();
+        if (col_meta.is_parquet_field_id_set()) {
+          map_schema.field_id = col_meta.get_parquet_field_id();
+        }
         map_schema.num_children = 1;
         map_schema.parent_idx   = parent_idx;
         schema.push_back(std::move(map_schema));
@@ -612,6 +624,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         col_schema.parent_idx  = parent_idx;
         col_schema.leaf_column = col;
+        set_field_id(col_schema, col_meta);
         schema.push_back(col_schema);
       }
     };
@@ -863,7 +876,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   std::vector<rmm::device_uvector<size_type>> dict_data;
   std::vector<rmm::device_uvector<uint16_t>> dict_index;
 
-  if (h_chunks.size() == 0) { return std::make_pair(std::move(dict_data), std::move(dict_index)); }
+  if (h_chunks.size() == 0) { return std::pair(std::move(dict_data), std::move(dict_index)); }
 
   // Allocate slots for each chunk
   std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
@@ -882,7 +895,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   chunks.host_to_device(stream);
 
   gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(chunks, frags, stream);
+  gpu::populate_chunk_hash_maps(frags, stream);
 
   chunks.device_to_host(stream, true);
 
@@ -899,7 +912,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       // We don't use dictionary if the indices are > 16 bits because that's the maximum bitpacking
       // bitsize we efficiently support
-      if (nbits > 16) { return std::make_pair(false, 0); }
+      if (nbits > 16) { return std::pair(false, 0); }
 
       // Only these bit sizes are allowed for RLE encoding because it's compute optimized
       constexpr auto allowed_bitsizes = std::array<size_type, 6>{1, 2, 4, 8, 12, 16};
@@ -912,7 +925,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       bool use_dict = (ck.plain_data_size > dict_enc_size);
       if (not use_dict) { rle_bits = 0; }
-      return std::make_pair(use_dict, rle_bits);
+      return std::pair(use_dict, rle_bits);
     }();
   }
 
@@ -931,9 +944,9 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
   chunks.host_to_device(stream);
   gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(chunks.device_view(), frags, stream);
+  gpu::get_dictionary_indices(frags, stream);
 
-  return std::make_pair(std::move(dict_data), std::move(dict_index));
+  return std::pair(std::move(dict_data), std::move(dict_index));
 }
 
 void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
@@ -971,8 +984,9 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>&
   stream.synchronize();
 }
 
-void snappy_compress(device_span<gpu_inflate_input_s const> comp_in,
-                     device_span<gpu_inflate_status_s> comp_stat,
+void snappy_compress(device_span<device_span<uint8_t const> const> comp_in,
+                     device_span<device_span<uint8_t> const> comp_out,
+                     device_span<decompress_status> comp_stats,
                      size_t max_page_uncomp_data_size,
                      rmm::cuda_stream_view stream)
 {
@@ -999,16 +1013,20 @@ void snappy_compress(device_span<gpu_inflate_input_s const> comp_in,
     // the space allocated unless one uses the API nvcompBatchedSnappyCompressGetOutputSize()
 
     // Prepare the vectors
-    auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(),
-                                             uncompressed_data_sizes.begin(),
-                                             compressed_data_ptrs.begin());
+    auto comp_it =
+      thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), uncompressed_data_sizes.begin());
+    thrust::transform(
+      rmm::exec_policy(stream),
+      comp_in.begin(),
+      comp_in.end(),
+      comp_it,
+      [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); });
+
     thrust::transform(rmm::exec_policy(stream),
-                      comp_in.begin(),
-                      comp_in.end(),
-                      comp_it,
-                      [] __device__(gpu_inflate_input_s in) {
-                        return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice);
-                      });
+                      comp_out.begin(),
+                      comp_out.end(),
+                      compressed_data_ptrs.begin(),
+                      [] __device__(auto const& out) { return out.data(); });
     nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(),
                                                      uncompressed_data_sizes.data(),
                                                      max_page_uncomp_data_size,
@@ -1028,9 +1046,9 @@ void snappy_compress(device_span<gpu_inflate_input_s const> comp_in,
     thrust::transform(rmm::exec_policy(stream),
                       compressed_bytes_written.begin(),
                       compressed_bytes_written.end(),
-                      comp_stat.begin(),
+                      comp_stats.begin(),
                       [] __device__(size_t size) {
-                        gpu_inflate_status_s status{};
+                        decompress_status status{};
                         status.bytes_written = size;
                         return status;
                       });
@@ -1038,9 +1056,9 @@ void snappy_compress(device_span<gpu_inflate_input_s const> comp_in,
   } catch (...) {
     // If we reach this then there was an error in compressing so set an error status for each page
     thrust::for_each(rmm::exec_policy(stream),
-                     comp_stat.begin(),
-                     comp_stat.end(),
-                     [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; });
+                     comp_stats.begin(),
+                     comp_stats.end(),
+                     [] __device__(decompress_status & stat) { stat.status = 1; });
   };
 }
 
@@ -1064,19 +1082,17 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   uint32_t max_comp_pages =
     (compression_ != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0;
 
-  rmm::device_uvector<gpu_inflate_input_s> compression_input(max_comp_pages, stream);
-  rmm::device_uvector<gpu_inflate_status_s> compression_status(max_comp_pages, stream);
-
-  device_span<gpu_inflate_input_s> comp_in{compression_input.data(), compression_input.size()};
-  device_span<gpu_inflate_status_s> comp_stat{compression_status.data(), compression_status.size()};
+  rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
+  rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
+  rmm::device_uvector<decompress_status> comp_stats(max_comp_pages, stream);
 
-  gpu::EncodePages(batch_pages, comp_in, comp_stat, stream);
+  gpu::EncodePages(batch_pages, comp_in, comp_out, comp_stats, stream);
   switch (compression_) {
     case parquet::Compression::SNAPPY:
       if (nvcomp_integration::is_stable_enabled()) {
-        snappy_compress(comp_in, comp_stat, max_page_uncomp_data_size, stream);
+        snappy_compress(comp_in, comp_out, comp_stats, max_page_uncomp_data_size, stream);
       } else {
-        CUDF_CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream));
+        gpu_snap(comp_in, comp_out, comp_stats, stream);
       }
       break;
     default: break;
@@ -1085,7 +1101,7 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   // chunk-level
   auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch);
   DecideCompression(d_chunks_in_batch.flat_view(), stream);
-  EncodePageHeaders(batch_pages, comp_stat, batch_pages_stats, chunk_stats, stream);
+  EncodePageHeaders(batch_pages, comp_stats, batch_pages_stats, chunk_stats, stream);
   GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
 
   auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index 9be19979c50..9ba54ec550c 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -332,10 +332,8 @@ __global__ void __launch_bounds__(block_size, 1)
 
   cooperative_load(state.group, &groups[blockIdx.x]);
   __syncthreads();
-  cooperative_load(state.col, state.group.col);
-  __syncthreads();
 
-  type_dispatcher(state.col.leaf_column->type(),
+  type_dispatcher(state.group.col_dtype,
                   merge_group_statistics_functor<block_size, IO>(storage),
                   state,
                   chunks_in + state.group.start_chunk,
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index 87d92beb595..bb3c3ee152c 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -107,7 +107,8 @@ struct statistics_group {
 };
 
 struct statistics_merge_group {
-  const stats_column_desc* col;  //!< Column information
+  data_type col_dtype;           //!< Column data type
+  statistics_dtype stats_dtype;  //!< Statistics data type for this column
   uint32_t start_chunk;          //!< Start chunk of this group
   uint32_t num_chunks;           //!< Number of chunks in group
 };
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index ed8c3d6e1e3..08b5914cb19 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -35,7 +35,7 @@ namespace {
 /**
  * @brief Defines which cuFile usage to enable.
  */
-enum class usage_policy : uint8_t { OFF, GDS, ALWAYS };
+enum class usage_policy : uint8_t { OFF, GDS, ALWAYS, KVIKIO };
 
 /**
  * @brief Get the current usage policy.
@@ -46,6 +46,7 @@ usage_policy get_env_policy()
   if (env_val == "OFF") return usage_policy::OFF;
   if (env_val == "GDS") return usage_policy::GDS;
   if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  if (env_val == "KVIKIO") return usage_policy::KVIKIO;
   CUDF_FAIL("Invalid LIBCUDF_CUFILE_POLICY value: " + env_val);
 }
 }  // namespace
@@ -54,6 +55,8 @@ bool is_always_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
 
 bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_policy::GDS; }
 
+bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; }
+
 }  // namespace cufile_integration
 
 namespace nvcomp_integration {
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp
index 80c20529687..4f6a14091cf 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/config_utils.hpp
@@ -48,6 +48,11 @@ bool is_always_enabled();
  */
 bool is_gds_enabled();
 
+/**
+ * @brief Returns true if KvikIO is enabled.
+ */
+bool is_kvikio_enabled();
+
 }  // namespace cufile_integration
 
 namespace nvcomp_integration {
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 63d0103ddec..042afc01253 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,9 @@
 #include "file_io_utilities.hpp"
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/utilities/config_utils.hpp>
 
+#include <kvikio/file_handle.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -30,10 +32,15 @@ namespace io {
 class file_sink : public data_sink {
  public:
   explicit file_sink(std::string const& filepath)
-    : _cufile_out(detail::make_cufile_output(filepath))
   {
     _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
     CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file");
+
+    if (detail::cufile_integration::is_kvikio_enabled()) {
+      _kvikio_file = kvikio::FileHandle(filepath, "w");
+    } else {
+      _cufile_out = detail::make_cufile_output(filepath);
+    }
   }
 
   virtual ~file_sink() { flush(); }
@@ -49,19 +56,15 @@ class file_sink : public data_sink {
 
   size_t bytes_written() override { return _bytes_written; }
 
-  [[nodiscard]] bool supports_device_write() const override { return _cufile_out != nullptr; }
-
-  [[nodiscard]] bool is_device_write_preferred(size_t size) const override
+  [[nodiscard]] bool supports_device_write() const override
   {
-    return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size);
+    return !_kvikio_file.closed() || _cufile_out != nullptr;
   }
 
-  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  [[nodiscard]] bool is_device_write_preferred(size_t size) const override
   {
-    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
-
-    _cufile_out->write(gpu_data, _bytes_written, size);
-    _bytes_written += size;
+    return !_kvikio_file.closed() ||
+           (_cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size));
   }
 
   std::future<void> device_write_async(void const* gpu_data,
@@ -70,15 +73,30 @@ class file_sink : public data_sink {
   {
     if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
 
-    auto result = _cufile_out->write_async(gpu_data, _bytes_written, size);
+    size_t offset = _bytes_written;
     _bytes_written += size;
-    return result;
+
+    if (!_kvikio_file.closed()) {
+      // KvikIO's `pwrite()` returns a `std::future<size_t>` so we convert it
+      // to `std::future<void>`
+      return std::async(std::launch::deferred, [this, gpu_data, size, offset] {
+        _kvikio_file.pwrite(gpu_data, size, offset).get();
+      });
+    }
+    return _cufile_out->write_async(gpu_data, offset, size);
+  }
+
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
+    return device_write_async(gpu_data, _bytes_written, stream).get();
   }
 
  private:
   std::ofstream _output_stream;
   size_t _bytes_written = 0;
   std::unique_ptr<detail::cufile_output_impl> _cufile_out;
+  kvikio::FileHandle _kvikio_file;
 };
 
 /**
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 6f864ab509f..80e07f31dd9 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,9 @@
 #include <cudf/utilities/error.hpp>
 #include <io/utilities/config_utils.hpp>
 
+#include <kvikio/file_handle.hpp>
+#include <rmm/device_buffer.hpp>
+
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
@@ -33,28 +36,38 @@ namespace {
  */
 class file_source : public datasource {
  public:
-  explicit file_source(const char* filepath)
-    : _file(filepath, O_RDONLY), _cufile_in(detail::make_cufile_input(filepath))
+  explicit file_source(const char* filepath) : _file(filepath, O_RDONLY)
   {
+    if (detail::cufile_integration::is_kvikio_enabled()) {
+      _kvikio_file = kvikio::FileHandle(filepath);
+    } else {
+      _cufile_in = detail::make_cufile_input(filepath);
+    }
   }
 
   virtual ~file_source() = default;
 
-  [[nodiscard]] bool supports_device_read() const override { return _cufile_in != nullptr; }
+  [[nodiscard]] bool supports_device_read() const override
+  {
+    return !_kvikio_file.closed() || _cufile_in != nullptr;
+  }
 
   [[nodiscard]] bool is_device_read_preferred(size_t size) const override
   {
-    return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
+    return !_kvikio_file.closed() ||
+           (_cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size));
   }
 
-  std::unique_ptr<datasource::buffer> device_read(size_t offset,
-                                                  size_t size,
-                                                  rmm::cuda_stream_view stream) override
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
   {
     CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
 
     auto const read_size = std::min(size, _file.size() - offset);
-    return _cufile_in->read(offset, read_size, stream);
+    if (!_kvikio_file.closed()) { return _kvikio_file.pread(dst, read_size, offset); }
+    return _cufile_in->read_async(offset, read_size, dst, stream);
   }
 
   size_t device_read(size_t offset,
@@ -62,21 +75,17 @@ class file_source : public datasource {
                      uint8_t* dst,
                      rmm::cuda_stream_view stream) override
   {
-    CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
-
-    auto const read_size = std::min(size, _file.size() - offset);
-    return _cufile_in->read(offset, read_size, dst, stream);
+    return device_read_async(offset, size, dst, stream).get();
   }
 
-  std::future<size_t> device_read_async(size_t offset,
-                                        size_t size,
-                                        uint8_t* dst,
-                                        rmm::cuda_stream_view stream) override
+  std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                  size_t size,
+                                                  rmm::cuda_stream_view stream) override
   {
-    CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
-
-    auto const read_size = std::min(size, _file.size() - offset);
-    return _cufile_in->read_async(offset, read_size, dst, stream);
+    rmm::device_buffer out_data(size, stream);
+    size_t read = device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
+    out_data.resize(read, stream);
+    return datasource::buffer::create(std::move(out_data));
   }
 
   [[nodiscard]] size_t size() const override { return _file.size(); }
@@ -86,6 +95,7 @@ class file_source : public datasource {
 
  private:
   std::unique_ptr<detail::cufile_input_impl> _cufile_in;
+  kvikio::FileHandle _kvikio_file;
 };
 
 /**
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index f7e250f1d3f..c0dd85702e2 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -176,16 +176,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath)
   pool.sleep_duration = 10;
 }
 
-std::unique_ptr<datasource::buffer> cufile_input_impl::read(size_t offset,
-                                                            size_t size,
-                                                            rmm::cuda_stream_view stream)
-{
-  rmm::device_buffer out_data(size, stream);
-  auto read_size = read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
-  out_data.resize(read_size, stream);
-  return datasource::buffer::create(std::move(out_data));
-}
-
 namespace {
 
 template <typename DataT,
@@ -234,15 +224,6 @@ std::future<size_t> cufile_input_impl::read_async(size_t offset,
   return std::async(std::launch::deferred, waiter, std::move(slice_tasks));
 }
 
-size_t cufile_input_impl::read(size_t offset,
-                               size_t size,
-                               uint8_t* dst,
-                               rmm::cuda_stream_view stream)
-{
-  auto result = read_async(offset, size, dst, stream);
-  return result.get();
-}
-
 cufile_output_impl::cufile_output_impl(std::string const& filepath)
   : shim{cufile_shim::instance()},
     cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664),
@@ -250,11 +231,6 @@ cufile_output_impl::cufile_output_impl(std::string const& filepath)
 {
 }
 
-void cufile_output_impl::write(void const* data, size_t offset, size_t size)
-{
-  write_async(data, offset, size).wait();
-}
-
 std::future<void> cufile_output_impl::write_async(void const* data, size_t offset, size_t size)
 {
   int device;
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index be3ecc49ab0..704ee77de8a 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -80,35 +80,6 @@ class cufile_io_base {
  */
 class cufile_input : public cufile_io_base {
  public:
-  /**
-   * @brief Reads into a new device buffer.
-   *
-   *  @throws cudf::logic_error on cuFile error
-   *
-   * @param offset Number of bytes from the start
-   * @param size Number of bytes to read
-   * @param stream CUDA stream to use
-   *
-   * @return The data buffer in the device memory
-   */
-  virtual std::unique_ptr<datasource::buffer> read(size_t offset,
-                                                   size_t size,
-                                                   rmm::cuda_stream_view stream) = 0;
-
-  /**
-   * @brief Reads into existing device memory.
-   *
-   *  @throws cudf::logic_error on cuFile error
-   *
-   * @param offset Number of bytes from the start
-   * @param size Number of bytes to read
-   * @param dst Address of the existing device memory
-   * @param stream CUDA stream to use
-   *
-   * @return The number of bytes read
-   */
-  virtual size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) = 0;
-
   /**
    * @brief Asynchronously reads into existing device memory.
    *
@@ -132,17 +103,6 @@ class cufile_input : public cufile_io_base {
  */
 class cufile_output : public cufile_io_base {
  public:
-  /**
-   * @brief Writes the data from a device buffer into a file.
-   *
-   *  @throws cudf::logic_error on cuFile error
-   *
-   * @param data Pointer to the buffer to be written into the output file
-   * @param offset Number of bytes from the start
-   * @param size Number of bytes to write
-   */
-  virtual void write(void const* data, size_t offset, size_t size) = 0;
-
   /**
    * @brief Asynchronously writes the data from a device buffer into a file.
    *
@@ -203,12 +163,6 @@ class cufile_input_impl final : public cufile_input {
  public:
   cufile_input_impl(std::string const& filepath);
 
-  std::unique_ptr<datasource::buffer> read(size_t offset,
-                                           size_t size,
-                                           rmm::cuda_stream_view stream) override;
-
-  size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override;
-
   std::future<size_t> read_async(size_t offset,
                                  size_t size,
                                  uint8_t* dst,
@@ -229,7 +183,6 @@ class cufile_output_impl final : public cufile_output {
  public:
   cufile_output_impl(std::string const& filepath);
 
-  void write(void const* data, size_t offset, size_t size) override;
   std::future<void> write_async(void const* data, size_t offset, size_t size) override;
 
  private:
@@ -241,18 +194,6 @@ class cufile_output_impl final : public cufile_output {
 
 class cufile_input_impl final : public cufile_input {
  public:
-  std::unique_ptr<datasource::buffer> read(size_t offset,
-                                           size_t size,
-                                           rmm::cuda_stream_view stream) override
-  {
-    CUDF_FAIL("Only used to compile without cufile library, should not be called");
-  }
-
-  size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override
-  {
-    CUDF_FAIL("Only used to compile without cufile library, should not be called");
-  }
-
   std::future<size_t> read_async(size_t offset,
                                  size_t size,
                                  uint8_t* dst,
@@ -264,10 +205,6 @@ class cufile_input_impl final : public cufile_input {
 
 class cufile_output_impl final : public cufile_output {
  public:
-  void write(void const* data, size_t offset, size_t size) override
-  {
-    CUDF_FAIL("Only used to compile without cufile library, should not be called");
-  }
   std::future<void> write_async(void const* data, size_t offset, size_t size) override
   {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 5c73cf31428..30c7b6ec326 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -51,10 +51,10 @@ class hostdevice_vector {
   }
 
   explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream)
-    : num_elements(initial_size), max_elements(max_size)
+    : max_elements(max_size), num_elements(initial_size)
   {
     if (max_elements != 0) {
-      CUDF_CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements));
+      CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast<void**>(&h_data), sizeof(T) * max_elements));
       d_data.resize(sizeof(T) * max_elements, stream);
     }
   }
@@ -62,7 +62,7 @@ class hostdevice_vector {
   ~hostdevice_vector()
   {
     if (max_elements != 0) {
-      auto const free_result = cudaFreeHost(h_data);
+      [[maybe_unused]] auto const free_result = cudaFreeHost(h_data);
       assert(free_result == cudaSuccess);
     }
   }
@@ -93,6 +93,31 @@ class hostdevice_vector {
     return reinterpret_cast<T const*>(d_data.data()) + offset;
   }
 
+  /**
+   * @brief Returns the specified element from device memory
+   *
+   * @note This function incurs a device to host memcpy and should be used sparingly.
+   * @note This function synchronizes `stream`.
+   *
+   * @throws rmm::out_of_range exception if `element_index >= size()`
+   *
+   * @param element_index Index of the desired element
+   * @param stream The stream on which to perform the copy
+   * @return The value of the specified element
+   */
+  [[nodiscard]] T element(std::size_t element_index, rmm::cuda_stream_view stream) const
+  {
+    CUDF_EXPECTS(element_index < size(), "Attempt to access out of bounds element.");
+    T value;
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                  reinterpret_cast<T const*>(d_data.data()) + element_index,
+                                  sizeof(value),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+    return value;
+  }
+
   operator cudf::device_span<T>() { return {device_ptr(), max_elements}; }
   operator cudf::device_span<T const>() const { return {device_ptr(), max_elements}; }
 
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 9bf7e6a7a43..ae1561b422b 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -59,8 +59,8 @@ conditional_join(table_view const& left,
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+        return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                         std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   } else if (left_num_rows == 0) {
@@ -70,12 +70,12 @@ conditional_join(table_view const& left,
       case join_kind::LEFT_ANTI_JOIN:
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+        return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                         std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
         auto ret_flipped = get_trivial_left_join_indices(right, stream);
-        return std::make_pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
+        return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
     }
@@ -139,8 +139,8 @@ conditional_join(table_view const& left,
   // all other cases (inner, left semi, and left anti joins) if we reach this
   // point we can safely return an empty result.
   if (join_size == 0) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                     std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   rmm::device_scalar<size_type> write_index(0, stream);
@@ -176,7 +176,7 @@ conditional_join(table_view const& left,
         swap_tables);
   }
 
-  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
+  auto join_indices = std::pair(std::move(left_indices), std::move(right_indices));
 
   // For full joins, get the indices in the right table that were not joined to
   // by any row in the left table.
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 086e1e49986..3e0e76de708 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -13,11 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <join/hash_join.cuh>
+#include "join_common_utils.cuh"
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/join.hpp>
 #include <cudf/detail/structs/utilities.hpp>
+#include <cudf/join.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -38,13 +41,67 @@
 
 namespace cudf {
 namespace detail {
-
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
-  table_view const& probe, table_view const& build)
+namespace {
+/**
+ * @brief Calculates the exact size of the join output produced when
+ * joining two tables together.
+ *
+ * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN
+ *
+ * @tparam JoinKind The type of join to be performed
+ *
+ * @param build_table The right hand table
+ * @param probe_table The left hand table
+ * @param hash_table A hash table built on the build table that maps the index
+ * of every row to the hash value of that row.
+ * @param nulls_equal Flag to denote nulls are equal or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return The exact size of the output of the join operation
+ */
+template <join_kind JoinKind>
+std::size_t compute_join_output_size(table_device_view build_table,
+                                     table_device_view probe_table,
+                                     cudf::detail::multimap_type const& hash_table,
+                                     bool const has_nulls,
+                                     cudf::null_equality const nulls_equal,
+                                     rmm::cuda_stream_view stream)
 {
-  std::unique_ptr<table> empty_probe = empty_like(probe);
-  std::unique_ptr<table> empty_build = empty_like(build);
-  return std::make_pair(std::move(empty_probe), std::move(empty_build));
+  const size_type build_table_num_rows{build_table.num_rows()};
+  const size_type probe_table_num_rows{probe_table.num_rows()};
+
+  // If the build table is empty, we know exactly how large the output
+  // will be for the different types of joins and can return immediately
+  if (0 == build_table_num_rows) {
+    switch (JoinKind) {
+      // Inner join with an empty table will have no output
+      case join_kind::INNER_JOIN: return 0;
+
+      // Left join with an empty table will have an output of NULL rows
+      // equal to the number of rows in the probe table
+      case join_kind::LEFT_JOIN: return probe_table_num_rows;
+
+      default: CUDF_FAIL("Unsupported join type");
+    }
+  }
+
+  auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
+  pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal};
+
+  row_hash hash_probe{probe_nulls, probe_table};
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  std::size_t size;
+  if constexpr (JoinKind == join_kind::LEFT_JOIN) {
+    size = hash_table.pair_count_outer(iter, iter + probe_table_num_rows, equality, stream.value());
+  } else {
+    size = hash_table.pair_count(iter, iter + probe_table_num_rows, equality, stream.value());
+  }
+
+  return size;
 }
 
 /**
@@ -69,7 +126,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 probe_join_hash_table(cudf::table_device_view build_table,
                       cudf::table_device_view probe_table,
-                      multimap_type const& hash_table,
+                      cudf::detail::multimap_type const& hash_table,
                       bool has_nulls,
                       null_equality compare_nulls,
                       std::optional<std::size_t> output_size,
@@ -88,8 +145,8 @@ probe_join_hash_table(cudf::table_device_view build_table,
 
   // If output size is zero, return immediately
   if (join_size == 0) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                     std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -125,7 +182,7 @@ probe_join_hash_table(cudf::table_device_view build_table,
     hash_table.pair_retrieve(
       iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value());
   }
-  return std::make_pair(std::move(left_indices), std::move(right_indices));
+  return std::pair(std::move(left_indices), std::move(right_indices));
 }
 
 /**
@@ -145,7 +202,7 @@ probe_join_hash_table(cudf::table_device_view build_table,
  */
 std::size_t get_full_join_size(cudf::table_device_view build_table,
                                cudf::table_device_view probe_table,
-                               multimap_type const& hash_table,
+                               cudf::detail::multimap_type const& hash_table,
                                bool const has_nulls,
                                null_equality const compare_nulls,
                                rmm::cuda_stream_view stream,
@@ -157,8 +214,6 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   // If output size is zero, return immediately
   if (join_size == 0) { return join_size; }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
-
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -221,25 +276,12 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   }
   return join_size + left_join_complement_size;
 }
+}  // namespace
 
-std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
-                                                std::unique_ptr<cudf::table>&& right)
-{
-  auto joined_cols = left->release();
-  auto right_cols  = right->release();
-  joined_cols.insert(joined_cols.end(),
-                     std::make_move_iterator(right_cols.begin()),
-                     std::make_move_iterator(right_cols.end()));
-  return std::make_unique<cudf::table>(std::move(joined_cols));
-}
-
-}  // namespace detail
-
-hash_join::hash_join_impl::~hash_join_impl() = default;
-
-hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
-                                          null_equality compare_nulls,
-                                          rmm::cuda_stream_view stream)
+template <typename Hasher>
+hash_join<Hasher>::hash_join(cudf::table_view const& build,
+                             cudf::null_equality compare_nulls,
+                             rmm::cuda_stream_view stream)
   : _is_empty{build.num_rows() == 0},
     _nulls_equal{compare_nulls},
     _hash_table{compute_hash_table_size(build.num_rows()),
@@ -263,41 +305,45 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
   cudf::detail::build_join_hash_table(_build, _hash_table, _nulls_equal, stream);
 }
 
+template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::inner_join(cudf::table_view const& probe,
-                                      std::optional<std::size_t> output_size,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
+hash_join<Hasher>::inner_join(cudf::table_view const& probe,
+                              std::optional<std::size_t> output_size,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(probe, output_size, stream, mr);
 }
 
+template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::left_join(cudf::table_view const& probe,
-                                     std::optional<std::size_t> output_size,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+hash_join<Hasher>::left_join(cudf::table_view const& probe,
+                             std::optional<std::size_t> output_size,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(probe, output_size, stream, mr);
 }
 
+template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::full_join(cudf::table_view const& probe,
-                                     std::optional<std::size_t> output_size,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+hash_join<Hasher>::full_join(cudf::table_view const& probe,
+                             std::optional<std::size_t> output_size,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(probe, output_size, stream, mr);
 }
 
-std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& probe,
-                                                       rmm::cuda_stream_view stream) const
+template <typename Hasher>
+std::size_t hash_join<Hasher>::inner_join_size(cudf::table_view const& probe,
+                                               rmm::cuda_stream_view stream) const
 {
   CUDF_FUNC_RANGE();
 
@@ -320,8 +366,9 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p
     stream);
 }
 
-std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe,
-                                                      rmm::cuda_stream_view stream) const
+template <typename Hasher>
+std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
+                                              rmm::cuda_stream_view stream) const
 {
   CUDF_FUNC_RANGE();
 
@@ -344,9 +391,10 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr
     stream);
 }
 
-std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr) const
+template <typename Hasher>
+std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
 
@@ -370,13 +418,51 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr
     mr);
 }
 
+template <typename Hasher>
+template <cudf::detail::join_kind JoinKind>
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join<Hasher>::probe_join_indices(cudf::table_view const& probe_table,
+                                      std::optional<std::size_t> output_size,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr) const
+{
+  // Trivial left join case - exit early
+  if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) {
+    return get_trivial_left_join_indices(probe_table, stream, mr);
+  }
+
+  CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null.");
+
+  auto build_table_ptr = cudf::table_device_view::create(_build, stream);
+  auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream);
+
+  auto join_indices = cudf::detail::probe_join_hash_table<JoinKind>(
+    *build_table_ptr,
+    *probe_table_ptr,
+    _hash_table,
+    cudf::has_nulls(probe_table) | cudf::has_nulls(_build),
+    _nulls_equal,
+    output_size,
+    stream,
+    mr);
+
+  if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
+    auto complement_indices = detail::get_left_join_indices_complement(
+      join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr);
+    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
+  }
+  return join_indices;
+}
+
+template <typename Hasher>
 template <cudf::detail::join_kind JoinKind>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
-                                             std::optional<std::size_t> output_size,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr) const
+hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
+                                     std::optional<std::size_t> output_size,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
@@ -390,8 +476,8 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
                "Mismatch in number of columns to be joined on");
 
   if (is_trivial_join(flattened_probe_table, _build, JoinKind)) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                     std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   CUDF_EXPECTS(std::equal(std::cbegin(_build),
@@ -403,41 +489,64 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
 
   return probe_join_indices<JoinKind>(flattened_probe_table, output_size, stream, mr);
 }
+}  // namespace detail
+
+hash_join::~hash_join() = default;
+
+hash_join::hash_join(cudf::table_view const& build,
+                     null_equality compare_nulls,
+                     rmm::cuda_stream_view stream)
+  : _impl{std::make_unique<const impl_type>(build, compare_nulls, stream)}
+{
+}
 
-template <cudf::detail::join_kind JoinKind>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe_table,
-                                              std::optional<std::size_t> output_size,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr) const
+hash_join::inner_join(cudf::table_view const& probe,
+                      std::optional<std::size_t> output_size,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr) const
 {
-  // Trivial left join case - exit early
-  if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) {
-    return get_trivial_left_join_indices(probe_table, stream, mr);
-  }
+  return _impl->inner_join(probe, output_size, stream, mr);
+}
 
-  CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null.");
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::left_join(cudf::table_view const& probe,
+                     std::optional<std::size_t> output_size,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const
+{
+  return _impl->left_join(probe, output_size, stream, mr);
+}
 
-  auto build_table_ptr = cudf::table_device_view::create(_build, stream);
-  auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream);
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::full_join(cudf::table_view const& probe,
+                     std::optional<std::size_t> output_size,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const
+{
+  return _impl->full_join(probe, output_size, stream, mr);
+}
 
-  auto join_indices = cudf::detail::probe_join_hash_table<JoinKind>(
-    *build_table_ptr,
-    *probe_table_ptr,
-    _hash_table,
-    cudf::has_nulls(probe_table) | cudf::has_nulls(_build),
-    _nulls_equal,
-    output_size,
-    stream,
-    mr);
+std::size_t hash_join::inner_join_size(cudf::table_view const& probe,
+                                       rmm::cuda_stream_view stream) const
+{
+  return _impl->inner_join_size(probe, stream);
+}
 
-  if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
-    auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr);
-    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
-  }
-  return join_indices;
+std::size_t hash_join::left_join_size(cudf::table_view const& probe,
+                                      rmm::cuda_stream_view stream) const
+{
+  return _impl->left_join_size(probe, stream);
+}
+
+std::size_t hash_join::full_join_size(cudf::table_view const& probe,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr) const
+{
+  return _impl->full_join_size(probe, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
deleted file mode 100644
index e55de043372..00000000000
--- a/cpp/src/join/hash_join.cuh
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-
-#include <cudf/detail/concatenate.cuh>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/join.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/table/table_view.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/sequence.h>
-
-#include <cstddef>
-#include <limits>
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value.
- *
- * @param hash The hash value to potentially remap
- * @param sentinel The reserved value
- */
-template <typename H, typename S>
-constexpr auto remap_sentinel_hash(H hash, S sentinel)
-{
-  // Arbitrarily choose hash - 1
-  return (hash == sentinel) ? (hash - 1) : hash;
-}
-
-/**
- * @brief Device functor to create a pair of hash value and index for a given row.
- */
-class make_pair_function {
- public:
-  CUDF_HOST_DEVICE make_pair_function(row_hash const& hash,
-                                      hash_value_type const empty_key_sentinel)
-    : _hash{hash}, _empty_key_sentinel{empty_key_sentinel}
-  {
-  }
-
-  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
-  {
-    // Compute the hash value of row `i`
-    auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
-    return cuco::make_pair(row_hash_value, i);
-  }
-
- private:
-  row_hash _hash;
-  hash_value_type const _empty_key_sentinel;
-};
-
-/**
- * @brief Calculates the exact size of the join output produced when
- * joining two tables together.
- *
- * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN
- *
- * @tparam JoinKind The type of join to be performed
- * @tparam multimap_type The type of the hash table
- *
- * @param build_table The right hand table
- * @param probe_table The left hand table
- * @param hash_table A hash table built on the build table that maps the index
- * of every row to the hash value of that row.
- * @param nulls_equal Flag to denote nulls are equal or not.
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return The exact size of the output of the join operation
- */
-template <join_kind JoinKind, typename multimap_type>
-std::size_t compute_join_output_size(table_device_view build_table,
-                                     table_device_view probe_table,
-                                     multimap_type const& hash_table,
-                                     bool const has_nulls,
-                                     cudf::null_equality const nulls_equal,
-                                     rmm::cuda_stream_view stream)
-{
-  const size_type build_table_num_rows{build_table.num_rows()};
-  const size_type probe_table_num_rows{probe_table.num_rows()};
-
-  // If the build table is empty, we know exactly how large the output
-  // will be for the different types of joins and can return immediately
-  if (0 == build_table_num_rows) {
-    switch (JoinKind) {
-      // Inner join with an empty table will have no output
-      case join_kind::INNER_JOIN: return 0;
-
-      // Left join with an empty table will have an output of NULL rows
-      // equal to the number of rows in the probe table
-      case join_kind::LEFT_JOIN: return probe_table_num_rows;
-
-      default: CUDF_FAIL("Unsupported join type");
-    }
-  }
-
-  auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
-  pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal};
-
-  row_hash hash_probe{probe_nulls, probe_table};
-  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
-  make_pair_function pair_func{hash_probe, empty_key_sentinel};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
-
-  std::size_t size;
-  if constexpr (JoinKind == join_kind::LEFT_JOIN) {
-    size = hash_table.pair_count_outer(iter, iter + probe_table_num_rows, equality, stream.value());
-  } else {
-    size = hash_table.pair_count(iter, iter + probe_table_num_rows, equality, stream.value());
-  }
-
-  return size;
-}
-
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
-  table_view const& probe, table_view const& build);
-
-std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
-                                                std::unique_ptr<cudf::table>&& right);
-
-/**
- * @brief Builds the hash table based on the given `build_table`.
- *
- * @tparam MultimapType The type of the hash table
- *
- * @param build Table of columns used to build join hash.
- * @param hash_table Build hash table.
- * @param nulls_equal Flag to denote nulls are equal or not.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- */
-template <typename MultimapType>
-void build_join_hash_table(cudf::table_view const& build,
-                           MultimapType& hash_table,
-                           null_equality const nulls_equal,
-                           rmm::cuda_stream_view stream)
-{
-  auto build_table_ptr = cudf::table_device_view::create(build, stream);
-
-  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
-  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
-
-  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
-  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
-  make_pair_function pair_func{hash_build, empty_key_sentinel};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
-
-  size_type const build_table_num_rows{build_table_ptr->num_rows()};
-  if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) {
-    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
-  } else {
-    thrust::counting_iterator<size_type> stencil(0);
-    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
-    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
-
-    // insert valid rows
-    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
-  }
-}
-}  // namespace detail
-
-struct hash_join::hash_join_impl {
- public:
-  hash_join_impl() = delete;
-  ~hash_join_impl();
-  hash_join_impl(hash_join_impl const&) = delete;
-  hash_join_impl(hash_join_impl&&)      = delete;
-  hash_join_impl& operator=(hash_join_impl const&) = delete;
-  hash_join_impl& operator=(hash_join_impl&&) = delete;
-
- private:
-  bool const _is_empty;
-  cudf::null_equality const _nulls_equal;
-  cudf::table_view _build;
-  std::vector<std::unique_ptr<cudf::column>> _created_null_columns;
-  cudf::structs::detail::flattened_table _flattened_build_table;
-  cudf::detail::multimap_type _hash_table;
-
- public:
-  /**
-   * @brief Constructor that internally builds the hash table based on the given `build` table
-   *
-   * @throw cudf::logic_error if the number of columns in `build` table is 0.
-   * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
-   *
-   * @param build The build table, from which the hash table is built.
-   * @param compare_nulls Controls whether null join-key values should match or not.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  hash_join_impl(cudf::table_view const& build,
-                 null_equality compare_nulls,
-                 rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(cudf::table_view const& probe,
-             std::optional<std::size_t> output_size,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const;
-
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  left_join(cudf::table_view const& probe,
-            std::optional<std::size_t> output_size,
-            rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
-
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  full_join(cudf::table_view const& probe,
-            std::optional<std::size_t> output_size,
-            rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
-
-  [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe,
-                                            rmm::cuda_stream_view stream) const;
-
-  [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe,
-                                           rmm::cuda_stream_view stream) const;
-
-  std::size_t full_join_size(cudf::table_view const& probe,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const;
-
- private:
-  template <cudf::detail::join_kind JoinKind>
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  compute_hash_join(cudf::table_view const& probe,
-                    std::optional<std::size_t> output_size,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr) const;
-
-  /**
-   * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`,
-   * and returns the output indices of `build_table` and `probe_table` as a combined table,
-   * i.e. if full join is specified as the join type then left join is called. Behavior
-   * is undefined if the provided `output_size` is smaller than the actual output size.
-   *
-   * @throw cudf::logic_error if hash table is null.
-   *
-   * @tparam JoinKind The type of join to be performed.
-   *
-   * @param probe_table Table of probe side columns to join.
-   * @param output_size Optional value which allows users to specify the exact output size.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @param mr Device memory resource used to allocate the returned vectors.
-   *
-   * @return Join output indices vector pair.
-   */
-  template <cudf::detail::join_kind JoinKind>
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
-  probe_join_indices(cudf::table_view const& probe_table,
-                     std::optional<std::size_t> output_size,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const;
-};
-
-}  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 7a478ca2eb3..5c529c88d9d 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -13,8 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "join/hash_join.cuh"
-#include "join/join_common_utils.hpp"
+#include "join_common_utils.hpp"
 
 #include <cudf/detail/gather.cuh>
 #include <cudf/dictionary/detail/update_keys.hpp>
@@ -26,6 +25,26 @@
 
 namespace cudf {
 namespace detail {
+namespace {
+std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
+  table_view const& probe, table_view const& build)
+{
+  std::unique_ptr<table> empty_probe = empty_like(probe);
+  std::unique_ptr<table> empty_build = empty_like(build);
+  return std::pair(std::move(empty_probe), std::move(empty_build));
+}
+
+std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
+                                                std::unique_ptr<cudf::table>&& right)
+{
+  auto joined_cols = left->release();
+  auto right_cols  = right->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+}  // namespace
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
@@ -52,7 +71,7 @@ inner_join(table_view const& left_input,
   if (right.num_rows() > left.num_rows()) {
     cudf::hash_join hj_obj(left, compare_nulls, stream);
     auto [right_result, left_result] = hj_obj.inner_join(right, std::nullopt, stream, mr);
-    return std::make_pair(std::move(left_result), std::move(right_result));
+    return std::pair(std::move(left_result), std::move(right_result));
   } else {
     cudf::hash_join hj_obj(right, compare_nulls, stream);
     return hj_obj.inner_join(left, std::nullopt, stream, mr);
@@ -222,69 +241,8 @@ std::unique_ptr<table> full_join(table_view const& left_input,
                                                        mr);
   return combine_table_pair(std::move(left_result), std::move(right_result));
 }
-
 }  // namespace detail
 
-hash_join::~hash_join() = default;
-
-hash_join::hash_join(cudf::table_view const& build,
-                     null_equality compare_nulls,
-                     rmm::cuda_stream_view stream)
-  : impl{std::make_unique<const hash_join::hash_join_impl>(build, compare_nulls, stream)}
-{
-}
-
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::inner_join(cudf::table_view const& probe,
-                      std::optional<std::size_t> output_size,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr) const
-{
-  return impl->inner_join(probe, output_size, stream, mr);
-}
-
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::left_join(cudf::table_view const& probe,
-                     std::optional<std::size_t> output_size,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
-{
-  return impl->left_join(probe, output_size, stream, mr);
-}
-
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::full_join(cudf::table_view const& probe,
-                     std::optional<std::size_t> output_size,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
-{
-  return impl->full_join(probe, output_size, stream, mr);
-}
-
-std::size_t hash_join::inner_join_size(cudf::table_view const& probe,
-                                       rmm::cuda_stream_view stream) const
-{
-  return impl->inner_join_size(probe, stream);
-}
-
-std::size_t hash_join::left_join_size(cudf::table_view const& probe,
-                                      rmm::cuda_stream_view stream) const
-{
-  return impl->left_join_size(probe, stream);
-}
-
-std::size_t hash_join::full_join_size(cudf::table_view const& probe,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
-{
-  return impl->full_join_size(probe, stream, mr);
-}
-
-// external APIs
-
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 inner_join(table_view const& left,
@@ -353,5 +311,4 @@ std::unique_ptr<table> full_join(table_view const& left,
   return detail::full_join(
     left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
 }
-
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index b778f13b5e1..fdb63419c84 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
-#include <join/join_common_utils.hpp>
+#include "join_common_utils.hpp"
 
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,6 +28,41 @@
 
 namespace cudf {
 namespace detail {
+/**
+ * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value.
+ *
+ * @param hash The hash value to potentially remap
+ * @param sentinel The reserved value
+ */
+template <typename H, typename S>
+constexpr auto remap_sentinel_hash(H hash, S sentinel)
+{
+  // Arbitrarily choose hash - 1
+  return (hash == sentinel) ? (hash - 1) : hash;
+}
+
+/**
+ * @brief Device functor to create a pair of hash value and index for a given row.
+ */
+class make_pair_function {
+ public:
+  CUDF_HOST_DEVICE make_pair_function(row_hash const& hash,
+                                      hash_value_type const empty_key_sentinel)
+    : _hash{hash}, _empty_key_sentinel{empty_key_sentinel}
+  {
+  }
+
+  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
+  {
+    // Compute the hash value of row `i`
+    auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
+    return cuco::make_pair(row_hash_value, i);
+  }
+
+ private:
+  row_hash _hash;
+  hash_value_type const _empty_key_sentinel;
+};
 
 /**
  * @brief Device functor to determine if a row is valid.
@@ -98,6 +135,47 @@ get_trivial_left_join_indices(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Builds the hash table based on the given `build_table`.
+ *
+ * @tparam MultimapType The type of the hash table
+ *
+ * @param build Table of columns used to build join hash.
+ * @param hash_table Build hash table.
+ * @param nulls_equal Flag to denote nulls are equal or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ */
+template <typename MultimapType>
+void build_join_hash_table(cudf::table_view const& build,
+                           MultimapType& hash_table,
+                           null_equality const nulls_equal,
+                           rmm::cuda_stream_view stream)
+{
+  auto build_table_ptr = cudf::table_device_view::create(build, stream);
+
+  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
+  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
+
+  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_build, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  size_type const build_table_num_rows{build_table_ptr->num_rows()};
+  if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) {
+    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
+  } else {
+    thrust::counting_iterator<size_type> stencil(0);
+    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    // insert valid rows
+    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
+  }
+}
+
 // Convenient alias for a pair of unique pointers to device uvectors.
 using VectorPair = std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                              std::unique_ptr<rmm::device_uvector<size_type>>>;
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 526c22d1d5c..060e8bff6f8 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#include <cudf/detail/join.hpp>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/join.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
@@ -34,7 +36,6 @@ namespace cudf {
 namespace detail {
 constexpr size_type MAX_JOIN_SIZE{std::numeric_limits<size_type>::max()};
 
-constexpr int DEFAULT_JOIN_CG_SIZE    = 2;
 constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128;
 constexpr int DEFAULT_JOIN_CACHE_SIZE = 128;
 constexpr size_type JoinNoneValue     = std::numeric_limits<size_type>::min();
@@ -45,12 +46,7 @@ using hash_type = cuco::detail::MurmurHash3_32<hash_value_type>;
 
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using multimap_type =
-  cuco::static_multimap<hash_value_type,
-                        size_type,
-                        cuda::thread_scope_device,
-                        hash_table_allocator_type,
-                        cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
+using multimap_type = cudf::hash_join::impl_type::map_type;
 
 // Multimap type used for mixed joins. TODO: This is a temporary alias used
 // until the mixed joins are converted to using CGs properly. Right now it's
@@ -68,9 +64,6 @@ using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
 
 using row_equality = cudf::row_equality_comparator<cudf::nullate::DYNAMIC>;
 
-enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
-
 bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type);
-
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 151db830962..7fa6642b19f 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <join/join_common_utils.cuh>
+#include "join_common_utils.cuh"
 
 #include <rmm/exec_policy.hpp>
 
@@ -61,7 +61,7 @@ get_trivial_left_join_indices(table_view const& left,
     std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
   thrust::uninitialized_fill(
     rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue);
-  return std::make_pair(std::move(left_indices), std::move(right_indices));
+  return std::pair(std::move(left_indices), std::move(right_indices));
 }
 
 VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream)
@@ -151,7 +151,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                              left_invalid_indices->end(),
                              JoinNoneValue);
 
-  return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement));
+  return std::pair(std::move(left_invalid_indices), std::move(right_indices_complement));
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index f9cbb2b5441..27ee77e3edd 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_kernels.cuh"
+
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -23,12 +27,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-#include <join/mixed_join_kernels.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/scan.h>
@@ -81,8 +82,8 @@ mixed_join(
       case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream);
       // Inner joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+        return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                         std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   } else if (left_num_rows == 0) {
@@ -90,12 +91,12 @@ mixed_join(
       // Left and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
       case join_kind::INNER_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+        return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                         std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
         auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream);
-        return std::make_pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
+        return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
     }
@@ -208,8 +209,8 @@ mixed_join(
   // all other cases (inner, left semi, and left anti joins) if we reach this
   // point we can safely return an empty result.
   if (join_size == 0) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                     std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   // Given the number of matches per row, we need to compute the offsets for insertion.
@@ -258,7 +259,7 @@ mixed_join(
         swap_tables);
   }
 
-  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
+  auto join_indices = std::pair(std::move(left_indices), std::move(right_indices));
 
   // For full joins, get the indices in the right table that were not joined to
   // by any row in the left table.
diff --git a/cpp/src/join/mixed_join_kernel.cu b/cpp/src/join/mixed_join_kernel.cu
new file mode 100644
index 00000000000..f8912f0c7bd
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernel.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mixed_join_kernel.cuh"
+
+namespace cudf {
+namespace detail {
+
+template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  size_type* join_output_l,
+  size_type* join_output_r,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const* join_result_offsets,
+  bool const swap_tables);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels.cu b/cpp/src/join/mixed_join_kernel.cuh
similarity index 80%
rename from cpp/src/join/mixed_join_kernels.cu
rename to cpp/src/join/mixed_join_kernel.cuh
index efaea841e45..38955ef4667 100644
--- a/cpp/src/join/mixed_join_kernels.cu
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-#include <join/mixed_join_common_utils.cuh>
+#pragma once
+
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
@@ -32,6 +33,7 @@
 
 namespace cudf {
 namespace detail {
+
 namespace cg = cooperative_groups;
 
 template <cudf::size_type block_size, bool has_nulls>
@@ -107,34 +109,6 @@ __launch_bounds__(block_size) __global__
   }
 }
 
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  size_type* join_output_l,
-  size_type* join_output_r,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
-
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  size_type* join_output_l,
-  size_type* join_output_r,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
-
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernel_nulls.cu b/cpp/src/join/mixed_join_kernel_nulls.cu
new file mode 100644
index 00000000000..a911c62b349
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernel_nulls.cu
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mixed_join_kernel.cuh"
+
+namespace cudf {
+namespace detail {
+
+template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  size_type* join_output_l,
+  size_type* join_output_r,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const* join_result_offsets,
+  bool const swap_tables);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 60cc74991ef..13a1f1a0ce2 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -14,8 +14,14 @@
  * limitations under the License.
  */
 
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_kernels_semi.cuh"
+
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
@@ -23,12 +29,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-#include <join/mixed_join_kernels_semi.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/join/mixed_join_size_kernel.cu b/cpp/src/join/mixed_join_size_kernel.cu
new file mode 100644
index 00000000000..cf8236e2be2
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernel.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mixed_join_size_kernel.cuh"
+
+namespace cudf {
+namespace detail {
+
+template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernels.cu b/cpp/src/join/mixed_join_size_kernel.cuh
similarity index 77%
rename from cpp/src/join/mixed_join_size_kernels.cu
rename to cpp/src/join/mixed_join_size_kernel.cuh
index 22c71bfc33a..ce70f7f18ee 100644
--- a/cpp/src/join/mixed_join_size_kernels.cu
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-#include <join/mixed_join_common_utils.cuh>
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
@@ -99,32 +98,5 @@ __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
   if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
 }
 
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
 }  // namespace detail
-
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel_nulls.cu b/cpp/src/join/mixed_join_size_kernel_nulls.cu
new file mode 100644
index 00000000000..f05d674b3b5
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernel_nulls.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mixed_join_size_kernel.cuh"
+
+namespace cudf {
+namespace detail {
+
+template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 9e1aa27a4e7..687e553fefd 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -45,7 +45,7 @@ namespace {
 /**
  * @brief Device functor to create a pair of hash value and index for a given row.
  */
-struct make_pair_function {
+struct make_pair_fn {
   __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
   {
     // The value is irrelevant since we only ever use the hash map to check for
@@ -101,7 +101,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   auto const right_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(right_flattened_keys)};
   row_hash const hash_build{right_nulls, *right_rows_d};
   row_equality equality_build{right_nulls, *right_rows_d, *right_rows_d, compare_nulls};
-  make_pair_function pair_func_build{};
+  make_pair_fn pair_func_build{};
 
   auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
 
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index fecdec0b1b2..f4d8e7678b1 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -81,7 +81,7 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
 
   auto [null_mask, null_count] = [&] {
     if (!build_null_mask)
-      return std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+      return std::pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
 
     // The output row will be null only if all lists on the input row are null.
     auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child(), stream);
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index adc1b95a9e6..38f738b4035 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -21,8 +21,7 @@
 #include <cudf/lists/detail/copying.hpp>
 #include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -176,7 +175,7 @@ struct list_child_constructor {
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
         ? construct_child_nullmask(
             list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
+        : std::pair(rmm::device_buffer{}, 0);
 
     auto child_column = cudf::make_fixed_width_column(source_lists_column_view.child().type(),
                                                       num_child_rows,
@@ -233,6 +232,8 @@ struct list_child_constructor {
 
     auto string_views = rmm::device_uvector<string_view>(num_child_rows, stream);
 
+    auto const null_string_view = string_view{nullptr, 0};  // placeholder for factory function
+
     thrust::transform(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<size_type>(0),
@@ -242,7 +243,8 @@ struct list_child_constructor {
        offset_size   = list_offsets.size(),
        d_list_vector = list_vector.begin(),
        source_lists,
-       target_lists] __device__(auto index) {
+       target_lists,
+       null_string_view] __device__(auto index) {
         auto const list_index_iter =
           thrust::upper_bound(thrust::seq, offset_begin, offset_begin + offset_size, index);
         auto const list_index =
@@ -253,39 +255,18 @@ struct list_child_constructor {
         auto lists_column    = actual_list_row.get_column();
         auto lists_offsets_ptr    = lists_column.offsets().template data<offset_type>();
         auto child_strings_column = lists_column.child();
-        auto string_offsets_ptr =
-          child_strings_column.child(cudf::strings_column_view::offsets_column_index)
-            .template data<offset_type>();
-        auto string_chars_ptr =
-          child_strings_column.child(cudf::strings_column_view::chars_column_index)
-            .template data<char>();
-
-        auto strings_offset = lists_offsets_ptr[row_index] + intra_index;
-        auto char_offset    = string_offsets_ptr[strings_offset];
-        auto char_ptr       = string_chars_ptr + char_offset;
-        auto string_size =
-          string_offsets_ptr[strings_offset + 1] - string_offsets_ptr[strings_offset];
-        return string_view{char_ptr, string_size};
+        auto strings_offset       = lists_offsets_ptr[row_index] + intra_index;
+
+        if (child_strings_column.is_null(strings_offset)) { return null_string_view; }
+        auto const d_str = child_strings_column.template element<string_view>(strings_offset);
+        // ensure a string from an all-empty column is not mapped to the null placeholder
+        auto const empty_string_view = string_view{};
+        return d_str.empty() ? empty_string_view : d_str;
       });
 
     // string_views should now have been populated with source and target references.
-
-    auto string_offsets = cudf::strings::detail::child_offsets_from_string_iterator(
-      string_views.begin(), string_views.size(), stream, mr);
-
-    auto string_chars = cudf::strings::detail::child_chars_from_string_vector(
-      string_views, string_offsets->view(), stream, mr);
-    auto child_null_mask =
-      source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
-        ? construct_child_nullmask(
-            list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
-
-    return cudf::make_strings_column(num_child_rows,
-                                     std::move(string_offsets),
-                                     std::move(string_chars),
-                                     child_null_mask.second,  // Null count.
-                                     std::move(child_null_mask.first));
+    auto sv_span = cudf::device_span<string_view const>(string_views);
+    return cudf::make_strings_column(sv_span, null_string_view, stream, mr);
   }
 
   /**
@@ -372,7 +353,7 @@ struct list_child_constructor {
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
         ? construct_child_nullmask(
             list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
+        : std::pair(rmm::device_buffer{}, 0);
 
     return cudf::make_lists_column(num_child_rows,
                                    std::move(child_offsets),
@@ -468,7 +449,7 @@ struct list_child_constructor {
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
         ? construct_child_nullmask(
             list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
-        : std::make_pair(rmm::device_buffer{}, 0);
+        : std::pair(rmm::device_buffer{}, 0);
 
     return cudf::make_structs_column(num_child_rows,
                                      std::move(child_columns),
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 01a94457b69..9c94a6220d6 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -26,6 +26,7 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -38,7 +39,6 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-#include "cudf/utilities/traits.hpp"
 #include <queue>
 #include <vector>
 
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 43686b7d257..0371065a2e5 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -595,8 +595,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     }
 
     stream.synchronize();  // Async D2H copy must finish before returning host vec
-    return std::make_pair(std::make_unique<table>(std::move(output_cols)),
-                          std::move(partition_offsets));
+    return std::pair(std::make_unique<table>(std::move(output_cols)), std::move(partition_offsets));
   } else {
     // Compute a scatter map from input to output such that the output rows are
     // sorted by partition number
@@ -613,7 +612,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
       input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr);
 
     stream.synchronize();  // Async D2H copy must finish before returning host vec
-    return std::make_pair(std::move(output), std::move(partition_offsets));
+    return std::pair(std::move(output), std::move(partition_offsets));
   }
 }
 
@@ -700,7 +699,7 @@ struct dispatch_map_type {
     auto scattered =
       cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, stream, mr);
 
-    return std::make_pair(std::move(scattered), std::move(partition_offsets));
+    return std::pair(std::move(scattered), std::move(partition_offsets));
   }
 
   template <typename MapType, typename... Args>
@@ -728,7 +727,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
 
   // Return empty result if there are no partitions or nothing to hash
   if (num_partitions <= 0 || input.num_rows() == 0 || table_to_hash.num_columns() == 0) {
-    return std::make_pair(empty_like(input), std::vector<size_type>{});
+    return std::pair(empty_like(input), std::vector<size_type>{});
   }
 
   if (has_nulls(table_to_hash)) {
@@ -753,7 +752,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   CUDF_EXPECTS(not partition_map.has_nulls(), "Unexpected null values in partition_map.");
 
   if (num_partitions == 0 or t.num_rows() == 0) {
-    return std::make_pair(empty_like(t), std::vector<size_type>{});
+    return std::pair(empty_like(t), std::vector<size_type>{});
   }
 
   return cudf::type_dispatcher(
@@ -779,10 +778,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
         if (!is_numeric(input.column(column_id).type()))
           CUDF_FAIL("IdentityHash does not support this data type");
       }
-      return detail::local::hash_partition<IdentityHash>(
+      return detail::local::hash_partition<detail::IdentityHash>(
         input, columns_to_hash, num_partitions, seed, stream, mr);
     case (hash_id::HASH_MURMUR3):
-      return detail::local::hash_partition<MurmurHash3_32>(
+      return detail::local::hash_partition<detail::MurmurHash3_32>(
         input, columns_to_hash, num_partitions, seed, stream, mr);
     default: CUDF_FAIL("Unsupported hash function in hash_partition");
   }
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 193bb5a4353..9cfad602db0 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -104,8 +104,8 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                                          stream,
                                          mr);
 
-    return std::make_pair(std::move(uniq_tbl),
-                          cudf::detail::make_std_vector_sync(partition_offsets, stream));
+    return std::pair(std::move(uniq_tbl),
+                     cudf::detail::make_std_vector_sync(partition_offsets, stream));
   } else {  //( num_partitions > nrows )
     rmm::device_uvector<cudf::size_type> d_row_indices(nrows, stream);
 
@@ -140,8 +140,8 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                            nedges_iter_begin + num_partitions,
                            partition_offsets.begin());
 
-    return std::make_pair(std::move(uniq_tbl),
-                          cudf::detail::make_std_vector_sync(partition_offsets, stream));
+    return std::pair(std::move(uniq_tbl),
+                     cudf::detail::make_std_vector_sync(partition_offsets, stream));
   }
 }
 }  // namespace
@@ -230,7 +230,7 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
 
   auto uniq_tbl = cudf::detail::gather(
     input, iter_begin, iter_begin + nrows, cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
-  auto ret_pair = std::make_pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
+  auto ret_pair = std::pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
 
   // this has the effect of rotating the set of partition sizes
   // right by start_partition positions:
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index a71fc862bf3..f38d0a921b7 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -113,10 +113,7 @@ struct quantile_functor {
         ordered_indices,
         [input = *d_input] __device__(size_type idx) { return input.is_valid_nocheck(idx); });
 
-      rmm::device_buffer mask;
-      size_type null_count;
-
-      std::tie(mask, null_count) = valid_if(
+      auto [mask, null_count] = valid_if(
         q_device.begin(),
         q_device.end(),
         [sorted_validity, interp = interp, size = size] __device__(double q) {
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 521f8e2d06f..0ababbf0a3d 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -102,16 +102,15 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
     mr);
 }
 
-std::unique_ptr<column> inclusive_percent_rank_scan(column_view const& order_by,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   auto const rank_column =
     inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource());
   auto const rank_view = rank_column->view();
 
-  // Result type for PERCENT_RANK is independent of input type.
-  using result_type = cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
+  // Result type for min 0-index percent rank is independent of input type.
+  using result_type        = double;
   auto percent_rank_result = cudf::make_fixed_width_column(
     data_type{type_to_id<result_type>()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr);
 
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index 52aaad5ddcf..b678b9441a5 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -35,17 +35,17 @@ std::unique_ptr<column> scan(column_view const& input,
   if (agg->kind == aggregation::RANK) {
     CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
                  "Rank aggregation operator requires an inclusive scan");
-    return inclusive_rank_scan(input, rmm::cuda_stream_default, mr);
-  }
-  if (agg->kind == aggregation::DENSE_RANK) {
-    CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
-                 "Dense rank aggregation operator requires an inclusive scan");
-    return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr);
-  }
-  if (agg->kind == aggregation::PERCENT_RANK) {
-    CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
-                 "Percent rank aggregation operator requires an inclusive scan");
-    return inclusive_percent_rank_scan(input, rmm::cuda_stream_default, mr);
+    auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(*agg);
+    if (rank_agg._method == rank_method::MIN) {
+      if (rank_agg._percentage == rank_percentage::NONE) {
+        return inclusive_rank_scan(input, rmm::cuda_stream_default, mr);
+      } else if (rank_agg._percentage == rank_percentage::ONE_NORMALIZED) {
+        return inclusive_one_normalized_percent_rank_scan(input, rmm::cuda_stream_default, mr);
+      }
+    } else if (rank_agg._method == rank_method::DENSE) {
+      return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr);
+    }
+    CUDF_FAIL("Unsupported rank aggregation method for inclusive scan");
   }
 
   return inclusive == scan_type::EXCLUSIVE
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 8b696854c25..73b224b0c99 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -76,7 +76,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> form_offsets_and_cha
     cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
 
-  return std::make_pair(std::move(offsets_column), std::move(chars_column));
+  return std::pair(std::move(offsets_column), std::move(chars_column));
 }
 
 template <typename OptionalScalarIterator, typename ReplaceScalarIterator>
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 9954cb4a299..d9e858e8e40 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -258,10 +258,7 @@ struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
                          func_value,
                          func_validity);
 
-    rmm::device_buffer mask;
-    size_type null_count;
-
-    std::tie(mask, null_count) = valid_if(index_begin, index_end, func_validity, stream, mr);
+    auto [mask, null_count] = valid_if(index_begin, index_end, func_validity, stream, mr);
 
     output->set_null_mask(std::move(mask), null_count);
 
diff --git a/cpp/src/rolling/rolling_collect_list.cuh b/cpp/src/rolling/rolling_collect_list.cuh
index 94703e320d0..13de4693e54 100644
--- a/cpp/src/rolling/rolling_collect_list.cuh
+++ b/cpp/src/rolling/rolling_collect_list.cuh
@@ -207,9 +207,7 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
                                             stream,
                                             mr);
 
-  rmm::device_buffer null_mask;
-  size_type null_count;
-  std::tie(null_mask, null_count) = valid_if(
+  auto [null_mask, null_count] = valid_if(
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(input.size()),
     [preceding_begin, following_begin, min_periods] __device__(auto i) {
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 70a6252e9b3..9473bed963e 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -1086,7 +1086,7 @@ struct dispatch_from_timestamps_fn {
                        thrust::make_counting_iterator<cudf::size_type>(0),
                        d_timestamps.size(),
                        pfn);
-    return std::make_pair(std::move(offsets_column), std::move(chars_column));
+    return std::pair(std::move(offsets_column), std::move(chars_column));
   }
 
   template <typename T, typename... Args>
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index b8a10a00f5b..89c00c63bb5 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -67,8 +67,8 @@ __device__ inline double stod(string_view const& d_str)
   // special strings: NaN, Inf
   if ((in_ptr < end) && *in_ptr > '9') {
     auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
-    if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
-    if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
+    if (is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
+    if (is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
   }
 
   // Parse and store the mantissa as much as we can,
@@ -124,16 +124,27 @@ __device__ inline double stod(string_view const& d_str)
   exp_ten *= exp_sign;
   exp_ten += exp_off;
   exp_ten += num_digits - 1;
-  if (exp_ten > std::numeric_limits<double>::max_exponent10)
+  if (exp_ten > std::numeric_limits<double>::max_exponent10) {
     return sign > 0 ? std::numeric_limits<double>::infinity()
                     : -std::numeric_limits<double>::infinity();
-  else if (exp_ten < std::numeric_limits<double>::min_exponent10)
-    return double{0};
+  }
+
+  double base = sign * static_cast<double>(digits);
 
   exp_ten += 1 - num_digits;
-  // exp10() is faster than pow(10.0,exp_ten)
+  // If 10^exp_ten would result in a subnormal value, the base and
+  // exponent should be adjusted so that 10^exp_ten is a normal value
+  auto const subnormal_shift = std::numeric_limits<double>::min_exponent10 - exp_ten;
+  if (subnormal_shift > 0) {
+    // Handle subnormal values. Ensure that both base and exponent are
+    // normal values before computing their product.
+    base = base / exp10(static_cast<double>(num_digits - 1 + subnormal_shift));
+    exp_ten += num_digits - 1;  // adjust exponent
+    auto const exponent = exp10(static_cast<double>(exp_ten + subnormal_shift));
+    return base * exponent;
+  }
+
   double const exponent = exp10(static_cast<double>(std::abs(exp_ten)));
-  double const base     = sign * static_cast<double>(digits);
   return exp_ten < 0 ? base / exponent : base * exponent;
 }
 
@@ -567,7 +578,7 @@ std::unique_ptr<column> is_float(
                     d_results,
                     [d_column] __device__(size_type idx) {
                       if (d_column.is_null(idx)) return false;
-                      return string::is_float(d_column.element<string_view>(idx));
+                      return strings::is_float(d_column.element<string_view>(idx));
                     });
   results->set_null_count(strings.null_count());
   return results;
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 95ddf1822a7..75c2f851bab 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -150,14 +150,13 @@ std::unique_ptr<column> is_integer(
       d_column->pair_begin<string_view, true>(),
       d_column->pair_end<string_view, true>(),
       d_results,
-      [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
+      [] __device__(auto const& p) { return p.second ? strings::is_integer(p.first) : false; });
   } else {
-    thrust::transform(
-      rmm::exec_policy(stream),
-      d_column->pair_begin<string_view, false>(),
-      d_column->pair_end<string_view, false>(),
-      d_results,
-      [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
+    thrust::transform(rmm::exec_policy(stream),
+                      d_column->pair_begin<string_view, false>(),
+                      d_column->pair_end<string_view, false>(),
+                      d_results,
+                      [] __device__(auto const& p) { return strings::is_integer(p.first); });
   }
 
   // Calling mutable_view() on a column invalidates it's null count so we need to set it back
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index fedb8d38a08..0ab7ef5ff2b 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -72,9 +72,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 {
   CUDF_FUNC_RANGE();
   // Assemble contiguous array of device views
-  std::unique_ptr<rmm::device_buffer> device_view_owners;
-  column_device_view* device_views_ptr;
-  std::tie(device_view_owners, device_views_ptr) =
+  auto [device_view_owners, device_views_ptr] =
     contiguous_copy_column_device_views<column_device_view>(views, stream);
 
   // Compute the partition offsets and size of offset column
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 30e8770c3c2..995b6223ddc 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -670,8 +670,8 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
 
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
   return is_empty
-           ? std::make_pair(thrust::nullopt, 0)
-           : std::make_pair(
+           ? std::pair(thrust::nullopt, 0)
+           : std::pair(
                thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)),
                max_stack_depth);
 }
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 6f36658523b..829230d0842 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -16,6 +16,7 @@
 
 #include <strings/regex/regcomp.h>
 
+#include <cudf/strings/detail/utf8.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <algorithm>
@@ -58,6 +59,37 @@ const std::array<char, 33> escapable_chars{
   {'.', '-', '+',  '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>',
    '"', '~', '\'', '`', '_',  '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}};
 
+/**
+ * @brief Converts UTF-8 string into fixed-width 32-bit character vector.
+ *
+ * No character conversion occurs.
+ * Each UTF-8 character is promoted into a 32-bit value.
+ * The last entry in the returned vector will be a 0 value.
+ * The fixed-width vector makes it easier to compile and faster to execute.
+ *
+ * @param pattern Regular expression encoded with UTF-8.
+ * @return Fixed-width 32-bit character vector.
+ */
+std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
+{
+  size_type size  = static_cast<size_type>(pattern.size());
+  size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
+    return is_begin_utf8_char(static_cast<uint8_t>(ch));
+  });
+  std::vector<char32_t> result(count + 1);
+  char32_t* output_ptr  = result.data();
+  const char* input_ptr = pattern.data();
+  for (size_type idx = 0; idx < size; ++idx) {
+    char_utf8 output_character = 0;
+    size_type ch_width         = to_char_utf8(input_ptr, output_character);
+    input_ptr += ch_width;
+    idx += ch_width - 1;
+    *output_ptr++ = output_character;
+  }
+  result[count] = 0;  // last entry set to 0
+  return result;
+}
+
 }  // namespace
 
 int32_t reprog::add_inst(int32_t t)
@@ -838,10 +870,11 @@ class regex_compiler {
 };
 
 // Convert pattern into program
-reprog reprog::create_from(const char32_t* pattern, regex_flags const flags)
+reprog reprog::create_from(std::string_view pattern, regex_flags const flags)
 {
   reprog rtn;
-  regex_compiler compiler(pattern, flags, rtn);
+  auto pattern32 = string_to_char32_vector(pattern);
+  regex_compiler compiler(pattern32.data(), flags, rtn);
   // for debugging, it can be helpful to call rtn.print(flags) here to dump
   // out the instructions that have been created from the given pattern
   return rtn;
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 18735d0f980..798b43830b4 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -92,7 +92,7 @@ class reprog {
    * @brief Parses the given regex pattern and compiles
    * into a list of chained instructions.
    */
-  static reprog create_from(const char32_t* pattern, regex_flags const flags);
+  static reprog create_from(std::string_view pattern, regex_flags const flags);
 
   int32_t add_inst(int32_t type);
   int32_t add_inst(reinst inst);
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index b172ceae2a6..bcdd15bceda 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -25,7 +25,6 @@
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
-#include <functional>
 #include <memory>
 
 namespace cudf {
@@ -35,9 +34,7 @@ class string_view;
 namespace strings {
 namespace detail {
 
-struct reljunk;
-struct reinst;
-class reprog;
+struct relist;
 
 using match_pair   = thrust::pair<cudf::size_type, cudf::size_type>;
 using match_result = thrust::optional<match_pair>;
@@ -65,19 +62,18 @@ constexpr int32_t RX_LARGE_INSTS  = (RX_STACK_LARGE / 11);
  *
  * This class holds the unique data for any regex CCLASS instruction.
  */
-class reclass_device {
- public:
+struct alignas(16) reclass_device {
   int32_t builtins{};
   int32_t count{};
-  char32_t* literals{};
+  char32_t const* literals{};
 
-  __device__ bool is_match(char32_t ch, const uint8_t* flags);
+  __device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const;
 };
 
 /**
  * @brief Regex program of instructions/data for a specific regex pattern.
  *
- * Once create, this find/extract methods are used to evaluating the regex instructions
+ * Once created, the find/extract methods are used to evaluate the regex instructions
  * against a single string.
  */
 class reprog_device {
@@ -132,15 +128,7 @@ class reprog_device {
   /**
    * @brief Returns the number of regex instructions.
    */
-  [[nodiscard]] __host__ __device__ int32_t insts_counts() const { return _insts_count; }
-
-  /**
-   * @brief Returns true if this is an empty program.
-   */
-  [[nodiscard]] __device__ bool is_empty() const
-  {
-    return insts_counts() == 0 || get_inst(0)->type == END;
-  }
+  [[nodiscard]] CUDF_HOST_DEVICE int32_t insts_counts() const { return _insts_count; }
 
   /**
    * @brief Returns the number of regex groups found in the expression.
@@ -151,19 +139,9 @@ class reprog_device {
   }
 
   /**
-   * @brief Returns the regex instruction object for a given index.
-   */
-  [[nodiscard]] __device__ inline reinst* get_inst(int32_t idx) const;
-
-  /**
-   * @brief Returns the regex class object for a given index.
-   */
-  [[nodiscard]] __device__ inline reclass_device get_class(int32_t idx) const;
-
-  /**
-   * @brief Returns the start-instruction-ids vector.
+   * @brief Returns true if this is an empty program.
    */
-  [[nodiscard]] __device__ inline int32_t* startinst_ids() const;
+  [[nodiscard]] __device__ inline bool is_empty() const;
 
   /**
    * @brief Does a find evaluation using the compiled expression on the given string.
@@ -180,9 +158,9 @@ class reprog_device {
    */
   template <int stack_size>
   __device__ inline int32_t find(int32_t idx,
-                                 string_view const& d_str,
-                                 int32_t& begin,
-                                 int32_t& end);
+                                 string_view const d_str,
+                                 cudf::size_type& begin,
+                                 cudf::size_type& end) const;
 
   /**
    * @brief Does an extract evaluation using the compiled expression on the given string.
@@ -192,8 +170,8 @@ class reprog_device {
    * the matched section.
    *
    * @tparam stack_size One of the `RX_STACK_` values based on the `insts_count`.
-   * @param idx The string index used for mapping the state memory for this string in global memory
-   * (if necessary).
+   * @param idx The string index used for mapping the state memory for this string in global
+   * memory (if necessary).
    * @param d_str The string to search.
    * @param begin Position index to begin the search. If found, returns the position found
    * in the string.
@@ -204,34 +182,65 @@ class reprog_device {
    */
   template <int stack_size>
   __device__ inline match_result extract(cudf::size_type idx,
-                                         string_view const& d_str,
+                                         string_view const d_str,
                                          cudf::size_type begin,
                                          cudf::size_type end,
-                                         cudf::size_type group_id);
+                                         cudf::size_type const group_id) const;
 
  private:
-  int32_t _startinst_id, _num_capturing_groups;
-  int32_t _insts_count, _starts_count, _classes_count;
-  const uint8_t* _codepoint_flags{};  // table of character types
-  reinst* _insts{};                   // array of regex instructions
-  int32_t* _startinst_ids{};          // array of start instruction ids
-  reclass_device* _classes{};         // array of regex classes
-  void* _relists_mem{};               // runtime relist memory for regexec
+  struct reljunk {
+    relist* __restrict__ list1;
+    relist* __restrict__ list2;
+    int32_t starttype{};
+    char32_t startchar{};
+
+    __device__ inline reljunk(relist* list1, relist* list2, reinst const inst);
+    __device__ inline void swaplist();
+  };
+
+  /**
+   * @brief Returns the regex instruction object for a given id.
+   */
+  __device__ inline reinst get_inst(int32_t id) const;
+
+  /**
+   * @brief Returns the regex class object for a given id.
+   */
+  __device__ inline reclass_device get_class(int32_t id) const;
 
   /**
    * @brief Executes the regex pattern on the given string.
    */
-  __device__ inline int32_t regexec(
-    string_view const& d_str, reljunk& jnk, int32_t& begin, int32_t& end, int32_t group_id = 0);
+  __device__ inline int32_t regexec(string_view const d_str,
+                                    reljunk jnk,
+                                    cudf::size_type& begin,
+                                    cudf::size_type& end,
+                                    cudf::size_type const group_id = 0) const;
 
   /**
    * @brief Utility wrapper to setup state memory structures for calling regexec
    */
   template <int stack_size>
-  __device__ inline int32_t call_regexec(
-    int32_t idx, string_view const& d_str, int32_t& begin, int32_t& end, int32_t group_id = 0);
-
-  reprog_device(reprog&);  // must use create()
+  __device__ inline int32_t call_regexec(int32_t idx,
+                                         string_view const d_str,
+                                         cudf::size_type& begin,
+                                         cudf::size_type& end,
+                                         cudf::size_type const group_id = 0) const;
+
+  reprog_device(reprog&);
+
+  int32_t _startinst_id;          // first instruction id
+  int32_t _num_capturing_groups;  // instruction groups
+  int32_t _insts_count;           // number of instructions
+  int32_t _starts_count;          // number of start-insts ids
+  int32_t _classes_count;         // number of classes
+
+  uint8_t const* _codepoint_flags{};  // table of character types
+  reinst const* _insts{};             // array of regex instructions
+  int32_t const* _startinst_ids{};    // array of start instruction ids
+  reclass_device const* _classes{};   // array of regex classes
+
+  void* _relists_mem{};  // runtime relist memory for regexec()
 };
 
 }  // namespace detail
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 01e773960e4..9fe4440d7ec 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,16 +17,9 @@
 #include <strings/char_types/is_flags.h>
 #include <strings/utf8.cuh>
 
-#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/string_view.cuh>
 
-#include <memory.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/logical.h>
-#include <thrust/optional.h>
-
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -40,95 +33,102 @@ namespace detail {
  * reflected here. The regexec function updates and manages this state data.
  */
 struct alignas(8) relist {
-  int16_t size{};
-  int16_t listsize{};
-  int32_t reserved;
-  int2* ranges{};       // pair per instruction
-  int16_t* inst_ids{};  // one per instruction
-  u_char* mask{};       // bit per instruction
-
-  CUDF_HOST_DEVICE inline static int32_t data_size_for(int32_t insts)
+  /**
+   * @brief Compute the memory size for the state data.
+   */
+  constexpr inline static std::size_t data_size_for(int32_t insts)
   {
-    return ((sizeof(ranges[0]) + sizeof(inst_ids[0])) * insts) + ((insts + 7) / 8);
+    return ((sizeof(ranges[0]) + sizeof(inst_ids[0])) * insts) +
+           cudf::util::div_rounding_up_unsafe(insts, 8);
   }
 
-  CUDF_HOST_DEVICE inline static int32_t alloc_size(int32_t insts)
+  /**
+   * @brief Compute the aligned memory allocation size.
+   */
+  constexpr inline static std::size_t alloc_size(int32_t insts)
   {
-    int32_t size = sizeof(relist);
-    size += data_size_for(insts);
-    size = ((size + 7) / 8) * 8;  // align it too
-    return size;
+    return cudf::util::round_up_unsafe<size_t>(data_size_for(insts) + sizeof(relist),
+                                               sizeof(ranges[0]));
   }
 
-  CUDF_HOST_DEVICE inline relist() {}
+  struct alignas(16) restate {
+    int2 range;
+    int32_t inst_id;
+    int32_t reserved;
+  };
 
-  CUDF_HOST_DEVICE inline relist(int16_t insts, u_char* data = nullptr) : listsize(insts)
+  __device__ __forceinline__ relist(int16_t insts, u_char* data = nullptr)
+    : masksize(cudf::util::div_rounding_up_unsafe(insts, 8))
   {
     auto ptr = data == nullptr ? reinterpret_cast<u_char*>(this) + sizeof(relist) : data;
     ranges   = reinterpret_cast<int2*>(ptr);
-    ptr += listsize * sizeof(ranges[0]);
+    ptr += insts * sizeof(ranges[0]);
     inst_ids = reinterpret_cast<int16_t*>(ptr);
-    ptr += listsize * sizeof(inst_ids[0]);
+    ptr += insts * sizeof(inst_ids[0]);
     mask = ptr;
     reset();
   }
 
-  CUDF_HOST_DEVICE inline void reset()
+  __device__ __forceinline__ void reset()
   {
-    memset(mask, 0, (listsize + 7) / 8);
+    memset(mask, 0, masksize);
     size = 0;
   }
 
-  __device__ inline bool activate(int32_t i, int32_t begin, int32_t end)
+  __device__ __forceinline__ bool activate(int32_t id, int32_t begin, int32_t end)
   {
-    if (readMask(i)) return false;
-    writeMask(true, i);
-    inst_ids[size] = static_cast<int16_t>(i);
+    if (readMask(id)) { return false; }
+    writeMask(id);
+    inst_ids[size] = static_cast<int16_t>(id);
     ranges[size]   = int2{begin, end};
     ++size;
     return true;
   }
 
-  __device__ inline void writeMask(bool v, int32_t pos)
+  __device__ __forceinline__ restate get_state(int16_t idx) const
   {
-    u_char uc = 1 << (pos & 7);
-    if (v)
-      mask[pos >> 3] |= uc;
-    else
-      mask[pos >> 3] &= ~uc;
+    return restate{ranges[idx], inst_ids[idx]};
   }
 
-  __device__ inline bool readMask(int32_t pos)
+  __device__ __forceinline__ int16_t get_size() const { return size; }
+
+ private:
+  int16_t size{};
+  int16_t const masksize;
+  int32_t reserved;
+  int2* __restrict__ ranges;       // pair per instruction
+  int16_t* __restrict__ inst_ids;  // one per instruction
+  u_char* __restrict__ mask;       // bit per instruction
+
+  __device__ __forceinline__ void writeMask(int32_t pos) const
   {
-    u_char uc = mask[pos >> 3];
-    return static_cast<bool>((uc >> (pos & 7)) & 1);
+    u_char const uc = 1 << (pos & 7);
+    mask[pos >> 3] |= uc;
   }
-};
 
-/**
- * @brief This manages the two relist instances required by the regexec function.
- */
-struct reljunk {
-  relist* list1;
-  relist* list2;
-  int32_t starttype{};
-  char32_t startchar{};
-
-  __host__ __device__ reljunk(relist* list1, relist* list2, int32_t stype, char32_t schar)
-    : list1(list1), list2(list2)
+  __device__ __forceinline__ bool readMask(int32_t pos) const
   {
-    if (starttype == CHAR || starttype == BOL) {
-      starttype = stype;
-      startchar = schar;
-    }
+    u_char const uc = mask[pos >> 3];
+    return static_cast<bool>((uc >> (pos & 7)) & 1);
   }
 };
 
-__device__ inline void swaplist(relist*& l1, relist*& l2)
+__device__ __forceinline__ reprog_device::reljunk::reljunk(relist* list1,
+                                                           relist* list2,
+                                                           reinst const inst)
+  : list1(list1), list2(list2)
+{
+  if (inst.type == CHAR || inst.type == BOL) {
+    starttype = inst.type;
+    startchar = inst.u1.c;
+  }
+}
+
+__device__ __forceinline__ void reprog_device::reljunk::swaplist()
 {
-  relist* tmp = l1;
-  l1          = l2;
-  l2          = tmp;
+  auto tmp = list1;
+  list1    = list2;
+  list2    = tmp;
 }
 
 /**
@@ -138,15 +138,13 @@ __device__ inline void swaplist(relist*& l1, relist*& l2)
  * @param codepoint_flags Used for mapping a character to type for builtin classes.
  * @return true if the character matches
  */
-__device__ inline bool reclass_device::is_match(char32_t ch, const uint8_t* codepoint_flags)
+__device__ __forceinline__ bool reclass_device::is_match(char32_t const ch,
+                                                         uint8_t const* codepoint_flags) const
 {
-  if (thrust::any_of(thrust::seq,
-                     thrust::make_counting_iterator<int>(0),
-                     thrust::make_counting_iterator<int>(count),
-                     [ch, this] __device__(int i) {
-                       return ((ch >= literals[i * 2]) && (ch <= literals[(i * 2) + 1]));
-                     }))
-    return true;
+  for (int i = 0; i < count; ++i) {
+    if ((ch >= literals[i * 2]) && (ch <= literals[(i * 2) + 1])) { return true; }
+  }
+
   if (!builtins) return false;
   uint32_t codept = utf8_to_codepoint(ch);
   if (codept > 0x00FFFF) return false;
@@ -167,20 +165,18 @@ __device__ inline bool reclass_device::is_match(char32_t ch, const uint8_t* code
   return false;
 }
 
-__device__ inline reinst* reprog_device::get_inst(int32_t idx) const
+__device__ __forceinline__ reinst reprog_device::get_inst(int32_t id) const { return _insts[id]; }
+
+__device__ __forceinline__ reclass_device reprog_device::get_class(int32_t id) const
 {
-  assert((idx >= 0) && (idx < _insts_count));
-  return _insts + idx;
+  return _classes[id];
 }
 
-__device__ inline reclass_device reprog_device::get_class(int32_t idx) const
+__device__ __forceinline__ bool reprog_device::is_empty() const
 {
-  assert((idx >= 0) && (idx < _classes_count));
-  return _classes[idx];
+  return insts_counts() == 0 || get_inst(0).type == END;
 }
 
-__device__ inline int32_t* reprog_device::startinst_ids() const { return _startinst_ids; }
-
 /**
  * @brief Evaluate a specific string against regex pattern compiled to this instance.
  *
@@ -195,35 +191,36 @@ __device__ inline int32_t* reprog_device::startinst_ids() const { return _starti
  * @param group_id Index of the group to match in a multi-group regex pattern.
  * @return >0 if match found
  */
-__device__ inline int32_t reprog_device::regexec(
-  string_view const& dstr, reljunk& jnk, int32_t& begin, int32_t& end, int32_t group_id)
+__device__ __forceinline__ int32_t reprog_device::regexec(string_view const dstr,
+                                                          reljunk jnk,
+                                                          cudf::size_type& begin,
+                                                          cudf::size_type& end,
+                                                          cudf::size_type const group_id) const
 {
-  int32_t match                   = 0;
-  auto checkstart                 = jnk.starttype;
-  auto pos                        = begin;
-  auto eos                        = end;
-  char32_t c                      = 0;
-  auto last_character             = false;
+  int32_t match       = 0;
+  auto pos            = begin;
+  auto eos            = end;
+  char_utf8 c         = 0;
+  auto checkstart     = jnk.starttype != 0;
+  auto last_character = false;
+
   string_view::const_iterator itr = string_view::const_iterator(dstr, pos);
 
   jnk.list1->reset();
   do {
-    /* fast check for first char */
+    // fast check for first CHAR or BOL
     if (checkstart) {
+      auto startchar = static_cast<char_utf8>(jnk.startchar);
       switch (jnk.starttype) {
-        case CHAR: {
-          auto fidx = dstr.find(static_cast<char_utf8>(jnk.startchar), pos);
-          if (fidx < 0) return match;
-          pos = fidx;
-          break;
-        }
-        case BOL: {
+        case BOL:
           if (pos == 0) break;
-          if (jnk.startchar != '^') return match;
+          if (jnk.startchar != '^') { return match; }
           --pos;
-          int fidx = dstr.find(static_cast<char_utf8>('\n'), pos);
-          if (fidx < 0) return match;  // update begin/end values?
-          pos = fidx + 1;
+          startchar = static_cast<char_utf8>('\n');
+        case CHAR: {
+          auto const fidx = dstr.find(startchar, pos);
+          if (fidx < 0) { return match; }
+          pos = fidx + (jnk.starttype == BOL);
           break;
         }
       }
@@ -231,128 +228,114 @@ __device__ inline int32_t reprog_device::regexec(
     }
 
     if (((eos < 0) || (pos < eos)) && match == 0) {
-      int32_t i = 0;
-      auto ids  = startinst_ids();
-      while (ids[i] >= 0)
-        jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1);
+      auto ids = _startinst_ids;
+      while (*ids >= 0)
+        jnk.list1->activate(*ids++, (group_id == 0 ? pos : -1), -1);
     }
 
-    last_character = (pos >= dstr.length());
+    last_character = itr.byte_offset() >= dstr.size_bytes();
 
-    c = static_cast<char32_t>(last_character ? 0 : *itr);
+    c = last_character ? 0 : *itr;
 
-    // expand LBRA, RBRA, BOL, EOL, BOW, NBOW, and OR
+    // expand the non-character types like: LBRA, RBRA, BOL, EOL, BOW, NBOW, and OR
     bool expanded = false;
     do {
       jnk.list2->reset();
       expanded = false;
 
-      for (int16_t i = 0; i < jnk.list1->size; i++) {
-        auto inst_id        = static_cast<int32_t>(jnk.list1->inst_ids[i]);
-        int2& range         = jnk.list1->ranges[i];
-        const reinst* inst  = get_inst(inst_id);
+      for (int16_t i = 0; i < jnk.list1->get_size(); i++) {
+        auto state          = jnk.list1->get_state(i);
+        auto range          = state.range;
+        auto const inst     = get_inst(state.inst_id);
         int32_t id_activate = -1;
 
-        switch (inst->type) {
+        switch (inst.type) {
           case CHAR:
           case ANY:
           case ANYNL:
           case CCLASS:
           case NCCLASS:
-          case END: id_activate = inst_id; break;
+          case END: id_activate = state.inst_id; break;
           case LBRA:
-            if (inst->u1.subid == group_id) range.x = pos;
-            id_activate = inst->u2.next_id;
+            if (inst.u1.subid == group_id) range.x = pos;
+            id_activate = inst.u2.next_id;
             expanded    = true;
             break;
           case RBRA:
-            if (inst->u1.subid == group_id) range.y = pos;
-            id_activate = inst->u2.next_id;
+            if (inst.u1.subid == group_id) range.y = pos;
+            id_activate = inst.u2.next_id;
             expanded    = true;
             break;
           case BOL:
-            if ((pos == 0) ||
-                ((inst->u1.c == '^') && (dstr[pos - 1] == static_cast<char_utf8>('\n')))) {
-              id_activate = inst->u2.next_id;
+            if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) {
+              id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
           case EOL:
-            if (last_character || (c == '\n' && inst->u1.c == '$')) {
-              id_activate = inst->u2.next_id;
-              expanded    = true;
-            }
-            break;
-          case BOW: {
-            auto codept           = utf8_to_codepoint(c);
-            auto last_c           = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
-            auto last_codept      = utf8_to_codepoint(last_c);
-            bool cur_alphaNumeric = (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]);
-            bool last_alphaNumeric =
-              (last_codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[last_codept]);
-            if (cur_alphaNumeric != last_alphaNumeric) {
-              id_activate = inst->u2.next_id;
+            if (last_character || (c == '\n' && inst.u1.c == '$')) {
+              id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
-          }
+          case BOW:
           case NBOW: {
-            auto codept           = utf8_to_codepoint(c);
-            auto last_c           = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
-            auto last_codept      = utf8_to_codepoint(last_c);
-            bool cur_alphaNumeric = (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]);
-            bool last_alphaNumeric =
+            auto const codept      = utf8_to_codepoint(c);
+            auto const last_c      = pos > 0 ? dstr[pos - 1] : 0;
+            auto const last_codept = utf8_to_codepoint(last_c);
+
+            bool const cur_alphaNumeric =
+              (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]);
+            bool const last_alphaNumeric =
               (last_codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[last_codept]);
-            if (cur_alphaNumeric == last_alphaNumeric) {
-              id_activate = inst->u2.next_id;
+            if ((cur_alphaNumeric == last_alphaNumeric) != (inst.type == BOW)) {
+              id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
           }
           case OR:
-            jnk.list2->activate(inst->u1.right_id, range.x, range.y);
-            id_activate = inst->u2.left_id;
+            jnk.list2->activate(inst.u1.right_id, range.x, range.y);
+            id_activate = inst.u2.left_id;
             expanded    = true;
             break;
         }
         if (id_activate >= 0) jnk.list2->activate(id_activate, range.x, range.y);
       }
-      swaplist(jnk.list1, jnk.list2);
+      jnk.swaplist();
 
     } while (expanded);
 
-    // execute
+    // execute instructions
     bool continue_execute = true;
     jnk.list2->reset();
-    for (int16_t i = 0; continue_execute && i < jnk.list1->size; i++) {
-      auto inst_id        = static_cast<int32_t>(jnk.list1->inst_ids[i]);
-      int2& range         = jnk.list1->ranges[i];
-      const reinst* inst  = get_inst(inst_id);
+    for (int16_t i = 0; continue_execute && i < jnk.list1->get_size(); i++) {
+      auto const state    = jnk.list1->get_state(i);
+      auto const range    = state.range;
+      auto const inst     = get_inst(state.inst_id);
       int32_t id_activate = -1;
 
-      switch (inst->type) {
+      switch (inst.type) {
         case CHAR:
-          if (inst->u1.c == c) id_activate = inst->u2.next_id;
+          if (inst.u1.c == c) id_activate = inst.u2.next_id;
           break;
         case ANY:
-          if (c != '\n') id_activate = inst->u2.next_id;
+          if (c != '\n') id_activate = inst.u2.next_id;
           break;
-        case ANYNL: id_activate = inst->u2.next_id; break;
+        case ANYNL: id_activate = inst.u2.next_id; break;
+        case NCCLASS:
         case CCLASS: {
-          reclass_device cls = get_class(inst->u1.cls_id);
-          if (cls.is_match(c, _codepoint_flags)) id_activate = inst->u2.next_id;
-          break;
-        }
-        case NCCLASS: {
-          reclass_device cls = get_class(inst->u1.cls_id);
-          if (!cls.is_match(c, _codepoint_flags)) id_activate = inst->u2.next_id;
+          auto const cls = get_class(inst.u1.cls_id);
+          if (cls.is_match(static_cast<char32_t>(c), _codepoint_flags) == (inst.type == CCLASS)) {
+            id_activate = inst.u2.next_id;
+          }
           break;
         }
         case END:
           match = 1;
           begin = range.x;
           end   = group_id == 0 ? pos : range.y;
-
+          // done with execute
           continue_execute = false;
           break;
       }
@@ -362,18 +345,18 @@ __device__ inline int32_t reprog_device::regexec(
 
     ++pos;
     ++itr;
-    swaplist(jnk.list1, jnk.list2);
-    checkstart = jnk.list1->size > 0 ? 0 : 1;
-  } while (!last_character && (jnk.list1->size > 0 || match == 0));
+    jnk.swaplist();
+    checkstart = jnk.list1->get_size() == 0;
+  } while (!last_character && (!checkstart || !match));
 
   return match;
 }
 
 template <int stack_size>
-__device__ inline int32_t reprog_device::find(int32_t idx,
-                                              string_view const& dstr,
-                                              int32_t& begin,
-                                              int32_t& end)
+__device__ __forceinline__ int32_t reprog_device::find(int32_t idx,
+                                                       string_view const dstr,
+                                                       cudf::size_type& begin,
+                                                       cudf::size_type& end) const
 {
   int32_t rtn = call_regexec<stack_size>(idx, dstr, begin, end);
   if (rtn <= 0) begin = end = -1;
@@ -381,11 +364,11 @@ __device__ inline int32_t reprog_device::find(int32_t idx,
 }
 
 template <int stack_size>
-__device__ inline match_result reprog_device::extract(cudf::size_type idx,
-                                                      string_view const& dstr,
-                                                      cudf::size_type begin,
-                                                      cudf::size_type end,
-                                                      cudf::size_type group_id)
+__device__ __forceinline__ match_result reprog_device::extract(cudf::size_type idx,
+                                                               string_view const dstr,
+                                                               cudf::size_type begin,
+                                                               cudf::size_type end,
+                                                               cudf::size_type const group_id) const
 {
   end = begin + 1;
   return call_regexec<stack_size>(idx, dstr, begin, end, group_id + 1) > 0
@@ -394,28 +377,29 @@ __device__ inline match_result reprog_device::extract(cudf::size_type idx,
 }
 
 template <int stack_size>
-__device__ inline int32_t reprog_device::call_regexec(
-  int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, int32_t group_id)
+__device__ __forceinline__ int32_t reprog_device::call_regexec(int32_t idx,
+                                                               string_view const dstr,
+                                                               cudf::size_type& begin,
+                                                               cudf::size_type& end,
+                                                               cudf::size_type const group_id) const
 {
   u_char data1[stack_size], data2[stack_size];
 
-  auto const stype = get_inst(_startinst_id)->type;
-  auto const schar = get_inst(_startinst_id)->u1.c;
-
   relist list1(static_cast<int16_t>(_insts_count), data1);
   relist list2(static_cast<int16_t>(_insts_count), data2);
 
-  reljunk jnk(&list1, &list2, stype, schar);
+  reljunk jnk(&list1, &list2, get_inst(_startinst_id));
   return regexec(dstr, jnk, begin, end, group_id);
 }
 
 template <>
-__device__ inline int32_t reprog_device::call_regexec<RX_STACK_ANY>(
-  int32_t idx, string_view const& dstr, int32_t& begin, int32_t& end, int32_t group_id)
+__device__ __forceinline__ int32_t
+reprog_device::call_regexec<RX_STACK_ANY>(int32_t idx,
+                                          string_view const dstr,
+                                          cudf::size_type& begin,
+                                          cudf::size_type& end,
+                                          cudf::size_type const group_id) const
 {
-  auto const stype = get_inst(_startinst_id)->type;
-  auto const schar = get_inst(_startinst_id)->u1.c;
-
   auto const relists_size = relist::alloc_size(_insts_count);
   auto* listmem           = reinterpret_cast<u_char*>(_relists_mem);  // beginning of relist buffer;
   listmem += (idx * relists_size * 2);                                // two relist ptrs in reljunk:
@@ -423,7 +407,7 @@ __device__ inline int32_t reprog_device::call_regexec<RX_STACK_ANY>(
   auto* list1 = new (listmem) relist(static_cast<int16_t>(_insts_count));
   auto* list2 = new (listmem + relists_size) relist(static_cast<int16_t>(_insts_count));
 
-  reljunk jnk(list1, list2, stype, schar);
+  reljunk jnk(list1, list2, get_inst(_startinst_id));
   return regexec(dstr, jnk, begin, end, group_id);
 }
 
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 3bcf55cf069..70d6079972a 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -18,6 +18,7 @@
 #include <strings/regex/regex.cuh>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -27,39 +28,6 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-namespace {
-/**
- * @brief Converts UTF-8 string into fixed-width 32-bit character vector.
- *
- * No character conversion occurs.
- * Each UTF-8 character is promoted into a 32-bit value.
- * The last entry in the returned vector will be a 0 value.
- * The fixed-width vector makes it easier to compile and faster to execute.
- *
- * @param pattern Regular expression encoded with UTF-8.
- * @return Fixed-width 32-bit character vector.
- */
-std::vector<char32_t> string_to_char32_vector(std::string const& pattern)
-{
-  size_type size  = static_cast<size_type>(pattern.size());
-  size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
-    return is_begin_utf8_char(static_cast<uint8_t>(ch));
-  });
-  std::vector<char32_t> result(count + 1);
-  char32_t* output_ptr  = result.data();
-  const char* input_ptr = pattern.data();
-  for (size_type idx = 0; idx < size; ++idx) {
-    char_utf8 output_character = 0;
-    size_type ch_width         = to_char_utf8(input_ptr, output_character);
-    input_ptr += ch_width;
-    idx += ch_width - 1;
-    *output_ptr++ = output_character;
-  }
-  result[count] = 0;  // last entry set to 0
-  return result;
-}
-
-}  // namespace
 
 // Copy reprog primitive values
 reprog_device::reprog_device(reprog& prog)
@@ -89,75 +57,76 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   size_type strings_count,
   rmm::cuda_stream_view stream)
 {
-  std::vector<char32_t> pattern32 = string_to_char32_vector(pattern);
   // compile pattern into host object
-  reprog h_prog = reprog::create_from(pattern32.data(), flags);
+  reprog h_prog = reprog::create_from(pattern, flags);
+
   // compute size to hold all the member data
-  auto insts_count   = h_prog.insts_count();
-  auto classes_count = h_prog.classes_count();
-  auto starts_count  = h_prog.starts_count();
-  // compute size of each section; make sure each is aligned appropriately
-  auto insts_size =
-    cudf::util::round_up_safe<size_t>(insts_count * sizeof(_insts[0]), sizeof(size_t));
-  auto startids_size =
-    cudf::util::round_up_safe<size_t>(starts_count * sizeof(_startinst_ids[0]), sizeof(size_t));
-  auto classes_size =
-    cudf::util::round_up_safe<size_t>(classes_count * sizeof(_classes[0]), sizeof(size_t));
-  for (int32_t idx = 0; idx < classes_count; ++idx)
+  auto const insts_count   = h_prog.insts_count();
+  auto const classes_count = h_prog.classes_count();
+  auto const starts_count  = h_prog.starts_count();
+
+  // compute size of each section
+  auto insts_size    = insts_count * sizeof(_insts[0]);
+  auto startids_size = starts_count * sizeof(_startinst_ids[0]);
+  auto classes_size  = classes_count * sizeof(_classes[0]);
+  for (auto idx = 0; idx < classes_count; ++idx)
     classes_size += static_cast<int32_t>((h_prog.class_at(idx).literals.size()) * sizeof(char32_t));
-  size_t memsize  = insts_size + startids_size + classes_size;
-  size_t rlm_size = 0;
-  // check memory size needed for executing regex
-  if (insts_count > RX_LARGE_INSTS) {
-    auto relist_alloc_size = relist::alloc_size(insts_count);
-    rlm_size               = relist_alloc_size * 2L * strings_count;  // reljunk has 2 relist ptrs
-  }
+  // make sure each section is aligned for the subsequent section's data type
+  auto const memsize = cudf::util::round_up_safe(insts_size, sizeof(_startinst_ids[0])) +
+                       cudf::util::round_up_safe(startids_size, sizeof(_classes[0])) +
+                       cudf::util::round_up_safe(classes_size, sizeof(char32_t));
+
+  // allocate memory to store all the prog data in a flat contiguous buffer
+  std::vector<u_char> h_buffer(memsize);                        // copy everything into here;
+  auto h_ptr    = h_buffer.data();                              // this is our running host ptr;
+  auto d_buffer = new rmm::device_buffer(memsize, stream);      // output device memory;
+  auto d_ptr    = reinterpret_cast<u_char*>(d_buffer->data());  // running device pointer
 
-  // allocate memory to store prog data
-  std::vector<u_char> h_buffer(memsize);
-  u_char* h_ptr  = h_buffer.data();  // running pointer
-  auto* d_buffer = new rmm::device_buffer(memsize, stream);
-  u_char* d_ptr  = reinterpret_cast<u_char*>(d_buffer->data());  // running device pointer
   // put everything into a flat host buffer first
   reprog_device* d_prog = new reprog_device(h_prog);
-  // copy the instructions array first (fixed-size structs)
-  reinst* insts = reinterpret_cast<reinst*>(h_ptr);
-  memcpy(insts, h_prog.insts_data(), insts_size);
-  h_ptr += insts_size;  // next section
+
+  // copy the instructions array first (fixed-sized structs)
+  memcpy(h_ptr, h_prog.insts_data(), insts_size);
   d_prog->_insts = reinterpret_cast<reinst*>(d_ptr);
+
+  // point to the end for the next section
+  insts_size = cudf::util::round_up_safe(insts_size, sizeof(_startinst_ids[0]));
+  h_ptr += insts_size;
   d_ptr += insts_size;
-  // copy the startinst_ids next (ints)
-  int32_t* startinst_ids = reinterpret_cast<int32_t*>(h_ptr);
-  memcpy(startinst_ids, h_prog.starts_data(), startids_size);
-  h_ptr += startids_size;  // next section
+  // copy the startinst_ids next
+  memcpy(h_ptr, h_prog.starts_data(), startids_size);
   d_prog->_startinst_ids = reinterpret_cast<int32_t*>(d_ptr);
+
+  // next section; align the size for next data type
+  startids_size = cudf::util::round_up_safe(startids_size, sizeof(_classes[0]));
+  h_ptr += startids_size;
   d_ptr += startids_size;
   // copy classes into flat memory: [class1,class2,...][char32 arrays]
-  reclass_device* classes = reinterpret_cast<reclass_device*>(h_ptr);
-  d_prog->_classes        = reinterpret_cast<reclass_device*>(d_ptr);
+  auto classes     = reinterpret_cast<reclass_device*>(h_ptr);
+  d_prog->_classes = reinterpret_cast<reclass_device*>(d_ptr);
   // get pointer to the end to handle variable length data
-  u_char* h_end = h_ptr + (classes_count * sizeof(reclass_device));
-  u_char* d_end = d_ptr + (classes_count * sizeof(reclass_device));
+  auto h_end = h_ptr + (classes_count * sizeof(reclass_device));
+  auto d_end = d_ptr + (classes_count * sizeof(reclass_device));
   // place each class and append the variable length data
   for (int32_t idx = 0; idx < classes_count; ++idx) {
     reclass& h_class = h_prog.class_at(idx);
-    reclass_device d_class;
-    d_class.builtins = h_class.builtins;
-    d_class.count    = h_class.literals.size() / 2;
-    d_class.literals = reinterpret_cast<char32_t*>(d_end);
-    memcpy(classes++, &d_class, sizeof(d_class));
+    reclass_device d_class{h_class.builtins,
+                           static_cast<int32_t>(h_class.literals.size() / 2),
+                           reinterpret_cast<char32_t*>(d_end)};
+    *classes++ = d_class;
     memcpy(h_end, h_class.literals.c_str(), h_class.literals.size() * sizeof(char32_t));
     h_end += h_class.literals.size() * sizeof(char32_t);
     d_end += h_class.literals.size() * sizeof(char32_t);
   }
+
   // initialize the rest of the elements
-  d_prog->_insts_count     = insts_count;
-  d_prog->_starts_count    = starts_count;
-  d_prog->_classes_count   = classes_count;
   d_prog->_codepoint_flags = codepoint_flags;
+
   // allocate execute memory if needed
   rmm::device_buffer* d_relists{};
-  if (rlm_size > 0) {
+  if (insts_count > RX_LARGE_INSTS) {
+    // two relist state structures are needed for execute per string
+    auto const rlm_size  = relist::alloc_size(insts_count) * 2 * strings_count;
     d_relists            = new rmm::device_buffer(rlm_size, stream);
     d_prog->_relists_mem = d_relists->data();
   }
@@ -165,7 +134,8 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   // copy flat prog to device memory
   CUDF_CUDA_TRY(cudaMemcpyAsync(
     d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice, stream.value()));
-  //
+
+  // build deleter to cleanup device memory
   auto deleter = [d_buffer, d_relists](reprog_device* t) {
     t->destroy();
     delete d_buffer;
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index d496b46bc36..7a3e0fb0243 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -283,7 +283,7 @@ auto make_strings_children(Func fn,
     for_each_fn(fn);
   }
 
-  return std::make_pair(std::move(offsets_column), std::move(chars_column));
+  return std::pair(std::move(offsets_column), std::move(chars_column));
 }
 
 }  // namespace
@@ -345,7 +345,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
 
   auto const strings_count = input.size();
   if (strings_count == 0) {
-    return std::make_pair(make_empty_column(type_to_id<size_type>()), int64_t{0});
+    return std::pair(make_empty_column(type_to_id<size_type>()), int64_t{0});
   }
 
   auto output_sizes = make_numeric_column(
@@ -374,7 +374,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
                              int64_t{0},
                              thrust::plus{});
 
-  return std::make_pair(std::move(output_sizes), total_bytes);
+  return std::pair(std::move(output_sizes), total_bytes);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index d7cc72fdfff..ac073f8efbc 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -20,7 +20,6 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/error.hpp>
 
@@ -28,12 +27,8 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/transform_scan.h>
-
-#include <cstring>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
@@ -42,65 +37,33 @@ namespace detail {
 /**
  * @copydoc create_string_vector_from_column
  */
-rmm::device_uvector<string_view> create_string_vector_from_column(cudf::strings_column_view strings,
-                                                                  rmm::cuda_stream_view stream)
-{
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
-  rmm::device_uvector<string_view> strings_vector(strings.size(), stream);
-  string_view* d_strings = strings_vector.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings.size(),
-                     [d_column, d_strings] __device__(size_type idx) {
-                       if (d_column.is_null(idx))
-                         d_strings[idx] = string_view(nullptr, 0);
-                       else
-                         d_strings[idx] = d_column.element<string_view>(idx);
-                     });
-  return strings_vector;
-}
-
-/**
- * @copydoc child_offsets_from_string_vector
- */
-std::unique_ptr<cudf::column> child_offsets_from_string_vector(
-  cudf::device_span<string_view> strings,
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const input,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  return child_offsets_from_string_iterator(strings.begin(), strings.size(), stream, mr);
-}
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto strings_vector = rmm::device_uvector<string_view>(input.size(), stream, mr);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    strings_vector.begin(),
+                    [d_strings = *d_strings] __device__(size_type idx) {
+                      // placeholder for factory function that takes a span of string_views
+                      auto const null_string_view = string_view{nullptr, 0};
+                      if (d_strings.is_null(idx)) { return null_string_view; }
+                      auto const d_str = d_strings.element<string_view>(idx);
+                      // special case when the entire column is filled with empty strings:
+                      // here the empty d_str may have a d_str.data() == nullptr
+                      auto const empty_string_view = string_view{};
+                      return d_str.empty() ? empty_string_view : d_str;
+                    });
 
-/**
- * @copydoc child_chars_from_string_vector
- */
-std::unique_ptr<cudf::column> child_chars_from_string_vector(cudf::device_span<string_view> strings,
-                                                             column_view const& offsets,
-                                                             rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr)
-{
-  auto const d_strings = strings.data();
-  auto const bytes     = cudf::detail::get_value<int32_t>(offsets, strings.size(), stream);
-  auto const d_offsets = offsets.data<int32_t>();
-
-  // create column
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-  // get it's view
-  auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings.size(),
-                     [d_strings, d_offsets, d_chars] __device__(size_type idx) {
-                       string_view const d_str = d_strings[idx];
-                       memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
-                     });
-
-  return chars_column;
+  return strings_vector;
 }
 
-//
 std::unique_ptr<column> create_chars_child_column(cudf::size_type total_bytes,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp
index db9496f18be..7d8c8837d2d 100644
--- a/cpp/src/structs/structs_column_view.cpp
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "cudf/utilities/error.hpp"
 #include <cudf/column/column.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/error.hpp>
 
 namespace cudf {
 
@@ -25,6 +25,8 @@ structs_column_view::structs_column_view(column_view const& rhs) : column_view{r
   CUDF_EXPECTS(type().id() == type_id::STRUCT, "structs_column_view only supports struct columns");
 }
 
+column_view structs_column_view::parent() const { return *this; }
+
 column_view structs_column_view::get_sliced_child(int index) const
 {
   std::vector<column_view> children;
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 852a32bed3d..a2c173cae5f 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -371,7 +371,7 @@ std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_paren
     auto [new_child_mask, null_count] = [&] {
       if (not child.nullable()) {
         // Adopt parent STRUCT's null mask.
-        return std::make_pair(structs_column.null_mask(), 0);
+        return std::pair(structs_column.null_mask(), 0);
       }
 
       // Both STRUCT and child are nullable. AND() for the child's new null mask.
@@ -387,8 +387,8 @@ std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_paren
                                                               stream,
                                                               mr);
       ret_validity_buffers.push_back(std::move(new_mask));
-      return std::make_pair(
-        reinterpret_cast<bitmask_type const*>(ret_validity_buffers.back().data()), null_count);
+      return std::pair(reinterpret_cast<bitmask_type const*>(ret_validity_buffers.back().data()),
+                       null_count);
     }();
 
     return cudf::column_view(
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index fb631b3f31f..404ecf1248c 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -22,6 +22,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -144,8 +145,8 @@ struct byte_pair_encoding_fn {
    * @param rhs Second string.
    * @return The hash value to match with `d_map`.
    */
-  __device__ hash_value_type compute_hash(cudf::string_view const& lhs,
-                                          cudf::string_view const& rhs)
+  __device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs,
+                                                cudf::string_view const& rhs)
   {
     __shared__ char shmem[48 * 1024];  // max for Pascal
     auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 31cc29a8d8a..24b10fc4a36 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -36,12 +36,12 @@ namespace detail {
 
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using merge_pairs_map_type = cuco::static_map<hash_value_type,
+using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
                                               cudf::size_type,
                                               cuda::thread_scope_device,
                                               hash_table_allocator_type>;
 
-using string_hasher_type = MurmurHash3_32<cudf::string_view>;
+using string_hasher_type = cudf::detail::MurmurHash3_32<cudf::string_view>;
 
 }  // namespace detail
 
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 2ed59c3ae0c..71f9e3f7043 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -278,8 +278,8 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
                                         rmm::cuda_stream_view stream) const
 {
   if (num_strings == 0)
-    return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                          std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
 
   // copy offsets to working memory
   size_t const num_offsets = num_strings + 1;
@@ -294,8 +294,8 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
                     });
   uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream);
   if (bytes_count == 0)  // if no bytes, nothing to do
-    return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
-                          std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
+    return std::pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
+                     std::make_unique<rmm::device_uvector<uint32_t>>(0, stream));
 
   cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(bytes_count), THREADS_PER_BLOCK, 1};
   size_t const threads_on_device  = grid.num_threads_per_block * grid.num_blocks;
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index 31f579dc9d4..1e0c9c81fcd 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -42,7 +43,7 @@ struct make_pair_function {
   /**
    * @brief Hash the merge pair entry
    */
-  __device__ cuco::pair_type<hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
+  __device__ cuco::pair_type<cudf::hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
   {
     auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
     return cuco::make_pair(result, idx);
@@ -105,9 +106,9 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   // Ensure capacity is at least (size/0.7) as documented here:
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
-    static_cast<size_t>(input.size() * 2),        // capacity is 2x;
-    std::numeric_limits<hash_value_type>::max(),  // empty key;
-    -1,                                           // empty value is not used
+    static_cast<size_t>(input.size() * 2),              // capacity is 2x;
+    std::numeric_limits<cudf::hash_value_type>::max(),  // empty key;
+    -1,                                                 // empty value is not used
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
@@ -117,8 +118,8 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 
   merge_pairs_map->insert(iter,
                           iter + input.size(),
-                          cuco::detail::MurmurHash3_32<hash_value_type>{},
-                          thrust::equal_to<hash_value_type>{},
+                          cuco::detail::MurmurHash3_32<cudf::hash_value_type>{},
+                          thrust::equal_to<cudf::hash_value_type>{},
                           stream.value());
 
   return merge_pairs_map;
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index 2cf4771890b..a1f49a5685f 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 {
   CUDF_EXPECTS(input.type().id() == type_id::BOOL8, "Input is not of type bool");
 
-  if (input.is_empty()) { return std::make_pair(std::make_unique<rmm::device_buffer>(), 0); }
+  if (input.is_empty()) { return std::pair(std::make_unique<rmm::device_buffer>(), 0); }
 
   auto input_device_view_ptr = column_device_view::create(input, stream);
   auto input_device_view     = *input_device_view_ptr;
@@ -45,12 +45,12 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 
     auto mask = detail::valid_if(input_begin, input_begin + input.size(), pred, stream, mr);
 
-    return std::make_pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
+    return std::pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
   } else {
     auto mask = detail::valid_if(
       input_device_view.begin<bool>(), input_device_view.end<bool>(), pred, stream, mr);
 
-    return std::make_pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
+    return std::pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
   }
 }
 
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 04821b09eab..60769665fca 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -57,7 +57,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
   auto indices_column = cudf::detail::lower_bound(
     sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr);
 
-  return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column));
+  return std::pair(std::move(sorted_unique_keys), std::move(indices_column));
 }
 
 }  // namespace detail
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index ee63e6d366f..42d41b44779 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -53,8 +53,7 @@ struct dispatch_nan_to_null {
                                    stream,
                                    mr);
 
-      return std::make_pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)),
-                            mask.second);
+      return std::pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
     } else {
       auto pred = [input_device_view] __device__(cudf::size_type idx) {
         return not(std::isnan(input_device_view.element<T>(idx)));
@@ -66,8 +65,7 @@ struct dispatch_nan_to_null {
                                    stream,
                                    mr);
 
-      return std::make_pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)),
-                            mask.second);
+      return std::pair(std::make_unique<rmm::device_buffer>(std::move(mask.first)), mask.second);
     }
   }
 
@@ -85,7 +83,7 @@ struct dispatch_nan_to_null {
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
   column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) { return std::make_pair(std::make_unique<rmm::device_buffer>(), 0); }
+  if (input.is_empty()) { return std::pair(std::make_unique<rmm::device_buffer>(), 0); }
 
   return cudf::type_dispatcher(input.type(), dispatch_nan_to_null{}, input, stream, mr);
 }
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 16aee349bb5..b1a8858f847 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,7 +89,7 @@ struct one_hot_encode_launcher {
     auto views = cudf::split(all_encodings->view(), split_indices);
     table_view encodings_view{views};
 
-    return std::make_pair(std::move(all_encodings), encodings_view);
+    return std::pair(std::move(all_encodings), encodings_view);
   }
 
   template <typename InputType,
@@ -108,14 +108,12 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 {
   CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
 
-  if (categories.is_empty()) {
-    return std::make_pair(make_empty_column(type_id::BOOL8), table_view{});
-  }
+  if (categories.is_empty()) { return std::pair(make_empty_column(type_id::BOOL8), table_view{}); }
 
   if (input.is_empty()) {
     auto empty_data = make_empty_column(type_id::BOOL8);
     std::vector<column_view> views(categories.size(), empty_data->view());
-    return std::make_pair(std::move(empty_data), table_view{views});
+    return std::pair(std::move(empty_data), table_view{views});
   }
 
   return type_dispatcher(input.type(), one_hot_encode_launcher{}, input, categories, stream, mr);
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index b5b00b11a0f..a87cf60a252 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -37,7 +37,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
 {
   // If there are no rows in the input, return successfully
   if (input.num_columns() == 0 || input.num_rows() == 0) {
-    return std::make_pair(std::make_unique<column>(), table_view{});
+    return std::pair(std::make_unique<column>(), table_view{});
   }
 
   // Check datatype homogeneity
@@ -54,7 +54,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
   auto splits = std::vector<size_type>(splits_iter, splits_iter + input.num_rows() - 1);
   auto output_column_views = split(output_column->view(), splits, stream);
 
-  return std::make_pair(std::move(output_column), table_view(output_column_views));
+  return std::pair(std::move(output_column), table_view(output_column_views));
 }
 }  // namespace detail
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e016f47616b..95c54d7596e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -250,6 +250,7 @@ ConfigureTest(
   copying/gather_tests.cpp
   copying/get_value_tests.cpp
   copying/pack_tests.cpp
+  copying/purge_nonempty_nulls_tests.cpp
   copying/sample_tests.cpp
   copying/scatter_tests.cpp
   copying/scatter_list_tests.cpp
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 64462669f90..28df893aff1 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -20,13 +20,13 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include "cudf/utilities/error.hpp"
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
 
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 4e0e70bf15c..44a79e63cd8 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -645,7 +645,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNonNested)
                                           0,
                                           cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *expected);
 }
 
 TYPED_TEST(ListsStructsLeafTest, FromNested)
diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
new file mode 100644
index 00000000000..77fd3f66ee5
--- /dev/null
+++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+namespace cudf::test {
+
+using iterators::no_nulls;
+using iterators::null_at;
+using iterators::nulls_at;
+using T             = int32_t;  // The actual type of the leaf node isn't really important.
+using values_col_t  = fixed_width_column_wrapper<T>;
+using offsets_col_t = fixed_width_column_wrapper<size_type>;
+using gather_map_t  = fixed_width_column_wrapper<size_type>;
+
+template <typename T>
+using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
+
+struct PurgeNonEmptyNullsTest : public cudf::test::BaseFixture {
+  /// Helper to run gather() on a single column, and extract the single column from the result.
+  std::unique_ptr<cudf::column> gather(column_view const& input, gather_map_t const& gather_map)
+  {
+    auto gathered =
+      cudf::gather(cudf::table_view{{input}}, gather_map, out_of_bounds_policy::NULLIFY);
+    return std::move(gathered->release()[0]);
+  }
+
+  /// Verify that the result of `sanitize()` is equivalent to the unsanitized input,
+  /// except that the null rows are also empty.
+  template <typename ColumnViewT>
+  void test_purge(ColumnViewT const& unpurged)
+  {
+    auto const purged = cudf::purge_nonempty_nulls(unpurged);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(unpurged.parent(), *purged);
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*purged));
+  }
+};
+
+// List<T>.
+TEST_F(PurgeNonEmptyNullsTest, SingleLevelList)
+{
+  auto const input = LCW<T>{{{{1, 2, 3, 4}, null_at(2)},
+                             {5},
+                             {6, 7},  // <--- Will be set to NULL. Unsanitized row.
+                             {8, 9, 10}},
+                            no_nulls()}
+                       .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+
+  test_purge(lists_column_view{*input});
+
+  {
+    // Selecting all rows from input, in different order.
+    auto const results           = gather(input->view(), {1, 2, 0, 3});
+    auto const results_list_view = lists_column_view(*results);
+
+    auto const expected = LCW<T>{{{5},
+                                  {},  // NULL.
+                                  {{1, 2, 3, 4}, null_at(2)},
+                                  {8, 9, 10}},
+                                 null_at(1)};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.child(),
+                                   values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+  {
+    // Test when gather selects rows preceded by unsanitized rows.
+    auto const results  = gather(input->view(), {3, 100, 0});
+    auto const expected = LCW<T>{{
+                                   {8, 9, 10},
+                                   {},  // NULL.
+                                   {{1, 2, 3, 4}, null_at(2)},
+                                 },
+                                 null_at(1)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+  {
+    // Test when gather selects rows followed by unsanitized rows.
+    auto const results  = gather(input->view(), {1, 100, 0});
+    auto const expected = LCW<T>{{
+                                   {5},
+                                   {},  // NULL.
+                                   {{1, 2, 3, 4}, null_at(2)},
+                                 },
+                                 null_at(1)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+  {
+    // Test when gather selects unsanitized row specifically.
+    auto const results            = gather(input->view(), {2});
+    auto const results_lists_view = lists_column_view(*results);
+    auto const expected           = LCW<T>{{
+                                   LCW<T>{}  // NULL.
+                                 },
+                                 null_at(0)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(), values_col_t{});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+}
+
+// List<List<T>>.
+TEST_F(PurgeNonEmptyNullsTest, TwoLevelList)
+{
+  auto const input =
+    LCW<T>{
+      {{{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}},
+       {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}},
+       {{21}, {22, 23}, {24, 25, 26}},
+       {{31, 32}, {33, 34, 35, 36}, {}, {37, 38}},  //<--- Will be set to NULL. Unsanitized row.
+       {{41}, {42, 43}}},
+      no_nulls()}
+      .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+
+  test_purge(lists_column_view{*input});
+
+  {
+    // Verify that gather() output is sanitized.
+    auto const results            = gather(input->view(), {100, 3, 0, 1});
+    auto const results_lists_view = lists_column_view(*results);
+
+    auto const expected = LCW<T>{{
+                                   LCW<T>{},  // NULL, because of out of bounds.
+                                   LCW<T>{},  // NULL, because input row was null.
+                                   {{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}},  // i.e. input[0]
+                                   {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}}  // i.e. input[1]
+                                 },
+                                 nulls_at({0, 1})};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+      results_lists_view.child(),
+      LCW<T>{
+        {1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}, {11, 12}, {13, 14, 15}, {16, 17, 18}, {19}});
+
+    auto const child_lists_view = lists_column_view(results_lists_view.child());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_lists_view.offsets(),
+                                   offsets_col_t{0, 3, 7, 8, 10, 11, 13, 16, 19, 20});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+      child_lists_view.child(),
+      values_col_t{1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 11, 12, 13, 14, 15, 16, 17, 18, 19});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+}
+
+// List<List<List<T>>>.
+TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList)
+{
+  auto const input = LCW<T>{{{{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}},
+                             {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}},
+                             {{{21, 21}}, {{22, 23}, {}}, {{24, 25}, {26}}},
+                             {{{31, 32}, {}},
+                              {{33, 34, 35}, {36}},
+                              {},
+                              {{37, 38}}},  //<--- Will be set to NULL. Unsanitized row.
+                             {{{41, 41, 41}}, {{42, 43}}}},
+                            no_nulls()}
+                       .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+
+  test_purge(lists_column_view{*input});
+
+  {
+    auto const results            = gather(input->view(), {100, 3, 0, 1});
+    auto const results_lists_view = lists_column_view(*results);
+
+    auto const expected = LCW<T>{
+      {
+        LCW<T>{},  // NULL, because of out of bounds.
+        LCW<T>{},  // NULL, because input row was null.
+        {{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}},  // i.e. input[0]
+        {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}}        // i.e. input[1]
+      },
+      nulls_at({0, 1})};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(),
+                                   LCW<T>{{{1, 2}, {3}},
+                                          {{4, 5}, {6, 7}},
+                                          {{8, 8}, {}},
+                                          {{9, 1}},
+                                          {{2, 3}},
+                                          {{11, 12}},
+                                          {{13}, {14, 15}},
+                                          {{16, 17, 18}},
+                                          {{19, 19}, {}}});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+}
+
+// List<string>.
+TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
+{
+  using T = string_view;
+
+  auto const input = LCW<T>{{{{"1", "22", "", "4444"}, null_at(2)},
+                             {"55555"},
+                             {"666666", "7777777"},  // <--- Will be set to NULL. Unsanitized row.
+                             {"88888888", "999999999", "1010101010"},
+                             {"11", "22", "33", "44"},
+                             {"55", "66", "77", "88"}},
+                            no_nulls()}
+                       .release();
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
+
+  // Set nullmask, post construction.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
+
+  test_purge(lists_column_view{*input});
+
+  {
+    // Selecting all rows from input, in different order.
+    auto const results           = gather(input->view(), {1, 2, 0, 3});
+    auto const results_list_view = lists_column_view(*results);
+
+    auto const expected = LCW<T>{{{"55555"},
+                                  {},  // NULL.
+                                  {{"1", "22", "", "4444"}, null_at(2)},
+                                  {"88888888", "999999999", "1010101010"}},
+                                 null_at(1)};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+      results_list_view.child(),
+      strings_column_wrapper{
+        {"55555", "1", "22", "", "4444", "88888888", "999999999", "1010101010"}, null_at(3)});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+  {
+    // Gathering from a sliced column.
+    auto const sliced = cudf::slice({input->view()}, {1, 5})[0];  // Lop off 1 row at each end.
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(sliced));
+    EXPECT_TRUE(cudf::has_nonempty_nulls(sliced));
+
+    auto const results           = gather(sliced, {1, 2, 0, 3});
+    auto const results_list_view = lists_column_view(*results);
+    auto const expected          = LCW<T>{{
+                                   {},
+                                   {"88888888", "999999999", "1010101010"},
+                                   {"55555"},
+                                   {"11", "22", "33", "44"},
+                                 },
+                                 null_at(0)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 0, 3, 4, 8});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+      results_list_view.child(),
+      strings_column_wrapper{
+        "88888888", "999999999", "1010101010", "55555", "11", "22", "33", "44"});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
+  }
+}
+
+// List<string>.
+TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings)
+{
+  auto strings =
+    strings_column_wrapper{
+      {"1", "22", "3", "44", "5", "66", "7", "8888", "9", "1010"},  //<--- "8888" will be
+                                                                    // unsanitized.
+      no_nulls()}
+      .release();
+  EXPECT_FALSE(cudf::may_have_nonempty_nulls(*strings));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*strings));
+
+  // Set strings nullmask, post construction.
+  set_null_mask(strings->mutable_view().null_mask(), 7, 8, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*strings));
+
+  test_purge(strings_column_view{*strings});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    strings_column_view(*strings).offsets(), offsets_col_t{0, 1, 3, 4, 6, 7, 9, 10, 14, 15, 19}
+    // 10-14 indicates that "8888" is unsanitized.
+  );
+
+  // Construct a list column from the strings column.
+  auto const lists = make_lists_column(4,
+                                       offsets_col_t{0, 4, 5, 7, 10}.release(),
+                                       std::move(strings),
+                                       0,
+                                       detail::make_null_mask(no_nulls(), no_nulls() + 4));
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*lists));
+
+  // Set lists nullmask, post construction.
+  cudf::detail::set_null_mask(lists->mutable_view().null_mask(), 2, 3, false);
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*lists));
+
+  test_purge(lists_column_view{*lists});
+
+  // At this point,
+  // 1. {"66", "7"} will be unsanitized.
+  // 2. {"8888", "9", "1010"} will be actually be {NULL, "9", "1010"}.
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    lists_column_view(*lists).offsets(),
+    offsets_col_t{0, 4, 5, 7, 10});  // 5-7 indicates that list row#2 is unsanitized.
+
+  auto const result   = gather(lists->view(), {1, 2, 0, 3});
+  auto const expected = LCW<string_view>{{{"5"},
+                                          {},  // NULL.
+                                          {"1", "22", "3", "44"},
+                                          {{"", "9", "1010"}, null_at(0)}},
+                                         null_at(1)};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
+
+  // Ensure row#2 has been sanitized.
+  auto const results_lists_view = lists_column_view(*result);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}
+                                 // 1-1 indicates that row#2 is sanitized.
+  );
+
+  // Ensure that "8888" has been sanitized, and stored as "".
+  auto const child_strings_view = strings_column_view(results_lists_view.child());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_strings_view.offsets(),
+                                 offsets_col_t{0, 1, 2, 4, 5, 7, 7, 8, 12});
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result));
+  EXPECT_FALSE(cudf::has_nonempty_nulls(*result));
+}
+
+// Struct<List<T>>.
+TEST_F(PurgeNonEmptyNullsTest, StructOfList)
+{
+  auto const structs_input =
+    [] {
+      auto child = LCW<T>{{{{1, 2, 3, 4}, null_at(2)},
+                           {5},
+                           {6, 7},  //<--- Unsanitized row.
+                           {8, 9, 10}},
+                          no_nulls()};
+      EXPECT_FALSE(cudf::has_nonempty_nulls(child));
+      return structs_column_wrapper{{child}, null_at(2)};
+    }()
+      .release();
+
+  EXPECT_TRUE(cudf::may_have_nonempty_nulls(*structs_input));
+  EXPECT_TRUE(cudf::has_nonempty_nulls(*structs_input));
+
+  test_purge(structs_column_view{*structs_input});
+
+  // At this point, even though the structs column has a null at index 2,
+  // the child column has a non-empty list row at index 2: {6, 7}.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(lists_column_view(structs_input->child(0)).child(),
+                                 values_col_t{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, null_at(2)});
+
+  {
+    // Test rearrange.
+    auto const gather_map      = gather_map_t{1, 2, 0, 3};
+    auto const result          = gather(structs_input->view(), gather_map);
+    auto const expected_result = [] {
+      auto child = LCW<T>{{{5},
+                           LCW<T>{},  //<--- Now, sanitized.
+                           {{1, 2, 3, 4}, null_at(2)},
+                           {8, 9, 10}},
+                          null_at(1)};
+      return structs_column_wrapper{{child}, null_at(1)};
+    }();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_result);
+    auto const results_child = lists_column_view(result->child(0));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.offsets(), offsets_col_t{0, 1, 1, 5, 8});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.child(),
+                                   values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)});
+    EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result));
+    EXPECT_FALSE(cudf::has_nonempty_nulls(*result));
+  }
+}
+
+}  // namespace cudf::test
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 28ebb6cbcb6..306ab8a3d5c 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -573,6 +573,17 @@ TEST_F(ScatterStringsTests, ScatterScalarNoNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
 
+TEST_F(ScatterStringsTests, EmptyStrings)
+{
+  cudf::test::strings_column_wrapper input{"", "", ""};
+  cudf::table_view t({input});
+
+  // Test for issue 10717: all-empty-string column scatter
+  auto map    = cudf::test::fixed_width_column_wrapper<int32_t>({0});
+  auto result = cudf::scatter(t, map, t);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), t);
+}
+
 template <typename T>
 class BooleanMaskScatter : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 4327a8b694b..bde8ccc6de7 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -39,6 +39,7 @@ TEST(CudaTryTest, Error)
   CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure),
                             "cudaErrorLaunchFailure unspecified launch failure");
 }
+
 TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); }
 
 TEST(CudaTryTest, TryCatch)
diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
index be7d6c1ce05..6f5a04e3752 100644
--- a/cpp/tests/groupby/m2_tests.cpp
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,8 +48,7 @@ auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values)
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
   auto result = gb_obj.aggregate(requests);
-  return std::make_pair(std::move(result.first->release()[0]),
-                        std::move(result.second[0].results[0]));
+  return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0]));
 }
 }  // namespace
 
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 7c24c6267ca..593bb7c50af 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,8 +47,7 @@ auto merge_lists(vcol_views const& keys_cols, vcol_views const& values_cols)
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
-  return std::make_pair(std::move(result.first->release()[0]),
-                        std::move(result.second[0].results[0]));
+  return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0]));
 }
 
 }  // namespace
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 60067e78022..79ffebf146c 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,10 +67,9 @@ auto compute_partial_results(cudf::column_view const& keys, cudf::column_view co
   auto [out_keys, out_results] = gb_obj.aggregate(requests);
 
   auto const num_output_rows = out_keys->num_rows();
-  return std::make_pair(
-    std::move(out_keys->release()[0]),
-    cudf::make_structs_column(
-      num_output_rows, std::move(out_results[0].results), 0, rmm::device_buffer{}));
+  return std::pair(std::move(out_keys->release()[0]),
+                   cudf::make_structs_column(
+                     num_output_rows, std::move(out_results[0].results), 0, rmm::device_buffer{}));
 }
 
 /**
@@ -93,8 +92,7 @@ auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols)
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
-  return std::make_pair(std::move(result.first->release()[0]),
-                        std::move(result.second[0].results[0]));
+  return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0]));
 }
 }  // namespace
 
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 1e2f0c9fa9e..57f67f6b81a 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,8 +47,7 @@ auto merge_sets(vcol_views const& keys_cols, vcol_views const& values_cols)
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
-  return std::make_pair(std::move(result.first->release()[0]),
-                        std::move(result.second[0].results[0]));
+  return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0]));
 }
 
 }  // namespace
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 81369beb2ec..d4e8b4cbf0f 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -29,11 +29,9 @@ namespace test {
 using namespace iterators;
 
 template <typename T>
-using input           = fixed_width_column_wrapper<T>;
-using rank_result_col = fixed_width_column_wrapper<size_type>;
-using percent_result_t =
-  cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
-using percent_result_col = fixed_width_column_wrapper<percent_result_t>;
+using input              = fixed_width_column_wrapper<T>;
+using rank_result_col    = fixed_width_column_wrapper<size_type>;
+using percent_result_col = fixed_width_column_wrapper<double>;
 using null_iter_t        = decltype(nulls_at({}));
 
 auto constexpr X     = int32_t{0};  // Placeholder for NULL rows.
@@ -45,27 +43,31 @@ inline void test_rank_scans(column_view const& keys,
                             column_view const& expected_rank,
                             column_view const& expected_percent_rank)
 {
-  test_single_scan(keys,
-                   order,
-                   keys,
-                   expected_dense,
-                   make_dense_rank_aggregation<groupby_scan_aggregation>(),
-                   null_policy::INCLUDE,
-                   sorted::YES);
-  test_single_scan(keys,
-                   order,
-                   keys,
-                   expected_rank,
-                   make_rank_aggregation<groupby_scan_aggregation>(),
-                   null_policy::INCLUDE,
-                   sorted::YES);
-  test_single_scan(keys,
-                   order,
-                   keys,
-                   expected_percent_rank,
-                   make_percent_rank_aggregation<groupby_scan_aggregation>(),
-                   null_policy::INCLUDE,
-                   sorted::YES);
+  test_single_scan(
+    keys,
+    order,
+    keys,
+    expected_dense,
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE, {}, null_policy::INCLUDE),
+    null_policy::INCLUDE,
+    sorted::YES);
+  test_single_scan(
+    keys,
+    order,
+    keys,
+    expected_rank,
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN, {}, null_policy::INCLUDE),
+    null_policy::INCLUDE,
+    sorted::YES);
+  test_single_scan(
+    keys,
+    order,
+    keys,
+    expected_percent_rank,
+    make_rank_aggregation<groupby_scan_aggregation>(
+      rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED),
+    null_policy::INCLUDE,
+    sorted::YES);
 }
 
 struct groupby_rank_scan_test : public BaseFixture {
@@ -148,7 +150,7 @@ TYPED_TEST(typed_groupby_rank_scan_test, basic)
 {
   using T = TypeParam;
 
-  auto const keys            = input<T>{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  auto const keys            = /*        */ input<T>{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
   auto const make_order_by   = [&] { return input<T>{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; };
   auto const order_by        = make_order_by();
   auto const order_by_struct = [&] {
@@ -244,9 +246,12 @@ TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs)
   std::vector<groupby::scan_request> requests;
   requests.emplace_back(groupby::scan_request());
   requests[0].values = *struct_col;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE, {}, null_policy::INCLUDE));
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN, {}, null_policy::INCLUDE));
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto [result_keys, agg_results] = gb_obj.scan(requests);
@@ -288,13 +293,19 @@ TYPED_TEST(typed_groupby_rank_scan_test, nestedStructs)
   requests.emplace_back(groupby::scan_request());
   requests.emplace_back(groupby::scan_request());
   requests[0].values = *nested_structs;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE));
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN));
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
   requests[1].values = *flat_struct;
-  requests[1].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE));
+  requests[1].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN));
+  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto [result_keys, agg_results] = gb_obj.scan(requests);
@@ -339,13 +350,19 @@ TYPED_TEST(typed_groupby_rank_scan_test, structsWithNullPushdown)
   requests.emplace_back(groupby::scan_request());
   requests.emplace_back(groupby::scan_request());
   requests[0].values = *possibly_null_structs;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE, {}, null_policy::INCLUDE));
+  requests[0].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN, {}, null_policy::INCLUDE));
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
   requests[1].values = *definitely_null_structs;
-  requests[1].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE, {}, null_policy::INCLUDE));
+  requests[1].aggregations.push_back(
+    make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN, {}, null_policy::INCLUDE));
+  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(
+    rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED));
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto [result_keys, agg_results] = gb_obj.scan(requests);
@@ -405,11 +422,11 @@ TYPED_TEST(list_groupby_rank_scan_test, lists)
   requests.emplace_back(groupby::aggregation_request());
   requests.emplace_back(groupby::aggregation_request());
   requests[0].values = list_col;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE));
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN));
   requests[1].values = struct_col;
-  requests[1].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE));
+  requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN));
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto result = gb_obj.scan(requests);
@@ -484,7 +501,7 @@ TEST(groupby_rank_scan_test, strings)
     keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent);
 }
 
-TEST_F(groupby_rank_scan_test_failures, test_exception_triggers)
+TEST_F(groupby_rank_scan_test_failures, DISABLED_test_exception_triggers)
 {
   using T = uint32_t;
 
@@ -496,57 +513,60 @@ TEST_F(groupby_rank_scan_test_failures, test_exception_triggers)
                      col,
                      keys,
                      col,
-                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
                      null_policy::INCLUDE,
                      sorted::NO),
-    "Dense rank aggregate in groupby scan requires the keys to be presorted");
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
-                                             col,
-                                             keys,
-                                             col,
-                                             make_rank_aggregation<groupby_scan_aggregation>(),
-                                             null_policy::INCLUDE,
-                                             sorted::NO),
-                            "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
+                     null_policy::INCLUDE,
+                     sorted::NO),
+    "Rank aggregate in groupby scan requires the keys to be presorted");
+
+  CUDF_EXPECT_THROW_MESSAGE(
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                     null_policy::EXCLUDE,
+                     sorted::YES),
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 
   CUDF_EXPECT_THROW_MESSAGE(
     test_single_scan(keys,
                      col,
                      keys,
                      col,
-                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
                      null_policy::EXCLUDE,
                      sorted::YES),
-    "Dense rank aggregate in groupby scan requires the keys to be presorted");
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
-                                             col,
-                                             keys,
-                                             col,
-                                             make_rank_aggregation<groupby_scan_aggregation>(),
-                                             null_policy::EXCLUDE,
-                                             sorted::YES),
-                            "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                     null_policy::EXCLUDE,
+                     sorted::NO),
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 
   CUDF_EXPECT_THROW_MESSAGE(
     test_single_scan(keys,
                      col,
                      keys,
                      col,
-                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
                      null_policy::EXCLUDE,
                      sorted::NO),
-    "Dense rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
-                                             col,
-                                             keys,
-                                             col,
-                                             make_rank_aggregation<groupby_scan_aggregation>(),
-                                             null_policy::EXCLUDE,
-                                             sorted::NO),
-                            "Rank aggregate in groupby scan requires the keys to be presorted");
+    "Rank aggregate in groupby scan requires the keys to be presorted");
 }
 
 }  // namespace test
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index d69aee57756..f42549514e6 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -23,12 +23,12 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/logical.h>
 #include <thrust/pair.h>
 #include <thrust/tabulate.h>
 
-#include "rmm/exec_policy.hpp"
 #include <cstdlib>
 #include <iostream>
 #include <limits>
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index d1dc60119b6..4b481ade83f 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -148,7 +148,7 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
 
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
-  return std::make_pair(
+  return std::pair(
     std::make_unique<cudf::table>(std::move(columns)),
     arrow::Table::Make(
       schema, {int64array, string_array, dict_array, boolarray, list_array, struct_array}));
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index dd00b201df9..a325cadf6a5 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <io/comp/gpuinflate.h>
+#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 
@@ -24,6 +25,8 @@
 
 #include <vector>
 
+using cudf::device_span;
+
 /**
  * @brief Base test fixture for decompression
  *
@@ -32,19 +35,6 @@
  */
 template <typename Decompressor>
 struct DecompressTest : public cudf::test::BaseFixture {
-  void SetUp() override
-  {
-    ASSERT_CUDA_SUCCEEDED(cudaMallocHost((void**)&inf_args, sizeof(cudf::io::gpu_inflate_input_s)));
-    ASSERT_CUDA_SUCCEEDED(
-      cudaMallocHost((void**)&inf_stat, sizeof(cudf::io::gpu_inflate_status_s)));
-  }
-
-  void TearDown() override
-  {
-    ASSERT_CUDA_SUCCEEDED(cudaFreeHost(inf_stat));
-    ASSERT_CUDA_SUCCEEDED(cudaFreeHost(inf_args));
-  }
-
   std::vector<uint8_t> vector_from_string(const char* str) const
   {
     return std::vector<uint8_t>(reinterpret_cast<const uint8_t*>(str),
@@ -55,49 +45,43 @@ struct DecompressTest : public cudf::test::BaseFixture {
                   const uint8_t* compressed,
                   size_t compressed_size)
   {
-    rmm::device_buffer src{compressed, compressed_size, rmm::cuda_stream_default};
-    rmm::device_buffer dst{decompressed->size(), rmm::cuda_stream_default};
-
-    inf_args->srcDevice = static_cast<const uint8_t*>(src.data());
-    inf_args->dstDevice = static_cast<uint8_t*>(dst.data());
-    inf_args->srcSize   = src.size();
-    inf_args->dstSize   = dst.size();
-    rmm::device_uvector<cudf::io::gpu_inflate_input_s> d_inf_args(1, rmm::cuda_stream_default);
-    rmm::device_uvector<cudf::io::gpu_inflate_status_s> d_inf_stat(1, rmm::cuda_stream_default);
-    ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_inf_args.data(),
-                                          inf_args,
-                                          sizeof(cudf::io::gpu_inflate_input_s),
-                                          cudaMemcpyHostToDevice,
-                                          0));
-    ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_inf_stat.data(),
-                                          inf_stat,
-                                          sizeof(cudf::io::gpu_inflate_status_s),
-                                          cudaMemcpyHostToDevice,
-                                          0));
-    ASSERT_CUDA_SUCCEEDED(
-      static_cast<Decompressor*>(this)->dispatch(d_inf_args.data(), d_inf_stat.data()));
-    ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(inf_stat,
-                                          d_inf_stat.data(),
-                                          sizeof(cudf::io::gpu_inflate_status_s),
-                                          cudaMemcpyDeviceToHost,
-                                          0));
-    ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(
-      decompressed->data(), inf_args->dstDevice, inf_args->dstSize, cudaMemcpyDeviceToHost, 0));
-    ASSERT_CUDA_SUCCEEDED(cudaStreamSynchronize(0));
+    auto stream = rmm::cuda_stream_default;
+    rmm::device_buffer src{compressed, compressed_size, stream};
+    rmm::device_uvector<uint8_t> dst{decompressed->size(), stream};
+
+    hostdevice_vector<device_span<uint8_t const>> inf_in(1, stream);
+    inf_in[0] = {static_cast<uint8_t const*>(src.data()), src.size()};
+    inf_in.host_to_device(stream);
+
+    hostdevice_vector<device_span<uint8_t>> inf_out(1, stream);
+    inf_out[0] = dst;
+    inf_out.host_to_device(stream);
+
+    hostdevice_vector<cudf::io::decompress_status> inf_stat(1, stream);
+    inf_stat[0] = {};
+    inf_stat.host_to_device(stream);
+
+    static_cast<Decompressor*>(this)->dispatch(inf_in, inf_out, inf_stat);
+    cudaMemcpyAsync(
+      decompressed->data(), dst.data(), dst.size(), cudaMemcpyDeviceToHost, stream.value());
+    inf_stat.device_to_host(stream, true);
+    ASSERT_EQ(inf_stat[0].status, 0);
   }
-
-  cudf::io::gpu_inflate_input_s* inf_args  = nullptr;
-  cudf::io::gpu_inflate_status_s* inf_stat = nullptr;
 };
 
 /**
  * @brief Derived fixture for GZIP decompression
  */
 struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
-  cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args,
-                       cudf::io::gpu_inflate_status_s* d_inf_stat)
+  void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
+                device_span<device_span<uint8_t>> d_inf_out,
+                device_span<cudf::io::decompress_status> d_inf_stat)
   {
-    return cudf::io::gpuinflate(d_inf_args, d_inf_stat, 1, 1, rmm::cuda_stream_default);
+    cudf::io::gpuinflate(d_inf_in,
+                         d_inf_out,
+                         d_inf_stat,
+                         cudf::io::gzip_header_included::YES,
+                         rmm::cuda_stream_default);
   }
 };
 
@@ -105,10 +89,11 @@ struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
  * @brief Derived fixture for Snappy decompression
  */
 struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
-  cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args,
-                       cudf::io::gpu_inflate_status_s* d_inf_stat)
+  void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
+                device_span<device_span<uint8_t>> d_inf_out,
+                device_span<cudf::io::decompress_status> d_inf_stat)
   {
-    return cudf::io::gpu_unsnap(d_inf_args, d_inf_stat, 1, rmm::cuda_stream_default);
+    cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, rmm::cuda_stream_default);
   }
 };
 
@@ -116,14 +101,19 @@ struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
  * @brief Derived fixture for Brotli decompression
  */
 struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
-  cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args,
-                       cudf::io::gpu_inflate_status_s* d_inf_stat)
+  void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
+                device_span<device_span<uint8_t>> d_inf_out,
+                device_span<cudf::io::decompress_status> d_inf_stat)
   {
     rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1),
                                  rmm::cuda_stream_default};
 
-    return cudf::io::gpu_debrotli(
-      d_inf_args, d_inf_stat, d_scratch.data(), d_scratch.size(), 1, rmm::cuda_stream_default);
+    cudf::io::gpu_debrotli(d_inf_in,
+                           d_inf_out,
+                           d_inf_stat,
+                           d_scratch.data(),
+                           d_scratch.size(),
+                           rmm::cuda_stream_default);
   }
 };
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index cd0aab3caeb..3905df2b274 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -219,15 +219,21 @@ struct ParquetWriterTimestampTypeTest : public ParquetWriterTest {
   auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
+// Typed test fixture for all types
+template <typename T>
+struct ParquetWriterSchemaTest : public ParquetWriterTest {
+  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
+};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
 TYPED_TEST_SUITE(ParquetWriterNumericTypeTest, SupportedTypes);
-using SupportedChronoTypes = cudf::test::Concat<cudf::test::ChronoTypes, cudf::test::DurationTypes>;
-TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, SupportedChronoTypes);
+TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, cudf::test::ChronoTypes);
 using SupportedTimestampTypes =
   cudf::test::Types<cudf::timestamp_ms, cudf::timestamp_us, cudf::timestamp_ns>;
 TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes);
+TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes);
 
 // Base test fixture for chunked writer tests
 struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index 5bddbfbd4aa..9aa18eb844f 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -12,10 +12,12 @@
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */
-#include "cudf/detail/utilities/vector_factories.hpp"
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
-#include <tests/iterator/iterator_tests.cuh>
+#include "iterator_tests.cuh"
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 73b355d496d..13852027bf0 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -93,7 +93,7 @@ std::pair<std::vector<T>, std::vector<T>> gen_random_repeated_columns(
   std::mt19937 gen(rd());
   std::shuffle(left.begin(), left.end(), gen);
   std::shuffle(right.begin(), right.end(), gen);
-  return std::make_pair(std::move(left), std::move(right));
+  return std::pair(std::move(left), std::move(right));
 }
 
 // Generate a single pair of left/right nullable columns of random data
@@ -120,8 +120,8 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re
     return uniform_dist(gen) > 0.5;
   });
 
-  return std::make_pair(std::make_pair(std::move(left), std::move(left_nulls)),
-                        std::make_pair(std::move(right), std::move(right_nulls)));
+  return std::pair(std::pair(std::move(left), std::move(left_nulls)),
+                   std::pair(std::move(right), std::move(right_nulls)));
 }
 
 }  // namespace
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index f560ce7f20c..8ed50c8fb39 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -67,7 +67,7 @@ struct JoinTest : public cudf::test::BaseFixture {
     auto gold_sort_order = cudf::sorted_order(gold);
     auto sorted_gold     = cudf::gather(gold, *gold_sort_order);
 
-    return std::make_pair(std::move(sorted_gold), std::move(sorted_result));
+    return std::pair(std::move(sorted_gold), std::move(sorted_result));
   }
 };
 
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index df5b1f5c14a..edcf1d1be27 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -94,7 +94,7 @@ std::pair<std::vector<T>, std::vector<T>> gen_random_repeated_columns(
   std::mt19937 gen(rd());
   std::shuffle(left.begin(), left.end(), gen);
   std::shuffle(right.begin(), right.end(), gen);
-  return std::make_pair(std::move(left), std::move(right));
+  return std::pair(std::move(left), std::move(right));
 }
 
 // Generate a single pair of left/right nullable columns of random data
@@ -121,8 +121,8 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re
     return uniform_dist(gen) > 0.5;
   });
 
-  return std::make_pair(std::make_pair(std::move(left), std::move(left_nulls)),
-                        std::make_pair(std::move(right), std::move(right_nulls)));
+  return std::pair(std::pair(std::move(left), std::move(left_nulls)),
+                   std::pair(std::move(right), std::move(right_nulls)));
 }
 
 }  // namespace
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index ea26cad3b59..129d1ad66f3 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -652,8 +652,8 @@ TYPED_TEST(MergeTest_, NMerge1KeyColumns)
   std::vector<std::pair<PairT0, PairT1>> facts{};
   std::vector<cudf::table_view> tables{};
   for (int i = 0; i < num_tables; ++i) {
-    facts.emplace_back(std::make_pair(PairT0(sequence0, sequence0 + inputRows),
-                                      PairT1(sequence1, sequence1 + inputRows)));
+    facts.emplace_back(std::pair(PairT0(sequence0, sequence0 + inputRows),
+                                 PairT1(sequence1, sequence1 + inputRows)));
     tables.push_back(cudf::table_view{{facts.back().first, facts.back().second}});
   }
   std::vector<cudf::size_type> key_cols{0};
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index befd9884b11..3ec6ae97595 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -67,9 +67,7 @@ TEST_F(HashPartition, ZeroPartitions)
   auto columns_to_hash = std::vector<cudf::size_type>({2});
 
   cudf::size_type const num_partitions = 0;
-  std::unique_ptr<cudf::table> output;
-  std::vector<cudf::size_type> offsets;
-  std::tie(output, offsets) = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [output, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions);
 
   // Expect empty table with same number of columns and zero partitions
   EXPECT_EQ(input.num_columns(), output->num_columns());
@@ -87,9 +85,7 @@ TEST_F(HashPartition, ZeroRows)
   auto columns_to_hash = std::vector<cudf::size_type>({2});
 
   cudf::size_type const num_partitions = 3;
-  std::unique_ptr<cudf::table> output;
-  std::vector<cudf::size_type> offsets;
-  std::tie(output, offsets) = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [output, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions);
 
   // Expect empty table with same number of columns and zero partitions
   EXPECT_EQ(input.num_columns(), output->num_columns());
@@ -104,9 +100,7 @@ TEST_F(HashPartition, ZeroColumns)
   auto columns_to_hash = std::vector<cudf::size_type>({});
 
   cudf::size_type const num_partitions = 3;
-  std::unique_ptr<cudf::table> output;
-  std::vector<cudf::size_type> offsets;
-  std::tie(output, offsets) = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [output, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions);
 
   // Expect empty table with same number of columns and zero partitions
   EXPECT_EQ(input.num_columns(), output->num_columns());
@@ -124,10 +118,8 @@ TEST_F(HashPartition, MixedColumnTypes)
   auto columns_to_hash = std::vector<cudf::size_type>({0, 2});
 
   cudf::size_type const num_partitions = 3;
-  std::unique_ptr<cudf::table> output1, output2;
-  std::vector<cudf::size_type> offsets1, offsets2;
-  std::tie(output1, offsets1) = cudf::hash_partition(input, columns_to_hash, num_partitions);
-  std::tie(output2, offsets2) = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [output1, offsets1] = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [output2, offsets2] = cudf::hash_partition(input, columns_to_hash, num_partitions);
 
   // Expect output to have size num_partitions
   EXPECT_EQ(static_cast<size_t>(num_partitions), offsets1.size());
@@ -148,9 +140,7 @@ TEST_F(HashPartition, NullableStrings)
   std::vector<cudf::size_type> const columns_to_hash({0});
   cudf::size_type const num_partitions = 3;
 
-  std::unique_ptr<cudf::table> result;
-  std::vector<cudf::size_type> offsets;
-  std::tie(result, offsets) = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [result, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions);
 
   auto const& col = result->get_column(0);
   EXPECT_EQ(0, col.null_count());
@@ -167,11 +157,9 @@ TEST_F(HashPartition, ColumnsToHash)
   auto columns_to_hash = std::vector<cudf::size_type>({0});
 
   cudf::size_type const num_partitions = 3;
-  std::unique_ptr<cudf::table> first_result, second_result;
-  std::vector<cudf::size_type> first_offsets, second_offsets;
-  std::tie(first_result, first_offsets) =
+  auto [first_result, first_offsets] =
     cudf::hash_partition(first_input, columns_to_hash, num_partitions);
-  std::tie(second_result, second_offsets) =
+  auto [second_result, second_offsets] =
     cudf::hash_partition(second_input, columns_to_hash, num_partitions);
 
   // Expect offsets to be equal and num_partitions in length
@@ -228,11 +216,9 @@ TEST_F(HashPartition, CustomSeedValue)
   auto columns_to_hash = std::vector<cudf::size_type>({0, 2});
 
   cudf::size_type const num_partitions = 3;
-  std::unique_ptr<cudf::table> output1, output2;
-  std::vector<cudf::size_type> offsets1, offsets2;
-  std::tie(output1, offsets1) = cudf::hash_partition(
+  auto [output1, offsets1]             = cudf::hash_partition(
     input, columns_to_hash, num_partitions, cudf::hash_id::HASH_MURMUR3, 12345);
-  std::tie(output2, offsets2) = cudf::hash_partition(
+  auto [output2, offsets2] = cudf::hash_partition(
     input, columns_to_hash, num_partitions, cudf::hash_id::HASH_MURMUR3, 12345);
 
   // Expect output to have size num_partitions
@@ -260,9 +246,7 @@ TYPED_TEST(HashPartitionFixedWidth, NullableFixedWidth)
   std::vector<cudf::size_type> const columns_to_hash({0});
   cudf::size_type const num_partitions = 3;
 
-  std::unique_ptr<cudf::table> result;
-  std::vector<cudf::size_type> offsets;
-  std::tie(result, offsets) = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [result, offsets] = cudf::hash_partition(input, columns_to_hash, num_partitions);
 
   auto const& col = result->get_column(0);
   EXPECT_EQ(0, col.null_count());
@@ -294,10 +278,8 @@ void run_fixed_width_test(size_t cols,
   auto columns_to_hash = std::vector<cudf::size_type>(cols);
   std::iota(columns_to_hash.begin(), columns_to_hash.end(), 0);
 
-  std::unique_ptr<cudf::table> output1, output2;
-  std::vector<cudf::size_type> offsets1, offsets2;
-  std::tie(output1, offsets1) = cudf::hash_partition(input, columns_to_hash, num_partitions);
-  std::tie(output2, offsets2) = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [output1, offsets1] = cudf::hash_partition(input, columns_to_hash, num_partitions);
+  auto [output2, offsets2] = cudf::hash_partition(input, columns_to_hash, num_partitions);
 
   // Expect output to have size num_partitions
   EXPECT_EQ(static_cast<size_t>(num_partitions), offsets1.size());
@@ -367,9 +349,7 @@ TEST_F(HashPartition, FixedPointColumnsToHash)
   auto columns_to_hash = std::vector<cudf::size_type>({0});
 
   cudf::size_type const num_partitions = 1;
-  std::unique_ptr<cudf::table> first_result;
-  std::vector<cudf::size_type> first_offsets;
-  std::tie(first_result, first_offsets) =
+  auto [first_result, first_offsets] =
     cudf::hash_partition(first_input, columns_to_hash, num_partitions);
 
   // Expect offsets to be equal and num_partitions in length
diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp
index 785af409c4c..014a19e93a9 100644
--- a/cpp/tests/partitioning/partition_test.cpp
+++ b/cpp/tests/partitioning/partition_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,16 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/copying.hpp>
-#include <cudf/partitioning.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include "cudf/sorting.hpp"
+#include <cudf/copying.hpp>
+#include <cudf/partitioning.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
 
 template <typename T>
 class PartitionTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index d263677f23b..b3a8e7e0c28 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -42,10 +42,11 @@ TEST_F(ListRankScanTest, BasicList)
 
   auto const expected_dense_vals =
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9};
-  this->test_ungrouped_rank_scan(col,
-                                 expected_dense_vals,
-                                 cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                 cudf::null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    col,
+    expected_dense_vals,
+    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    cudf::null_policy::INCLUDE);
 }
 
 TEST_F(ListRankScanTest, DeepList)
@@ -73,20 +74,22 @@ TEST_F(ListRankScanTest, DeepList)
   {  // Non-sliced
     auto const expected_dense_vals = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
       1, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 7, 8, 9, 10, 11};
-    this->test_ungrouped_rank_scan(col,
-                                   expected_dense_vals,
-                                   cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                   cudf::null_policy::INCLUDE);
+    this->test_ungrouped_rank_scan(
+      col,
+      expected_dense_vals,
+      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      cudf::null_policy::INCLUDE);
   }
 
   {  // sliced
     auto sliced_col = cudf::slice(col, {3, 12})[0];
     auto const expected_dense_vals =
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 3, 3, 3, 4, 4, 5, 5};
-    this->test_ungrouped_rank_scan(sliced_col,
-                                   expected_dense_vals,
-                                   cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                   cudf::null_policy::INCLUDE);
+    this->test_ungrouped_rank_scan(
+      sliced_col,
+      expected_dense_vals,
+      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      cudf::null_policy::INCLUDE);
   }
 }
 
@@ -138,10 +141,11 @@ TEST_F(ListRankScanTest, ListOfStruct)
     auto expect = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
       1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10};
 
-    this->test_ungrouped_rank_scan(list_column,
-                                   expect,
-                                   cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                   cudf::null_policy::INCLUDE);
+    this->test_ungrouped_rank_scan(
+      list_column,
+      expect,
+      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      cudf::null_policy::INCLUDE);
   }
 
   {  // Sliced
@@ -149,10 +153,11 @@ TEST_F(ListRankScanTest, ListOfStruct)
     auto expect =
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 3, 3, 3, 4, 5, 6, 7, 7, 8, 8};
 
-    this->test_ungrouped_rank_scan(sliced_col,
-                                   expect,
-                                   cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                   cudf::null_policy::INCLUDE);
+    this->test_ungrouped_rank_scan(
+      sliced_col,
+      expect,
+      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      cudf::null_policy::INCLUDE);
   }
 }
 
@@ -192,10 +197,11 @@ TEST_F(ListRankScanTest, ListOfEmptyStruct)
   auto expect =
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6};
 
-  this->test_ungrouped_rank_scan(*list_column,
-                                 expect,
-                                 cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                 cudf::null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    *list_column,
+    expect,
+    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    cudf::null_policy::INCLUDE);
 }
 
 TEST_F(ListRankScanTest, EmptyDeepList)
@@ -221,8 +227,9 @@ TEST_F(ListRankScanTest, EmptyDeepList)
 
   auto expect = cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 2};
 
-  this->test_ungrouped_rank_scan(*list_column,
-                                 expect,
-                                 cudf::make_dense_rank_aggregation<cudf::scan_aggregation>(),
-                                 cudf::null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    *list_column,
+    expect,
+    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    cudf::null_policy::INCLUDE);
 }
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index fb2cd17fe30..3bf2899ce2f 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -36,15 +36,14 @@ namespace cudf::test {
 using namespace iterators;
 
 template <typename T>
-using input           = fixed_width_column_wrapper<T>;
-using rank_result_col = fixed_width_column_wrapper<size_type>;
-using percent_result_t =
-  cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
-using percent_result_col = fixed_width_column_wrapper<percent_result_t>;
+using input              = fixed_width_column_wrapper<T>;
+using rank_result_col    = fixed_width_column_wrapper<size_type>;
+using percent_result_col = fixed_width_column_wrapper<double>;
 
-auto const rank         = cudf::make_rank_aggregation<scan_aggregation>();
-auto const dense_rank   = cudf::make_dense_rank_aggregation<scan_aggregation>();
-auto const percent_rank = cudf::make_percent_rank_aggregation<scan_aggregation>();
+auto const rank         = cudf::make_rank_aggregation<scan_aggregation>(cudf::rank_method::MIN);
+auto const dense_rank   = cudf::make_rank_aggregation<scan_aggregation>(cudf::rank_method::DENSE);
+auto const percent_rank = cudf::make_rank_aggregation<scan_aggregation>(
+  cudf::rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED);
 
 auto constexpr INCLUSIVE_SCAN = cudf::scan_type::INCLUSIVE;
 auto constexpr INCLUDE_NULLS  = cudf::null_policy::INCLUDE;
@@ -56,6 +55,8 @@ struct TypedRankScanTest : BaseScanTest<T> {
                                        std::unique_ptr<scan_aggregation> const& agg)
   {
     auto col_out = cudf::scan(input, agg, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    std::cout << "expect type: " << static_cast<int>(expect_vals.type().id()) << std::endl;
+    std::cout << "out type: " << static_cast<int>(col_out->type().id()) << std::endl;
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view());
   }
 };
@@ -318,11 +319,11 @@ TEST(RankScanTest, ExclusiveScan)
   auto const vals = input<uint32_t>{3, 4, 5};
 
   CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Dense rank aggregation operator requires an inclusive scan");
+                            "Rank aggregation operator requires an inclusive scan");
   CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
                             "Rank aggregation operator requires an inclusive scan");
   CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Percent rank aggregation operator requires an inclusive scan");
+                            "Rank aggregation operator requires an inclusive scan");
 }
 
 }  // namespace cudf::test
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index d533a91f4d0..68b4d85db2a 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -84,6 +84,7 @@ struct ScanTest : public BaseScanTest<T> {
         case aggregation::PRODUCT: return std::is_invocable_v<cudf::DeviceProduct, T, T>;
         case aggregation::MIN: return std::is_invocable_v<cudf::DeviceMin, T, T>;
         case aggregation::MAX: return std::is_invocable_v<cudf::DeviceMax, T, T>;
+        case aggregation::RANK: return std::is_invocable_v<cudf::DeviceMax, T, T>;  // comparable
         default: return false;
       }
       return false;
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index f484661eee8..3a69c13c889 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -340,10 +340,8 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
     thrust::host_vector<bool> ref_valid(num_rows);
 
     // input data and mask
-    thrust::host_vector<T> in_col;
-    std::vector<bitmask_type> in_valid;
-    std::tie(in_col, in_valid) = cudf::test::to_host<T>(input);
-    bitmask_type* valid_mask   = in_valid.data();
+    auto [in_col, in_valid]  = cudf::test::to_host<T>(input);
+    bitmask_type* valid_mask = in_valid.data();
 
     agg_op op;
     for (size_type i = 0; i < num_rows; i++) {
@@ -973,10 +971,8 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
     thrust::host_vector<bool> ref_valid(num_rows);
 
     // input data and mask
-    thrust::host_vector<T> in_col;
-    std::vector<bitmask_type> in_valid;
-    std::tie(in_col, in_valid) = cudf::test::to_host<T>(input);
-    bitmask_type* valid_mask   = in_valid.data();
+    auto [in_col, in_valid]  = cudf::test::to_host<T>(input);
+    bitmask_type* valid_mask = in_valid.data();
 
     agg_op op;
     for (size_type i = 0; i < num_rows; i++) {
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index c54fe073e3a..9549569d9f6 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -552,10 +552,8 @@ class RollingTest : public cudf::test::BaseFixture {
     thrust::host_vector<bool> ref_valid(num_rows);
 
     // input data and mask
-    thrust::host_vector<T> in_col;
-    std::vector<bitmask_type> in_valid;
-    std::tie(in_col, in_valid) = cudf::test::to_host<T>(input);
-    bitmask_type* valid_mask   = in_valid.data();
+    auto [in_col, in_valid]  = cudf::test::to_host<T>(input);
+    bitmask_type* valid_mask = in_valid.data();
 
     agg_op op;
     for (size_type i = 0; i < num_rows; i++) {
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index a1f0b1d81cf..159b082890a 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -57,7 +57,7 @@ auto search_bounds(cudf::column_view const& t_col_view,
   auto const values       = cudf::table_view{std::vector<cudf::column_view>{values_col->view()}};
   auto result_lower_bound = cudf::lower_bound(t, values, column_orders, null_precedence);
   auto result_upper_bound = cudf::upper_bound(t, values, column_orders, null_precedence);
-  return std::make_pair(std::move(result_lower_bound), std::move(result_upper_bound));
+  return std::pair(std::move(result_lower_bound), std::move(result_upper_bound));
 }
 
 auto search_bounds(std::unique_ptr<cudf::column> const& t_col,
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index c4d0b6b04f4..28c9b40de11 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -91,8 +91,7 @@ struct Rank : public BaseFixture {
            test_case_t{table_view{{col1, col2, col3}},
                        table_view{{col1_rank, col2_rank, col3_rank}}},
          }) {
-      table_view input, output;
-      std::tie(input, output) = test_case;
+      auto [input, output] = test_case;
 
       run_rank_test(input,
                     output,
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
index 0529539c4b2..31bbd43c78d 100644
--- a/cpp/tests/stream_compaction/distinct_count_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -71,7 +71,7 @@ TYPED_TEST(TypedDistinctCount, TableNoNull)
   std::vector<std::pair<T, T>> pair_input;
   std::transform(
     input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) {
-      return std::make_pair(a, b);
+      return std::pair(a, b);
     });
 
   cudf::test::fixed_width_column_wrapper<T> input_col1(input1.begin(), input1.end());
diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp
index 3285cd1a711..591fe042592 100644
--- a/cpp/tests/stream_compaction/unique_count_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_count_tests.cpp
@@ -71,7 +71,7 @@ TYPED_TEST(TypedUniqueCount, TableNoNull)
   std::vector<std::pair<T, T>> pair_input;
   std::transform(
     input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) {
-      return std::make_pair(a, b);
+      return std::pair(a, b);
     });
 
   cudf::test::fixed_width_column_wrapper<T> input_col1(input1.begin(), input1.end());
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index bec06f7e601..360ea8be178 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -125,11 +125,15 @@ TEST_F(StringsConvertTest, FromFloats32)
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
+  // clang-format off
   std::vector<const char*> h_strings{
     "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
     "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
     "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
-    "-Inf",   "-INFINITY"};
+    "-Inf",   "-INFINITY", "1.0",     "1.7976931348623157e+308",  "1.7976931348623157e-307",
+    // subnormal numbers:           v--- smallest double               v--- result is 0
+    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324" };
+  // clang-format on
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
diff --git a/cpp/tests/strings/translate_tests.cpp b/cpp/tests/strings/translate_tests.cpp
index e928065dca4..53c6982b880 100644
--- a/cpp/tests/strings/translate_tests.cpp
+++ b/cpp/tests/strings/translate_tests.cpp
@@ -38,7 +38,7 @@ std::pair<cudf::char_utf8, cudf::char_utf8> make_entry(const char* from, const c
   cudf::char_utf8 out = 0;
   cudf::strings::detail::to_char_utf8(from, in);
   if (to) cudf::strings::detail::to_char_utf8(to, out);
-  return std::make_pair(in, out);
+  return std::pair(in, out);
 }
 
 TEST_F(StringsTranslateTest, Translate)
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 8ed50b6eae0..9c3326cf575 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -123,9 +123,7 @@ TYPED_TEST(RowBitCountTyped, Lists)
 {
   using T = TypeParam;
 
-  std::unique_ptr<column> col;
-  std::unique_ptr<column> expected_sizes;
-  std::tie(col, expected_sizes) = build_list_column<T>();
+  auto [col, expected_sizes] = build_list_column<T>();
 
   table_view t({*col});
   auto result = cudf::row_bit_count(t);
@@ -326,9 +324,7 @@ TEST_F(RowBitCount, StructsNoNulls)
 
 TEST_F(RowBitCount, StructsNulls)
 {
-  std::unique_ptr<column> struct_col;
-  std::unique_ptr<column> expected_sizes;
-  std::tie(struct_col, expected_sizes) = build_struct_column();
+  auto [struct_col, expected_sizes] = build_struct_column();
   table_view t({*struct_col});
   auto result = cudf::row_bit_count(t);
 
@@ -440,9 +436,7 @@ TEST_F(RowBitCount, NestedTypes)
 {
   // List<Struct<List<int>, float, List<int>, int16>
   {
-    std::unique_ptr<column> col_no_nulls;
-    std::unique_ptr<column> expected_sizes;
-    std::tie(col_no_nulls, expected_sizes) =
+    auto [col_no_nulls, expected_sizes] =
       build_nested_and_expected_column({1, 1, 1, 1, 1, 1, 1, 1});
     table_view no_nulls_t({*col_no_nulls});
     auto no_nulls_result = cudf::row_bit_count(no_nulls_t);
@@ -600,19 +594,13 @@ struct sum_functor {
 TEST_F(RowBitCount, Table)
 {
   // complex nested column
-  std::unique_ptr<column> col0;
-  std::unique_ptr<column> col0_sizes;
-  std::tie(col0, col0_sizes) = build_nested_and_expected_column({1, 1, 1, 1, 1, 1, 1, 1});
+  auto [col0, col0_sizes] = build_nested_and_expected_column({1, 1, 1, 1, 1, 1, 1, 1});
 
   // struct column
-  std::unique_ptr<column> col1;
-  std::unique_ptr<column> col1_sizes;
-  std::tie(col1, col1_sizes) = build_struct_column();
+  auto [col1, col1_sizes] = build_struct_column();
 
   // list column
-  std::unique_ptr<column> col2;
-  std::unique_ptr<column> col2_sizes;
-  std::tie(col2, col2_sizes) = build_list_column<int16_t>();
+  auto [col2, col2_sizes] = build_list_column<int16_t>();
 
   table_view t({*col0, *col1, *col2});
   auto result = cudf::row_bit_count(t);
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index f53498bccec..16fb02e06bc 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -174,9 +174,7 @@ void validate_cast_result(cudf::column_view expected, cudf::column_view actual)
 {
   using namespace cudf::test;
   // round-trip through the host because sizeof(T) may not equal sizeof(R)
-  thrust::host_vector<T> h_data;
-  std::vector<cudf::bitmask_type> null_mask;
-  std::tie(h_data, null_mask) = to_host<T>(expected);
+  auto [h_data, null_mask] = to_host<T>(expected);
   if (null_mask.empty()) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(make_column<R, T>(h_data), actual);
   } else {
diff --git a/docs/cudf/source/_static/params.css b/docs/cudf/source/_static/params.css
index 2bdd6f5a299..9e6be7ca75f 100644
--- a/docs/cudf/source/_static/params.css
+++ b/docs/cudf/source/_static/params.css
@@ -52,4 +52,9 @@ table.io-supported-types-table thead{
 
 .special-table td, .special-table th {
     border: 1px solid #dee2e6;
-}
\ No newline at end of file
+}
+
+/* Needed to resolve https://github.com/executablebooks/jupyter-book/issues/1611 */ 
+.output.text_html {
+    overflow: auto;
+}
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index 1aa1ea8beac..e0ef3cb2ff0 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -105,6 +105,7 @@ Function application, GroupBy & window
    :toctree: api/
 
    DataFrame.apply
+   DataFrame.applymap
    DataFrame.apply_chunks
    DataFrame.apply_rows
    DataFrame.pipe
@@ -148,6 +149,7 @@ Computations / descriptive stats
    DataFrame.round
    DataFrame.skew
    DataFrame.sum
+   DataFrame.sum_of_squares
    DataFrame.std
    DataFrame.var
    DataFrame.nunique
@@ -247,9 +249,11 @@ Serialization / IO / conversion
    DataFrame.to_dlpack
    DataFrame.to_parquet
    DataFrame.to_csv
+   DataFrame.to_cupy
    DataFrame.to_hdf
    DataFrame.to_dict
    DataFrame.to_json
+   DataFrame.to_numpy
    DataFrame.to_pandas
    DataFrame.to_feather
    DataFrame.to_records
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 6f5affd0ecd..8e0e3bbd411 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -92,7 +92,9 @@ Conversion
 
    Index.astype
    Index.to_arrow
+   Index.to_cupy
    Index.to_list
+   Index.to_numpy
    Index.to_series
    Index.to_frame
    Index.to_pandas
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 95aa71919e4..d7015c9348d 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -390,10 +390,12 @@ Serialization / IO / conversion
    :toctree: api/
 
    Series.to_arrow
+   Series.to_cupy
    Series.to_dlpack
    Series.to_frame
    Series.to_hdf
    Series.to_json
+   Series.to_numpy
    Series.to_pandas
    Series.to_string
    Series.from_arrow
diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst
index 3087bcaa826..8d4646c47a7 100644
--- a/docs/cudf/source/api_docs/string_handling.rst
+++ b/docs/cudf/source/api_docs/string_handling.rst
@@ -83,7 +83,6 @@ strings and apply several methods to it. These can be accessed like
    rsplit
    startswith
    strip
-   subword_tokenize
    swapcase
    title
    token_count
diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst
index 60a65558033..9b8983fba49 100644
--- a/docs/cudf/source/basics/basics.rst
+++ b/docs/cudf/source/basics/basics.rst
@@ -15,36 +15,40 @@ The following table lists all of cudf types. For methods requiring dtype argumen
 .. rst-class:: special-table
 .. table::
 
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Kind of Data           | Data Type        | Scalar                                                                              | String Aliases                              |
-    +========================+==================+=====================================================================================+=============================================+
-    | Integer                |                  | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_,                   | ``'int8'``, ``'int16'``, ``'int32'``,       |
-    |                        |                  | np.uint32_, np.uint64_                                                              | ``'int64'``, ``'uint8'``, ``'uint16'``,     |
-    |                        |                  |                                                                                     | ``'uint32'``, ``'uint64'``                  |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Float                  |                  | np.float32_, np.float64_                                                            | ``'float32'``, ``'float64'``                |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Strings                |                  | `str <https://docs.python.org/3/library/stdtypes.html#str>`_                        | ``'string'``, ``'object'``                  |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Datetime               |                  | np.datetime64_                                                                      | ``'datetime64[s]'``, ``'datetime64[ms]'``,  |
-    |                        |                  |                                                                                     | ``'datetime64[us]'``, ``'datetime64[ns]'``  |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Timedelta              |                  | np.timedelta64_                                                                     | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,|
-    | (duration type)        |                  |                                                                                     | ``'timedelta64[us]'``, ``'timedelta64[ns]'``|
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Categorical            | CategoricalDtype | (none)                                                                              | ``'category'``                              |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Boolean                |                  | np.bool_                                                                            | ``'bool'``                                  |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Decimal                | Decimal32Dtype,  | (none)                                                                              | (none)                                      |
-    |                        | Decimal64Dtype,  |                                                                                     |                                             |
-    |                        | Decimal128Dtype  |                                                                                     |                                             |
-    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Kind of Data    | Data Type        | Scalar                                                       | String Aliases                               |
+    +=================+==================+==============================================================+==============================================+
+    | Integer         |                  | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_,        | ``'int8'``, ``'int16'``, ``'int32'``,        |
+    |                 |                  | np.uint16_, np.uint32_, np.uint64_                           | ``'int64'``, ``'uint8'``, ``'uint16'``,      |
+    |                 |                  |                                                              | ``'uint32'``, ``'uint64'``                   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Float           |                  | np.float32_, np.float64_                                     | ``'float32'``, ``'float64'``                 |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Strings         |                  | `str <https://docs.python.org/3/library/stdtypes.html#str>`_ | ``'string'``, ``'object'``                   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Datetime        |                  | np.datetime64_                                               | ``'datetime64[s]'``, ``'datetime64[ms]'``,   |
+    |                 |                  |                                                              | ``'datetime64[us]'``, ``'datetime64[ns]'``   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Timedelta       |                  | np.timedelta64_                                              | ``'timedelta64[s]'``, ``'timedelta64[ms]'``, |
+    | (duration type) |                  |                                                              | ``'timedelta64[us]'``, ``'timedelta64[ns]'`` |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Categorical     | CategoricalDtype | (none)                                                       | ``'category'``                               |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Boolean         |                  | np.bool_                                                     | ``'bool'``                                   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Decimal         | Decimal32Dtype,  | (none)                                                       | (none)                                       |
+    |                 | Decimal64Dtype,  |                                                              |                                              |
+    |                 | Decimal128Dtype  |                                                              |                                              |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Lists           | ListDtype        | list                                                         | ``'list'``                                   |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
+    | Structs         | StructDtype      | dict                                                         | ``'struct'``                                 |
+    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
 
 **Note: All dtypes above are Nullable**
 
-.. _np.int8: 
-.. _np.int16: 
+.. _np.int8:
+.. _np.int16:
 .. _np.int32:
 .. _np.int64:
 .. _np.uint8:
diff --git a/docs/cudf/source/basics/internals.rst b/docs/cudf/source/basics/internals.rst
index 60b63c6fab8..96ef40d51e6 100644
--- a/docs/cudf/source/basics/internals.rst
+++ b/docs/cudf/source/basics/internals.rst
@@ -54,7 +54,7 @@ As another example, the ``StringColumn`` backing the Series
 2. No mask buffer as there are no nulls in the Series
 3. Two children columns:
 
-    -  A column of 8-bit characters
+    -  A column of UTF-8 characters
        ``['d', 'o', 'y', 'o', 'u', h' ... '?']``
     -  A column of "offsets" to the characters column (in this case,
        ``[0, 2, 5, 9, 12, 19]``)
@@ -172,7 +172,7 @@ Selecting columns by index:
     >>> ca.select_by_index(1)
     ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
     >>> ca.select_by_index([0, 1])
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))    
+    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
     >>> ca.select_by_index(slice(1, 3))
     ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
 
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
index 71c114e9149..ce774453386 100644
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ b/docs/cudf/source/basics/io-gds-integration.rst
@@ -1,39 +1,42 @@
 GPUDirect Storage Integration
 =============================
 
-Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. 
-GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. 
-GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. 
+Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations.
+GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU.
+GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer.
 The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
 GDS is also included in CUDA Toolkit 11.4 and higher.
 
-Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. 
-This variable also controls the GDS compatibility mode. 
+Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``.
+This variable also controls the GDS compatibility mode.
 
-There are three valid values for the environment variable:
+There are four valid values for the environment variable:
 
 - "GDS": Enable GDS use; GDS compatibility mode is *off*.
 - "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
+- "KVIKIO": Enable GDS through `KvikIO <https://github.com/rapidsai/kvikio>`_.
 - "OFF": Completely disable GDS use.
 
 If no value is set, behavior will be the same as the "GDS" option.
 
 This environment variable also affects how cuDF treats GDS errors.
 When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
-When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), 
-cuDF throws an exception to propagate the error to te user.
+When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on),
+cuDF throws an exception to propagate the error to the user.
+When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a KvikIO API call fails for any reason (unlikely, given that KvikIO implements its own compatibility mode) cuDF throws an exception to propagate the error to the user.
+For more information about error handling, compatibility mode, and tuning parameters in KvikIO see: https://github.com/rapidsai/kvikio
 
 Operations that support the use of GPUDirect Storage:
 
-- `read_avro`
-- `read_parquet`
-- `read_orc`
-- `to_csv`
-- `to_parquet`
-- `to_orc`
+- :py:func:`cudf.read_avro`
+- :py:func:`cudf.read_parquet`
+- :py:func:`cudf.read_orc`
+- :py:meth:`cudf.DataFrame.to_csv`
+- :py:meth:`cudf.DataFrame.to_parquet`
+- :py:meth:`cudf.DataFrame.to_orc`
 
 Several parameters that can be used to tune the performance of GDS-enabled I/O are exposed through environment variables:
 
 - ``LIBCUDF_CUFILE_THREAD_COUNT``: Integral value, maximum number of parallel reads/writes per file (default 16);
 - ``LIBCUDF_CUFILE_SLICE_SIZE``: Integral value, maximum size of each GDS read/write, in bytes (default 4MB).
-  Larger I/O operations are split into multiple calls.
\ No newline at end of file
+  Larger I/O operations are split into multiple calls.
diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst
index 521833e2afd..fc24e0c15f4 100644
--- a/docs/cudf/source/basics/io-nvcomp-integration.rst
+++ b/docs/cudf/source/basics/io-nvcomp-integration.rst
@@ -1,14 +1,14 @@
 nvCOMP Integration
 =============================
 
-Some types of compression/decompression can be performed using either `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation. 
+Some types of compression/decompression can be performed using either the `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation.
 
 Which implementation is used by default depends on the data format and the compression type.
 Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
 
 There are three valid values for the environment variable:
 
-- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. 
+- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use.
 - "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations.
 - "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead.
 
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index dbdf8e59e6a..c8b30120924 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -46,10 +46,13 @@
     "numpydoc",
     "IPython.sphinxext.ipython_console_highlighting",
     "IPython.sphinxext.ipython_directive",
-    "nbsphinx",
     "PandasCompat",
+    "myst_nb",
 ]
 
+jupyter_execute_notebooks = "force"
+execution_timeout = 300
+
 copybutton_prompt_text = ">>> "
 autosummary_generate = True
 ipython_mplbackend = "str"
@@ -252,6 +255,7 @@ def process_class_docstrings(app, what, name, obj, options, lines):
             lines[:] = lines[:cut_index]
 
 
+nitpick_ignore = [("py:class", "SeriesOrIndex"),]
 
 
 def setup(app):
diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
index 1bcb9335256..35ca21f380e 100644
--- a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
+++ b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
@@ -1067,7 +1067,7 @@
     "    if sparseformat == 'row':\n",
     "        _sparse_constructor = cp.sparse.csr_matrix\n",
     "\n",
-    "    return _sparse_constructor(cp.from_dlpack(data.to_dlpack()))"
+    "    return _sparse_constructor(cupy_from_dlpack(data.to_dlpack()))"
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index ab006847fc6..9bb95406e8a 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -2484,6 +2484,14 @@
    "execution_count": 35,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mmccarty/miniconda3/envs/cudf_dev/lib/python3.8/site-packages/cudf/core/series.py:2223: FutureWarning: Series.applymap is deprecated and will be removed in a future cuDF release. Use Series.apply instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
@@ -3024,7 +3032,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ashwin/workspace/rapids/cudf/python/cudf/cudf/core/indexed_frame.py:2271: FutureWarning: append is deprecated and will be removed in a future version. Use concat instead.\n",
+      "/home/mmccarty/miniconda3/envs/cudf_dev/lib/python3.8/site-packages/cudf/core/indexed_frame.py:2329: FutureWarning: append is deprecated and will be removed in a future version. Use concat instead.\n",
       "  warnings.warn(\n"
      ]
     },
@@ -5850,7 +5858,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/home/mmccarty/sandbox/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc'"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from pathlib import Path\n",
+    "current_dir = os.path.dirname(os.path.realpath(\"__file__\"))\n",
+    "cudf_root = Path(current_dir).parents[3]\n",
+    "file_path = os.path.join(cudf_root, \"python\", \"cudf\", \"cudf\", \"tests\", \"data\", \"orc\", \"TestOrcFile.test1.orc\")\n",
+    "file_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
    "metadata": {},
    "outputs": [
     {
@@ -5941,13 +5974,13 @@
        "1  [{'key': 'chani', 'value': {'int1': 5, 'string...  "
       ]
      },
-     "execution_count": 79,
+     "execution_count": 81,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df2 = cudf.read_orc('/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc')\n",
+    "df2 = cudf.read_orc(file_path)\n",
     "df2"
    ]
   },
@@ -5974,15 +6007,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 82,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-03-29 12:21:32,328 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
-      "2022-03-29 12:21:32,394 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
+      "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-ghcx5g0e', purging\n",
+      "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-wh16f0h3', purging\n",
+      "2022-04-21 10:11:07,360 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
+      "2022-04-21 10:11:07,388 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
      ]
     },
     {
@@ -5992,7 +6027,7 @@
        "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
        "    <div style=\"margin-left: 48px;\">\n",
        "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
-       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-4be800f5-af7c-11ec-8df8-c8d9d2247354</p>\n",
+       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-e3492c89-c17c-11ec-813e-fc3497a62adc</p>\n",
        "        <table style=\"width: 100%; text-align: left;\">\n",
        "\n",
        "        <tr>\n",
@@ -6021,7 +6056,7 @@
        "    </div>\n",
        "    <div style=\"margin-left: 48px;\">\n",
        "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCUDACluster</h3>\n",
-       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">137d0882</p>\n",
+       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">db2501e1</p>\n",
        "        <table style=\"width: 100%; text-align: left;\">\n",
        "            <tr>\n",
        "                <td style=\"text-align: left;\">\n",
@@ -6036,7 +6071,7 @@
        "                    <strong>Total threads:</strong> 2\n",
        "                </td>\n",
        "                <td style=\"text-align: left;\">\n",
-       "                    <strong>Total memory:</strong> 45.79 GiB\n",
+       "                    <strong>Total memory:</strong> 125.65 GiB\n",
        "                </td>\n",
        "            </tr>\n",
        "            \n",
@@ -6058,11 +6093,11 @@
        "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
        "        <div style=\"margin-left: 48px;\">\n",
        "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
-       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-08f95e9e-2c10-4d66-a103-955ab4218e91</p>\n",
+       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-6f476508-e52f-49e9-8f1f-6a8641e177bd</p>\n",
        "            <table style=\"width: 100%; text-align: left;\">\n",
        "                <tr>\n",
        "                    <td style=\"text-align: left;\">\n",
-       "                        <strong>Comm:</strong> tcp://127.0.0.1:35157\n",
+       "                        <strong>Comm:</strong> tcp://127.0.0.1:39755\n",
        "                    </td>\n",
        "                    <td style=\"text-align: left;\">\n",
        "                        <strong>Workers:</strong> 2\n",
@@ -6081,7 +6116,7 @@
        "                        <strong>Started:</strong> Just now\n",
        "                    </td>\n",
        "                    <td style=\"text-align: left;\">\n",
-       "                        <strong>Total memory:</strong> 45.79 GiB\n",
+       "                        <strong>Total memory:</strong> 125.65 GiB\n",
        "                    </td>\n",
        "                </tr>\n",
        "            </table>\n",
@@ -6104,7 +6139,7 @@
        "                <table style=\"width: 100%; text-align: left;\">\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Comm: </strong> tcp://127.0.0.1:41411\n",
+       "                            <strong>Comm: </strong> tcp://127.0.0.1:33491\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Total threads: </strong> 1\n",
@@ -6112,31 +6147,31 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:40997/status\" target=\"_blank\">http://127.0.0.1:40997/status</a>\n",
+       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:34333/status\" target=\"_blank\">http://127.0.0.1:34333/status</a>\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Memory: </strong> 22.89 GiB\n",
+       "                            <strong>Memory: </strong> 62.82 GiB\n",
        "                        </td>\n",
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Nanny: </strong> tcp://127.0.0.1:42959\n",
+       "                            <strong>Nanny: </strong> tcp://127.0.0.1:43093\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\"></td>\n",
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
-       "                            <strong>Local directory: </strong> /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-ruvvgno2\n",
+       "                            <strong>Local directory: </strong> /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-jsuvfju4\n",
        "                        </td>\n",
        "                    </tr>\n",
        "\n",
        "                    \n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>GPU: </strong>Quadro GV100\n",
+       "                            <strong>GPU: </strong>NVIDIA RTX A6000\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>GPU memory: </strong> 31.75 GiB\n",
+       "                            <strong>GPU memory: </strong> 47.51 GiB\n",
        "                        </td>\n",
        "                    </tr>\n",
        "                    \n",
@@ -6158,7 +6193,7 @@
        "                <table style=\"width: 100%; text-align: left;\">\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Comm: </strong> tcp://127.0.0.1:41341\n",
+       "                            <strong>Comm: </strong> tcp://127.0.0.1:44033\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Total threads: </strong> 1\n",
@@ -6166,31 +6201,31 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:39963/status\" target=\"_blank\">http://127.0.0.1:39963/status</a>\n",
+       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:45225/status\" target=\"_blank\">http://127.0.0.1:45225/status</a>\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Memory: </strong> 22.89 GiB\n",
+       "                            <strong>Memory: </strong> 62.82 GiB\n",
        "                        </td>\n",
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Nanny: </strong> tcp://127.0.0.1:33675\n",
+       "                            <strong>Nanny: </strong> tcp://127.0.0.1:46529\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\"></td>\n",
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
-       "                            <strong>Local directory: </strong> /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-phx0wjv_\n",
+       "                            <strong>Local directory: </strong> /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-zlsacw8_\n",
        "                        </td>\n",
        "                    </tr>\n",
        "\n",
        "                    \n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>GPU: </strong>Quadro GV100\n",
+       "                            <strong>GPU: </strong>NVIDIA RTX A6000\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>GPU memory: </strong> 31.74 GiB\n",
+       "                            <strong>GPU memory: </strong> 47.54 GiB\n",
        "                        </td>\n",
        "                    </tr>\n",
        "                    \n",
@@ -6216,10 +6251,10 @@
        "</div>"
       ],
       "text/plain": [
-       "<Client: 'tcp://127.0.0.1:35157' processes=2 threads=2, memory=45.79 GiB>"
+       "<Client: 'tcp://127.0.0.1:39755' processes=2 threads=2, memory=125.65 GiB>"
       ]
      },
-     "execution_count": 80,
+     "execution_count": 82,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6245,7 +6280,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 83,
    "metadata": {},
    "outputs": [
     {
@@ -6321,7 +6356,7 @@
        "<dask_cudf.DataFrame | 20 tasks | 5 npartitions>"
       ]
      },
-     "execution_count": 81,
+     "execution_count": 83,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6337,14 +6372,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 84,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Tue Mar 29 12:21:33 2022       \n",
+      "Thu Apr 21 10:11:07 2022       \n",
       "+-----------------------------------------------------------------------------+\n",
       "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\n",
       "|-------------------------------+----------------------+----------------------+\n",
@@ -6352,12 +6387,12 @@
       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
       "|                               |                      |               MIG M. |\n",
       "|===============================+======================+======================|\n",
-      "|   0  Quadro GV100        Off  | 00000000:15:00.0 Off |                  Off |\n",
-      "| 36%   49C    P2    50W / 250W |   1113MiB / 32508MiB |      0%      Default |\n",
+      "|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0  On |                  Off |\n",
+      "| 30%   48C    P2    83W / 300W |   2970MiB / 48651MiB |      7%      Default |\n",
       "|                               |                      |                  N/A |\n",
       "+-------------------------------+----------------------+----------------------+\n",
-      "|   1  Quadro GV100        Off  | 00000000:2D:00.0 Off |                  Off |\n",
-      "| 40%   54C    P2    50W / 250W |    306MiB / 32498MiB |      0%      Default |\n",
+      "|   1  NVIDIA RTX A6000    On   | 00000000:02:00.0 Off |                  Off |\n",
+      "| 30%   36C    P2    25W / 300W |    265MiB / 48685MiB |      5%      Default |\n",
       "|                               |                      |                  N/A |\n",
       "+-------------------------------+----------------------+----------------------+\n",
       "                                                                               \n",
@@ -6366,6 +6401,15 @@
       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
       "|        ID   ID                                                   Usage      |\n",
       "|=============================================================================|\n",
+      "|    0   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                871MiB |\n",
+      "|    0   N/A  N/A      2441      G   /usr/bin/gnome-shell              316MiB |\n",
+      "|    0   N/A  N/A   1240494      G   ...AAAAAAAAA= --shared-files       68MiB |\n",
+      "|    0   N/A  N/A   1240525      G   ...RendererForSitePerProcess       41MiB |\n",
+      "|    0   N/A  N/A   1243689      C   .../envs/cudf_dev/bin/python      593MiB |\n",
+      "|    0   N/A  N/A   1245502      C   .../envs/cudf_dev/bin/python      753MiB |\n",
+      "|    0   N/A  N/A   1245751      C   .../envs/cudf_dev/bin/python      257MiB |\n",
+      "|    1   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                  4MiB |\n",
+      "|    1   N/A  N/A   1245748      C   .../envs/cudf_dev/bin/python      257MiB |\n",
       "+-----------------------------------------------------------------------------+\n"
      ]
     }
@@ -6383,7 +6427,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 85,
    "metadata": {},
    "outputs": [
     {
@@ -6459,7 +6503,7 @@
        "<dask_cudf.DataFrame | 5 tasks | 5 npartitions>"
       ]
      },
-     "execution_count": 83,
+     "execution_count": 85,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6471,14 +6515,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 86,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Tue Mar 29 12:21:34 2022       \n",
+      "Thu Apr 21 10:11:08 2022       \n",
       "+-----------------------------------------------------------------------------+\n",
       "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\n",
       "|-------------------------------+----------------------+----------------------+\n",
@@ -6486,12 +6530,12 @@
       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
       "|                               |                      |               MIG M. |\n",
       "|===============================+======================+======================|\n",
-      "|   0  Quadro GV100        Off  | 00000000:15:00.0 Off |                  Off |\n",
-      "| 36%   49C    P2    50W / 250W |   1113MiB / 32508MiB |      0%      Default |\n",
+      "|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0  On |                  Off |\n",
+      "| 30%   48C    P2    84W / 300W |   2970MiB / 48651MiB |      3%      Default |\n",
       "|                               |                      |                  N/A |\n",
       "+-------------------------------+----------------------+----------------------+\n",
-      "|   1  Quadro GV100        Off  | 00000000:2D:00.0 Off |                  Off |\n",
-      "| 40%   54C    P2    50W / 250W |    306MiB / 32498MiB |      0%      Default |\n",
+      "|   1  NVIDIA RTX A6000    On   | 00000000:02:00.0 Off |                  Off |\n",
+      "| 30%   36C    P2    37W / 300W |    265MiB / 48685MiB |      0%      Default |\n",
       "|                               |                      |                  N/A |\n",
       "+-------------------------------+----------------------+----------------------+\n",
       "                                                                               \n",
@@ -6500,6 +6544,15 @@
       "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
       "|        ID   ID                                                   Usage      |\n",
       "|=============================================================================|\n",
+      "|    0   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                871MiB |\n",
+      "|    0   N/A  N/A      2441      G   /usr/bin/gnome-shell              316MiB |\n",
+      "|    0   N/A  N/A   1240494      G   ...AAAAAAAAA= --shared-files       68MiB |\n",
+      "|    0   N/A  N/A   1240525      G   ...RendererForSitePerProcess       41MiB |\n",
+      "|    0   N/A  N/A   1243689      C   .../envs/cudf_dev/bin/python      593MiB |\n",
+      "|    0   N/A  N/A   1245502      C   .../envs/cudf_dev/bin/python      753MiB |\n",
+      "|    0   N/A  N/A   1245751      C   .../envs/cudf_dev/bin/python      257MiB |\n",
+      "|    1   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                  4MiB |\n",
+      "|    1   N/A  N/A   1245748      C   .../envs/cudf_dev/bin/python      257MiB |\n",
       "+-----------------------------------------------------------------------------+\n"
      ]
     }
@@ -6527,7 +6580,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 87,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -6552,7 +6605,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -6569,16 +6622,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 89,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "DoneAndNotDoneFutures(done={<Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-31174ad3bfdf492cbf5d581c19c4d054', 2)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-31174ad3bfdf492cbf5d581c19c4d054', 3)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-31174ad3bfdf492cbf5d581c19c4d054', 4)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-31174ad3bfdf492cbf5d581c19c4d054', 0)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-31174ad3bfdf492cbf5d581c19c4d054', 1)>}, not_done=set())"
+       "DoneAndNotDoneFutures(done={<Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 0)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 2)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 3)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 1)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 4)>}, not_done=set())"
       ]
      },
-     "execution_count": 87,
+     "execution_count": 89,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 41bce8b865e..8026c378156 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -4,7 +4,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Overview of User Defined Functions with cuDF"
+    "Overview of User Defined Functions with cuDF\n",
+    "===================================="
    ]
   },
   {
@@ -40,7 +41,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Series UDFs\n",
+    "Series UDFs\n",
+    "--------------\n",
     "\n",
     "You can execute UDFs on Series in two ways:\n",
     "\n",
@@ -54,7 +56,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#  `cudf.Series.apply`"
+    "`cudf.Series.apply`\n",
+    "---------------------"
    ]
   },
   {
@@ -126,7 +129,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Functions with Additional Scalar Arguments"
+    "Functions with Additional Scalar Arguments\n",
+    "---------------------------------------------------"
    ]
   },
   {
@@ -138,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -148,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -160,7 +164,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -181,7 +185,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Nullable Data"
+    "Nullable Data\n",
+    "----------------"
    ]
   },
   {
@@ -193,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -205,7 +210,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -218,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -229,7 +234,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -241,7 +246,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -260,7 +265,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -274,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -286,7 +291,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -307,7 +312,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Lower level control with custom `numba` kernels"
+    "Lower level control with custom `numba` kernels\n",
+    "---------------------------------------------------------"
    ]
   },
   {
@@ -322,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -331,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -355,7 +361,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -373,7 +379,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -452,7 +458,7 @@
        "4   979   982  1011   9790.0"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -472,7 +478,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## DataFrame UDFs\n",
+    "DataFrame UDFs\n",
+    "--------------------\n",
     "\n",
     "Like `cudf.Series`, there are multiple ways of using UDFs on dataframes, which essentially amount to UDFs that expect multiple columns as input:\n",
     "\n",
@@ -485,7 +492,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# `cudf.DataFrame.apply`"
+    "`cudf.DataFrame.apply`\n",
+    "---------------------------"
    ]
   },
   {
@@ -497,7 +505,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -514,7 +522,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -569,7 +577,7 @@
        "2  3     6"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -591,7 +599,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -603,7 +611,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -621,7 +629,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -633,7 +641,7 @@
        "dtype: object"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -658,7 +666,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -709,7 +717,7 @@
        "2     3"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -728,7 +736,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -740,7 +748,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -758,7 +766,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -813,7 +821,7 @@
        "2  3  1"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -836,7 +844,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -848,7 +856,7 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -866,7 +874,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -921,7 +929,7 @@
        "2  3  3.14"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -939,7 +947,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -951,7 +959,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -982,7 +990,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -1033,7 +1041,7 @@
        "2  5"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1054,7 +1062,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -1066,7 +1074,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1084,7 +1092,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -1151,7 +1159,7 @@
        "2  3  6     4  8  6"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1172,7 +1180,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -1184,7 +1192,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1197,7 +1205,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Numba kernels for DataFrames"
+    "Numba kernels for DataFrames\n",
+    "------------------------------------"
    ]
   },
   {
@@ -1212,7 +1221,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1241,7 +1250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
@@ -1312,7 +1321,7 @@
        "2  3  6     4  8  6  9.0"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1344,7 +1353,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -1417,7 +1426,7 @@
        "4   979   982  1011"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1443,7 +1452,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -1522,7 +1531,7 @@
        "4   979   982  1011  1961.0"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1546,7 +1555,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Rolling Window UDFs\n",
+    "Rolling Window UDFs\n",
+    "-------------------------\n",
     "\n",
     "For time-series data, we may need to operate on a small \\\"window\\\" of our column at a time, processing each portion independently. We could slide (\\\"roll\\\") this window over the entire column to answer questions like \\\"What is the 3-day moving average of a stock price over the past year?\"\n",
     "\n",
@@ -1555,7 +1565,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
@@ -1570,7 +1580,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1582,7 +1592,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
@@ -1591,7 +1601,7 @@
        "Rolling [window=3,min_periods=3,center=False]"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 35,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1610,7 +1620,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1634,7 +1644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
@@ -1649,7 +1659,7 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1667,7 +1677,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {
@@ -1734,7 +1744,7 @@
        "4  59.0  59.0"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 38,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1748,7 +1758,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
@@ -1845,7 +1855,7 @@
        "9        100.0        100.0"
       ]
      },
-     "execution_count": 43,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1859,16 +1869,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## GroupBy DataFrame UDFs\n",
+    "GroupBy DataFrame UDFs\n",
+    "-------------------------------\n",
     "\n",
     "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n",
     "\n",
-    "First, we'll group our DataFrame based on column `b`, which is either True or False. Note that we currently need to pass `method=\"cudf\"` to use UDFs with GroupBy objects."
+    "First, we'll group our DataFrame based on column `b`, which is either True or False."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [
     {
@@ -1947,7 +1958,7 @@
        "4 -0.970850  False   Sarah  0.342905"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 40,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1959,7 +1970,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1975,7 +1986,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2002,7 +2013,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -2132,7 +2143,7 @@
        "9 -0.725581   True  George  0.405245       0.271319"
       ]
      },
-     "execution_count": 47,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2155,14 +2166,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Numba Kernels on CuPy Arrays\n",
+    "Numba Kernels on CuPy Arrays\n",
+    "-------------------------------------\n",
     "\n",
     "We can also execute Numba kernels on CuPy NDArrays, again thanks to the `__cuda_array_interface__`. We can even run the same UDF on the Series and the CuPy array. First, we define a Series and then create a CuPy array from that Series."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
@@ -2171,7 +2183,7 @@
        "array([ 1.,  2.,  3.,  4., 10.])"
       ]
      },
-     "execution_count": 48,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2193,7 +2205,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
@@ -2207,14 +2219,12 @@
        "dtype: int32"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 45,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from cudf.utils import cudautils\n",
-    "\n",
     "@cuda.jit\n",
     "def multiply_by_5(x, out):\n",
     "    i = cuda.grid(1)\n",
@@ -2235,7 +2245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [
     {
@@ -2244,7 +2254,7 @@
        "array([ 5., 10., 15., 20., 50.])"
       ]
      },
-     "execution_count": 50,
+     "execution_count": 46,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2259,7 +2269,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Caveats"
+    "Caveats\n",
+    "---------"
    ]
   },
   {
@@ -2274,7 +2285,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Summary\n",
+    "Summary\n",
+    "-----------\n",
     "\n",
     "This guide has covered a lot of content. At this point, you should hopefully feel comfortable writing UDFs (with or without null values) that operate on\n",
     "\n",
diff --git a/java/README.md b/java/README.md
index afd69df11ef..ea1b9e3e4e4 100644
--- a/java/README.md
+++ b/java/README.md
@@ -75,16 +75,6 @@ If you decide to build without Docker and the build script, examining the cmake
 settings in the [Java CI build script](ci/build-in-docker.sh) can be helpful if you are
 encountering difficulties during the build.
 
-## Dynamically Linking Arrow
-
-Since libcudf builds by default with a dynamically linked Arrow dependency, it may be
-desirable to build the Java bindings without requiring a statically-linked Arrow to avoid
-rebuilding an already built libcudf.so. To do so, specify the additional command-line flag
-`-DCUDF_JNI_ARROW_STATIC=OFF` when building the Java bindings with Maven.  However this will
-result in a jar that requires the correct Arrow version to be available in the runtime
-environment, and therefore is not recommended unless you are only performing local testing
-within the libcudf build environment.
-
 ## Statically Linking the CUDA Runtime
 
 If you use the default cmake options libcudart will be dynamically linked to libcudf and libcudfjni.
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index d6a193fbeaf..d21010ba30e 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -78,7 +78,6 @@ BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\"\
  -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS\
  -DCUDA_STATIC_RUNTIME=$ENABLE_CUDA_STATIC_RUNTIME\
  -DCUDF_JNI_LIBCUDF_STATIC=ON\
- -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL\
  -DUSE_GDS=$ENABLE_GDS -Dtest=*,!CuFileTest"
 
 if [ "$SIGN_FILE" == true ]; then
diff --git a/java/pom.xml b/java/pom.xml
index 8eccd652a46..50b6ca59440 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -147,7 +147,7 @@
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
-            <version>3.1.4</version>
+            <version>3.2.3</version>
             <scope>test</scope>
         </dependency>
     </dependencies>
@@ -165,10 +165,8 @@
         <CMAKE_EXPORT_COMPILE_COMMANDS>OFF</CMAKE_EXPORT_COMPILE_COMMANDS>
         <CUDA_STATIC_RUNTIME>OFF</CUDA_STATIC_RUNTIME>
         <PER_THREAD_DEFAULT_STREAM>OFF</PER_THREAD_DEFAULT_STREAM>
-        <RMM_LOGGING_LEVEL>INFO</RMM_LOGGING_LEVEL>
         <USE_GDS>OFF</USE_GDS>
         <GPU_ARCHS>ALL</GPU_ARCHS>
-        <CUDF_JNI_ARROW_STATIC>ON</CUDF_JNI_ARROW_STATIC>
         <CUDF_JNI_LIBCUDF_STATIC>OFF</CUDF_JNI_LIBCUDF_STATIC>
         <native.build.path>${project.build.directory}/cmake-build</native.build.path>
         <slf4j.version>1.7.30</slf4j.version>
@@ -386,13 +384,11 @@
                                     <arg value="${basedir}/src/main/native"/>
                                     <arg value="-DCUDA_STATIC_RUNTIME=${CUDA_STATIC_RUNTIME}" />
                                     <arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}" />
-                                    <arg value="-DRMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL}" />
                                     <arg value="-DUSE_GDS=${USE_GDS}" />
                                     <arg value="-DCMAKE_CXX_FLAGS=${cxx.flags}"/>
                                     <arg value="-DCMAKE_EXPORT_COMPILE_COMMANDS=${CMAKE_EXPORT_COMPILE_COMMANDS}"/>
                                     <arg value="-DCUDF_CPP_BUILD_DIR=${CUDF_CPP_BUILD_DIR}"/>
                                     <arg value="-DGPU_ARCHS=${GPU_ARCHS}"/>
-                                    <arg value="-DCUDF_JNI_ARROW_STATIC=${CUDF_JNI_ARROW_STATIC}"/>
                                     <arg value="-DCUDF_JNI_LIBCUDF_STATIC=${CUDF_JNI_LIBCUDF_STATIC}"/>
                                     <arg value="-DBUILD_SHARED_LIBS=ON"/>
                                 </exec>
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index ed3ac124216..cc1bc35f951 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1546,6 +1546,31 @@ public ColumnVector segmentedReduce(ColumnView offsets, SegmentedReductionAggreg
     }
   }
 
+  /**
+   * Segmented gather of the elements within a list element in each row of a list column.
+   * For each list, assuming the size is N, valid indices of gather map ranges in [-N, N).
+   * Out of bound indices refer to null.
+   * @param gatherMap ListColumnView carrying lists of integral indices which maps the
+   * element in list of each row in the source columns to rows of lists in the result columns.
+   * @return the result.
+   */
+  public ColumnVector segmentedGather(ColumnView gatherMap) {
+    return segmentedGather(gatherMap, OutOfBoundsPolicy.NULLIFY);
+  }
+
+  /**
+   * Segmented gather of the elements within a list element in each row of a list column.
+   * @param gatherMap ListColumnView carrying lists of integral indices which maps the
+   * element in list of each row in the source columns to rows of lists in the result columns.
+   * @param policy OutOfBoundsPolicy, `DONT_CHECK` leads to undefined behaviour; `NULLIFY`
+   * replaces out of bounds with null.
+   * @return the result.
+   */
+  public ColumnVector segmentedGather(ColumnView gatherMap, OutOfBoundsPolicy policy) {
+    return new ColumnVector(segmentedGather(getNativeView(), gatherMap.getNativeView(),
+        policy.equals(OutOfBoundsPolicy.NULLIFY)));
+  }
+
   /**
    * Do a reduction on the values in a list. The output type will be the type of the data column
    * of this list.
@@ -3448,6 +3473,16 @@ public final ColumnVector listSortRows(boolean isDescending, boolean isNullSmall
     return new ColumnVector(listSortRows(getNativeView(), isDescending, isNullSmallest));
   }
 
+  /**
+   * Generate list offsets from sizes of each list.
+   * NOTICE: This API only works for INT32. Otherwise, the behavior is undefined. And no null and negative value is allowed.
+   *
+   * @return a column of list offsets whose size is N + 1
+   */
+  public final ColumnVector generateListOffsets() {
+    return new ColumnVector(generateListOffsets(getNativeView()));
+  }
+
   /**
    * Get a single item from the column at the specified index as a Scalar.
    *
@@ -3998,6 +4033,9 @@ private static native long scan(long viewHandle, long aggregation,
   private static native long segmentedReduce(long dataViewHandle, long offsetsViewHandle,
       long aggregation, boolean includeNulls, int dtype, int scale) throws CudfException;
 
+  private static native long segmentedGather(long sourceColumnHandle, long gatherMapListHandle,
+      boolean isNullifyOutBounds) throws CudfException;
+
   private static native long isNullNative(long viewHandle);
 
   private static native long isNanNative(long viewHandle);
@@ -4134,6 +4172,8 @@ static native long makeCudfColumnView(int type, int scale, long data, long dataS
 
   static native long copyColumnViewToCV(long viewHandle) throws CudfException;
 
+  static native long generateListOffsets(long handle) throws CudfException;
+
   /**
    * A utility class to create column vector like objects without refcounts and other APIs when
    * creating the device side vector from host side nested vectors. Eventually this can go away or
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
index 78b3d5d52ec..f3fb7de6abe 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
@@ -33,9 +33,15 @@ public class ColumnWriterOptions {
   private boolean isNullable;
   private boolean isMap = false;
   private String columnName;
+  // only for Parquet
+  private boolean hasParquetFieldId;
+  private int parquetFieldId;
+
   private ColumnWriterOptions(AbstractStructBuilder builder) {
     this.columnName = builder.name;
     this.isNullable = builder.isNullable;
+    this.hasParquetFieldId = builder.hasParquetFieldId;
+    this.parquetFieldId = builder.parquetFieldId;
     this.childColumnOptions =
         (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]);
   }
@@ -67,6 +73,10 @@ public AbstractStructBuilder(String name, boolean isNullable) {
       super(name, isNullable);
     }
 
+    public AbstractStructBuilder(String name, boolean isNullable, int parquetFieldId) {
+      super(name, isNullable, parquetFieldId);
+    }
+
     protected AbstractStructBuilder() {
       super();
     }
@@ -84,6 +94,9 @@ public static abstract class NestedBuilder<T extends NestedBuilder, V extends Co
     protected List<ColumnWriterOptions> children = new ArrayList<>();
     protected boolean isNullable = true;
     protected String name = "";
+    // Parquet structure needs
+    protected boolean hasParquetFieldId;
+    protected int parquetFieldId;
 
     /**
      * Builder specific to build a Struct meta
@@ -93,22 +106,43 @@ protected NestedBuilder(String name, boolean isNullable) {
       this.isNullable = isNullable;
     }
 
+    protected NestedBuilder(String name, boolean isNullable, int parquetFieldId) {
+      this.name = name;
+      this.isNullable = isNullable;
+      this.hasParquetFieldId = true;
+      this.parquetFieldId = parquetFieldId;
+    }
+
     protected NestedBuilder() {}
 
-    protected ColumnWriterOptions withColumns(String name, boolean isNullable) {
+    protected ColumnWriterOptions withColumn(String name, boolean isNullable) {
       return new ColumnWriterOptions(name, isNullable);
     }
 
+    protected ColumnWriterOptions withColumn(String name, boolean isNullable, int parquetFieldId) {
+      return new ColumnWriterOptions(name, isNullable, parquetFieldId);
+    }
+
     protected ColumnWriterOptions withDecimal(String name, int precision,
                                               boolean isNullable) {
       return new ColumnWriterOptions(name, false, precision, isNullable);
     }
 
+    protected ColumnWriterOptions withDecimal(String name, int precision,
+                                              boolean isNullable, int parquetFieldId) {
+      return new ColumnWriterOptions(name, false, precision, isNullable, parquetFieldId);
+    }
+
     protected ColumnWriterOptions withTimestamp(String name, boolean isInt96,
                                                 boolean isNullable) {
       return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable);
     }
 
+    protected ColumnWriterOptions withTimestamp(String name, boolean isInt96,
+                                                boolean isNullable, int parquetFieldId) {
+      return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable, parquetFieldId);
+    }
+
     /**
      * Set the list column meta.
      * Lists should have only one child in ColumnVector, but the metadata expects a
@@ -155,16 +189,16 @@ public T withStructColumn(StructColumnWriterOptions child) {
     /**
      * Set column name
      */
-    public T withNonNullableColumns(String... name) {
-      withColumns(false, name);
+    public T withNonNullableColumns(String... names) {
+      withColumns(false, names);
       return (T) this;
     }
 
     /**
      * Set nullable column meta data
      */
-    public T withNullableColumns(String... name) {
-      withColumns(true, name);
+    public T withNullableColumns(String... names) {
+      withColumns(true, names);
       return (T) this;
     }
 
@@ -172,13 +206,22 @@ public T withNullableColumns(String... name) {
      * Set a simple child meta data
      * @return this for chaining.
      */
-    public T withColumns(boolean nullable, String... name) {
-      for (String n : name) {
-        children.add(withColumns(n, nullable));
+    public T withColumns(boolean nullable, String... names) {
+      for (String n : names) {
+        children.add(withColumn(n, nullable));
       }
       return (T) this;
     }
 
+    /**
+     * Set a simple child meta data
+     * @return this for chaining.
+     */
+    public T withColumn(boolean nullable, String name, int parquetFieldId) {
+      children.add(withColumn(name, nullable, parquetFieldId));
+      return (T) this;
+    }
+
     /**
      * Set a Decimal child meta data
      * @return this for chaining.
@@ -188,6 +231,15 @@ public T withDecimalColumn(String name, int precision, boolean nullable) {
       return (T) this;
     }
 
+    /**
+     * Set a Decimal child meta data
+     * @return this for chaining.
+     */
+    public T withDecimalColumn(String name, int precision, boolean nullable, int parquetFieldId) {
+      children.add(withDecimal(name, precision, nullable, parquetFieldId));
+      return (T) this;
+    }
+
     /**
      * Set a Decimal child meta data
      * @return this for chaining.
@@ -206,6 +258,15 @@ public T withDecimalColumn(String name, int precision) {
       return (T) this;
     }
 
+    /**
+     * Set a timestamp child meta data
+     * @return this for chaining.
+     */
+    public T withTimestampColumn(String name, boolean isInt96, boolean nullable, int parquetFieldId) {
+      children.add(withTimestamp(name, isInt96, nullable, parquetFieldId));
+      return (T) this;
+    }
+
     /**
      * Set a timestamp child meta data
      * @return this for chaining.
@@ -244,6 +305,13 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
     this.columnName = columnName;
   }
 
+  public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
+                             int precision, boolean isNullable, int parquetFieldId) {
+    this(columnName, isTimestampTypeInt96, precision, isNullable);
+    this.hasParquetFieldId = true;
+    this.parquetFieldId = parquetFieldId;
+  }
+
   public ColumnWriterOptions(String columnName, boolean isNullable) {
     this.isTimestampTypeInt96 = false;
     this.precision = UNKNOWN_PRECISION;
@@ -251,6 +319,12 @@ public ColumnWriterOptions(String columnName, boolean isNullable) {
     this.columnName = columnName;
   }
 
+  public ColumnWriterOptions(String columnName, boolean isNullable, int parquetFieldId) {
+    this(columnName, isNullable);
+    this.hasParquetFieldId = true;
+    this.parquetFieldId = parquetFieldId;
+  }
+
   public ColumnWriterOptions(String columnName) {
     this(columnName, true);
   }
@@ -302,6 +376,24 @@ int[] getFlatPrecision() {
     }
   }
 
+  boolean[] getFlatHasParquetFieldId() {
+    boolean[] ret = {hasParquetFieldId};
+    if (childColumnOptions.length > 0) {
+      return getFlatBooleans(ret, (opt) -> opt.getFlatHasParquetFieldId());
+    } else {
+      return ret;
+    }
+  }
+
+  int[] getFlatParquetFieldId() {
+    int[] ret = {parquetFieldId};
+    if (childColumnOptions.length > 0) {
+      return getFlatInts(ret, (opt) -> opt.getFlatParquetFieldId());
+    } else {
+      return ret;
+    }
+  }
+
   boolean[] getFlatIsNullable() {
     boolean[] ret = {isNullable};
     if (childColumnOptions.length > 0) {
@@ -418,6 +510,13 @@ public static StructBuilder structBuilder(String name, boolean isNullable) {
     return new StructBuilder(name, isNullable);
   }
 
+  /**
+   * Creates a StructBuilder for column called 'name'
+   */
+  public static StructBuilder structBuilder(String name, boolean isNullable, int parquetFieldId) {
+    return new StructBuilder(name, isNullable, parquetFieldId);
+  }
+
   /**
    * Creates a StructBuilder for column called 'name'
    */
@@ -477,6 +576,10 @@ public StructBuilder(String name, boolean isNullable) {
       super(name, isNullable);
     }
 
+    public StructBuilder(String name, boolean isNullable, int parquetFieldId) {
+      super(name, isNullable, parquetFieldId);
+    }
+
     public StructColumnWriterOptions build() {
       return new StructColumnWriterOptions(this);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java
index 9292975d0ce..3a3b7d721b7 100644
--- a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java
@@ -41,6 +41,16 @@ int[] getFlatPrecision() {
     return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatPrecision());
   }
 
+  @Override
+  boolean[] getFlatHasParquetFieldId() {
+    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatHasParquetFieldId());
+  }
+
+  @Override
+  int[] getFlatParquetFieldId() {
+    return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatParquetFieldId());
+  }
+
   @Override
   int[] getFlatNumChildren() {
     return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatNumChildren());
diff --git a/java/src/main/java/ai/rapids/cudf/CudaException.java b/java/src/main/java/ai/rapids/cudf/CudaException.java
index 2d862b47ef8..ff7ca308f3c 100755
--- a/java/src/main/java/ai/rapids/cudf/CudaException.java
+++ b/java/src/main/java/ai/rapids/cudf/CudaException.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,9 @@
  */
 package ai.rapids.cudf;
 
+import java.util.HashMap;
+import java.util.Map;
+
 /**
  * Exception from the cuda language/library.  Be aware that because of how cuda does asynchronous
  * processing exceptions from cuda can be thrown by method calls that did not cause the exception
@@ -28,11 +31,283 @@
  * don't switch between threads for different parts of processing that can be retried as a chunk.
  */
 public class CudaException extends RuntimeException {
-  CudaException(String message) {
+  CudaException(String message, int errorCode) {
     super(message);
+    cudaError = CudaError.parseErrorCode(errorCode);
   }
 
-  CudaException(String message, Throwable cause) {
+  CudaException(String message, int errorCode, Throwable cause) {
     super(message, cause);
+    cudaError = CudaError.parseErrorCode(errorCode);
+  }
+
+  public final CudaError cudaError;
+
+  /**
+   * The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM.
+   */
+  public enum CudaError {
+    UnknownNativeError(-1), // native CUDA error type which Java doesn't have a representation
+    cudaErrorInvalidValue(1),
+    cudaErrorMemoryAllocation(2),
+    cudaErrorInitializationError(3),
+    cudaErrorCudartUnloading(4),
+    cudaErrorProfilerDisabled(5),
+    cudaErrorProfilerNotInitialized(6),
+    cudaErrorProfilerAlreadyStarted(7),
+    cudaErrorProfilerAlreadyStopped(8),
+    cudaErrorInvalidConfiguration(9),
+    cudaErrorInvalidPitchValue(12),
+    cudaErrorInvalidSymbol(13),
+    cudaErrorInvalidHostPointer(16),
+    cudaErrorInvalidDevicePointer(17),
+    cudaErrorInvalidTexture(18),
+    cudaErrorInvalidTextureBinding(19),
+    cudaErrorInvalidChannelDescriptor(20),
+    cudaErrorInvalidMemcpyDirection(21),
+    cudaErrorAddressOfConstant(22),
+    cudaErrorTextureFetchFailed(23),
+    cudaErrorTextureNotBound(24),
+    cudaErrorSynchronizationError(25),
+    cudaErrorInvalidFilterSetting(26),
+    cudaErrorInvalidNormSetting(27),
+    cudaErrorMixedDeviceExecution(28),
+    cudaErrorNotYetImplemented(31),
+    cudaErrorMemoryValueTooLarge(32),
+    cudaErrorStubLibrary(34),
+    cudaErrorInsufficientDriver(35),
+    cudaErrorCallRequiresNewerDriver(36),
+    cudaErrorInvalidSurface(37),
+    cudaErrorDuplicateVariableName(43),
+    cudaErrorDuplicateTextureName(44),
+    cudaErrorDuplicateSurfaceName(45),
+    cudaErrorDevicesUnavailable(46),
+    cudaErrorIncompatibleDriverContext(49),
+    cudaErrorMissingConfiguration(52),
+    cudaErrorPriorLaunchFailure(53),
+    cudaErrorLaunchMaxDepthExceeded(65),
+    cudaErrorLaunchFileScopedTex(66),
+    cudaErrorLaunchFileScopedSurf(67),
+    cudaErrorSyncDepthExceeded(68),
+    cudaErrorLaunchPendingCountExceeded(69),
+    cudaErrorInvalidDeviceFunction(98),
+    cudaErrorNoDevice(100),
+    cudaErrorInvalidDevice(101),
+    cudaErrorDeviceNotLicensed(102),
+    cudaErrorSoftwareValidityNotEstablished(103),
+    cudaErrorStartupFailure(127),
+    cudaErrorInvalidKernelImage(200),
+    cudaErrorDeviceUninitialized(201),
+    cudaErrorMapBufferObjectFailed(205),
+    cudaErrorUnmapBufferObjectFailed(206),
+    cudaErrorArrayIsMapped(207),
+    cudaErrorAlreadyMapped(208),
+    cudaErrorNoKernelImageForDevice(209),
+    cudaErrorAlreadyAcquired(210),
+    cudaErrorNotMapped(211),
+    cudaErrorNotMappedAsArray(212),
+    cudaErrorNotMappedAsPointer(213),
+    cudaErrorECCUncorrectable(214),
+    cudaErrorUnsupportedLimit(215),
+    cudaErrorDeviceAlreadyInUse(216),
+    cudaErrorPeerAccessUnsupported(217),
+    cudaErrorInvalidPtx(218),
+    cudaErrorInvalidGraphicsContext(219),
+    cudaErrorNvlinkUncorrectable(220),
+    cudaErrorJitCompilerNotFound(221),
+    cudaErrorUnsupportedPtxVersion(222),
+    cudaErrorJitCompilationDisabled(223),
+    cudaErrorUnsupportedExecAffinity(224),
+    cudaErrorInvalidSource(300),
+    cudaErrorFileNotFound(301),
+    cudaErrorSharedObjectSymbolNotFound(302),
+    cudaErrorSharedObjectInitFailed(303),
+    cudaErrorOperatingSystem(304),
+    cudaErrorInvalidResourceHandle(400),
+    cudaErrorIllegalState(401),
+    cudaErrorSymbolNotFound(500),
+    cudaErrorNotReady(600),
+    cudaErrorIllegalAddress(700),
+    cudaErrorLaunchOutOfResources(701),
+    cudaErrorLaunchTimeout(702),
+    cudaErrorLaunchIncompatibleTexturing(703),
+    cudaErrorPeerAccessAlreadyEnabled(704),
+    cudaErrorPeerAccessNotEnabled(705),
+    cudaErrorSetOnActiveProcess(708),
+    cudaErrorContextIsDestroyed(709),
+    cudaErrorAssert(710),
+    cudaErrorTooManyPeers(711),
+    cudaErrorHostMemoryAlreadyRegistered(712),
+    cudaErrorHostMemoryNotRegistered(713),
+    cudaErrorHardwareStackError(714),
+    cudaErrorIllegalInstruction(715),
+    cudaErrorMisalignedAddress(716),
+    cudaErrorInvalidAddressSpace(717),
+    cudaErrorInvalidPc(718),
+    cudaErrorLaunchFailure(719),
+    cudaErrorCooperativeLaunchTooLarge(720),
+    cudaErrorNotPermitted(800),
+    cudaErrorNotSupported(801),
+    cudaErrorSystemNotReady(802),
+    cudaErrorSystemDriverMismatch(803),
+    cudaErrorCompatNotSupportedOnDevice(804),
+    cudaErrorMpsConnectionFailed(805),
+    cudaErrorMpsRpcFailure(806),
+    cudaErrorMpsServerNotReady(807),
+    cudaErrorMpsMaxClientsReached(808),
+    cudaErrorMpsMaxConnectionsReached(809),
+    cudaErrorStreamCaptureUnsupported(900),
+    cudaErrorStreamCaptureInvalidated(901),
+    cudaErrorStreamCaptureMerge(902),
+    cudaErrorStreamCaptureUnmatched(903),
+    cudaErrorStreamCaptureUnjoined(904),
+    cudaErrorStreamCaptureIsolation(905),
+    cudaErrorStreamCaptureImplicit(906),
+    cudaErrorCapturedEvent(907),
+    cudaErrorStreamCaptureWrongThread(908),
+    cudaErrorTimeout(909),
+    cudaErrorGraphExecUpdateFailure(910),
+    cudaErrorExternalDevice(911),
+    cudaErrorUnknown(999),
+    cudaErrorApiFailureBase(10000);
+
+    final int code;
+
+    private static Map<Integer, CudaError> codeToError = new HashMap<Integer, CudaError>(){{
+      put(cudaErrorInvalidValue.code, cudaErrorInvalidValue);
+      put(cudaErrorMemoryAllocation.code, cudaErrorMemoryAllocation);
+      put(cudaErrorInitializationError.code, cudaErrorInitializationError);
+      put(cudaErrorCudartUnloading.code, cudaErrorCudartUnloading);
+      put(cudaErrorProfilerDisabled.code, cudaErrorProfilerDisabled);
+      put(cudaErrorProfilerNotInitialized.code, cudaErrorProfilerNotInitialized);
+      put(cudaErrorProfilerAlreadyStarted.code, cudaErrorProfilerAlreadyStarted);
+      put(cudaErrorProfilerAlreadyStopped.code, cudaErrorProfilerAlreadyStopped);
+      put(cudaErrorInvalidConfiguration.code, cudaErrorInvalidConfiguration);
+      put(cudaErrorInvalidPitchValue.code, cudaErrorInvalidPitchValue);
+      put(cudaErrorInvalidSymbol.code, cudaErrorInvalidSymbol);
+      put(cudaErrorInvalidHostPointer.code, cudaErrorInvalidHostPointer);
+      put(cudaErrorInvalidDevicePointer.code, cudaErrorInvalidDevicePointer);
+      put(cudaErrorInvalidTexture.code, cudaErrorInvalidTexture);
+      put(cudaErrorInvalidTextureBinding.code, cudaErrorInvalidTextureBinding);
+      put(cudaErrorInvalidChannelDescriptor.code, cudaErrorInvalidChannelDescriptor);
+      put(cudaErrorInvalidMemcpyDirection.code, cudaErrorInvalidMemcpyDirection);
+      put(cudaErrorAddressOfConstant.code, cudaErrorAddressOfConstant);
+      put(cudaErrorTextureFetchFailed.code, cudaErrorTextureFetchFailed);
+      put(cudaErrorTextureNotBound.code, cudaErrorTextureNotBound);
+      put(cudaErrorSynchronizationError.code, cudaErrorSynchronizationError);
+      put(cudaErrorInvalidFilterSetting.code, cudaErrorInvalidFilterSetting);
+      put(cudaErrorInvalidNormSetting.code, cudaErrorInvalidNormSetting);
+      put(cudaErrorMixedDeviceExecution.code, cudaErrorMixedDeviceExecution);
+      put(cudaErrorNotYetImplemented.code, cudaErrorNotYetImplemented);
+      put(cudaErrorMemoryValueTooLarge.code, cudaErrorMemoryValueTooLarge);
+      put(cudaErrorStubLibrary.code, cudaErrorStubLibrary);
+      put(cudaErrorInsufficientDriver.code, cudaErrorInsufficientDriver);
+      put(cudaErrorCallRequiresNewerDriver.code, cudaErrorCallRequiresNewerDriver);
+      put(cudaErrorInvalidSurface.code, cudaErrorInvalidSurface);
+      put(cudaErrorDuplicateVariableName.code, cudaErrorDuplicateVariableName);
+      put(cudaErrorDuplicateTextureName.code, cudaErrorDuplicateTextureName);
+      put(cudaErrorDuplicateSurfaceName.code, cudaErrorDuplicateSurfaceName);
+      put(cudaErrorDevicesUnavailable.code, cudaErrorDevicesUnavailable);
+      put(cudaErrorIncompatibleDriverContext.code, cudaErrorIncompatibleDriverContext);
+      put(cudaErrorMissingConfiguration.code, cudaErrorMissingConfiguration);
+      put(cudaErrorPriorLaunchFailure.code, cudaErrorPriorLaunchFailure);
+      put(cudaErrorLaunchMaxDepthExceeded.code, cudaErrorLaunchMaxDepthExceeded);
+      put(cudaErrorLaunchFileScopedTex.code, cudaErrorLaunchFileScopedTex);
+      put(cudaErrorLaunchFileScopedSurf.code, cudaErrorLaunchFileScopedSurf);
+      put(cudaErrorSyncDepthExceeded.code, cudaErrorSyncDepthExceeded);
+      put(cudaErrorLaunchPendingCountExceeded.code, cudaErrorLaunchPendingCountExceeded);
+      put(cudaErrorInvalidDeviceFunction.code, cudaErrorInvalidDeviceFunction);
+      put(cudaErrorNoDevice.code, cudaErrorNoDevice);
+      put(cudaErrorInvalidDevice.code, cudaErrorInvalidDevice);
+      put(cudaErrorDeviceNotLicensed.code, cudaErrorDeviceNotLicensed);
+      put(cudaErrorSoftwareValidityNotEstablished.code, cudaErrorSoftwareValidityNotEstablished);
+      put(cudaErrorStartupFailure.code, cudaErrorStartupFailure);
+      put(cudaErrorInvalidKernelImage.code, cudaErrorInvalidKernelImage);
+      put(cudaErrorDeviceUninitialized.code, cudaErrorDeviceUninitialized);
+      put(cudaErrorMapBufferObjectFailed.code, cudaErrorMapBufferObjectFailed);
+      put(cudaErrorUnmapBufferObjectFailed.code, cudaErrorUnmapBufferObjectFailed);
+      put(cudaErrorArrayIsMapped.code, cudaErrorArrayIsMapped);
+      put(cudaErrorAlreadyMapped.code, cudaErrorAlreadyMapped);
+      put(cudaErrorNoKernelImageForDevice.code, cudaErrorNoKernelImageForDevice);
+      put(cudaErrorAlreadyAcquired.code, cudaErrorAlreadyAcquired);
+      put(cudaErrorNotMapped.code, cudaErrorNotMapped);
+      put(cudaErrorNotMappedAsArray.code, cudaErrorNotMappedAsArray);
+      put(cudaErrorNotMappedAsPointer.code, cudaErrorNotMappedAsPointer);
+      put(cudaErrorECCUncorrectable.code, cudaErrorECCUncorrectable);
+      put(cudaErrorUnsupportedLimit.code, cudaErrorUnsupportedLimit);
+      put(cudaErrorDeviceAlreadyInUse.code, cudaErrorDeviceAlreadyInUse);
+      put(cudaErrorPeerAccessUnsupported.code, cudaErrorPeerAccessUnsupported);
+      put(cudaErrorInvalidPtx.code, cudaErrorInvalidPtx);
+      put(cudaErrorInvalidGraphicsContext.code, cudaErrorInvalidGraphicsContext);
+      put(cudaErrorNvlinkUncorrectable.code, cudaErrorNvlinkUncorrectable);
+      put(cudaErrorJitCompilerNotFound.code, cudaErrorJitCompilerNotFound);
+      put(cudaErrorUnsupportedPtxVersion.code, cudaErrorUnsupportedPtxVersion);
+      put(cudaErrorJitCompilationDisabled.code, cudaErrorJitCompilationDisabled);
+      put(cudaErrorUnsupportedExecAffinity.code, cudaErrorUnsupportedExecAffinity);
+      put(cudaErrorInvalidSource.code, cudaErrorInvalidSource);
+      put(cudaErrorFileNotFound.code, cudaErrorFileNotFound);
+      put(cudaErrorSharedObjectSymbolNotFound.code, cudaErrorSharedObjectSymbolNotFound);
+      put(cudaErrorSharedObjectInitFailed.code, cudaErrorSharedObjectInitFailed);
+      put(cudaErrorOperatingSystem.code, cudaErrorOperatingSystem);
+      put(cudaErrorInvalidResourceHandle.code, cudaErrorInvalidResourceHandle);
+      put(cudaErrorIllegalState.code, cudaErrorIllegalState);
+      put(cudaErrorSymbolNotFound.code, cudaErrorSymbolNotFound);
+      put(cudaErrorNotReady.code, cudaErrorNotReady);
+      put(cudaErrorIllegalAddress.code, cudaErrorIllegalAddress);
+      put(cudaErrorLaunchOutOfResources.code, cudaErrorLaunchOutOfResources);
+      put(cudaErrorLaunchTimeout.code, cudaErrorLaunchTimeout);
+      put(cudaErrorLaunchIncompatibleTexturing.code, cudaErrorLaunchIncompatibleTexturing);
+      put(cudaErrorPeerAccessAlreadyEnabled.code, cudaErrorPeerAccessAlreadyEnabled);
+      put(cudaErrorPeerAccessNotEnabled.code, cudaErrorPeerAccessNotEnabled);
+      put(cudaErrorSetOnActiveProcess.code, cudaErrorSetOnActiveProcess);
+      put(cudaErrorContextIsDestroyed.code, cudaErrorContextIsDestroyed);
+      put(cudaErrorAssert.code, cudaErrorAssert);
+      put(cudaErrorTooManyPeers.code, cudaErrorTooManyPeers);
+      put(cudaErrorHostMemoryAlreadyRegistered.code, cudaErrorHostMemoryAlreadyRegistered);
+      put(cudaErrorHostMemoryNotRegistered.code, cudaErrorHostMemoryNotRegistered);
+      put(cudaErrorHardwareStackError.code, cudaErrorHardwareStackError);
+      put(cudaErrorIllegalInstruction.code, cudaErrorIllegalInstruction);
+      put(cudaErrorMisalignedAddress.code, cudaErrorMisalignedAddress);
+      put(cudaErrorInvalidAddressSpace.code, cudaErrorInvalidAddressSpace);
+      put(cudaErrorInvalidPc.code, cudaErrorInvalidPc);
+      put(cudaErrorLaunchFailure.code, cudaErrorLaunchFailure);
+      put(cudaErrorCooperativeLaunchTooLarge.code, cudaErrorCooperativeLaunchTooLarge);
+      put(cudaErrorNotPermitted.code, cudaErrorNotPermitted);
+      put(cudaErrorNotSupported.code, cudaErrorNotSupported);
+      put(cudaErrorSystemNotReady.code, cudaErrorSystemNotReady);
+      put(cudaErrorSystemDriverMismatch.code, cudaErrorSystemDriverMismatch);
+      put(cudaErrorCompatNotSupportedOnDevice.code, cudaErrorCompatNotSupportedOnDevice);
+      put(cudaErrorMpsConnectionFailed.code, cudaErrorMpsConnectionFailed);
+      put(cudaErrorMpsRpcFailure.code, cudaErrorMpsRpcFailure);
+      put(cudaErrorMpsServerNotReady.code, cudaErrorMpsServerNotReady);
+      put(cudaErrorMpsMaxClientsReached.code, cudaErrorMpsMaxClientsReached);
+      put(cudaErrorMpsMaxConnectionsReached.code, cudaErrorMpsMaxConnectionsReached);
+      put(cudaErrorStreamCaptureUnsupported.code, cudaErrorStreamCaptureUnsupported);
+      put(cudaErrorStreamCaptureInvalidated.code, cudaErrorStreamCaptureInvalidated);
+      put(cudaErrorStreamCaptureMerge.code, cudaErrorStreamCaptureMerge);
+      put(cudaErrorStreamCaptureUnmatched.code, cudaErrorStreamCaptureUnmatched);
+      put(cudaErrorStreamCaptureUnjoined.code, cudaErrorStreamCaptureUnjoined);
+      put(cudaErrorStreamCaptureIsolation.code, cudaErrorStreamCaptureIsolation);
+      put(cudaErrorStreamCaptureImplicit.code, cudaErrorStreamCaptureImplicit);
+      put(cudaErrorCapturedEvent.code, cudaErrorCapturedEvent);
+      put(cudaErrorStreamCaptureWrongThread.code, cudaErrorStreamCaptureWrongThread);
+      put(cudaErrorTimeout.code, cudaErrorTimeout);
+      put(cudaErrorGraphExecUpdateFailure.code, cudaErrorGraphExecUpdateFailure);
+      put(cudaErrorExternalDevice.code, cudaErrorExternalDevice);
+      put(cudaErrorUnknown.code, cudaErrorUnknown);
+      put(cudaErrorApiFailureBase.code, cudaErrorApiFailureBase);
+    }};
+
+    CudaError(int errorCode) {
+      this.code = errorCode;
+    }
+
+    public static CudaError parseErrorCode(int errorCode) {
+      if (!codeToError.containsKey(errorCode)) {
+        return UnknownNativeError;
+      }
+      return codeToError.get(errorCode);
+    }
+
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/CudaFatalException.java b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java
new file mode 100644
index 00000000000..cf36726aa80
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CudaFatalException.java
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.rapids.cudf;
+
+/**
+ * CudaFatalException is a kind of CudaException which leaves the process in an inconsistent state
+ * and any further CUDA work will return the same error.
+ * To continue using CUDA, the process must be terminated and relaunched.
+ */
+public class CudaFatalException extends CudaException {
+  CudaFatalException(String message, int errorCode) {
+    super(message, errorCode);
+  }
+
+  CudaFatalException(String message, int errorCode, Throwable cause) {
+    super(message, errorCode, cause);
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index ff966643866..24f7d44ed28 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -289,7 +289,10 @@ private static native long writeParquetFileBegin(String[] columnNames,
                                                    int statsFreq,
                                                    boolean[] isInt96,
                                                    int[] precisions,
-                                                   boolean[] isMapValues, String filename) throws CudfException;
+                                                   boolean[] isMapValues,
+                                                   boolean[] hasParquetFieldIds,
+                                                   int[] parquetFieldIds,
+                                                   String filename) throws CudfException;
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
@@ -319,6 +322,8 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      boolean[] isInt96,
                                                      int[] precisions,
                                                      boolean[] isMapValues,
+                                                     boolean[] hasParquetFieldIds,
+                                                     int[] parquetFieldIds,
                                                      HostBufferConsumer consumer) throws CudfException;
 
   /**
@@ -1201,6 +1206,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
       boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
       boolean[] isMapValues = options.getFlatIsMap();
       int[] precisions = options.getFlatPrecision();
+      boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId();
+      int[] parquetFieldIds = options.getFlatParquetFieldId();
       int[] flatNumChildren = options.getFlatNumChildren();
 
       this.consumer = null;
@@ -1215,6 +1222,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
           timeInt96Values,
           precisions,
           isMapValues,
+          hasParquetFieldIds,
+          parquetFieldIds,
           outputFile.getAbsolutePath());
     }
 
@@ -1224,6 +1233,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
       boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
       boolean[] isMapValues = options.getFlatIsMap();
       int[] precisions = options.getFlatPrecision();
+      boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId();
+      int[] parquetFieldIds = options.getFlatParquetFieldId();
       int[] flatNumChildren = options.getFlatNumChildren();
 
       this.consumer = consumer;
@@ -1238,6 +1249,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           timeInt96Values,
           precisions,
           isMapValues,
+          hasParquetFieldIds,
+          parquetFieldIds,
           consumer);
     }
 
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 9851102d011..3a375412bbd 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -41,7 +41,6 @@ option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 option(USE_GDS "Build with GPUDirect Storage (GDS)/cuFile support" OFF)
-option(CUDF_JNI_ARROW_STATIC "Statically link Arrow" ON)
 option(CUDF_JNI_LIBCUDF_STATIC "Link with libcudf.a" OFF)
 
 message(VERBOSE "CUDF_JNI: Build with NVTX support: ${USE_NVTX}")
@@ -50,7 +49,6 @@ message(VERBOSE "CUDF_JNI: Configure CMake to build tests: ${BUILD_TESTS}")
 message(VERBOSE "CUDF_JNI: Build with per-thread default stream: ${PER_THREAD_DEFAULT_STREAM}")
 message(VERBOSE "CUDF_JNI: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 message(VERBOSE "CUDF_JNI: Build with GPUDirect Storage support: ${USE_GDS}")
-message(VERBOSE "CUDF_JNI: Build with static Arrow library: ${CUDF_JNI_ARROW_STATIC}")
 message(VERBOSE "CUDF_JNI: Link with libcudf statically: ${CUDF_JNI_LIBCUDF_STATIC}")
 
 set(CUDF_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../../../../cpp")
@@ -93,67 +91,16 @@ endif()
 rapids_cmake_build_type("Release")
 
 # ##################################################################################################
-# * Thrust/CUB
-# ------------------------------------------------------------------------------------
-include(${CUDF_SOURCE_DIR}/cmake/thirdparty/get_thrust.cmake)
+# * nvcomp------------------------------------------------------------------------------------------
 
-# ##################################################################################################
-# * CUDF ------------------------------------------------------------------------------------------
-
-set(CUDF_INCLUDE "${PROJECT_SOURCE_DIR}/../../../../cpp/include"
-                 "${PROJECT_SOURCE_DIR}/../../../../cpp/src/"
-)
-
-set(CUDF_LIB_HINTS HINTS "$ENV{CUDF_ROOT}" "$ENV{CUDF_ROOT}/lib" "$ENV{CONDA_PREFIX}/lib"
-                   "${CUDF_CPP_BUILD_DIR}"
-)
-
-find_library(CUDF_LIB "cudf" REQUIRED HINTS ${CUDF_LIB_HINTS})
-
-# ##################################################################################################
-# * ZLIB ------------------------------------------------------------------------------------------
-
-# find zlib
-rapids_find_package(ZLIB REQUIRED)
+set(nvcomp_DIR "${CUDF_CPP_BUILD_DIR}/_deps/nvcomp-build")
+rapids_find_package(nvcomp REQUIRED)
 
 # ##################################################################################################
-# * RMM -------------------------------------------------------------------------------------------
+# * CUDF ------------------------------------------------------------------------------------------
 
-include(${CUDF_SOURCE_DIR}/cmake/thirdparty/get_rmm.cmake)
-
-# ##################################################################################################
-# * ARROW -----------------------------------------------------------------------------------------
-
-find_path(ARROW_INCLUDE "arrow" HINTS "$ENV{ARROW_ROOT}/include"
-                                      "${CUDF_CPP_BUILD_DIR}/_deps/arrow-src/cpp/src"
-)
-
-message(STATUS "ARROW: ARROW_INCLUDE set to ${ARROW_INCLUDE}")
-
-if(CUDF_JNI_ARROW_STATIC)
-  # Find static version of Arrow lib
-  set(CUDF_JNI_ARROW_LIBNAME "libarrow.a")
-else()
-  set(CUDF_JNI_ARROW_LIBNAME "arrow")
-endif()
-
-find_library(
-  ARROW_LIBRARY ${CUDF_JNI_ARROW_LIBNAME} REQUIRED
-  HINTS "$ENV{ARROW_ROOT}/lib" "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release"
-        "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/debug"
-)
-
-if(NOT ARROW_LIBRARY)
-  if(CUDF_JNI_ARROW_STATIC)
-    message(
-      FATAL_ERROR "Arrow static library not found. Was libcudf built with CUDF_USE_ARROW_STATIC=ON?"
-    )
-  else()
-    message(FATAL_ERROR "Arrow dynamic library not found.")
-  endif()
-else()
-  message(STATUS "ARROW: ARROW_LIBRARY set to ${ARROW_LIBRARY}")
-endif()
+set(cudf_ROOT "${CUDF_CPP_BUILD_DIR}")
+rapids_find_package(cudf REQUIRED)
 
 # ##################################################################################################
 # * find JNI -------------------------------------------------------------------------------------
@@ -164,27 +111,6 @@ else()
   message(FATAL_ERROR "JDK with JNI not found, please check your settings.")
 endif()
 
-# ##################################################################################################
-# * nvcomp ----------------------------------------------------------------------------------------
-
-find_path(NVCOMP_INCLUDE "nvcomp" HINTS "${CUDF_CPP_BUILD_DIR}/_deps/nvcomp-src/include"
-                                        "$ENV{CONDA_PREFIX}/include"
-)
-
-message(STATUS "NVCOMP: NVCOMP_INCLUDE set to ${NVCOMP_INCLUDE}")
-
-set(CUDF_JNI_NVCOMP_LIBNAME "libnvcomp.a")
-find_library(
-  NVCOMP_LIBRARY ${CUDF_JNI_NVCOMP_LIBNAME} REQUIRED HINTS "${CUDF_CPP_BUILD_DIR}/lib"
-                                                           "$ENV{CONDA_PREFIX}/lib"
-)
-
-if(NOT NVCOMP_LIBRARY)
-  message(FATAL_ERROR "nvcomp static library not found.")
-else()
-  message(STATUS "NVCOMP: NVCOMP_LIBRARY set to ${NVCOMP_LIBRARY}")
-endif()
-
 # ##################################################################################################
 # * GDS/cufile ------------------------------------------------------------------------------------
 
@@ -238,17 +164,8 @@ endif()
 # * include paths ---------------------------------------------------------------------------------
 
 target_include_directories(
-  cudfjni
-  PUBLIC cudf::Thrust
-         "${LIBCUDACXX_INCLUDE}"
-         "${CUDAToolkit_INCLUDE_DIRS}"
-         "${NVCOMP_INCLUDE}"
-         "${CMAKE_BINARY_DIR}/include"
-         "${CMAKE_SOURCE_DIR}/include"
-         "${CMAKE_SOURCE_DIR}/src"
-         "${JNI_INCLUDE_DIRS}"
-         "${CUDF_INCLUDE}"
-         "${ARROW_INCLUDE}"
+  cudfjni PUBLIC "${CMAKE_BINARY_DIR}/include" "${CMAKE_SOURCE_DIR}/include"
+                 "${CMAKE_SOURCE_DIR}/src" "${JNI_INCLUDE_DIRS}"
 )
 
 # ##################################################################################################
@@ -291,39 +208,24 @@ if(USE_GDS)
                POSITION_INDEPENDENT_CODE ON
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
-  target_include_directories(
-    cufilejni
-    PUBLIC "${LIBCUDACXX_INCLUDE}" "${CUDF_INCLUDE}"
-    PRIVATE "${cuFile_INCLUDE_DIRS}"
-  )
-  target_link_libraries(cufilejni PRIVATE cudfjni rmm::rmm "${cuFile_LIBRARIES}")
+  target_include_directories(cufilejni PRIVATE "${cuFile_INCLUDE_DIRS}")
+  target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}")
 endif()
 
-# ##################################################################################################
-# * rmm logging level -----------------------------------------------------------------------------
-
-set(RMM_LOGGING_LEVEL
-    "INFO"
-    CACHE STRING "Choose the logging level."
-)
-# Set the possible values of build type for cmake-gui
-set_property(
-  CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF"
-)
-message(STATUS "RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
-
-target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL})
-
 # ##################################################################################################
 # * link libraries --------------------------------------------------------------------------------
 
-set(CUDF_LINK ${CUDF_LIB})
+set(CUDF_LINK PUBLIC cudf::cudf)
 if(CUDF_JNI_LIBCUDF_STATIC)
-  set(CUDF_LINK -Wl,--whole-archive ${CUDF_LIB} -Wl,--no-whole-archive ZLIB::ZLIB)
+  # Whole-link libcudf.a into the shared library but not its dependencies
+  set(CUDF_LINK PRIVATE -Wl,--whole-archive cudf::cudf -Wl,--no-whole-archive PUBLIC cudf::cudf)
 endif()
 
+# When nvcomp is installed we need to use nvcomp::nvcomp but from the cudf build directory it will
+# just be nvcomp.
 target_link_libraries(
-  cudfjni PRIVATE ${CUDF_LINK} ${NVCOMP_LIBRARY} ${ARROW_LIBRARY} rmm::rmm CUDA::cuda_driver
+  cudfjni ${CUDF_LINK} PRIVATE $<TARGET_NAME_IF_EXISTS:nvcomp>
+                               $<TARGET_NAME_IF_EXISTS:nvcomp::nvcomp>
 )
 
 # ##################################################################################################
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index a45716a89b3..eca424132a5 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -30,6 +30,7 @@ namespace jni {
 constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6;
 
 constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException";
+constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException";
 constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException";
 constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException";
 constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
@@ -737,12 +738,26 @@ class native_jstringArray {
  * @brief create a cuda exception from a given cudaError_t
  */
 inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) {
-  jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS);
+  const char *ex_class_name;
+
+  // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
+  // call doesn't return with cudaSuccess.
+  cudaGetLastError();
+  auto const last = cudaGetLastError();
+  // Call cudaDeviceSynchronize to ensure `last` did not result from an asynchronous error.
+  // between two calls.
+  if (status == last && last == cudaDeviceSynchronize()) {
+    ex_class_name = cudf::jni::CUDA_FATAL_ERROR_CLASS;
+  } else {
+    ex_class_name = cudf::jni::CUDA_ERROR_CLASS;
+  }
+
+  jclass ex_class = env->FindClass(ex_class_name);
   if (ex_class == NULL) {
     return NULL;
   }
   jmethodID ctor_id =
-      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/Throwable;)V");
+      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;ILjava/lang/Throwable;)V");
   if (ctor_id == NULL) {
     return NULL;
   }
@@ -752,19 +767,20 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
     return NULL;
   }
 
-  jobject ret = env->NewObject(ex_class, ctor_id, msg, cause);
+  jint err_code = static_cast<jint>(status);
+
+  jobject ret = env->NewObject(ex_class, ctor_id, msg, err_code, cause);
   return (jthrowable)ret;
 }
 
 inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
   if (cudaSuccess != cuda_status) {
-    // Clear the last error so it does not propagate.
-    cudaGetLastError();
     jthrowable jt = cuda_exception(env, cuda_status);
     if (jt != NULL) {
       env->Throw(jt);
-      throw jni_exception("CUDA ERROR");
     }
+    throw jni_exception(std::string("CUDA ERROR: code ") +
+                        std::to_string(static_cast<int>(cuda_status)));
   }
 }
 
@@ -790,18 +806,26 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
     JNI_THROW_NEW(env, class_name, message, ret_val)                                               \
   }
 
-#define JNI_CUDA_TRY(env, ret_val, call)                                                           \
+// Throw a new exception only if one is not pending then always return with the specified value
+#define JNI_CHECK_CUDA_ERROR(env, class_name, e, ret_val)                                          \
   {                                                                                                \
-    cudaError_t internal_cuda_status = (call);                                                     \
-    if (cudaSuccess != internal_cuda_status) {                                                     \
-      /* Clear the last error so it does not propagate.*/                                          \
-      cudaGetLastError();                                                                          \
-      jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status);                        \
-      if (jt != NULL) {                                                                            \
-        env->Throw(jt);                                                                            \
-      }                                                                                            \
+    if (env->ExceptionOccurred()) {                                                                \
       return ret_val;                                                                              \
     }                                                                                              \
+    std::string n_msg = e.what() == nullptr ? "" : e.what();                                       \
+    jstring j_msg = env->NewStringUTF(n_msg.c_str());                                              \
+    jint e_code = static_cast<jint>(e.error_code());                                               \
+    jclass ex_class = env->FindClass(class_name);                                                  \
+    if (ex_class != NULL) {                                                                        \
+      jmethodID ctor_id = env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;I)V");          \
+      if (ctor_id != NULL) {                                                                       \
+        jobject cuda_error = env->NewObject(ex_class, ctor_id, j_msg, e_code);                     \
+        if (cuda_error != NULL) {                                                                  \
+          env->Throw((jthrowable)cuda_error);                                                      \
+        }                                                                                          \
+      }                                                                                            \
+    }                                                                                              \
+    return ret_val;                                                                                \
   }
 
 #define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                               \
@@ -831,6 +855,12 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
         std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
     JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                         \
   }                                                                                                \
+  catch (const cudf::fatal_cuda_error &e) {                                                        \
+    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e, ret_val);                      \
+  }                                                                                                \
+  catch (const cudf::cuda_error &e) {                                                              \
+    JNI_CHECK_CUDA_ERROR(env, cudf::jni::CUDA_ERROR_CLASS, e, ret_val);                            \
+  }                                                                                                \
   catch (const std::exception &e) {                                                                \
     /* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
     JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val);                                       \
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index f8c448566c8..6ac73282615 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -82,11 +82,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
         case 27: // MERGE_M2
           return cudf::make_merge_m2_aggregation();
         case 28: // RANK
-          return cudf::make_rank_aggregation();
+          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {},
+                                             cudf::null_policy::INCLUDE);
         case 29: // DENSE_RANK
-          return cudf::make_dense_rank_aggregation();
-        case 30: // PERCENT_RANK
-          return cudf::make_percent_rank_aggregation();
+          return cudf::make_rank_aggregation(cudf::rank_method::DENSE, {},
+                                             cudf::null_policy::INCLUDE);
+        case 30: // ANSI SQL PERCENT_RANK
+          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
+                                             {}, cudf::rank_percentage::ONE_NORMALIZED);
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
       }
     }();
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 8c8e9b91e8d..e074180c312 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -31,6 +31,7 @@
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/drop_list_duplicates.hpp>
 #include <cudf/lists/extract.hpp>
+#include <cudf/lists/gather.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/null_mask.hpp>
@@ -288,6 +289,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedReduce(
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedGather(
+    JNIEnv *env, jclass, jlong source_column, jlong gather_map_list, jboolean nullify_out_bounds) {
+  JNI_NULL_CHECK(env, source_column, "source column view is null", 0);
+  JNI_NULL_CHECK(env, gather_map_list, "gather map is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const &src_col =
+        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(source_column));
+    auto const &gather_map =
+        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(gather_map_list));
+    auto out_bounds_policy = nullify_out_bounds ? cudf::out_of_bounds_policy::NULLIFY :
+                                                  cudf::out_of_bounds_policy::DONT_CHECK;
+    return release_as_jlong(cudf::lists::segmented_gather(src_col, gather_map, out_bounds_policy));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(JNIEnv *env, jclass, jlong j_col_view,
                                                             jlong j_agg, jboolean is_inclusive,
                                                             jboolean include_nulls) {
@@ -606,6 +624,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_generateListOffsets(JNIEnv *env, jclass,
+                                                                           jlong handle) {
+  JNI_NULL_CHECK(env, handle, "handle is null", 0)
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const cv = reinterpret_cast<cudf::column_view const *>(handle);
+    return release_as_jlong(cudf::jni::generate_list_offsets(*cv));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong input_handle,
                                                                         jstring pattern_obj,
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 47055ca1611..6b4db39eb34 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,11 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
+#include <rmm/exec_policy.hpp>
+#include <thrust/scan.h>
 
 #include "ColumnViewJni.hpp"
 
@@ -51,4 +54,22 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
   return deep_copy;
 }
 
+std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const &list_length,
+                                                    rmm::cuda_stream_view stream) {
+  CUDF_EXPECTS(list_length.type().id() == cudf::type_id::INT32,
+               "Input column does not have type INT32.");
+
+  auto const begin_iter = list_length.template begin<cudf::size_type>();
+  auto const end_iter = list_length.template end<cudf::size_type>();
+
+  auto offsets_column = make_numeric_column(data_type{type_id::INT32}, list_length.size() + 1,
+                                            mask_state::UNALLOCATED, stream);
+  auto offsets_view = offsets_column->mutable_view();
+  auto d_offsets = offsets_view.template begin<int32_t>();
+
+  thrust::inclusive_scan(rmm::exec_policy(stream), begin_iter, end_iter, d_offsets + 1);
+  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream));
+
+  return offsets_column;
+}
 } // namespace cudf::jni
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index 37e58ecb63a..429f36bcb1d 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf::jni {
 
@@ -35,4 +36,19 @@ std::unique_ptr<cudf::column>
 new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
                                            cudf::column_view const &bool_column);
 
+/**
+ * @brief Generates list offsets with lengths of each list.
+ *
+ * For example,
+ * Given a list column: [[1,2,3], [4,5], [6], [], [7,8]]
+ * The list lengths of it: [3, 2, 1, 0, 2]
+ * The list offsets of it: [0, 3, 5, 6, 6, 8]
+ *
+ * @param list_length The column represents list lengths.
+ * @return The column represents list offsets.
+ */
+std::unique_ptr<cudf::column>
+generate_list_offsets(cudf::column_view const &list_length,
+                      rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 } // namespace cudf::jni
diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp
index 9862c3bface..926521c55f9 100644
--- a/java/src/main/native/src/CudaJni.cpp
+++ b/java/src/main/native/src/CudaJni.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <cuda_profiler_api.h>
+#include <cudf/utilities/error.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include "jni_utils.hpp"
@@ -66,7 +67,7 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
     cudf::jni::auto_set_device(env);
 
     size_t free, total;
-    JNI_CUDA_TRY(env, NULL, cudaMemGetInfo(&free, &total));
+    CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
 
     jclass info_class = env->FindClass("Lai/rapids/cudf/CudaMemInfo;");
     if (info_class == NULL) {
@@ -90,7 +91,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jc
   try {
     cudf::jni::auto_set_device(env);
     void *ret = nullptr;
-    JNI_CUDA_TRY(env, 0, cudaMallocHost(&ret, size));
+    CUDF_CUDA_TRY(cudaMallocHost(&ret, size));
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0);
@@ -99,7 +100,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jc
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) {
   try {
     cudf::jni::auto_set_device(env);
-    JNI_CUDA_TRY(env, , cudaFreeHost(reinterpret_cast<void *>(ptr)));
+    CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast<void *>(ptr)));
   }
   CATCH_STD(env, );
 }
@@ -109,8 +110,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlon
   JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count));
-    JNI_CUDA_TRY(env, , cudaStreamSynchronize(0));
+    CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count));
+    CUDF_CUDA_TRY(cudaStreamSynchronize(0));
   }
   CATCH_STD(env, );
 }
@@ -120,7 +121,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass,
   JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    JNI_CUDA_TRY(env, , cudaMemsetAsync((void *)dst, value, count));
+    CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count));
   }
   CATCH_STD(env, );
 }
@@ -129,7 +130,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) {
   try {
     cudf::jni::auto_set_device(env);
     jint dev;
-    JNI_CUDA_TRY(env, -2, cudaGetDevice(&dev));
+    CUDF_CUDA_TRY(cudaGetDevice(&dev));
     return dev;
   }
   CATCH_STD(env, -2);
@@ -139,7 +140,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jcla
   try {
     cudf::jni::auto_set_device(env);
     jint count;
-    JNI_CUDA_TRY(env, -2, cudaGetDeviceCount(&count));
+    CUDF_CUDA_TRY(cudaGetDeviceCount(&count));
     return count;
   }
   CATCH_STD(env, -2);
@@ -151,7 +152,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, j
       cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
                                       "Cannot change device after RMM init");
     }
-    JNI_CUDA_TRY(env, , cudaSetDevice(dev));
+    CUDF_CUDA_TRY(cudaSetDevice(dev));
   }
   CATCH_STD(env, );
 }
@@ -167,7 +168,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jc
   try {
     cudf::jni::auto_set_device(env);
     jint driver_version;
-    JNI_CUDA_TRY(env, -2, cudaDriverGetVersion(&driver_version));
+    CUDF_CUDA_TRY(cudaDriverGetVersion(&driver_version));
     return driver_version;
   }
   CATCH_STD(env, -2);
@@ -177,7 +178,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, j
   try {
     cudf::jni::auto_set_device(env);
     jint runtime_version;
-    JNI_CUDA_TRY(env, -2, cudaRuntimeGetVersion(&runtime_version));
+    CUDF_CUDA_TRY(cudaRuntimeGetVersion(&runtime_version));
     return runtime_version;
   }
   CATCH_STD(env, -2);
@@ -187,9 +188,9 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env
   try {
     cudf::jni::auto_set_device(env);
     int device;
-    JNI_CUDA_TRY(env, -2, cudaGetDevice(&device));
+    CUDF_CUDA_TRY(cudaGetDevice(&device));
     cudaDeviceProp device_prop;
-    JNI_CUDA_TRY(env, -2, cudaGetDeviceProperties(&device_prop, device));
+    CUDF_CUDA_TRY(cudaGetDeviceProperties(&device_prop, device));
     return device_prop.computeMode;
   }
   CATCH_STD(env, -2);
@@ -199,10 +200,9 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv
   try {
     cudf::jni::auto_set_device(env);
     int device;
-    JNI_CUDA_TRY(env, -2, ::cudaGetDevice(&device));
+    CUDF_CUDA_TRY(::cudaGetDevice(&device));
     int attribute_value;
-    JNI_CUDA_TRY(
-        env, -2,
+    CUDF_CUDA_TRY(
         ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device));
     return attribute_value;
   }
@@ -213,10 +213,9 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv
   try {
     cudf::jni::auto_set_device(env);
     int device;
-    JNI_CUDA_TRY(env, -2, ::cudaGetDevice(&device));
+    CUDF_CUDA_TRY(::cudaGetDevice(&device));
     int attribute_value;
-    JNI_CUDA_TRY(
-        env, -2,
+    CUDF_CUDA_TRY(
         ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device));
     return attribute_value;
   }
@@ -226,7 +225,7 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
   try {
     cudf::jni::auto_set_device(env);
-    JNI_CUDA_TRY(env, , cudaFree(0));
+    CUDF_CUDA_TRY(cudaFree(0));
   }
   CATCH_STD(env, );
 }
@@ -237,7 +236,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclas
     cudf::jni::auto_set_device(env);
     cudaStream_t stream = nullptr;
     auto flags = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
-    JNI_CUDA_TRY(env, 0, cudaStreamCreateWithFlags(&stream, flags));
+    CUDF_CUDA_TRY(cudaStreamCreateWithFlags(&stream, flags));
     return reinterpret_cast<jlong>(stream);
   }
   CATCH_STD(env, 0);
@@ -247,7 +246,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclas
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    JNI_CUDA_TRY(env, , cudaStreamDestroy(stream));
+    CUDF_CUDA_TRY(cudaStreamDestroy(stream));
   }
   CATCH_STD(env, );
 }
@@ -258,7 +257,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jcl
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
-    JNI_CUDA_TRY(env, , cudaStreamWaitEvent(stream, event, 0));
+    CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
   }
   CATCH_STD(env, );
 }
@@ -268,7 +267,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, j
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    JNI_CUDA_TRY(env, , cudaStreamSynchronize(stream));
+    CUDF_CUDA_TRY(cudaStreamSynchronize(stream));
   }
   CATCH_STD(env, );
 }
@@ -286,7 +285,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass
     if (blockingSync) {
       flags = flags | cudaEventBlockingSync;
     }
-    JNI_CUDA_TRY(env, 0, cudaEventCreateWithFlags(&event, flags));
+    CUDF_CUDA_TRY(cudaEventCreateWithFlags(&event, flags));
     return reinterpret_cast<jlong>(event);
   }
   CATCH_STD(env, 0);
@@ -296,7 +295,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass
   try {
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
-    JNI_CUDA_TRY(env, , cudaEventDestroy(event));
+    CUDF_CUDA_TRY(cudaEventDestroy(event));
   }
   CATCH_STD(env, );
 }
@@ -311,7 +310,7 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jcla
     } else if (result == cudaErrorNotReady) {
       return false;
     } // else
-    JNI_CUDA_TRY(env, false, result);
+    CUDF_CUDA_TRY(result);
   }
   CATCH_STD(env, false);
   return false;
@@ -323,7 +322,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass,
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    JNI_CUDA_TRY(env, , cudaEventRecord(event, stream));
+    CUDF_CUDA_TRY(cudaEventRecord(event, stream));
   }
   CATCH_STD(env, );
 }
@@ -333,7 +332,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jc
   try {
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
-    JNI_CUDA_TRY(env, , cudaEventSynchronize(event));
+    CUDF_CUDA_TRY(cudaEventSynchronize(event));
   }
   CATCH_STD(env, );
 }
@@ -352,8 +351,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jcla
     auto src = reinterpret_cast<void *>(jsrc);
     auto kind = static_cast<cudaMemcpyKind>(jkind);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream));
-    JNI_CUDA_TRY(env, , cudaStreamSynchronize(stream));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream));
+    CUDF_CUDA_TRY(cudaStreamSynchronize(stream));
   }
   CATCH_STD(env, );
 }
@@ -372,7 +371,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env,
     auto src = reinterpret_cast<void *>(jsrc);
     auto kind = static_cast<cudaMemcpyKind>(jkind);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    JNI_CUDA_TRY(env, , cudaMemcpyAsync(dst, src, count, kind, stream));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream));
   }
   CATCH_STD(env, );
 }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index cebe476dd87..919958d4db2 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -673,6 +673,8 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
                         cudf::jni::native_jbooleanArray &is_int96,
                         cudf::jni::native_jintArray &precisions,
                         cudf::jni::native_jbooleanArray &is_map,
+                        cudf::jni::native_jbooleanArray &hasParquetFieldIds,
+                        cudf::jni::native_jintArray &parquetFieldIds,
                         cudf::jni::native_jintArray &children, int num_children, int read_index) {
   int write_index = 0;
   for (int i = 0; i < num_children; i++, write_index++) {
@@ -687,12 +689,15 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
     if (is_map[read_index]) {
       child.set_list_column_as_map();
     }
+    if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
+      child.set_parquet_field_id(parquetFieldIds[read_index]);
+    }
     column_metadata.add_child(child);
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index =
-          set_column_metadata(column_metadata.child(write_index), col_names, nullability, is_int96,
-                              precisions, is_map, children, childs_children, read_index);
+      read_index = set_column_metadata(column_metadata.child(write_index), col_names, nullability,
+                                       is_int96, precisions, is_map, hasParquetFieldIds,
+                                       parquetFieldIds, children, childs_children, read_index);
     }
   }
   return read_index;
@@ -701,12 +706,15 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
 void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names,
                          jintArray &j_children, jbooleanArray &j_col_nullability,
                          jbooleanArray &j_is_int96, jintArray &j_precisions,
-                         jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata) {
+                         jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata,
+                         jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds) {
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
   cudf::jni::native_jbooleanArray is_int96(env, j_is_int96);
   cudf::jni::native_jintArray precisions(env, j_precisions);
+  cudf::jni::native_jbooleanArray hasParquetFieldIds(env, j_hasParquetFieldIds);
+  cudf::jni::native_jintArray parquetFieldIds(env, j_parquetFieldIds);
   cudf::jni::native_jintArray children(env, j_children);
   cudf::jni::native_jbooleanArray is_map(env, j_is_map);
 
@@ -729,11 +737,14 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
     if (is_map[read_index]) {
       metadata.column_metadata[write_index].set_list_column_as_map();
     }
+    if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
+      metadata.column_metadata[write_index].set_parquet_field_id(parquetFieldIds[read_index]);
+    }
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index =
-          set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability,
-                              is_int96, precisions, is_map, children, childs_children, read_index);
+      read_index = set_column_metadata(
+          metadata.column_metadata[write_index], cpp_names, col_nullability, is_int96, precisions,
+          is_map, hasParquetFieldIds, parquetFieldIds, children, childs_children, read_index);
     }
   }
 }
@@ -1539,7 +1550,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
     jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jobject consumer) {
+    jbooleanArray j_is_map, jbooleanArray j_hasParquetFieldIds, jintArray j_parquetFieldIds,
+    jobject consumer) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1554,7 +1566,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     sink_info sink{data_sink.get()};
     table_input_metadata metadata;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata);
+                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds);
 
     auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
@@ -1583,7 +1595,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
     jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jstring j_output_path) {
+    jbooleanArray j_is_map, jbooleanArray j_hasParquetFieldIds, jintArray j_parquetFieldIds,
+    jstring j_output_path) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1596,7 +1609,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     using namespace cudf::jni;
     table_input_metadata metadata;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata);
+                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds);
 
     auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
@@ -1721,8 +1734,12 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     table_input_metadata metadata;
     // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_is_int96 = NULL;
+    // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
+    jbooleanArray j_hasParquetFieldIds = NULL;
+    jintArray j_parquetFieldIds = NULL;
+
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata);
+                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds);
 
     auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
@@ -1766,8 +1783,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     table_input_metadata metadata;
     // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_is_int96 = NULL;
+    // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
+    jbooleanArray j_hasParquetFieldIds = NULL;
+    jintArray j_parquetFieldIds = NULL;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata);
+                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds);
 
     auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 58901d5743b..a42846aac05 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -20,6 +20,7 @@
 
 import ai.rapids.cudf.ColumnView.FindOptions;
 import ai.rapids.cudf.HostColumnVector.*;
+import com.google.common.collect.Lists;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
@@ -6259,4 +6260,43 @@ void testCopyWithBooleanColumnAsValidity() {
     });
     assertTrue(x.getMessage().contains("Exemplar and validity columns must have the same size"));
   }
+
+  @Test
+  void testSegmentedGather() {
+    HostColumnVector.DataType dt = new ListType(true, new BasicType(true, DType.STRING));
+    try (ColumnVector source = ColumnVector.fromLists(dt,
+        Lists.newArrayList("a", "b", null, "c"),
+        null,
+        Lists.newArrayList(),
+        Lists.newArrayList(null, "A", "B", "C", "D"));
+         ColumnVector gatherMap = ColumnVector.fromLists(
+             new ListType(false, new BasicType(false, DType.INT32)),
+             Lists.newArrayList(-3, 0, 2, 3, 4),
+             Lists.newArrayList(),
+             Lists.newArrayList(1),
+             Lists.newArrayList(1, -4, 5, -1, -6));
+         ColumnVector actual = source.segmentedGather(gatherMap);
+         ColumnVector expected = ColumnVector.fromLists(dt,
+             Lists.newArrayList("b", "a", null, "c", null),
+             null,
+             Lists.newArrayList((String) null),
+             Lists.newArrayList("A", "A", null, "D", null))) {
+      assertColumnsAreEqual(expected, actual);
+    }
+  }
+
+  @Test
+  void testGenerateListOffsets() {
+    try (ColumnVector index = ColumnVector.fromInts(1, 3, 3, 0, 2, 0, 0, 5, 10, 25);
+         ColumnVector actual = index.generateListOffsets();
+         ColumnVector expected = ColumnVector.fromInts(0, 1, 4, 7, 7, 9, 9, 9, 14, 24, 49)) {
+      assertColumnsAreEqual(expected, actual);
+    }
+
+    try (ColumnVector index = ColumnVector.fromInts(0, 0, 1, 0, 0);
+         ColumnVector actual = index.generateListOffsets();
+         ColumnVector expected = ColumnVector.fromInts(0, 0, 0, 1, 1, 1)) {
+      assertColumnsAreEqual(expected, actual);
+    }
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index 8905c2edd56..1a86dbb374d 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 import org.junit.jupiter.api.Test;
 
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.*;
 
 public class CudaTest {
 
@@ -32,4 +32,17 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
+  @Test
+  public void testCudaException() {
+    assertThrows(CudaException.class, () -> {
+          try {
+            Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
+          } catch (CudaFatalException ignored) {
+          } catch (CudaException ex) {
+            assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError);
+            throw ex;
+          }
+        }
+    );
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 7be1ca2118b..af28cfb6d6c 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -36,6 +36,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.OriginalType;
 import org.junit.jupiter.api.Test;
@@ -7899,6 +7900,126 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
     }
   }
 
+  @Test
+  void testParquetWriteWithFieldId() throws IOException {
+    // field IDs are:
+    // c1: -1, c2: 2, c3: 3, c31: 31, c32: 32, c4: -4, c5: not specified
+    ColumnWriterOptions.StructBuilder sBuilder =
+        structBuilder("c3", true, 3)
+            .withColumn(true, "c31", 31)
+            .withColumn(true, "c32", 32);
+    ParquetWriterOptions options = ParquetWriterOptions.builder()
+        .withColumn(true, "c1", -1)
+        .withDecimalColumn("c2", 9, true, 2)
+        .withStructColumn(sBuilder.build())
+        .withTimestampColumn("c4", true, true, -4)
+        .withColumns( true, "c5")
+        .build();
+
+    File tempFile = File.createTempFile("test-field-id", ".parquet");
+    try {
+      HostColumnVector.StructType structType = new HostColumnVector.StructType(
+          true,
+          new HostColumnVector.BasicType(true, DType.STRING),
+          new HostColumnVector.BasicType(true, DType.STRING));
+
+      try (Table table0 = new Table.TestBuilder()
+          .column(true, false) // c1
+          .decimal32Column(0, 298, 2473) // c2
+          .column(structType, // c3
+              new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b"))
+          .timestampMicrosecondsColumn(1000L, 2000L) // c4
+          .column("a", "b") // c5
+          .build()) {
+        try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
+          writer.write(table0);
+        }
+      }
+
+      try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(
+          new Path(tempFile.getAbsolutePath()),
+          new Configuration()))) {
+        MessageType schema = reader.getFooter().getFileMetaData().getSchema();
+        assert (schema.getFields().get(0).getId().intValue() == -1);
+        assert (schema.getFields().get(1).getId().intValue() == 2);
+        assert (schema.getFields().get(2).getId().intValue() == 3);
+        assert (((GroupType) schema.getFields().get(2)).getFields().get(0).getId().intValue() == 31);
+        assert (((GroupType) schema.getFields().get(2)).getFields().get(1).getId().intValue() == 32);
+        assert (schema.getFields().get(3).getId().intValue() == -4);
+        assert (schema.getFields().get(4).getId() == null);
+      }
+    } finally {
+      tempFile.delete();
+    }
+  }
+
+  @Test
+  void testParquetWriteWithFieldIdNestNotSpecified() throws IOException {
+    // field IDs are:
+    // c0: no field ID
+    // c1: 1
+    // c2: no field ID
+    //   c21: 21
+    //   c22: no field ID
+    // c3: 3
+    //   c31: 31
+    //   c32: no field ID
+    // c4: 0
+    ColumnWriterOptions.StructBuilder c2Builder =
+        structBuilder("c2", true)
+            .withColumn(true, "c21", 21)
+            .withColumns(true, "c22");
+    ColumnWriterOptions.StructBuilder c3Builder =
+        structBuilder("c3", true, 3)
+            .withColumn(true, "c31", 31)
+            .withColumns(true, "c32");
+    ParquetWriterOptions options = ParquetWriterOptions.builder()
+        .withColumns(true, "c0")
+        .withDecimalColumn("c1", 9, true, 1)
+        .withStructColumn(c2Builder.build())
+        .withStructColumn(c3Builder.build())
+        .withColumn(true, "c4", 0)
+        .build();
+
+    File tempFile = File.createTempFile("test-field-id", ".parquet");
+    try {
+      HostColumnVector.StructType structType = new HostColumnVector.StructType(
+          true,
+          new HostColumnVector.BasicType(true, DType.STRING),
+          new HostColumnVector.BasicType(true, DType.STRING));
+
+      try (Table table0 = new Table.TestBuilder()
+          .column(true, false) // c0
+          .decimal32Column(0, 298, 2473) // c1
+          .column(structType, // c2
+              new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b"))
+          .column(structType, // c3
+              new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b"))
+          .column("a", "b") // c4
+          .build()) {
+        try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
+          writer.write(table0);
+        }
+      }
+
+      try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(
+          new Path(tempFile.getAbsolutePath()),
+          new Configuration()))) {
+        MessageType schema = reader.getFooter().getFileMetaData().getSchema();
+        assert (schema.getFields().get(0).getId() == null);
+        assert (schema.getFields().get(1).getId().intValue() == 1);
+        assert (schema.getFields().get(2).getId() == null);
+        assert (((GroupType) schema.getFields().get(2)).getFields().get(0).getId().intValue() == 21);
+        assert (((GroupType) schema.getFields().get(2)).getFields().get(1).getId() == null);
+        assert (((GroupType) schema.getFields().get(3)).getFields().get(0).getId().intValue() == 31);
+        assert (((GroupType) schema.getFields().get(3)).getFields().get(1).getId() == null);
+        assert (schema.getFields().get(4).getId().intValue() == 0);
+      }
+    } finally {
+      tempFile.delete();
+    }
+  }
+
   /** Return a column where DECIMAL64 has been up-casted to DECIMAL128 */
   private ColumnVector castDecimal64To128(ColumnView c) {
     DType dtype = c.getType();
diff --git a/python/.flake8 b/python/.flake8
index c645c46a216..667875030cc 100644
--- a/python/.flake8
+++ b/python/.flake8
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 [flake8]
 exclude = __init__.py
@@ -9,14 +9,14 @@ ignore =
     E203
 
 [pydocstyle]
-match = ^(.*abc\.py|.*api/types\.py|.*single_column_frame\.py|.*indexed_frame\.py)$
-# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather than include using match-dir.
-match-dir = ^(?!ci|cpp|python/dask_cudf|python/cudf_kafka|python/custreamz).*$
-# In addition to numpy style, we additionally ignore:
-add-ignore =
-    # magic methods
-    D105,
-    # no docstring in __init__
-    D107,
-    # newlines before docstrings
-    D204
+# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
+# than include using match-dir. Note that as discussed in
+# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
+# unlike the match option above this match-dir will have no effect when
+# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
+# also be maintained in the pre-commit config file.
+match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks|dask_cudf|cudf_kafka|custreamz|tests)).*$
+# Allow missing docstrings for docutils
+ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
+select = 
+    D30
diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py
index b99cd938a63..59d6f198681 100644
--- a/python/cudf/cudf/_fuzz_testing/fuzzer.py
+++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py
@@ -57,7 +57,7 @@ def log_stats(self):
         logging.info(f"Run-Time elapsed (hh:mm:ss.ms) {total_time_taken}")
 
     def write_crash(self, error):
-        error_file_name = datetime.datetime.now().__str__()
+        error_file_name = str(datetime.datetime.now())
         if self._crash_dir:
             crash_path = os.path.join(
                 self._crash_dir,
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index f850a7e79f9..29e0aeb7050 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -2,7 +2,7 @@
 
 import logging
 import random
-from collections import abc as abc
+from collections import abc
 
 import numpy as np
 
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index bd25aa53405..542262b7908 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 import numpy as np
 
 from . import (
@@ -8,6 +8,7 @@
     copying,
     csv,
     datetime,
+    expressions,
     filling,
     gpuarrow,
     groupby,
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 4dc91268d57..84dd9c3a576 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -30,7 +30,10 @@ from cudf._lib.types import Interpolation
 
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type
+from cudf._lib.cpp.aggregation cimport (
+    underlying_type_t_correlation_type,
+    underlying_type_t_rank_method,
+)
 
 import cudf
 
@@ -54,6 +57,7 @@ class AggregationKind(Enum):
     ARGMIN = libcudf_aggregation.aggregation.Kind.ARGMIN
     NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE
     NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT
+    RANK = libcudf_aggregation.aggregation.Kind.RANK
     COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT
     UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
@@ -77,6 +81,14 @@ class CorrelationType(IntEnum):
     )
 
 
+class RankMethod(IntEnum):
+    FIRST = libcudf_aggregation.rank_method.FIRST
+    AVERAGE = libcudf_aggregation.rank_method.AVERAGE
+    MIN = libcudf_aggregation.rank_method.MIN
+    MAX = libcudf_aggregation.rank_method.MAX
+    DENSE = libcudf_aggregation.rank_method.DENSE
+
+
 cdef class RollingAggregation:
     """A Cython wrapper for rolling window aggregations.
 
@@ -564,6 +576,33 @@ cdef class GroupbyScanAggregation:
     cummin = min
     cummax = max
 
+    @classmethod
+    def rank(cls, method, ascending, na_option, pct):
+        cdef GroupbyScanAggregation agg = cls()
+        cdef libcudf_aggregation.rank_method c_method = (
+            <libcudf_aggregation.rank_method> (
+                <underlying_type_t_rank_method> (
+                    RankMethod[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_rank_aggregation[groupby_scan_aggregation](
+                c_method,
+                (libcudf_types.order.ASCENDING if ascending else
+                    libcudf_types.order.DESCENDING),
+                (libcudf_types.null_policy.EXCLUDE if na_option == "keep" else
+                    libcudf_types.null_policy.INCLUDE),
+                (libcudf_types.null_order.BEFORE
+                    if (na_option == "top") == ascending else
+                    libcudf_types.null_order.AFTER),
+                (libcudf_aggregation.rank_percentage.ZERO_NORMALIZED
+                    if pct else
+                    libcudf_aggregation.rank_percentage.NONE)
+            ))
+        return agg
+
 
 cdef class ReduceAggregation:
     """A Cython wrapper for reduce aggregations.
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 448a22425a4..8cbadfa19a5 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -7,7 +7,7 @@ import pandas as pd
 import rmm
 
 import cudf
-import cudf._lib as libcudfxx
+import cudf._lib as libcudf
 from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype
 from cudf.core.buffer import Buffer
 
@@ -160,7 +160,7 @@ cdef class Column:
             if self.base_mask is None or self.offset == 0:
                 self._mask = self.base_mask
             else:
-                self._mask = libcudfxx.null_mask.copy_bitmask(self)
+                self._mask = libcudf.null_mask.copy_bitmask(self)
         return self._mask
 
     @property
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 399deb74c9c..a1d1485e1e8 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -7,11 +8,14 @@ from libcpp.vector cimport vector
 from cudf._lib.cpp.types cimport (
     data_type,
     interpolation,
+    null_order,
     null_policy,
+    order,
     size_type,
 )
 
 ctypedef int32_t underlying_type_t_correlation_type
+ctypedef int32_t underlying_type_t_rank_method
 
 cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
@@ -35,6 +39,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             ARGMIN 'cudf::aggregation::ARGMIN'
             NUNIQUE 'cudf::aggregation::NUNIQUE'
             NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT'
+            RANK 'cudf::aggregation::RANK'
             COLLECT 'cudf::aggregation::COLLECT_LIST'
             COLLECT_SET 'cudf::aggregation::COLLECT_SET'
             PTX 'cudf::aggregation::PTX'
@@ -68,6 +73,18 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         KENDALL 'cudf::correlation_type::KENDALL'
         SPEARMAN 'cudf::correlation_type::SPEARMAN'
 
+    ctypedef enum rank_method:
+        FIRST "cudf::rank_method::FIRST"
+        AVERAGE "cudf::rank_method::AVERAGE"
+        MIN "cudf::rank_method::MIN"
+        MAX "cudf::rank_method::MAX"
+        DENSE "cudf::rank_method::DENSE"
+
+    ctypedef enum rank_percentage:
+        NONE "cudf::rank_percentage::NONE"
+        ZERO_NORMALIZED "cudf::rank_percentage::ZERO_NORMALIZED"
+        ONE_NORMALIZED "cudf::rank_percentage::ONE_NORMALIZED"
+
     cdef unique_ptr[T] make_sum_aggregation[T]() except +
 
     cdef unique_ptr[T] make_product_aggregation[T]() except +
@@ -127,3 +144,10 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[T] make_covariance_aggregation[T](
         size_type min_periods, size_type ddof) except +
+
+    cdef unique_ptr[T] make_rank_aggregation[T](
+        rank_method method,
+        order column_order,
+        null_policy null_handling,
+        null_order null_precedence,
+        rank_percentage percentage) except +
diff --git a/python/cudf/cudf/_lib/cpp/expressions.pxd b/python/cudf/cudf/_lib/cpp/expressions.pxd
new file mode 100644
index 00000000000..1721f8aa734
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/expressions.pxd
@@ -0,0 +1,88 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.scalar.scalar cimport (
+    duration_scalar,
+    numeric_scalar,
+    timestamp_scalar,
+)
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
+
+
+cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
+    ctypedef enum ast_operator:
+        # Binary operators
+        ADD "cudf::ast::ast_operator::ADD"
+        SUB "cudf::ast::ast_operator::SUB"
+        MUL "cudf::ast::ast_operator::MUL"
+        DIV "cudf::ast::ast_operator::DIV"
+        TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV"
+        FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV"
+        MOD "cudf::ast::ast_operator::MOD"
+        PYMOD "cudf::ast::ast_operator::PYMOD"
+        POW "cudf::ast::ast_operator::POW"
+        EQUAL "cudf::ast::ast_operator::EQUAL"
+        NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL"
+        NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL"
+        LESS "cudf::ast::ast_operator::LESS"
+        GREATER "cudf::ast::ast_operator::GREATER"
+        LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL"
+        GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL"
+        BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND"
+        BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR"
+        BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR"
+        NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND"
+        LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND"
+        NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR"
+        LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR"
+        # Unary operators
+        IDENTITY "cudf::ast::ast_operator::IDENTITY"
+        SIN "cudf::ast::ast_operator::SIN"
+        COS "cudf::ast::ast_operator::COS"
+        TAN "cudf::ast::ast_operator::TAN"
+        ARCSIN "cudf::ast::ast_operator::ARCSIN"
+        ARCCOS "cudf::ast::ast_operator::ARCCOS"
+        ARCTAN "cudf::ast::ast_operator::ARCTAN"
+        SINH "cudf::ast::ast_operator::SINH"
+        COSH "cudf::ast::ast_operator::COSH"
+        TANH "cudf::ast::ast_operator::TANH"
+        ARCSINH "cudf::ast::ast_operator::ARCSINH"
+        ARCCOSH "cudf::ast::ast_operator::ARCCOSH"
+        ARCTANH "cudf::ast::ast_operator::ARCTANH"
+        EXP "cudf::ast::ast_operator::EXP"
+        LOG "cudf::ast::ast_operator::LOG"
+        SQRT "cudf::ast::ast_operator::SQRT"
+        CBRT "cudf::ast::ast_operator::CBRT"
+        CEIL "cudf::ast::ast_operator::CEIL"
+        FLOOR "cudf::ast::ast_operator::FLOOR"
+        ABS "cudf::ast::ast_operator::ABS"
+        RINT "cudf::ast::ast_operator::RINT"
+        BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT"
+        NOT "cudf::ast::ast_operator::NOT"
+
+    cdef cppclass expression:
+        pass
+
+    ctypedef enum table_reference:
+        LEFT "cudf::ast::table_reference::LEFT"
+        RIGHT "cudf::ast::table_reference::RIGHT"
+
+    cdef cppclass literal(expression):
+        # Due to https://github.com/cython/cython/issues/3198, we need to
+        # specify a return type for templated constructors.
+        literal literal[T](numeric_scalar[T] &) except +
+        literal literal[T](timestamp_scalar[T] &) except +
+        literal literal[T](duration_scalar[T] &) except +
+
+    cdef cppclass column_reference(expression):
+        # Allow for default C++ parameters by declaring multiple constructors
+        # with the default parameters optionally omitted.
+        column_reference(size_type) except +
+        column_reference(size_type, table_reference) except +
+
+    cdef cppclass operation(expression):
+        operation(ast_operator, const expression &)
+        operation(ast_operator, const expression &, const expression&)
diff --git a/python/cudf/cudf/_lib/cpp/lists/contains.pxd b/python/cudf/cudf/_lib/cpp/lists/contains.pxd
index 46aea37643f..e3cb01721a0 100644
--- a/python/cudf/cudf/_lib/cpp/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/cpp/lists/contains.pxd
@@ -18,3 +18,8 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
         lists_column_view lists,
         scalar search_key,
     ) except +
+
+    cdef unique_ptr[column] index_of(
+        lists_column_view lists,
+        column_view search_keys,
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd
index 243b841ce4b..c6c42c327ac 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/cpp/sorting.pxd
@@ -7,20 +7,13 @@ from libcpp.vector cimport vector
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.aggregation cimport rank_method
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 
 
-cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
-    ctypedef enum rank_method:
-        FIRST "cudf::rank_method::FIRST"
-        AVERAGE "cudf::rank_method::AVERAGE"
-        MIN "cudf::rank_method::MIN"
-        MAX "cudf::rank_method::MAX"
-        DENSE "cudf::rank_method::DENSE"
-
 cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
     cdef unique_ptr[column] sorted_order(
         table_view source_table,
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 590a371ff52..d9de04b676e 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -9,6 +9,7 @@ from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.expressions cimport expression
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type
@@ -42,3 +43,8 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         column_view input_column,
         column_view categories
     )
+
+    cdef unique_ptr[column] compute_column(
+        const table_view table,
+        const expression& expr
+    ) except +
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 05ff32392fe..f1a75baa951 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -19,9 +19,9 @@ import cudf
 
 from cudf._lib.cpp.types cimport size_type
 
-import collections.abc as abc
 import errno
 import os
+from collections import abc
 from enum import IntEnum
 from io import BytesIO, StringIO
 
@@ -238,7 +238,7 @@ cdef csv_reader_options make_csv_reader_options(
                 "`parse_dates`: dictionaries are unsupported")
         if not isinstance(parse_dates, abc.Iterable):
             raise NotImplementedError(
-                "`parse_dates`: non-lists are unsupported")
+                "`parse_dates`: an iterable is required")
         for col in parse_dates:
             if isinstance(col, str):
                 c_parse_dates_names.push_back(str(col).encode())
@@ -279,7 +279,7 @@ cdef csv_reader_options make_csv_reader_options(
             )
             csv_reader_options_c.set_dtypes(c_dtypes_list)
             csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
-        elif isinstance(dtype, abc.Iterable):
+        elif isinstance(dtype, abc.Collection):
             c_dtypes_list.reserve(len(dtype))
             for index, col_dtype in enumerate(dtype):
                 if col_dtype in CSV_HEX_TYPE_MAP:
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/expressions.pxd
new file mode 100644
index 00000000000..85665822174
--- /dev/null
+++ b/python/cudf/cudf/_lib/expressions.pxd
@@ -0,0 +1,38 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t, int64_t
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.expressions cimport (
+    column_reference,
+    expression,
+    literal,
+    operation,
+)
+from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
+
+ctypedef enum scalar_type_t:
+    INT
+    DOUBLE
+
+
+ctypedef union int_or_double_scalar_ptr:
+    unique_ptr[numeric_scalar[int64_t]] int_ptr
+    unique_ptr[numeric_scalar[double]] double_ptr
+
+
+cdef class Expression:
+    cdef unique_ptr[expression] c_obj
+
+
+cdef class Literal(Expression):
+    cdef scalar_type_t c_scalar_type
+    cdef int_or_double_scalar_ptr c_scalar
+
+
+cdef class ColumnReference(Expression):
+    pass
+
+
+cdef class Operation(Expression):
+    pass
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
new file mode 100644
index 00000000000..f069bcdbe73
--- /dev/null
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -0,0 +1,130 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from enum import Enum
+
+from cython.operator cimport dereference
+from libc.stdint cimport int64_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp cimport expressions as libcudf_exp
+from cudf._lib.cpp.types cimport size_type
+
+# Necessary for proper casting, see below.
+ctypedef int32_t underlying_type_ast_operator
+
+
+# Aliases for simplicity
+ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
+
+
+class ASTOperator(Enum):
+    ADD = libcudf_exp.ast_operator.ADD
+    SUB = libcudf_exp.ast_operator.SUB
+    MUL = libcudf_exp.ast_operator.MUL
+    DIV = libcudf_exp.ast_operator.DIV
+    TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV
+    FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV
+    MOD = libcudf_exp.ast_operator.MOD
+    PYMOD = libcudf_exp.ast_operator.PYMOD
+    POW = libcudf_exp.ast_operator.POW
+    EQUAL = libcudf_exp.ast_operator.EQUAL
+    NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL
+    NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL
+    LESS = libcudf_exp.ast_operator.LESS
+    GREATER = libcudf_exp.ast_operator.GREATER
+    LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL
+    GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL
+    BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND
+    BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR
+    BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR
+    LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND
+    NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND
+    LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR
+    NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR
+    # Unary operators
+    IDENTITY = libcudf_exp.ast_operator.IDENTITY
+    SIN = libcudf_exp.ast_operator.SIN
+    COS = libcudf_exp.ast_operator.COS
+    TAN = libcudf_exp.ast_operator.TAN
+    ARCSIN = libcudf_exp.ast_operator.ARCSIN
+    ARCCOS = libcudf_exp.ast_operator.ARCCOS
+    ARCTAN = libcudf_exp.ast_operator.ARCTAN
+    SINH = libcudf_exp.ast_operator.SINH
+    COSH = libcudf_exp.ast_operator.COSH
+    TANH = libcudf_exp.ast_operator.TANH
+    ARCSINH = libcudf_exp.ast_operator.ARCSINH
+    ARCCOSH = libcudf_exp.ast_operator.ARCCOSH
+    ARCTANH = libcudf_exp.ast_operator.ARCTANH
+    EXP = libcudf_exp.ast_operator.EXP
+    LOG = libcudf_exp.ast_operator.LOG
+    SQRT = libcudf_exp.ast_operator.SQRT
+    CBRT = libcudf_exp.ast_operator.CBRT
+    CEIL = libcudf_exp.ast_operator.CEIL
+    FLOOR = libcudf_exp.ast_operator.FLOOR
+    ABS = libcudf_exp.ast_operator.ABS
+    RINT = libcudf_exp.ast_operator.RINT
+    BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT
+    NOT = libcudf_exp.ast_operator.NOT
+
+
+class TableReference(Enum):
+    LEFT = libcudf_exp.table_reference.LEFT
+    RIGHT = libcudf_exp.table_reference.RIGHT
+
+
+# Note that this function only currently supports numeric literals. libcudf
+# expressions don't really support other types yet though, so this isn't
+# restrictive at the moment.
+cdef class Literal(Expression):
+    def __cinit__(self, value):
+        # TODO: Would love to find a better solution than unions for literals.
+        cdef int intval
+        cdef double doubleval
+
+        if isinstance(value, int):
+            self.c_scalar_type = scalar_type_t.INT
+            intval = value
+            self.c_scalar.int_ptr = make_unique[numeric_scalar[int64_t]](
+                intval, True
+            )
+            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+                <numeric_scalar[int64_t] &>dereference(self.c_scalar.int_ptr)
+            )
+        elif isinstance(value, float):
+            self.c_scalar_type = scalar_type_t.DOUBLE
+            doubleval = value
+            self.c_scalar.double_ptr = make_unique[numeric_scalar[double]](
+                doubleval, True
+            )
+            self.c_obj = <expression_ptr> make_unique[libcudf_exp.literal](
+                <numeric_scalar[double] &>dereference(self.c_scalar.double_ptr)
+            )
+
+
+cdef class ColumnReference(Expression):
+    def __cinit__(self, size_type index):
+        self.c_obj = <expression_ptr>make_unique[libcudf_exp.column_reference](
+            index
+        )
+
+
+cdef class Operation(Expression):
+    def __cinit__(self, op, Expression left, Expression right=None):
+        # This awkward double casting is the only way to get Cython to generate
+        # valid C++. Cython doesn't support scoped enumerations, so it assumes
+        # that enums correspond to their underlying value types and will thus
+        # attempt operations that are invalid without first explicitly casting
+        # to the underlying before casting to the desired type.
+        cdef libcudf_exp.ast_operator op_value = <libcudf_exp.ast_operator>(
+            <underlying_type_ast_operator> op.value
+        )
+
+        if right is None:
+            self.c_obj = <expression_ptr> make_unique[libcudf_exp.operation](
+                op_value, dereference(left.c_obj)
+            )
+        else:
+            self.c_obj = <expression_ptr> make_unique[libcudf_exp.operation](
+                op_value, dereference(left.c_obj), dereference(right.c_obj)
+            )
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 48f566b846d..be5bb2741b4 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -341,7 +341,7 @@ cdef class GroupBy:
         return columns_from_unique_ptr(move(c_result.second))
 
 
-_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"}
+_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax", "rank"}
 
 
 def _is_all_scan_aggregate(all_aggs):
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 301f571f5fb..8bb8ab92a48 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -14,16 +14,14 @@ from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
 from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
-def hash_partition(source_table, object columns_to_hash,
-                   int num_partitions, bool keep_index=True):
+def hash_partition(list source_columns, object columns_to_hash,
+                   int num_partitions):
     cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
     cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
 
     cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
     with nogil:
@@ -36,27 +34,17 @@ def hash_partition(source_table, object columns_to_hash,
         )
 
     # Note that the offsets (`c_result.second`) may be empty when
-    # the original table (`source_table`) is empty. We need to
+    # the original table (`source_columns`) is empty. We need to
     # return a list of zeros in this case.
     return (
-        *data_from_unique_ptr(
-            move(c_result.first),
-            column_names=source_table._column_names,
-            index_names=(
-                source_table._index_names
-                if keep_index is True
-                else None
-            )
-
-        ),
-        list(c_result.second) if c_result.second.size()
-        else [0] * num_partitions
+        columns_from_unique_ptr(move(c_result.first)),
+        list(c_result.second)
+        if c_result.second.size() else [0] * num_partitions
     )
 
 
-def hash(source_table, str method, int seed=0):
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, ignore_index=True)
+def hash(list source_columns, str method, int seed=0):
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
     cdef unique_ptr[column] c_result
     cdef cpp_hash_id c_hash_function
     if method == "murmur3":
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 06e287ee670..88c8b19ded0 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import cudf
 
@@ -20,12 +20,12 @@ from cudf._lib.cpp.interop cimport (
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def from_dlpack(dlpack_capsule):
     """
-    Converts a DLPack Tensor PyCapsule into a cudf Frame object.
+    Converts a DLPack Tensor PyCapsule into a list of columns.
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
@@ -40,31 +40,25 @@ def from_dlpack(dlpack_capsule):
             cpp_from_dlpack(dlpack_tensor)
         )
 
-    res = data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
-    )
+    res = columns_from_unique_ptr(move(c_result))
     dlpack_tensor.deleter(dlpack_tensor)
     return res
 
 
-def to_dlpack(source_table):
+def to_dlpack(list source_columns):
     """
-    Converts a cudf Frame into a DLPack Tensor PyCapsule.
+    Converts a list of columns into a DLPack Tensor PyCapsule.
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-    for column in source_table._columns:
-        if column.null_count:
-            raise ValueError(
-                "Cannot create a DLPack tensor with null values. \
-                    Input is required to have null count as zero."
-            )
+    if any(column.null_count for column in source_columns):
+        raise ValueError(
+            "Cannot create a DLPack tensor with null values. \
+                Input is required to have null count as zero."
+        )
 
     cdef DLManagedTensor *dlpack_tensor
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
-    )
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
 
     with nogil:
         dlpack_tensor = cpp_to_dlpack(
@@ -110,17 +104,14 @@ cdef vector[column_metadata] gather_metadata(object metadata) except *:
         raise ValueError("Malformed metadata has been encountered")
 
 
-def to_arrow(input_table,
-             object metadata,
-             bool keep_index=True):
-    """Convert from cudf Frame to PyArrow Table.
+def to_arrow(list source_columns, object metadata):
+    """Convert a list of columns from
+    cudf Frame to a PyArrow Table.
 
     Parameters
     ----------
-    input_table : cudf table
-    column_names : names for the pyarrow arrays
-    field_names : field names for nested type arrays
-    keep_index : whether index needs to be part of arrow table
+    source_columns : a list of columns to convert
+    metadata : a list of metadata, see `gather_metadata` for layout
 
     Returns
     -------
@@ -128,9 +119,7 @@ def to_arrow(input_table,
     """
 
     cdef vector[column_metadata] cpp_metadata = gather_metadata(metadata)
-    cdef table_view input_table_view = (
-        table_view_from_table(input_table, not keep_index)
-    )
+    cdef table_view input_table_view = table_view_from_columns(source_columns)
 
     cdef shared_ptr[CTable] cpp_arrow_table
     with nogil:
@@ -141,22 +130,16 @@ def to_arrow(input_table,
     return pyarrow_wrap_table(cpp_arrow_table)
 
 
-def from_arrow(
-    object input_table,
-    object column_names=None,
-    object index_names=None
-):
-    """Convert from PyArrow Table to cudf Frame.
+def from_arrow(object input_table):
+    """Convert from PyArrow Table to a list of columns.
 
     Parameters
     ----------
     input_table : PyArrow table
-    column_names : names for the cudf table data columns
-    index_names : names for the cudf table index columns
 
     Returns
     -------
-    cudf Frame
+    A list of columns to construct Frame object
     """
     cdef shared_ptr[CTable] cpp_arrow_table = (
         pyarrow_unwrap_table(input_table)
@@ -166,8 +149,4 @@ def from_arrow(
     with nogil:
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=column_names,
-        index_names=index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 5921f06d36e..1baef266dab 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from itertools import chain
 
@@ -16,31 +16,25 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
-from cudf._lib.utils cimport table_view_from_table
+from cudf._lib.utils cimport table_view_from_columns
 
 # The functions below return the *gathermaps* that represent
 # the join result when joining on the keys `lhs` and `rhs`.
 
-cpdef join(lhs, rhs, how=None):
+cpdef join(list lhs, list rhs, how=None):
     cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
-    cdef table_view c_lhs = table_view_from_table(lhs)
-    cdef table_view c_rhs = table_view_from_table(rhs)
+    cdef table_view c_lhs = table_view_from_columns(lhs)
+    cdef table_view c_rhs = table_view_from_columns(rhs)
 
     if how == "inner":
-        c_result = move(cpp_join.inner_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.inner_join(c_lhs, c_rhs))
     elif how == "left":
-        c_result = move(cpp_join.left_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_join(c_lhs, c_rhs))
     elif how == "outer":
-        c_result = move(cpp_join.full_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.full_join(c_lhs, c_rhs))
     else:
         raise ValueError(f"Invalid join type {how}")
 
@@ -49,30 +43,23 @@ cpdef join(lhs, rhs, how=None):
     return left_rows, right_rows
 
 
-cpdef semi_join(lhs, rhs, how=None):
+cpdef semi_join(list lhs, list rhs, how=None):
     # left-semi and left-anti joins
     cdef cpp_join.gather_map_type c_result
-    cdef table_view c_lhs = table_view_from_table(lhs)
-    cdef table_view c_rhs = table_view_from_table(rhs)
+    cdef table_view c_lhs = table_view_from_columns(lhs)
+    cdef table_view c_rhs = table_view_from_columns(rhs)
 
     if how == "leftsemi":
-        c_result = move(cpp_join.left_semi_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs))
     elif how == "leftanti":
-        c_result = move(cpp_join.left_anti_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs))
     else:
         raise ValueError(f"Invalid join type {how}")
 
     cdef Column left_rows = _gather_map_as_column(move(c_result))
-    return (
-        left_rows,
-        None
-    )
+    return left_rows, None
 
 
 cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 48da83450d7..263d70afe26 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -1,11 +1,11 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
 
-import collections.abc as abc
 import io
 import os
+from collections import abc
 
 import cudf
 
@@ -82,15 +82,15 @@ cpdef read_json(object filepaths_or_buffers,
             for k, v in dtype.items():
                 c_dtypes_map[str(k).encode()] = \
                     _get_cudf_data_type_from_dtype(v)
-        elif not isinstance(dtype, abc.Iterable):
-            raise TypeError("`dtype` must be 'list like' or 'dict'")
-        else:
+        elif isinstance(dtype, abc.Collection):
             is_list_like_dtypes = True
             c_dtypes_list.reserve(len(dtype))
             for col_dtype in dtype:
                 c_dtypes_list.push_back(
                     _get_cudf_data_type_from_dtype(
                         col_dtype))
+        else:
+            raise TypeError("`dtype` must be 'list like' or 'dict'")
 
     cdef json_reader_options opts = move(
         json_reader_options.builder(make_source_info(filepaths_or_buffers))
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 523686fafe6..025fb0665d3 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -42,7 +42,7 @@ from cudf.core.dtypes import ListDtype
 
 from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
 from cudf._lib.cpp.lists.extract cimport extract_list_element
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def count_elements(Column col):
@@ -61,8 +61,10 @@ def count_elements(Column col):
     return result
 
 
-def explode_outer(tbl, int explode_column_idx, bool ignore_index=False):
-    cdef table_view c_table_view = table_view_from_table(tbl, ignore_index)
+def explode_outer(
+    list source_columns, int explode_column_idx
+):
+    cdef table_view c_table_view = table_view_from_columns(source_columns)
     cdef size_type c_explode_column_idx = explode_column_idx
 
     cdef unique_ptr[table] c_result
@@ -70,11 +72,7 @@ def explode_outer(tbl, int explode_column_idx, bool ignore_index=False):
     with nogil:
         c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=tbl._column_names,
-        index_names=None if ignore_index else tbl._index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal):
@@ -178,7 +176,7 @@ def contains_scalar(Column col, object py_search_key):
     return result
 
 
-def index_of(Column col, object py_search_key):
+def index_of_scalar(Column col, object py_search_key):
 
     cdef DeviceScalar search_key = py_search_key.device_value
 
@@ -197,18 +195,35 @@ def index_of(Column col, object py_search_key):
     return Column.from_unique_ptr(move(c_result))
 
 
-def concatenate_rows(tbl):
+def index_of_column(Column col, Column search_keys):
+
+    cdef column_view keys_view = search_keys.view()
+
+    cdef shared_ptr[lists_column_view] list_view = (
+        make_shared[lists_column_view](col.view())
+    )
+
     cdef unique_ptr[column] c_result
 
-    cdef table_view c_table_view = table_view_from_table(tbl)
+    with nogil:
+        c_result = move(cpp_index_of(
+            list_view.get()[0],
+            keys_view,
+        ))
+    return Column.from_unique_ptr(move(c_result))
+
+
+def concatenate_rows(list source_columns):
+    cdef unique_ptr[column] c_result
+
+    cdef table_view c_table_view = table_view_from_columns(source_columns)
 
     with nogil:
         c_result = move(cpp_concatenate_rows(
             c_table_view,
         ))
 
-    result = Column.from_unique_ptr(move(c_result))
-    return result
+    return Column.from_unique_ptr(move(c_result))
 
 
 def concatenate_list_elements(Column input_column, dropna=False):
diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx
index 915b46c5691..dae2c466266 100644
--- a/python/cudf/cudf/_lib/merge.pyx
+++ b/python/cudf/cudf/_lib/merge.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -10,79 +10,43 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.merge cimport merge as cpp_merge
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def merge_sorted(
-    object tables,
-    object keys=None,
-    bool by_index=False,
-    bool ignore_index=False,
+    list input_columns,
+    list key_columns_indices,
     bool ascending=True,
-    object na_position="last",
+    str na_position="last",
 ):
-    cdef vector[libcudf_types.size_type] c_column_keys
+    """Merge multiple lists of lexicographically sorted columns into one list
+    of sorted columns. `input_columns` is a list of lists of columns to be
+    merged.
+    """
+    cdef vector[libcudf_types.size_type] c_column_keys = key_columns_indices
     cdef vector[table_view] c_input_tables
     cdef vector[libcudf_types.order] c_column_order
     cdef vector[libcudf_types.null_order] c_null_precedence
-    cdef libcudf_types.order column_order
-    cdef libcudf_types.null_order null_precedence
-    cdef source_table
 
-    # Create vector of tables
-    # Use metadata from 0th table for names, etc
-    c_input_tables.reserve(len(tables))
-    for source_table in tables:
+    c_input_tables.reserve(len(input_columns))
+    for source_columns in input_columns:
         c_input_tables.push_back(
-            table_view_from_table(source_table, ignore_index))
-    source_table = tables[0]
+            table_view_from_columns(source_columns))
 
-    # Define sorting order and null precedence
-    column_order = (libcudf_types.order.ASCENDING
-                    if ascending
-                    else libcudf_types.order.DESCENDING)
+    num_keys = len(key_columns_indices)
 
-    if ascending is False:
-        if na_position == "last":
-            na_position = "first"
-        else:
-            na_position = "last"
-    null_precedence = (
+    cdef libcudf_types.order column_order = (
+        libcudf_types.order.ASCENDING if ascending
+        else libcudf_types.order.DESCENDING
+    )
+    c_column_order = vector[libcudf_types.order](num_keys, column_order)
+
+    if not ascending:
+        na_position = "last" if na_position == "first" else "first"
+    cdef libcudf_types.null_order null_precedence = (
         libcudf_types.null_order.BEFORE if na_position == "first"
         else libcudf_types.null_order.AFTER
     )
-
-    # Determine index-column offset and index names
-    if ignore_index:
-        num_index_columns = 0
-        index_names = None
-    else:
-        num_index_columns = (
-            0 if source_table._index is None
-            else source_table._index._num_columns
-        )
-        index_names = source_table._index_names
-
-    # Define C vectors for each key column
-    if not by_index and keys is not None:
-        num_keys = len(keys)
-        c_column_keys.reserve(num_keys)
-        for name in keys:
-            c_column_keys.push_back(
-                num_index_columns + source_table._column_names.index(name)
-            )
-    else:
-        if by_index:
-            start = 0
-            stop = num_index_columns
-        else:
-            start = num_index_columns
-            stop = num_index_columns + source_table._num_columns
-        num_keys = stop - start
-        c_column_keys.reserve(num_keys)
-        for key in range(start, stop):
-            c_column_keys.push_back(key)
-    c_column_order = vector[libcudf_types.order](num_keys, column_order)
     c_null_precedence = vector[libcudf_types.null_order](
         num_keys,
         null_precedence
@@ -100,8 +64,4 @@ def merge_sorted(
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=index_names,
-    )
+    return columns_from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index b6e26fe594f..ce83a6f0f18 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from enum import Enum
 
@@ -8,9 +8,6 @@ from libcpp.utility cimport move
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf._lib.column cimport Column
-
-import cudf._lib as libcudfxx
-
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 8cb7dd942c1..e363ea875f0 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
@@ -17,7 +17,7 @@ except ImportError:
     import json
 
 import numpy as np
-from cython.operator import dereference
+from cython.operator cimport dereference
 
 from cudf.api.types import (
     is_categorical_dtype,
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index e53667e7589..f2f5a92aca1 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -11,21 +11,19 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.partitioning cimport partition as cpp_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
 cimport cudf._lib.cpp.types as libcudf_types
 
 
-def partition(source_table, Column partition_map,
-              object num_partitions, bool keep_index=True):
+def partition(list source_columns, Column partition_map,
+              object num_partitions):
 
     if num_partitions is None:
         num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
     cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
 
     cdef column_view c_partition_map_view = partition_map.view()
 
@@ -40,13 +38,5 @@ def partition(source_table, Column partition_map,
         )
 
     return (
-        *data_from_unique_ptr(
-            move(c_result.first),
-            column_names=source_table._column_names,
-            index_names=source_table._index_names if(
-                keep_index is True)
-            else None
-
-        ),
-        list(c_result.second)
+        columns_from_unique_ptr(move(c_result.first)), list(c_result.second)
     )
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index 497a71df89d..f65c29a55a8 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -31,7 +31,7 @@ from cudf._lib.cpp.types cimport (
     order_info,
     sorted,
 )
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def quantile(
@@ -74,14 +74,13 @@ def quantile(
     return Column.from_unique_ptr(move(c_result))
 
 
-def quantiles(source_table,
+def quantiles(list source_columns,
               vector[double] q,
               object interp,
               object is_input_sorted,
               list column_order,
               list null_precedence):
-    cdef table_view c_input = table_view_from_table(
-        source_table, ignore_index=True)
+    cdef table_view c_input = table_view_from_columns(source_columns)
     cdef vector[double] c_q = q
     cdef interpolation c_interp = <interpolation>(
         <underlying_type_t_interpolation> interp
@@ -119,7 +118,4 @@ def quantiles(source_table,
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names
-    )
+    return columns_from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index d64d0543892..29223947eea 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,32 +13,25 @@ from cudf._lib.cpp.reshape cimport (
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
-def interleave_columns(source_table):
-    cdef table_view c_view = table_view_from_table(
-        source_table, ignore_index=True)
+def interleave_columns(list source_columns):
+    cdef table_view c_view = table_view_from_columns(source_columns)
     cdef unique_ptr[column] c_result
 
     with nogil:
         c_result = move(cpp_interleave_columns(c_view))
 
-    return Column.from_unique_ptr(
-        move(c_result)
-    )
+    return Column.from_unique_ptr(move(c_result))
 
 
-def tile(source_table, size_type count):
+def tile(list source_columns, size_type count):
     cdef size_type c_count = count
-    cdef table_view c_view = table_view_from_table(source_table)
+    cdef table_view c_view = table_view_from_columns(source_columns)
     cdef unique_ptr[table] c_result
 
     with nogil:
         c_result = move(cpp_tile(c_view, c_count))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=source_table._index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index b4b3384032c..a2cb115f668 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -1,6 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-from __future__ import print_function
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import pandas as pd
 
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 32d6cb2ea6d..71ac022ba2d 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -65,6 +65,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_us,
 )
 from cudf._lib.utils cimport (
+    columns_from_table_view,
     data_from_table_view,
     table_view_from_columns,
     table_view_from_table,
@@ -172,10 +173,10 @@ cdef class DeviceScalar:
         if self.value is cudf.NA:
             return (
                 f"{self.__class__.__name__}"
-                f"({self.value}, {self.dtype.__repr__()})"
+                f"({self.value}, {repr(self.dtype)})"
             )
         else:
-            return f"{self.__class__.__name__}({self.value.__repr__()})"
+            return f"{self.__class__.__name__}({repr(self.value)})"
 
     @staticmethod
     cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=None):
@@ -361,8 +362,8 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
             names=columns
         )
 
-    data, _ = from_arrow(pyarrow_table, column_names=columns)
-    cdef table_view struct_view = table_view_from_columns(data.values())
+    data = from_arrow(pyarrow_table)
+    cdef table_view struct_view = table_view_from_columns(data)
 
     s.reset(
         new struct_scalar(struct_view, valid)
@@ -373,18 +374,10 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s):
         return cudf.NA
 
     cdef table_view struct_table_view = (<struct_scalar*>s.get()).view()
-    columns = [str(i) for i in range(struct_table_view.num_columns())]
+    column_names = [str(i) for i in range(struct_table_view.num_columns())]
 
-    data, _ = data_from_table_view(
-        struct_table_view,
-        None,
-        column_names=columns
-    )
-    to_arrow_table = cudf.core.frame.Frame(
-        cudf.core.column_accessor.ColumnAccessor(data)
-    )
-
-    python_dict = to_arrow(to_arrow_table, columns).to_pydict()
+    columns = columns_from_table_view(struct_table_view, None)
+    python_dict = to_arrow(columns, column_names).to_pydict()
 
     return {k: _nested_na_replace(python_dict[k])[0] for k in python_dict}
 
@@ -415,9 +408,8 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s):
 
     cdef column_view list_col_view = (<list_scalar*>s.get()).view()
     cdef Column list_col = Column.from_column_view(list_col_view, None)
-    to_arrow_table = cudf.core.frame.Frame({"col": list_col})
 
-    arrow_table = to_arrow(to_arrow_table, [["col", []]])
+    arrow_table = to_arrow([list_col], [["col", []]])
     result = arrow_table['col'].to_pylist()
     return _nested_na_replace(result)
 
diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
index f92ef753fc2..d5568f53231 100644
--- a/python/cudf/cudf/_lib/search.pyx
+++ b/python/cudf/cudf/_lib/search.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -10,20 +10,20 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport table_view_from_table
+from cudf._lib.utils cimport table_view_from_columns
 
 
 def search_sorted(
-    table, values, side, ascending=True, na_position="last"
+    list source, list values, side, ascending=True, na_position="last"
 ):
     """Find indices where elements should be inserted to maintain order
 
     Parameters
     ----------
-    table : Frame
-        Frame to search in
-    values : Frame
-        Frame of values to search for
+    source : list of columns
+        List of columns to search in
+    values : List of columns
+        List of value columns to search for
     side : str {‘left’, ‘right’} optional
         If ‘left’, the index of the first suitable location is given.
         If ‘right’, return the last such index
@@ -33,10 +33,8 @@ def search_sorted(
     cdef vector[libcudf_types.null_order] c_null_precedence
     cdef libcudf_types.order c_order
     cdef libcudf_types.null_order c_null_order
-    cdef table_view c_table_data = table_view_from_table(
-        table, ignore_index=True)
-    cdef table_view c_values_data = table_view_from_table(
-        values, ignore_index=True)
+    cdef table_view c_table_data = table_view_from_columns(source)
+    cdef table_view c_values_data = table_view_from_columns(values)
 
     # Note: We are ignoring index columns here
     c_order = (libcudf_types.order.ASCENDING
@@ -47,9 +45,9 @@ def search_sorted(
         if na_position=="last"
         else libcudf_types.null_order.BEFORE
     )
-    c_column_order = vector[libcudf_types.order](table._num_columns, c_order)
+    c_column_order = vector[libcudf_types.order](len(source), c_order)
     c_null_precedence = vector[libcudf_types.null_order](
-        table._num_columns, c_null_order
+        len(source), c_null_order
     )
 
     if side == 'left':
diff --git a/python/cudf/cudf/_lib/sort.pxd b/python/cudf/cudf/_lib/sort.pxd
deleted file mode 100644
index d7488889555..00000000000
--- a/python/cudf/cudf/_lib/sort.pxd
+++ /dev/null
@@ -1,3 +0,0 @@
-from libc.stdint cimport int32_t
-
-ctypedef int32_t underlying_type_t_rank_method
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 3aa0b35e90e..1d7204a0a39 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -1,6 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-import pandas as pd
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -10,32 +8,34 @@ from libcpp.vector cimport vector
 from enum import IntEnum
 
 from cudf._lib.column cimport Column
+from cudf._lib.cpp.aggregation cimport (
+    rank_method,
+    underlying_type_t_rank_method,
+)
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.search cimport lower_bound, upper_bound
 from cudf._lib.cpp.sorting cimport (
     is_sorted as cpp_is_sorted,
     rank,
-    rank_method,
     sorted_order,
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport null_order, null_policy, order
-from cudf._lib.sort cimport underlying_type_t_rank_method
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def is_sorted(
-    source_table, object ascending=None, object null_position=None
+    list source_columns, object ascending=None, object null_position=None
 ):
     """
     Checks whether the rows of a `table` are sorted in lexicographical order.
 
     Parameters
     ----------
-    source_table : Frame
-        Frame whose columns are to be checked for sort order
+    source_columns : list of columns
+        columns to be checked for sort order
     ascending : None or list-like of booleans
         None or list-like of boolean values indicating expected sort order of
         each column. If list-like, size of list-like must be len(columns). If
@@ -58,51 +58,39 @@ def is_sorted(
     cdef vector[null_order] null_precedence
 
     if ascending is None:
-        column_order = vector[order](
-            source_table._num_columns, order.ASCENDING
-        )
-    elif pd.api.types.is_list_like(ascending):
-        if len(ascending) != source_table._num_columns:
+        column_order = vector[order](len(source_columns), order.ASCENDING)
+    else:
+        if len(ascending) != len(source_columns):
             raise ValueError(
-                f"Expected a list-like of length {source_table._num_columns}, "
+                f"Expected a list-like of length {len(source_columns)}, "
                 f"got length {len(ascending)} for `ascending`"
             )
         column_order = vector[order](
-            source_table._num_columns, order.DESCENDING
+            len(source_columns), order.DESCENDING
         )
         for idx, val in enumerate(ascending):
             if val:
                 column_order[idx] = order.ASCENDING
-    else:
-        raise TypeError(
-            f"Expected a list-like or None for `ascending`, got "
-            f"{type(ascending)}"
-        )
 
     if null_position is None:
         null_precedence = vector[null_order](
-            source_table._num_columns, null_order.AFTER
+            len(source_columns), null_order.AFTER
         )
-    elif pd.api.types.is_list_like(null_position):
-        if len(null_position) != source_table._num_columns:
+    else:
+        if len(null_position) != len(source_columns):
             raise ValueError(
-                f"Expected a list-like of length {source_table._num_columns}, "
+                f"Expected a list-like of length {len(source_columns)}, "
                 f"got length {len(null_position)} for `null_position`"
             )
         null_precedence = vector[null_order](
-            source_table._num_columns, null_order.AFTER
+            len(source_columns), null_order.AFTER
         )
         for idx, val in enumerate(null_position):
             if val:
                 null_precedence[idx] = null_order.BEFORE
-    else:
-        raise TypeError(
-            f"Expected a list-like or None for `null_position`, got "
-            f"{type(null_position)}"
-        )
 
     cdef bool c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
     with nogil:
         c_result = cpp_is_sorted(
             source_table_view,
@@ -113,34 +101,34 @@ def is_sorted(
     return c_result
 
 
-def order_by(source_table, object ascending, str na_position):
+def order_by(list columns_from_table, object ascending, str na_position):
     """
-    Sorting the table ascending/descending
+    Get index to sort the table in ascending/descending order.
 
     Parameters
     ----------
-    source_table : table which will be sorted
-    ascending : list of boolean values which correspond to each column
+    columns_from_table : columns from the table which will be sorted
+    ascending : sequence of boolean values which correspond to each column
                 in source_table signifying order of each column
                 True - Ascending and False - Descending
     na_position : whether null value should show up at the "first" or "last"
                 position of **all** sorted column.
     """
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
+    cdef table_view source_table_view = table_view_from_columns(
+        columns_from_table
     )
     cdef vector[order] column_order
     column_order.reserve(len(ascending))
     cdef vector[null_order] null_precedence
     null_precedence.reserve(len(ascending))
 
-    for i in ascending:
-        if i is True:
+    for asc in ascending:
+        if asc:
             column_order.push_back(order.ASCENDING)
         else:
             column_order.push_back(order.DESCENDING)
 
-        if i ^ (na_position == "first"):
+        if asc ^ (na_position == "first"):
             null_precedence.push_back(null_order.AFTER)
         else:
             null_precedence.push_back(null_order.BEFORE)
@@ -154,21 +142,21 @@ def order_by(source_table, object ascending, str na_position):
     return Column.from_unique_ptr(move(c_result))
 
 
-def digitize(source_values_table, bins, bool right=False):
+def digitize(list source_columns, list bins, bool right=False):
     """
     Return the indices of the bins to which each value in source_table belongs.
 
     Parameters
     ----------
-    source_table : Input table to be binned.
-    bins : Frame containing columns of bins
+    source_columns : Input columns to be binned.
+    bins : List containing columns of bins
     right : Indicating whether the intervals include the
             right or the left bin edge.
     """
 
-    cdef table_view bins_view = table_view_from_table(bins)
-    cdef table_view source_values_table_view = table_view_from_table(
-        source_values_table
+    cdef table_view bins_view = table_view_from_columns(bins)
+    cdef table_view source_table_view = table_view_from_columns(
+        source_columns
     )
     cdef vector[order] column_order = (
         vector[order](
@@ -184,11 +172,11 @@ def digitize(source_values_table, bins, bool right=False):
     )
 
     cdef unique_ptr[column] c_result
-    if right is True:
+    if right:
         with nogil:
             c_result = move(lower_bound(
                 bins_view,
-                source_values_table_view,
+                source_table_view,
                 column_order,
                 null_precedence)
             )
@@ -196,7 +184,7 @@ def digitize(source_values_table, bins, bool right=False):
         with nogil:
             c_result = move(upper_bound(
                 bins_view,
-                source_values_table_view,
+                source_table_view,
                 column_order,
                 null_precedence)
             )
@@ -204,23 +192,13 @@ def digitize(source_values_table, bins, bool right=False):
     return Column.from_unique_ptr(move(c_result))
 
 
-class RankMethod(IntEnum):
-    FIRST = < underlying_type_t_rank_method > rank_method.FIRST
-    AVERAGE = < underlying_type_t_rank_method > rank_method.AVERAGE
-    MIN = < underlying_type_t_rank_method > rank_method.MIN
-    MAX = < underlying_type_t_rank_method > rank_method.MAX
-    DENSE = < underlying_type_t_rank_method > rank_method.DENSE
-
-
-def rank_columns(source_table, object method, str na_option,
+def rank_columns(list source_columns, object method, str na_option,
                  bool ascending, bool pct
                  ):
     """
     Compute numerical data ranks (1 through n) of each column in the dataframe
     """
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
-    )
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
 
     cdef rank_method c_rank_method = < rank_method > (
         < underlying_type_t_rank_method > method
@@ -260,7 +238,7 @@ def rank_columns(source_table, object method, str na_option,
     cdef vector[unique_ptr[column]] c_results
     cdef column_view c_view
     cdef Column col
-    for col in source_table._columns:
+    for col in source_columns:
         c_view = col.view()
         with nogil:
             c_results.push_back(move(
@@ -274,11 +252,6 @@ def rank_columns(source_table, object method, str na_option,
                 )
             ))
 
-    cdef unique_ptr[table] c_result
-    c_result.reset(new table(move(c_results)))
-    data, _ = data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=None
-    )
-    return data, source_table._index
+    return [Column.from_unique_ptr(
+        move(c_results[i])
+    ) for i in range(c_results.size())]
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 3b5ef33a668..eeb39f70728 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -18,10 +18,10 @@ from cudf._lib.cpp.strings.combine cimport (
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport table_view_from_table
+from cudf._lib.utils cimport table_view_from_columns
 
 
-def concatenate(source_strings,
+def concatenate(list source_strings,
                 object sep,
                 object na_rep):
     """
@@ -33,8 +33,7 @@ def concatenate(source_strings,
     cdef DeviceScalar narep = na_rep.device_value
 
     cdef unique_ptr[column] c_result
-    cdef table_view source_view = table_view_from_table(
-        source_strings, ignore_index=True)
+    cdef table_view source_view = table_view_from_columns(source_strings)
 
     cdef const string_scalar* scalar_separator = \
         <const string_scalar*>(separator.get_raw_ptr())
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 96d25cb92c9..2d94ef2cedf 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -1,13 +1,15 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import numpy as np
 from numba.np import numpy_support
 
 import cudf
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
+from cudf.core._internals.expressions import parse_expression
 from cudf.core.buffer import Buffer
 from cudf.utils import cudautils
 
+from cython.operator cimport dereference
 from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
@@ -20,13 +22,17 @@ cimport cudf._lib.cpp.transform as libcudf_transform
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.expressions cimport expression
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
+from cudf._lib.expressions cimport Expression
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
     data_from_table_view,
     data_from_unique_ptr,
+    table_view_from_columns,
     table_view_from_table,
 )
 
@@ -123,21 +129,15 @@ def transform(Column input, op):
     return Column.from_unique_ptr(move(c_output))
 
 
-def table_encode(input):
-    cdef table_view c_input = table_view_from_table(
-        input, ignore_index=True)
+def table_encode(list source_columns):
+    cdef table_view c_input = table_view_from_columns(source_columns)
     cdef pair[unique_ptr[table], unique_ptr[column]] c_result
 
     with nogil:
         c_result = move(libcudf_transform.encode(c_input))
 
-    return (
-        *data_from_unique_ptr(
-            move(c_result.first),
-            column_names=input._column_names,
-        ),
-        Column.from_unique_ptr(move(c_result.second))
-    )
+    return columns_from_unique_ptr(
+        move(c_result.first)), Column.from_unique_ptr(move(c_result.second))
 
 
 def one_hot_encode(Column input_column, Column categories):
@@ -162,3 +162,34 @@ def one_hot_encode(Column input_column, Column categories):
     )
 
     return encodings
+
+
+def compute_column(list columns, tuple column_names, expr: str):
+    """Compute a new column by evaluating an expression on a set of columns.
+
+    Parameters
+    ----------
+    columns : list
+        The set of columns forming the table to evaluate the expression on.
+    column_names : tuple[str]
+        The names associated with each column. These names are necessary to map
+        column names in the expression to indices in the provided list of
+        columns, which are what will be used by libcudf to evaluate the
+        expression on the table.
+    expr : str
+        The expression to evaluate.
+    """
+    visitor = parse_expression(expr, column_names)
+
+    # At the end, all the stack contains is the expression to evaluate.
+    cdef Expression cudf_expr = visitor.expression
+    cdef table_view tbl = table_view_from_columns(columns)
+    cdef unique_ptr[column] col
+    with nogil:
+        col = move(
+            libcudf_transform.compute_column(
+                tbl,
+                <expression &> dereference(cudf_expr.c_obj.get())
+            )
+        )
+    return Column.from_unique_ptr(move(col))
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 931a2702612..b9eea6169bd 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,7 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-import cudf
-from cudf.api.types import is_categorical_dtype
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
@@ -9,65 +6,22 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
-from cudf._lib.utils cimport data_from_table_view, table_view_from_table
-
+from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
-def transpose(source):
-    """Transpose index and columns.
 
-    See Also
-    --------
-    cudf.core.DataFrame.transpose
+def transpose(list source_columns):
+    """Transpose m n-row columns into n m-row columns
     """
-
-    if source._num_columns == 0:
-        return source
-
-    cats = None
-    columns = source._columns
-    dtype = columns[0].dtype
-
-    if is_categorical_dtype(dtype):
-        if any(not is_categorical_dtype(c.dtype) for c in columns):
-            raise ValueError('Columns must all have the same dtype')
-        cats = list(c.categories for c in columns)
-        cats = cudf.core.column.concat_columns(cats).unique()
-        source = cudf.core.frame.Frame(index=source._index, data=[
-            (name, col._set_categories(cats, is_unique=True).codes)
-            for name, col in source._data.items()
-        ])
-    elif any(c.dtype != dtype for c in columns):
-        raise ValueError('Columns must all have the same dtype')
-
     cdef pair[unique_ptr[column], table_view] c_result
-    cdef table_view c_input = table_view_from_table(
-        source, ignore_index=True)
+    cdef table_view c_input = table_view_from_columns(source_columns)
 
     with nogil:
         c_result = move(cpp_transpose(c_input))
 
     result_owner = Column.from_unique_ptr(move(c_result.first))
-    data, _ = data_from_table_view(
+    return columns_from_table_view(
         c_result.second,
-        owner=result_owner,
-        column_names=range(c_input.num_rows())
+        owners=[result_owner] * c_result.second.num_columns()
     )
-
-    if cats is not None:
-        data= [
-            (name, cudf.core.column.column.build_categorical_column(
-                codes=cudf.core.column.column.build_column(
-                    col.base_data, dtype=col.dtype),
-                mask=col.base_mask,
-                size=col.size,
-                categories=cats,
-                offset=col.offset,
-            ))
-            for name, col in data.items()
-        ]
-
-    return data
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 8557f430e25..643a1adca9f 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -317,10 +317,10 @@ cdef columns_from_table_view(
 ):
     """
     Given a ``cudf::table_view``, construsts a list of columns from it,
-    along with referencing an ``owner`` Python object that owns the memory
-    lifetime. ``owner`` must be either None or a list of column. If ``owner``
-    is a list of columns, the owner of the `i`th ``cudf::column_view`` in the
-    table view is ``owners[i]``. For more about memory ownership,
+    along with referencing an owner Python object that owns the memory
+    lifetime. owner must be either None or a list of column. If owner
+    is a list of columns, the owner of the `i`th ``cudf::column_view``
+    in the table view is ``owners[i]``. For more about memory ownership,
     see ``Column.from_column_view``.
     """
 
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index fad2a973681..62f8377a323 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from collections.abc import Sequence
+from collections import abc
 from functools import wraps
 from inspect import isclass
 from typing import List, Union
@@ -48,16 +48,12 @@ def is_numeric_dtype(obj):
         if issubclass(obj, _BaseDtype):
             return False
     else:
-        if isinstance(obj, cudf.Decimal128Dtype) or isinstance(
-            getattr(obj, "dtype", None), cudf.Decimal128Dtype
-        ):
-            return True
-        if isinstance(obj, cudf.Decimal64Dtype) or isinstance(
-            getattr(obj, "dtype", None), cudf.Decimal64Dtype
-        ):
-            return True
-        if isinstance(obj, cudf.Decimal32Dtype) or isinstance(
-            getattr(obj, "dtype", None), cudf.Decimal32Dtype
+        if isinstance(
+            obj,
+            (cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype),
+        ) or isinstance(
+            getattr(obj, "dtype", None),
+            (cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype),
         ):
             return True
         if isinstance(obj, _BaseDtype) or isinstance(
@@ -129,12 +125,14 @@ def is_scalar(val):
     bool
         Return True if given object is scalar.
     """
-    return (
-        isinstance(val, cudf._lib.scalar.DeviceScalar)
-        or isinstance(val, cudf.Scalar)
-        or isinstance(val, cudf.core.tools.datetimes.DateOffset)
-        or pd_types.is_scalar(val)
-    )
+    return isinstance(
+        val,
+        (
+            cudf.Scalar,
+            cudf._lib.scalar.DeviceScalar,
+            cudf.core.tools.datetimes.DateOffset,
+        ),
+    ) or pd_types.is_scalar(val)
 
 
 def _is_scalar_or_zero_d_array(val):
@@ -174,7 +172,7 @@ def is_list_like(obj):
     bool
         Return True if given object is list-like.
     """
-    return isinstance(obj, (Sequence, np.ndarray)) and not isinstance(
+    return isinstance(obj, (abc.Sequence, np.ndarray)) and not isinstance(
         obj, (str, bytes)
     )
 
diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index f21eb4e4d8c..0c4d9d7f77e 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
-from collections import OrderedDict
-from collections.abc import Sequence
+from collections import OrderedDict, abc
 
 import numpy as np
 import pandas as pd
@@ -32,7 +31,7 @@ def __init__(self, source, schema=None):
         self._open(source, schema)
 
 
-class GpuArrowReader(Sequence):
+class GpuArrowReader(abc.Sequence):
     def __init__(self, schema, dev_ary):
         self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all()
 
@@ -120,12 +119,12 @@ def null(self):
 
     @property
     def data_raw(self):
-        "Accessor for the data buffer as a device array"
+        """Accessor for the data buffer as a device array"""
         return self._series._column.data_array_view
 
     @property
     def null_raw(self):
-        "Accessor for the null buffer as a device array"
+        """Accessor for the null buffer as a device array"""
         return self._series._column.mask_array_view
 
     def make_series(self):
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 259a7f711c3..8dbd71739b5 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import pickle
+import warnings
 from functools import cached_property
 from typing import Any, Set
 
@@ -118,7 +119,7 @@ def get_level_values(self, level):
 
         See Also
         --------
-        cudf.core.multiindex.MultiIndex.get_level_values : Get values for
+        cudf.MultiIndex.get_level_values : Get values for
             a level of a MultiIndex.
 
         Notes
@@ -707,7 +708,18 @@ def difference(self, other, sort=None):
         if is_mixed_with_object_dtype(self, other):
             difference = self.copy()
         else:
-            difference = self.join(other, how="leftanti")
+            other = other.copy(deep=False)
+            other.names = self.names
+            difference = cudf.core.index._index_from_data(
+                cudf.DataFrame._from_data(self._data)
+                ._merge(
+                    cudf.DataFrame._from_data(other._data),
+                    how="leftanti",
+                    on=self.name,
+                )
+                ._data
+            )
+
             if self.dtype != other.dtype:
                 difference = difference.astype(self.dtype)
 
@@ -989,7 +1001,17 @@ def _union(self, other, sort=None):
         return union_result
 
     def _intersection(self, other, sort=None):
-        intersection_result = self.unique().join(other.unique(), how="inner")
+        other_unique = other.unique()
+        other_unique.names = self.names
+        intersection_result = cudf.core.index._index_from_data(
+            cudf.DataFrame._from_data(self.unique()._data)
+            ._merge(
+                cudf.DataFrame._from_data(other_unique._data),
+                how="inner",
+                on=self.name,
+            )
+            ._data
+        )
 
         if sort is None and len(other):
             return intersection_result.sort_values()
@@ -1141,6 +1163,9 @@ def join(
                     (1, 2)],
                    names=['a', 'b'])
         """
+        warnings.warn(
+            "Index.join is deprecated and will be removed", FutureWarning
+        )
 
         if isinstance(self, cudf.MultiIndex) and isinstance(
             other, cudf.MultiIndex
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
new file mode 100644
index 00000000000..bc587d4e1e2
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import ast
+import functools
+from typing import List, Tuple
+
+from cudf._lib.expressions import (
+    ASTOperator,
+    ColumnReference,
+    Expression,
+    Literal,
+    Operation,
+)
+
+# This dictionary encodes the mapping from Python AST operators to their cudf
+# counterparts.
+python_cudf_operator_map = {
+    # Binary operators
+    ast.Add: ASTOperator.ADD,
+    ast.Sub: ASTOperator.SUB,
+    ast.Mult: ASTOperator.MUL,
+    ast.Div: ASTOperator.DIV,
+    ast.FloorDiv: ASTOperator.FLOOR_DIV,
+    ast.Mod: ASTOperator.PYMOD,
+    ast.Pow: ASTOperator.POW,
+    ast.Eq: ASTOperator.EQUAL,
+    ast.NotEq: ASTOperator.NOT_EQUAL,
+    ast.Lt: ASTOperator.LESS,
+    ast.Gt: ASTOperator.GREATER,
+    ast.LtE: ASTOperator.LESS_EQUAL,
+    ast.GtE: ASTOperator.GREATER_EQUAL,
+    ast.BitXor: ASTOperator.BITWISE_XOR,
+    # TODO: The mapping of logical/bitwise operators here is inconsistent with
+    # pandas. In pandas, Both `BitAnd` and `And` map to
+    # `ASTOperator.LOGICAL_AND` for booleans, while they map to
+    # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to
+    # encode this at present because expressions can be arbitrarily nested so
+    # we won't know the dtype of the input without inserting a much more
+    # complex traversal of the expression tree to determine the output types at
+    # each node. For now, we'll rely on users to use the appropriate operator.
+    ast.BitAnd: ASTOperator.BITWISE_AND,
+    ast.BitOr: ASTOperator.BITWISE_OR,
+    ast.And: ASTOperator.LOGICAL_AND,
+    ast.Or: ASTOperator.LOGICAL_OR,
+    # Unary operators
+    ast.Invert: ASTOperator.BIT_INVERT,
+    ast.Not: ASTOperator.NOT,
+    # TODO: Missing USub, possibility other unary ops?
+}
+
+
+# Mapping between Python function names encode in an ast.Call node and the
+# corresponding libcudf C++ AST operators.
+python_cudf_function_map = {
+    # TODO: Operators listed on
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval  # noqa: E501
+    # that we don't support yet:
+    # expm1, log1p, arctan2 and log10.
+    "sin": ASTOperator.SIN,
+    "cos": ASTOperator.COS,
+    "tan": ASTOperator.TAN,
+    "arcsin": ASTOperator.ARCSIN,
+    "arccos": ASTOperator.ARCCOS,
+    "arctan": ASTOperator.ARCTAN,
+    "sinh": ASTOperator.SINH,
+    "cosh": ASTOperator.COSH,
+    "tanh": ASTOperator.TANH,
+    "arcsinh": ASTOperator.ARCSINH,
+    "arccosh": ASTOperator.ARCCOSH,
+    "arctanh": ASTOperator.ARCTANH,
+    "exp": ASTOperator.EXP,
+    "log": ASTOperator.LOG,
+    "sqrt": ASTOperator.SQRT,
+    "abs": ASTOperator.ABS,
+    "ceil": ASTOperator.CEIL,
+    "floor": ASTOperator.FLOOR,
+    # TODO: Operators supported by libcudf with no Python function analog.
+    # ast.rint: ASTOperator.RINT,
+    # ast.cbrt: ASTOperator.CBRT,
+}
+
+
+class libcudfASTVisitor(ast.NodeVisitor):
+    """A NodeVisitor specialized for constructing a libcudf expression tree.
+
+    This visitor is designed to handle AST nodes that have libcudf equivalents.
+    It constructs column references from names and literals from constants,
+    then builds up operations. The final result can be accessed using the
+    `expression` property. The visitor must be kept in scope for as long as the
+    expression is needed because all of the underlying libcudf expressions will
+    be destroyed when the libcudfASTVisitor is.
+
+    Parameters
+    ----------
+    col_names : Tuple[str]
+        The column names used to map the names in an expression.
+    """
+
+    def __init__(self, col_names: Tuple[str]):
+        self.stack: List[Expression] = []
+        self.nodes: List[Expression] = []
+        self.col_names = col_names
+
+    @property
+    def expression(self):
+        """Expression: The result of parsing an AST."""
+        assert len(self.stack) == 1
+        return self.stack[-1]
+
+    def visit_Name(self, node):
+        try:
+            col_id = self.col_names.index(node.id)
+        except ValueError:
+            raise ValueError(f"Unknown column name {node.id}")
+        self.stack.append(ColumnReference(col_id))
+
+    def visit_Constant(self, node):
+        if not isinstance(node, ast.Num):
+            raise ValueError(
+                f"Unsupported literal {repr(node.value)} of type "
+                "{type(node.value).__name__}"
+            )
+        self.stack.append(Literal(node.value))
+
+    def visit_UnaryOp(self, node):
+        self.visit(node.operand)
+        self.nodes.append(self.stack.pop())
+        if isinstance(node.op, ast.USub):
+            # TODO: Except for leaf nodes, we won't know the type of the
+            # operand, so there's no way to know whether this should be a float
+            # or an int. We should maybe see what Spark does, and this will
+            # probably require casting.
+            self.nodes.append(Literal(-1))
+            op = ASTOperator.MUL
+            self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
+        elif isinstance(node.op, ast.UAdd):
+            self.stack.append(self.nodes[-1])
+        else:
+            op = python_cudf_operator_map[type(node.op)]
+            self.stack.append(Operation(op, self.nodes[-1]))
+
+    def visit_BinOp(self, node):
+        self.visit(node.left)
+        self.visit(node.right)
+        self.nodes.append(self.stack.pop())
+        self.nodes.append(self.stack.pop())
+
+        op = python_cudf_operator_map[type(node.op)]
+        self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
+
+    def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops):
+        # Helper function handling the common components of parsing BoolOp and
+        # Compare AST nodes. These two types of nodes both support chaining
+        # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this
+        # function helps standardize that.
+
+        # TODO: Whether And/Or and BitAnd/BitOr actually correspond to
+        # logical or bitwise operators depends on the data types that they
+        # are applied to. We'll need to add logic to map to that.
+        inner_ops = []
+        for op, (left, right) in zip(operators, operands):
+            # Note that this will lead to duplicate nodes, e.g. if
+            # the comparison is `a < b < c` that will be encoded as
+            # `a < b and b < c`. We could potentially optimize by caching
+            # expressions by name so that we only construct them once.
+            self.visit(left)
+            self.visit(right)
+
+            self.nodes.append(self.stack.pop())
+            self.nodes.append(self.stack.pop())
+
+            op = python_cudf_operator_map[type(op)]
+            inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2]))
+
+        self.nodes.extend(inner_ops)
+
+        # If we have more than one comparator, we need to link them
+        # together with LOGICAL_AND operators.
+        if has_multiple_ops:
+            op = ASTOperator.LOGICAL_AND
+
+            def _combine_compare_ops(left, right):
+                self.nodes.append(Operation(op, left, right))
+                return self.nodes[-1]
+
+            functools.reduce(_combine_compare_ops, inner_ops)
+
+        self.stack.append(self.nodes[-1])
+
+    def visit_BoolOp(self, node):
+        operators = [node.op] * (len(node.values) - 1)
+        operands = zip(node.values[:-1], node.values[1:])
+        self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2)
+
+    def visit_Compare(self, node):
+        operands = (node.left, *node.comparators)
+        has_multiple_ops = len(operands) > 2
+        operands = zip(operands[:-1], operands[1:])
+        self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops)
+
+    def visit_Call(self, node):
+        try:
+            op = python_cudf_function_map[node.func.id]
+        except KeyError:
+            raise ValueError(f"Unsupported function {node.func}.")
+        # Assuming only unary functions are supported, which is checked above.
+        if len(node.args) != 1 or node.keywords:
+            raise ValueError(
+                f"Function {node.func} only accepts one positional "
+                "argument."
+            )
+        self.visit(node.args[0])
+
+        self.nodes.append(self.stack.pop())
+        self.stack.append(Operation(op, self.nodes[-1]))
+
+
+@functools.lru_cache(256)
+def parse_expression(expr: str, col_names: Tuple[str]):
+    visitor = libcudfASTVisitor(col_names)
+    visitor.visit(ast.parse(expr))
+    return visitor
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 22a5666ef3f..d13c55dfcc0 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
-from warnings import warn
+import warnings
 
 import cupy as cp
 import numpy as np
@@ -50,7 +50,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
         raise NotImplementedError("na_sentinel can not be None.")
 
     if size_hint:
-        warn("size_hint is not applicable for cudf.factorize")
+        warnings.warn("size_hint is not applicable for cudf.factorize")
 
     return_cupy_array = isinstance(values, cp.ndarray)
 
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 911391ef984..f9bb7ea2f1a 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import pickle
-from collections.abc import MutableSequence
+from collections import abc
 from functools import cached_property
 from typing import (
     TYPE_CHECKING,
@@ -911,8 +911,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
                 )
             return other
 
-        ary = cudf.utils.utils.scalar_broadcast_to(
-            self._encode(other), size=len(self), dtype=self.codes.dtype
+        ary = column.full(
+            len(self), self._encode(other), dtype=self.codes.dtype
         )
         return column.build_categorical_column(
             categories=self.dtype.categories._values,
@@ -1379,7 +1379,9 @@ def view(self, dtype: Dtype) -> ColumnBase:
         )
 
     @staticmethod
-    def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
+    def _concat(
+        objs: abc.MutableSequence[CategoricalColumn],
+    ) -> CategoricalColumn:
         # TODO: This function currently assumes it is being called from
         # column.concat_columns, at least to the extent that all the
         # preprocessing in that function has already been done. That should be
@@ -1627,9 +1629,9 @@ def _create_empty_categorical_column(
     return column.build_categorical_column(
         categories=column.as_column(dtype.categories),
         codes=column.as_column(
-            cudf.utils.utils.scalar_broadcast_to(
-                _DEFAULT_CATEGORICAL_VALUE,
+            column.full(
                 categorical_column.size,
+                _DEFAULT_CATEGORICAL_VALUE,
                 categorical_column.codes.dtype,
             )
         ),
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b2e3e42531b..3fb71173178 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -68,7 +68,6 @@
     StructDtype,
 )
 from cudf.core.mixins import BinaryOperand, Reducible
-from cudf.utils import utils
 from cudf.utils.dtypes import (
     cudf_dtype_from_pa_type,
     get_time_unit,
@@ -229,13 +228,9 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        return libcudf.interop.to_arrow(
-            cudf.core.frame.Frame(
-                cudf.core.column_accessor.ColumnAccessor({"None": self})
-            ),
-            [["None"]],
-            keep_index=False,
-        )["None"].chunk(0)
+        return libcudf.interop.to_arrow([self], [["None"]],)[
+            "None"
+        ].chunk(0)
 
     @classmethod
     def from_arrow(cls, array: pa.Array) -> ColumnBase:
@@ -280,12 +275,8 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 }
             )
 
-            codes = libcudf.interop.from_arrow(
-                indices_table, indices_table.column_names
-            )[0]["None"]
-            categories = libcudf.interop.from_arrow(
-                dictionaries_table, dictionaries_table.column_names
-            )[0]["None"]
+            codes = libcudf.interop.from_arrow(indices_table)[0]
+            categories = libcudf.interop.from_arrow(dictionaries_table)[0]
 
             return build_categorical_column(
                 categories=categories,
@@ -301,7 +292,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         ):
             return cudf.core.column.IntervalColumn.from_arrow(array)
 
-        result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"]
+        result = libcudf.interop.from_arrow(data)[0]
 
         return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
 
@@ -1782,9 +1773,7 @@ def as_column(
             if dtype is None:
                 dtype = cudf.dtype("float64")
 
-        data = as_column(
-            utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
-        )
+        data = as_column(full(length, arbitrary, dtype=dtype))
         if not nan_as_null and not is_decimal_dtype(data.dtype):
             if np.issubdtype(data.dtype, np.floating):
                 data = data.fillna(np.nan)
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index fac8af652c1..375a19f5423 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import datetime as dt
+import datetime
 import locale
 import re
 from locale import nl_langinfo
@@ -237,9 +237,9 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)):
             return other
 
-        if isinstance(other, dt.datetime):
+        if isinstance(other, datetime.datetime):
             other = np.datetime64(other)
-        elif isinstance(other, dt.timedelta):
+        elif isinstance(other, datetime.timedelta):
             other = np.timedelta64(other)
         elif isinstance(other, pd.Timestamp):
             other = other.to_datetime64()
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index f10e257d359..d8ddb3d8d1a 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
+import warnings
 from decimal import Decimal
 from typing import Any, Sequence, Tuple, Union, cast
-from warnings import warn
 
 import cupy as cp
 import numpy as np
@@ -43,7 +43,7 @@ def as_decimal_column(
             isinstance(dtype, cudf.core.dtypes.DecimalDtype)
             and dtype.scale < self.dtype.scale
         ):
-            warn(
+            warnings.warn(
                 "cuDF truncates when downcasting decimals to a lower scale. "
                 "To round, use Series.round() or DataFrame.round()."
             )
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 8578bfe8147..2964378d114 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -17,7 +17,8 @@
     drop_list_duplicates,
     extract_element_column,
     extract_element_scalar,
-    index_of,
+    index_of_column,
+    index_of_scalar,
     sort_lists,
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column
@@ -113,9 +114,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             return NotImplemented
         if isinstance(other.dtype, ListDtype):
             if op == "__add__":
-                return concatenate_rows(
-                    cudf.core.frame.Frame({0: self, 1: other})
-                )
+                return concatenate_rows([self, other])
             else:
                 raise NotImplementedError(
                     "Lists concatenation for this operation is not yet"
@@ -301,16 +300,31 @@ def as_string_column(
         """
         Create a strings column from a list column
         """
-        # Convert the leaf child column to strings column
+        lc = self._transform_leaves(
+            lambda col, dtype: col.as_string_column(dtype), dtype
+        )
+
+        # Separator strings to match the Python format
+        separators = as_column([", ", "[", "]"])
+
+        # Call libcudf to format the list column
+        return format_list_column(lc, separators)
+
+    def _transform_leaves(self, func, *args, **kwargs):
+        # return a new list column with the same nested structure
+        # as ``self``, but with the leaf column transformed
+        # by applying ``func`` to it
+
         cc: List[ListColumn] = []
         c: ColumnBase = self
+
         while isinstance(c, ListColumn):
             cc.insert(0, c)
             c = c.children[1]
-        s = c.as_string_column(dtype)
+
+        lc = func(c, *args, **kwargs)
 
         # Rebuild the list column replacing just the leaf child
-        lc = s
         for c in cc:
             o = c.children[0]
             lc = cudf.core.column.ListColumn(  # type: ignore
@@ -321,12 +335,7 @@ def as_string_column(
                 null_count=c.null_count,
                 children=(o, lc),
             )
-
-        # Separator strings to match the Python format
-        separators = as_column([", ", "[", "]"])
-
-        # Call libcudf to format the list column
-        return format_list_column(lc, separators)
+        return lc
 
 
 class ListMethods(ColumnMethods):
@@ -455,10 +464,61 @@ def contains(self, search_key: ScalarLike) -> ParentType:
             raise
         return res
 
-    def index(self, search_key: ScalarLike) -> ParentType:
-        search_key = cudf.Scalar(search_key)
+    def index(self, search_key: Union[ScalarLike, ColumnLike]) -> ParentType:
+        """
+        Returns integers representing the index of the search key for each row.
+
+        If ``search_key`` is a sequence, it must be the same length as the
+        Series and ``search_key[i]`` represents the search key for the
+        ``i``-th row of the Series.
+
+        If the search key is not contained in a row, -1 is returned. If either
+        the row or the search key are null, <NA> is returned. If the search key
+        is contained multiple times, the smallest matching index is returned.
+
+        Parameters
+        ----------
+        search_key : scalar or sequence of scalars
+            Element or elements being searched for in each row of the list
+            column
+
+        Returns
+        -------
+        Series or Index
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
+        >>> s.list.index(4)
+        0   -1
+        1    1
+        2    0
+        dtype: int32
+
+        >>> s = cudf.Series([["a", "b", "c"], ["x", "y", "z"]])
+        >>> s.list.index(["b", "z"])
+        0    1
+        1    2
+        dtype: int32
+
+        >>> s = cudf.Series([[4, 5, 6], None, [-3, -2, -1]])
+        >>> s.list.index([None, 3, -2])
+        0    <NA>
+        1    <NA>
+        2       1
+        dtype: int32
+        """
+
         try:
-            res = self._return_or_inplace(index_of(self._column, search_key))
+            if is_scalar(search_key):
+                return self._return_or_inplace(
+                    index_of_scalar(self._column, cudf.Scalar(search_key))
+                )
+            else:
+                return self._return_or_inplace(
+                    index_of_column(self._column, as_column(search_key))
+                )
+
         except RuntimeError as e:
             if (
                 "Type/Scale of search key does not "
@@ -466,7 +526,6 @@ def index(self, search_key: ScalarLike) -> ParentType:
             ):
                 raise TypeError(str(e)) from e
             raise
-        return res
 
     @property
     def leaves(self) -> ParentType:
@@ -717,3 +776,31 @@ def concat(self, dropna=True) -> ParentType:
                     "of nesting"
                 )
         return self._return_or_inplace(result)
+
+    def astype(self, dtype):
+        """
+        Return a new list Series with the leaf values casted
+        to the specified data type.
+
+        Parameters
+        ----------
+        dtype: data type to cast leaves values to
+
+        Returns
+        -------
+        A new Series of lists
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2], [3, 4]])
+        >>> s.dtype
+        ListDtype(int64)
+        >>> s2 = s.list.astype("float64")
+        >>> s2.dtype
+        ListDtype(float64)
+        """
+        return self._return_or_inplace(
+            self._column._transform_leaves(
+                lambda col, dtype: col.astype(dtype), dtype
+            )
+        )
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 216faaa8250..d30026e8bfa 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -40,11 +40,12 @@
     as_column,
     build_column,
     column,
+    full,
     string,
 )
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
-from cudf.utils import cudautils, utils
+from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
     min_column_type,
@@ -254,9 +255,7 @@ def normalize_binop_value(
             if np.isscalar(other):
                 return cudf.dtype(other_dtype).type(other)
             else:
-                ary = utils.scalar_broadcast_to(
-                    other, size=len(self), dtype=other_dtype
-                )
+                ary = full(len(self), other, dtype=other_dtype)
                 return column.build_column(
                     data=Buffer(ary),
                     dtype=ary.dtype,
@@ -438,9 +437,7 @@ def find_and_replace(
             )
         if len(replacement_col) == 1 and len(to_replace_col) > 1:
             replacement_col = column.as_column(
-                utils.scalar_broadcast_to(
-                    replacement[0], (len(to_replace_col),), self.dtype
-                )
+                full(len(to_replace_col), replacement[0], self.dtype)
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
@@ -774,6 +771,4 @@ def digitize(
     if bin_col.nullable:
         raise ValueError("`bins` cannot contain null entries.")
 
-    return as_column(
-        libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right)
-    )
+    return as_column(libcudf.sort.digitize([column], [bin_col], right))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d5d45c341d5..0db7e7d9a27 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -37,7 +37,6 @@
 from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
 from cudf.core.column.methods import ColumnMethods, ParentType
-from cudf.utils import utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import can_convert_to_column
 
@@ -202,7 +201,7 @@ def __getitem__(self, key):
             return self.get(key)
 
     def len(self) -> SeriesOrIndex:
-        """
+        r"""
         Computes the length of each element in the Series/Index.
 
         Returns
@@ -214,7 +213,7 @@ def len(self) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series(["dog", "", "\\n", None])
+        >>> s = cudf.Series(["dog", "", "\n", None])
         >>> s.str.len()
         0       3
         1       0
@@ -365,9 +364,7 @@ def cat(self, others=None, sep=None, na_rep=None):
             other_cols = _get_cols_list(self._parent, others)
             all_cols = [self._column] + other_cols
             data = libstrings.concatenate(
-                cudf.DataFrame(
-                    {index: value for index, value in enumerate(all_cols)}
-                ),
+                all_cols,
                 cudf.Scalar(sep),
                 cudf.Scalar(na_rep, "str"),
             )
@@ -963,7 +960,7 @@ def replace(
         )
 
     def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex:
-        """
+        r"""
         Use the ``repl`` back-ref template to create a new string
         with the extracted elements found using the ``pat`` expression.
 
@@ -983,7 +980,7 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex:
         --------
         >>> import cudf
         >>> s = cudf.Series(["A543","Z756"])
-        >>> s.str.replace_with_backrefs('(\\\\d)(\\\\d)', 'V\\\\2\\\\1')
+        >>> s.str.replace_with_backrefs('(\\d)(\\d)', 'V\\2\\1')
         0    AV453
         1    ZV576
         dtype: object
@@ -1198,7 +1195,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex:
         )
 
     def isfloat(self) -> SeriesOrIndex:
-        """
+        r"""
         Check whether all characters in each string form floating value.
 
         If a string has zero characters, False is returned for
@@ -1252,7 +1249,7 @@ def isfloat(self) -> SeriesOrIndex:
         4     True
         5    False
         dtype: bool
-        >>> s = cudf.Series(["this is plain text", "\\t\\n", "9.9", "9.9.9"])
+        >>> s = cudf.Series(["this is plain text", "\t\n", "9.9", "9.9.9"])
         >>> s.str.isfloat()
         0    False
         1    False
@@ -2242,7 +2239,7 @@ def get(self, i: int = 0) -> SeriesOrIndex:
         return self._return_or_inplace(libstrings.get(self._column, i))
 
     def get_json_object(self, json_path):
-        """
+        r"""
         Applies a JSONPath string to an input strings column
         where each row in the column is a valid json string
 
@@ -2261,7 +2258,7 @@ def get_json_object(self, json_path):
         >>> import cudf
         >>> s = cudf.Series(
             [
-                \\"\\"\\"
+                \"\"\"
                 {
                     "store":{
                         "book":[
@@ -2280,13 +2277,13 @@ def get_json_object(self, json_path):
                         ]
                     }
                 }
-                \\"\\"\\"
+                \"\"\"
             ])
         >>> s
-            0    {"store": {\\n        "book": [\\n        { "cat...
+            0    {"store": {\n        "book": [\n        { "cat...
             dtype: object
         >>> s.str.get_json_object("$.store.book")
-            0    [\\n        { "category": "reference",\\n       ...
+            0    [\n        { "category": "reference",\n       ...
             dtype: object
         """
 
@@ -3141,7 +3138,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex:
         )
 
     def strip(self, to_strip: str = None) -> SeriesOrIndex:
-        """
+        r"""
         Remove leading and trailing characters.
 
         Strip whitespaces (including newlines) or a set of
@@ -3172,11 +3169,11 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\\n', '3. Cat?\\t', None])
+        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', None])
         >>> s
         0    1. Ant.
-        1    2. Bee!\\n
-        2    3. Cat?\\t
+        1    2. Bee!\n
+        2    3. Cat?\t
         3         <NA>
         dtype: object
         >>> s.str.strip()
@@ -3185,7 +3182,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex:
         2    3. Cat?
         3       <NA>
         dtype: object
-        >>> s.str.strip('123.!? \\n\\t')
+        >>> s.str.strip('123.!? \n\t')
         0     Ant
         1     Bee
         2     Cat
@@ -3200,7 +3197,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex:
         )
 
     def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
-        """
+        r"""
         Remove leading and trailing characters.
 
         Strip whitespaces (including newlines)
@@ -3231,11 +3228,11 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\\n', '3. Cat?\\t', None])
+        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', None])
         >>> s.str.lstrip('123.')
         0     Ant.
-        1     Bee!\\n
-        2     Cat?\\t
+        1     Bee!\n
+        2     Cat?\t
         3       <NA>
         dtype: object
         """
@@ -3247,7 +3244,7 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex:
         )
 
     def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
-        """
+        r"""
         Remove leading and trailing characters.
 
         Strip whitespaces (including newlines)
@@ -3280,14 +3277,14 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\\n', '3. Cat?\\t', None])
+        >>> s = cudf.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', None])
         >>> s
         0    1. Ant.
-        1    2. Bee!\\n
-        2    3. Cat?\\t
+        1    2. Bee!\n
+        2    3. Cat?\t
         3         <NA>
         dtype: object
-        >>> s.str.rstrip('.!? \\n\\t')
+        >>> s.str.rstrip('.!? \n\t')
         0    1. Ant
         1    2. Bee
         2    3. Cat
@@ -3302,7 +3299,7 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex:
         )
 
     def wrap(self, width: int, **kwargs) -> SeriesOrIndex:
-        """
+        r"""
         Wrap long strings in the Series/Index to be formatted in
         paragraphs with length less than a given width.
 
@@ -3343,8 +3340,8 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex:
         >>> data = ['line to be wrapped', 'another line to be wrapped']
         >>> s = cudf.Series(data)
         >>> s.str.wrap(12)
-        0             line to be\\nwrapped
-        1    another line\\nto be\\nwrapped
+        0             line to be\nwrapped
+        1    another line\nto be\nwrapped
         dtype: object
         """
         if not is_integer(width):
@@ -3578,7 +3575,7 @@ def isempty(self) -> SeriesOrIndex:
         return self._return_or_inplace((self._column == "").fillna(False))
 
     def isspace(self) -> SeriesOrIndex:
-        """
+        r"""
         Check whether all characters in each string are whitespace.
 
         This is equivalent to running the Python string method
@@ -3626,7 +3623,7 @@ def isspace(self) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series([' ', '\\t\\r\\n ', ''])
+        >>> s = cudf.Series([' ', '\t\r\n ', ''])
         >>> s.str.isspace()
         0     True
         1     True
@@ -4274,7 +4271,7 @@ def normalize_spaces(self) -> SeriesOrIndex:
         )
 
     def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
-        """
+        r"""
         Normalizes strings characters for tokenizing.
 
         This uses the normalizer that is built into the
@@ -4283,7 +4280,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
             - adding padding around punctuation (unicode category starts with
               "P") as well as certain ASCII symbols like "^" and "$"
             - adding padding around the CJK Unicode block characters
-            - changing whitespace (e.g. ``\\t``, ``\\n``, ``\\r``) to space
+            - changing whitespace (e.g. ``\t``, ``\n``, ``\r``) to space
             - removing control characters (unicode categories "Cc" and "Cf")
 
         If `do_lower_case = true`, lower-casing also removes the accents.
@@ -4306,7 +4303,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         Examples
         --------
         >>> import cudf
-        >>> ser = cudf.Series(["héllo, \\tworld","ĂĆCĖÑTED","$99"])
+        >>> ser = cudf.Series(["héllo, \tworld","ĂĆCĖÑTED","$99"])
         >>> ser.str.normalize_characters()
         0    hello ,  world
         1          accented
@@ -5523,15 +5520,21 @@ def _binaryop(
         if isinstance(other, (StringColumn, str, cudf.Scalar)):
             if op == "__add__":
                 if isinstance(other, cudf.Scalar):
-                    other = utils.scalar_broadcast_to(
-                        other, size=len(self), dtype="object"
+                    other = cast(
+                        StringColumn,
+                        column.full(len(self), other, dtype="object"),
                     )
+
+                # Explicit types are necessary because mypy infers ColumnBase
+                # rather than StringColumn and sometimes forgets Scalar.
+                lhs: Union[cudf.Scalar, StringColumn]
+                rhs: Union[cudf.Scalar, StringColumn]
                 lhs, rhs = (other, self) if reflect else (self, other)
 
                 return cast(
                     "column.ColumnBase",
                     libstrings.concatenate(
-                        cudf.DataFrame._from_data(data={0: lhs, 1: rhs}),
+                        [lhs, rhs],
                         sep=cudf.Scalar(""),
                         na_rep=cudf.Scalar(None, "str"),
                     ),
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 15815427aca..810624e9f4e 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import datetime as dt
+import datetime
 from typing import Any, Sequence, cast
 
 import numpy as np
@@ -211,7 +211,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     def normalize_binop_value(self, other) -> ColumnBinaryOperand:
         if isinstance(other, (ColumnBase, cudf.Scalar)):
             return other
-        if isinstance(other, dt.timedelta):
+        if isinstance(other, datetime.timedelta):
             other = np.timedelta64(other)
         elif isinstance(other, pd.Timestamp):
             other = other.to_datetime64()
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 291e50386cc..34236a8c09e 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import itertools
-from collections.abc import MutableMapping
+from collections import abc
 from functools import cached_property, reduce
 from typing import (
     TYPE_CHECKING,
@@ -78,7 +78,7 @@ def _to_flat_dict(d):
     return {k: v for k, v in _to_flat_dict_inner(d)}
 
 
-class ColumnAccessor(MutableMapping):
+class ColumnAccessor(abc.MutableMapping):
     """
     Parameters
     ----------
@@ -99,7 +99,7 @@ class ColumnAccessor(MutableMapping):
 
     def __init__(
         self,
-        data: Union[MutableMapping, ColumnAccessor] = None,
+        data: Union[abc.MutableMapping, ColumnAccessor] = None,
         multiindex: bool = False,
         level_names=None,
     ):
@@ -149,7 +149,7 @@ def _create_unsafe(
         return obj
 
     def __iter__(self):
-        return self._data.__iter__()
+        return iter(self._data)
 
     def __getitem__(self, key: Any) -> ColumnBase:
         return self._data[key]
@@ -158,7 +158,7 @@ def __setitem__(self, key: Any, value: Any):
         self.set_by_label(key, value)
 
     def __delitem__(self, key: Any):
-        self._data.__delitem__(key)
+        del self._data[key]
         self._clear_cache()
 
     def __len__(self) -> int:
@@ -213,7 +213,7 @@ def columns(self) -> Tuple[ColumnBase, ...]:
         return tuple(self.values())
 
     @cached_property
-    def _grouped_data(self) -> MutableMapping:
+    def _grouped_data(self) -> abc.MutableMapping:
         """
         If self.multiindex is True,
         return the underlying mapping as a nested mapping.
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 7c585602c23..2ec39043eb2 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -1,4 +1,6 @@
-from collections.abc import Sequence
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
+from collections import abc
 
 import cupy
 import numpy as np
@@ -21,21 +23,27 @@ def cut(
     duplicates: str = "raise",
     ordered: bool = True,
 ):
+    """Bin values into discrete intervals.
 
-    """
-    Bin values into discrete intervals.
     Use cut when you need to segment and sort data values into bins. This
     function is also useful for going from a continuous variable to a
     categorical variable.
+
     Parameters
     ----------
     x : array-like
         The input array to be binned. Must be 1-dimensional.
     bins : int, sequence of scalars, or IntervalIndex
         The criteria to bin by.
-        * int : Defines the number of equal-width bins in the
-        range of x. The range of x is extended by .1% on each
-        side to include the minimum and maximum values of x.
+
+        * int : Defines the number of equal-width bins in the range of `x`. The
+          range of `x` is extended by .1% on each side to include the minimum
+          and maximum values of `x`.
+        * sequence of scalars : Defines the bin edges allowing for non-uniform
+          width. No extension of the range of `x` is done.
+        * IntervalIndex : Defines the exact bins to be used. Note that
+          IntervalIndex for `bins` must be non-overlapping.
+
     right : bool, default True
         Indicates whether bins includes the rightmost edge or not.
     labels : array or False, default None
@@ -56,6 +64,7 @@ def cut(
         Categorical and Series (with Categorical dtype). If True,
         the resulting categorical will be ordered. If False, the resulting
         categorical will be unordered (labels must be provided).
+
     Returns
     -------
     out : CategoricalIndex
@@ -66,30 +75,38 @@ def cut(
         For scalar or sequence bins, this is an ndarray with the computed
         bins. If set duplicates=drop, bins will drop non-unique bin. For
         an IntervalIndex bins, this is equal to bins.
+
     Examples
     --------
     Discretize into three equal-sized bins.
+
     >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
     CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
-    ...         (5.0, 7.0],(0.994, 3.0]], categories=[(0.994, 3.0],
-    ...         (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category')
+                (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0],
+                (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category')
+
     >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
     (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
-    ...         (5.0, 7.0],(0.994, 3.0]],categories=[(0.994, 3.0],
-    ...         (3.0, 5.0], (5.0, 7.0]],ordered=True, dtype='category'),
-    array([0.994, 3.   , 5.   , 7.   ]))
+                (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0],
+                (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category'),
+     array([0.994, 3.   , 5.   , 7.   ]))
+
     >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]),
-    ...        3, labels=["bad", "medium", "good"])
+    ...          3, labels=["bad", "medium", "good"])
     CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'],
-    ...       categories=['bad', 'medium', 'good'],ordered=True,
-    ...       dtype='category')
+                     categories=['bad', 'medium', 'good'],ordered=True,
+                     dtype='category')
+
     >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
-    ...       labels=["B", "A", "B"], ordered=False)
+    ...          labels=["B", "A", "B"], ordered=False)
     CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'],
-    ...        ordered=False, dtype='category')
+               ordered=False, dtype='category')
+
     >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False)
     array([0, 1, 1, 3], dtype=int32)
+
     Passing a Series as an input returns a Series with categorical dtype:
+
     >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]),
     ...        index=['a', 'b', 'c', 'd', 'e'])
     >>> cudf.cut(s, 3)
@@ -123,7 +140,7 @@ def cut(
                 )
 
     # bins can either be an int, sequence of scalars or an intervalIndex
-    if isinstance(bins, Sequence):
+    if isinstance(bins, abc.Sequence):
         if len(set(bins)) is not len(bins):
             if duplicates == "raise":
                 raise ValueError(
@@ -141,7 +158,7 @@ def cut(
 
     # create bins if given an int or single scalar
     if not isinstance(bins, pd.IntervalIndex):
-        if not isinstance(bins, (Sequence)):
+        if not isinstance(bins, (abc.Sequence)):
             if isinstance(
                 x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)
             ):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 277fd5aae57..0d3b3ee0300 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7,12 +7,13 @@
 import itertools
 import numbers
 import pickle
+import re
 import sys
 import warnings
-from collections import defaultdict
-from collections.abc import Iterable, Mapping, Sequence
+from collections import abc, defaultdict
 from typing import (
     Any,
+    Callable,
     Dict,
     List,
     MutableMapping,
@@ -25,6 +26,7 @@
 )
 
 import cupy
+import numba
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -78,7 +80,7 @@
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
 from cudf.core.udf.row_function import _get_row_kernel
-from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
+from cudf.utils import applyutils, docutils, ioutils, queryutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     can_convert_to_column,
@@ -1103,7 +1105,7 @@ def __getitem__(self, arg):
         elif can_convert_to_column(arg):
             mask = arg
             if is_list_like(mask):
-                mask = cudf.utils.utils._create_pandas_series(data=mask)
+                mask = pd.Series(mask)
             if mask.dtype == "bool":
                 return self._apply_boolean_mask(mask)
             else:
@@ -1172,9 +1174,7 @@ def __setitem__(self, arg, value):
                             allow_non_unique=True,
                         )
                     if is_scalar(value):
-                        self._data[arg] = utils.scalar_broadcast_to(
-                            value, len(self)
-                        )
+                        self._data[arg] = column.full(len(self), value)
                     else:
                         value = as_column(value)
                         self._data[arg] = value
@@ -1855,7 +1855,7 @@ def _make_operands_and_index_for_binop(
         Optional[BaseIndex],
     ]:
         # Check built-in types first for speed.
-        if isinstance(other, (list, dict, Sequence, Mapping)):
+        if isinstance(other, (list, dict, abc.Sequence, abc.Mapping)):
             warnings.warn(
                 "Binary operations between host objects such as "
                 f"{type(other)} and cudf.DataFrame are deprecated and will be "
@@ -1876,7 +1876,7 @@ def _make_operands_and_index_for_binop(
 
         if _is_scalar_or_zero_d_array(other):
             rhs = {name: other for name in self._data}
-        elif isinstance(other, (list, Sequence)):
+        elif isinstance(other, (list, abc.Sequence)):
             rhs = {name: o for (name, o) in zip(self._data, other)}
         elif isinstance(other, Series):
             rhs = dict(zip(other.index.values_host, other.values_host))
@@ -1905,7 +1905,7 @@ def _make_operands_and_index_for_binop(
             # the fill value.
             left_default = fill_value
 
-        if not isinstance(rhs, (dict, Mapping)):
+        if not isinstance(rhs, (dict, abc.Mapping)):
             return NotImplemented, None
 
         operands = {
@@ -2571,8 +2571,24 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                 f"{num_cols * (num_cols > 0)}"
             )
 
+        # TODO: This check is currently necessary because
+        # _is_scalar_or_zero_d_array below will treat a length 1 pd.Categorical
+        # as a scalar and attempt to use column.full, which can't handle it.
+        # Maybe _is_scalar_or_zero_d_array should be changed, or maybe we just
+        # shouldn't support pd.Categorical at all, but those changes will at
+        # least require a deprecation cycle because we currently support
+        # inserting a pd.Categorical.
+        if isinstance(value, pd.Categorical):
+            value = cudf.core.column.categorical.pandas_categorical_as_column(
+                value
+            )
+
         if _is_scalar_or_zero_d_array(value):
-            value = utils.scalar_broadcast_to(value, len(self))
+            value = column.full(
+                len(self),
+                value,
+                "str" if libcudf.scalar._is_null_host_scalar(value) else None,
+            )
 
         if len(self) == 0:
             if isinstance(value, (pd.Series, Series)):
@@ -2658,11 +2674,6 @@ def diff(self, periods=1, axis=0):
         if axis != 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
-        if not all(is_numeric_dtype(i) for i in self.dtypes):
-            raise NotImplementedError(
-                "DataFrame.diff only supports numeric dtypes"
-            )
-
         if abs(periods) > len(self):
             df = cudf.DataFrame._from_data(
                 {
@@ -2964,7 +2975,9 @@ def agg(self, aggs, axis=None):
         if axis == 0 or axis is not None:
             raise NotImplementedError("axis not implemented yet")
 
-        if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)):
+        if isinstance(aggs, abc.Iterable) and not isinstance(
+            aggs, (str, dict)
+        ):
             result = DataFrame()
             # TODO : Allow simultaneous pass for multi-aggregation as
             # a future optimization
@@ -3000,13 +3013,13 @@ def agg(self, aggs, axis=None):
                             f"'Series' object"
                         )
                     result[key] = getattr(col, value)()
-            elif all([isinstance(val, Iterable) for val in aggs.values()]):
+            elif all([isinstance(val, abc.Iterable) for val in aggs.values()]):
                 idxs = set()
                 for val in aggs.values():
-                    if isinstance(val, Iterable):
-                        idxs.update(val)
-                    elif isinstance(val, str):
+                    if isinstance(val, str):
                         idxs.add(val)
+                    elif isinstance(val, abc.Iterable):
+                        idxs.update(val)
                 idxs = sorted(list(idxs))
                 for agg in idxs:
                     if agg is callable:
@@ -3020,7 +3033,7 @@ def agg(self, aggs, axis=None):
                         len(idxs), dtype=col.dtype, masked=True
                     )
                     ans = cudf.Series(data=col_empty, index=idxs)
-                    if isinstance(aggs.get(key), Iterable):
+                    if isinstance(aggs.get(key), abc.Iterable):
                         # TODO : Allow simultaneous pass for multi-aggregation
                         # as a future optimization
                         for agg in aggs.get(key):
@@ -3196,17 +3209,42 @@ def transpose(self):
         Difference from pandas:
         Not supporting *copy* because default and only behavior is copy=True
         """
-        # Never transpose a MultiIndex - remove the existing columns and
-        # replace with a RangeIndex. Afterward, reassign.
-        columns = self.index.copy(deep=False)
+
         index = self._data.to_pandas_index()
+        columns = self.index.copy(deep=False)
         if self._num_columns == 0 or self._num_rows == 0:
             return DataFrame(index=index, columns=columns)
+
+        # No column from index is transposed with libcudf.
+        source_columns = [*self._columns]
+        source_dtype = source_columns[0].dtype
+        if is_categorical_dtype(source_dtype):
+            if any(not is_categorical_dtype(c.dtype) for c in source_columns):
+                raise ValueError("Columns must all have the same dtype")
+            cats = list(c.categories for c in source_columns)
+            cats = cudf.core.column.concat_columns(cats).unique()
+            source_columns = [
+                col._set_categories(cats, is_unique=True).codes
+                for col in source_columns
+            ]
+
+        if any(c.dtype != source_columns[0].dtype for c in source_columns):
+            raise ValueError("Columns must all have the same dtype")
+
+        result_columns = libcudf.transpose.transpose(source_columns)
+
+        if is_categorical_dtype(source_dtype):
+            result_columns = [
+                codes._with_type_metadata(
+                    cudf.core.dtypes.CategoricalDtype(categories=cats)
+                )
+                for codes in result_columns
+            ]
+
         # Set the old column names as the new index
         result = self.__class__._from_data(
-            # Cython renames the columns to the range [0...ncols]
-            libcudf.transpose.transpose(self),
-            as_index(index),
+            {i: col for i, col in enumerate(result_columns)},
+            index=as_index(index),
         )
         # Set the old index as the new column names
         result.columns = columns
@@ -3564,12 +3602,13 @@ def apply(
     ):
         """
         Apply a function along an axis of the DataFrame.
-
-        Designed to mimic `pandas.DataFrame.apply`. Applies a user
-        defined function row wise over a dataframe, with true null
-        handling. Works with UDFs using `core.udf.pipeline.nulludf`
-        and returns a single series. Uses numba to jit compile the
-        function to PTX via LLVM.
+        ``apply`` relies on Numba to JIT compile ``func``.
+        Thus the allowed operations within ``func`` are limited
+        to the ones specified
+        [here](https://numba.pydata.org/numba-doc/latest/cuda/cudapysupported.html).
+        For more information, see the cuDF guide
+        to user defined functions found
+        [here](https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html).
 
         Parameters
         ----------
@@ -3590,7 +3629,7 @@ def apply(
         Examples
         --------
 
-        Simple function of a single variable which could be NA
+        Simple function of a single variable which could be NA:
 
         >>> def f(row):
         ...     if row['a'] is cudf.NA:
@@ -3606,7 +3645,7 @@ def apply(
         dtype: int64
 
         Function of multiple variables will operate in
-        a null aware manner
+        a null aware manner:
 
         >>> def f(row):
         ...     return row['a'] - row['b']
@@ -3622,7 +3661,7 @@ def apply(
         3    <NA>
         dtype: int64
 
-        Functions may conditionally return NA as in pandas
+        Functions may conditionally return NA as in pandas:
 
         >>> def f(row):
         ...     if row['a'] + row['b'] > 3:
@@ -3641,7 +3680,7 @@ def apply(
         dtype: int64
 
         Mixed types are allowed, but will return the common
-        type, rather than object as in pandas
+        type, rather than object as in pandas:
 
         >>> def f(row):
         ...     return row['a'] + row['b']
@@ -3658,7 +3697,7 @@ def apply(
 
         Functions may also return scalar values, however the
         result will be promoted to a safe type regardless of
-        the data
+        the data:
 
         >>> def f(row):
         ...     if row['a'] > 3:
@@ -3675,7 +3714,7 @@ def apply(
         2    5.0
         dtype: float64
 
-        Ops against N columns are supported generally
+        Ops against N columns are supported generally:
 
         >>> def f(row):
         ...     v, w, x, y, z = (
@@ -3707,6 +3746,68 @@ def apply(
 
         return self._apply(func, _get_row_kernel, *args, **kwargs)
 
+    def applymap(
+        self,
+        func: Callable[[Any], Any],
+        na_action: Union[str, None] = None,
+        **kwargs,
+    ) -> DataFrame:
+
+        """
+        Apply a function to a Dataframe elementwise.
+
+        This method applies a function that accepts and returns a scalar
+        to every element of a DataFrame.
+
+        Parameters
+        ----------
+        func : callable
+            Python function, returns a single value from a single value.
+        na_action : {None, 'ignore'}, default None
+            If 'ignore', propagate NaN values, without passing them to func.
+
+        Returns
+        -------
+        DataFrame
+            Transformed DataFrame.
+        """
+
+        if kwargs:
+            raise NotImplementedError(
+                "DataFrame.applymap does not yet support **kwargs."
+            )
+
+        if na_action not in {"ignore", None}:
+            raise ValueError(
+                f"na_action must be 'ignore' or None. Got {repr(na_action)}"
+            )
+
+        if na_action == "ignore":
+            devfunc = numba.cuda.jit(device=True)(func)
+
+            # promote to a null-ignoring function
+            # this code is never run in python, it only
+            # exists to provide numba with the correct
+            # bytecode to generate the equivalent PTX
+            # as a null-ignoring version of the function
+            def _func(x):  # pragma: no cover
+                if x is cudf.NA:
+                    return cudf.NA
+                else:
+                    return devfunc(x)
+
+        else:
+            _func = func
+
+        # TODO: naive implementation
+        # this could be written as a single kernel
+        result = {}
+        for name, col in self._data.items():
+            apply_sr = Series._from_data({None: col})
+            result[name] = apply_sr.apply(_func)
+
+        return DataFrame._from_data(result, index=self.index)
+
     @_cudf_nvtx_annotate
     @applyutils.doc_apply()
     def apply_rows(
@@ -3872,19 +3973,16 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
         -------
         partitioned: list of DataFrame
         """
-        idx = (
-            0
-            if (self._index is None or keep_index is False)
-            else self._index._num_columns
-        )
-        key_indices = [self._data.names.index(k) + idx for k in columns]
 
-        output_data, output_index, offsets = libcudf.hash.hash_partition(
-            self, key_indices, nparts, keep_index
+        key_indices = [self._column_names.index(k) for k in columns]
+        output_columns, offsets = libcudf.hash.hash_partition(
+            [*self._columns], key_indices, nparts
+        )
+        outdf = self._from_columns_like_self(
+            [*(self._index._columns if keep_index else ()), *output_columns],
+            self._column_names,
+            self._index_names if keep_index else None,
         )
-        outdf = self.__class__._from_data(output_data, output_index)
-        outdf._copy_type_metadata(self, include_index=keep_index)
-
         # Slice into partition
         return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])]
 
@@ -5115,26 +5213,33 @@ def _reduce(
         if level is not None:
             raise NotImplementedError("level parameter is not implemented yet")
 
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
+        source = self
+        if numeric_only:
+            numeric_cols = (
+                name
+                for name in self._data.names
+                if is_numeric_dtype(self._data[name])
             )
-        axis = self._get_axis_from_axis_arg(axis)
+            source = self._get_columns_by_label(numeric_cols)
+            if source.empty:
+                return Series(index=cudf.StringIndex([]))
+
+        axis = source._get_axis_from_axis_arg(axis)
 
         if axis == 0:
             try:
                 result = [
-                    getattr(self._data[col], op)(**kwargs)
-                    for col in self._data.names
+                    getattr(source._data[col], op)(**kwargs)
+                    for col in source._data.names
                 ]
             except AttributeError:
-                raise TypeError(f"cannot perform {op} with type {self.dtype}")
+                raise TypeError(f"Not all column dtypes support op {op}")
 
             return Series._from_data(
-                {None: result}, as_index(self._data.names)
+                {None: result}, as_index(source._data.names)
             )
         elif axis == 1:
-            return self._apply_cupy_method_axis_1(op, **kwargs)
+            return source._apply_cupy_method_axis_1(op, **kwargs)
 
     @_cudf_nvtx_annotate
     def _scan(
@@ -5529,14 +5634,14 @@ def select_dtypes(self, include=None, exclude=None):
     @ioutils.doc_to_parquet()
     def to_parquet(self, path, *args, **kwargs):
         """{docstring}"""
-        from cudf.io import parquet as pq
+        from cudf.io import parquet
 
-        return pq.to_parquet(self, path, *args, **kwargs)
+        return parquet.to_parquet(self, path, *args, **kwargs)
 
     @ioutils.doc_to_feather()
     def to_feather(self, path, *args, **kwargs):
         """{docstring}"""
-        from cudf.io import feather as feather
+        from cudf.io import feather
 
         feather.to_feather(self, path, *args, **kwargs)
 
@@ -5556,7 +5661,7 @@ def to_csv(
         **kwargs,
     ):
         """{docstring}"""
-        from cudf.io import csv as csv
+        from cudf.io import csv
 
         return csv.to_csv(
             self,
@@ -5576,7 +5681,7 @@ def to_csv(
     @ioutils.doc_to_orc()
     def to_orc(self, fname, compression=None, *args, **kwargs):
         """{docstring}"""
-        from cudf.io import orc as orc
+        from cudf.io import orc
 
         orc.to_orc(self, fname, compression, *args, **kwargs)
 
@@ -5610,22 +5715,24 @@ def stack(self, level=-1, dropna=True):
         """
         assert level in (None, -1)
         repeated_index = self.index.repeat(self.shape[1])
-        name_index = cudf.DataFrame._from_data({0: self._column_names}).tile(
-            self.shape[0]
+        name_index = libcudf.reshape.tile(
+            [as_column(self._column_names)], self.shape[0]
         )
-        new_index = list(repeated_index._columns) + [name_index._columns[0]]
+        new_index_columns = [*repeated_index._columns, *name_index]
         if isinstance(self._index, MultiIndex):
             index_names = self._index.names + [None]
         else:
-            index_names = [None] * len(new_index)
+            index_names = [None] * len(new_index_columns)
         new_index = MultiIndex.from_frame(
-            DataFrame(dict(zip(range(0, len(new_index)), new_index))),
+            DataFrame._from_data(
+                dict(zip(range(0, len(new_index_columns)), new_index_columns))
+            ),
             names=index_names,
         )
 
         # Collect datatypes and cast columns as that type
         common_type = np.result_type(*self.dtypes)
-        homogenized = DataFrame(
+        homogenized = DataFrame._from_data(
             {
                 c: (
                     self._data[c].astype(common_type)
@@ -5636,9 +5743,15 @@ def stack(self, level=-1, dropna=True):
             }
         )
 
-        data_col = libcudf.reshape.interleave_columns(homogenized)
+        result = Series._from_data(
+            {
+                None: libcudf.reshape.interleave_columns(
+                    [*homogenized._columns]
+                )
+            },
+            index=new_index,
+        )
 
-        result = Series(data=data_col, index=new_index)
         if dropna:
             return result.dropna()
         else:
@@ -6090,7 +6203,7 @@ def _sample_axis_1(
     def _from_columns_like_self(
         self,
         columns: List[ColumnBase],
-        column_names: Iterable[str],
+        column_names: abc.Iterable[str],
         index_names: Optional[List[str]] = None,
     ) -> DataFrame:
         result = super()._from_columns_like_self(
@@ -6099,6 +6212,207 @@ def _from_columns_like_self(
         result._set_column_names_like(self)
         return result
 
+    @_cudf_nvtx_annotate
+    def interleave_columns(self):
+        """
+        Interleave Series columns of a table into a single column.
+
+        Converts the column major table `cols` into a row major column.
+
+        Parameters
+        ----------
+        cols : input Table containing columns to interleave.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({0: ['A1', 'A2', 'A3'], 1: ['B1', 'B2', 'B3']})
+        >>> df
+            0   1
+        0  A1  B1
+        1  A2  B2
+        2  A3  B3
+        >>> df.interleave_columns()
+        0    A1
+        1    B1
+        2    A2
+        3    B2
+        4    A3
+        5    B3
+        dtype: object
+
+        Returns
+        -------
+        The interleaved columns as a single column
+        """
+        if ("category" == self.dtypes).any():
+            raise ValueError(
+                "interleave_columns does not support 'category' dtype."
+            )
+
+        return self._constructor_sliced._from_data(
+            {None: libcudf.reshape.interleave_columns([*self._columns])}
+        )
+
+    @_cudf_nvtx_annotate
+    def eval(self, expr: str, inplace: bool = False, **kwargs):
+        """Evaluate a string describing operations on DataFrame columns.
+
+        Operates on columns only, not specific rows or elements.
+
+        Parameters
+        ----------
+        expr : str
+            The expression string to evaluate.
+        inplace : bool, default False
+            If the expression contains an assignment, whether to perform the
+            operation inplace and mutate the existing DataFrame. Otherwise,
+            a new DataFrame is returned.
+        **kwargs
+            Not supported.
+
+        Returns
+        -------
+        DataFrame, Series, or None
+            Series if a single column is returned (the typical use case),
+            DataFrame if any assignment statements are included in
+            ``expr``, or None if ``inplace=True``.
+
+        Notes
+        -----
+        Difference from pandas:
+            * Additional kwargs are not supported.
+            * Bitwise and logical operators are not dtype-dependent.
+              Specifically, `&` must be used for bitwise operators on integers,
+              not `and`, which is specifically for the logical and between
+              booleans.
+            * Only numerical types are currently supported.
+            * Operators generally will not cast automatically. Users are
+              responsible for casting columns to suitable types before
+              evaluating a function.
+            * Multiple assignments to the same name (i.e. a sequence of
+              assignment statements where later statements are conditioned upon
+              the output of earlier statements) is not supported.
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
+        >>> df
+           A   B
+        0  1  10
+        1  2   8
+        2  3   6
+        3  4   4
+        4  5   2
+        >>> df.eval('A + B')
+        0    11
+        1    10
+        2     9
+        3     8
+        4     7
+        dtype: int64
+
+        Assignment is allowed though by default the original DataFrame is not
+        modified.
+
+        >>> df.eval('C = A + B')
+           A   B   C
+        0  1  10  11
+        1  2   8  10
+        2  3   6   9
+        3  4   4   8
+        4  5   2   7
+        >>> df
+           A   B
+        0  1  10
+        1  2   8
+        2  3   6
+        3  4   4
+        4  5   2
+
+        Use ``inplace=True`` to modify the original DataFrame.
+
+        >>> df.eval('C = A + B', inplace=True)
+        >>> df
+           A   B   C
+        0  1  10  11
+        1  2   8  10
+        2  3   6   9
+        3  4   4   8
+        4  5   2   7
+
+        Multiple columns can be assigned to using multi-line expressions:
+
+        >>> df.eval(
+        ...     '''
+        ... C = A + B
+        ... D = A - B
+        ... '''
+        ... )
+           A   B   C  D
+        0  1  10  11 -9
+        1  2   8  10 -6
+        2  3   6   9 -3
+        3  4   4   8  0
+        4  5   2   7  3
+        """
+        if kwargs:
+            raise ValueError(
+                "Keyword arguments other than `inplace` are not supported"
+            )
+
+        # Have to use a regex match to avoid capturing "=="
+        includes_assignment = re.search("[^=]=[^=]", expr) is not None
+
+        # Check if there were multiple statements. Filter out empty lines.
+        statements = tuple(filter(None, expr.strip().split("\n")))
+        if len(statements) > 1 and any(
+            re.search("[^=]=[^=]", st) is None for st in statements
+        ):
+            raise ValueError(
+                "Multi-line expressions are only valid if all expressions "
+                "contain an assignment."
+            )
+
+        if not includes_assignment:
+            if inplace:
+                raise ValueError(
+                    "Cannot operate inplace if there is no assignment"
+                )
+            return Series._from_data(
+                {
+                    None: libcudf.transform.compute_column(
+                        [*self._columns], self._column_names, statements[0]
+                    )
+                }
+            )
+
+        targets = []
+        exprs = []
+        for st in statements:
+            try:
+                t, e = re.split("[^=]=[^=]", st)
+            except ValueError as err:
+                if "too many values" in str(err):
+                    raise ValueError(
+                        f"Statement {st} contains too many assignments ('=')"
+                    )
+                raise
+            targets.append(t.strip())
+            exprs.append(e.strip())
+
+        cols = (
+            libcudf.transform.compute_column(
+                [*self._columns], self._column_names, e
+            )
+            for e in exprs
+        )
+        ret = self if inplace else self.copy(deep=False)
+        for name, col in zip(targets, cols):
+            ret._data[name] = col
+        if not inplace:
+            return ret
+
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 4a30a78bf65..f4ce658bff3 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
-import collections
 import enum
+from collections import abc
 from typing import (
     Any,
     Dict,
@@ -569,13 +569,13 @@ def get_columns(self) -> Iterable[_CuDFColumn]:
         ]
 
     def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame":
-        if not isinstance(indices, collections.abc.Sequence):
+        if not isinstance(indices, abc.Sequence):
             raise ValueError("`indices` is not a sequence")
 
         return _CuDFDataFrame(self._df.iloc[:, indices])
 
     def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame":
-        if not isinstance(names, collections.Sequence):
+        if not isinstance(names, abc.Sequence):
             raise ValueError("`names` is not a sequence")
 
         return _CuDFDataFrame(
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 21cae5f05b7..585e8b94e80 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -237,7 +237,7 @@ def __eq__(self, other):
 
     def __repr__(self):
         if isinstance(self.element_type, (ListDtype, StructDtype)):
-            return f"{type(self).__name__}({self.element_type.__repr__()})"
+            return f"{type(self).__name__}({repr(self.element_type)})"
         else:
             return f"{type(self).__name__}({self.element_type})"
 
@@ -681,8 +681,13 @@ def is_interval_dtype(obj):
     # TODO: Should there be any branch in this function that calls
     # pd.api.types.is_interval_dtype?
     return (
-        isinstance(obj, cudf.core.dtypes.IntervalDtype)
-        or isinstance(obj, pd.core.dtypes.dtypes.IntervalDtype)
+        isinstance(
+            obj,
+            (
+                cudf.core.dtypes.IntervalDtype,
+                pd.core.dtypes.dtypes.IntervalDtype,
+            ),
+        )
         or obj is cudf.core.dtypes.IntervalDtype
         or (
             isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 5185fb05cb4..d0e9e6d94c1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -40,6 +40,7 @@
     as_column,
     build_categorical_column,
     deserialize_columns,
+    full,
     serialize_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
@@ -917,6 +918,12 @@ def scatter_by_map(
         -------
         A list of cudf.DataFrame objects.
         """
+        if not isinstance(self, cudf.DataFrame):
+            warnings.warn(
+                f"{self.__class__.__name__}.scatter_by_map is deprecated and "
+                "will be removed.",
+                FutureWarning,
+            )
 
         # map_index might be a column name or array,
         # make it a Column
@@ -954,10 +961,16 @@ def scatter_by_map(
                     f"ERROR: map_size must be >= {count} (got {map_size})."
                 )
 
-        data, index, output_offsets = libcudf.partitioning.partition(
-            self, map_index, map_size, keep_index
+        partitioned_columns, output_offsets = libcudf.partitioning.partition(
+            [*(self._index._columns if keep_index else ()), *self._columns],
+            map_index,
+            map_size,
+        )
+        partitioned = self._from_columns_like_self(
+            partitioned_columns,
+            column_names=self._column_names,
+            index_names=self._index_names if keep_index else None,
         )
-        partitioned = self.__class__._from_data(data, index)
 
         # due to the split limitation mentioned
         # here: https://github.com/rapidsai/cudf/issues/4607
@@ -967,9 +980,6 @@ def scatter_by_map(
 
         result = partitioned._split(output_offsets, keep_index=keep_index)
 
-        for frame in result:
-            frame._copy_type_metadata(self, include_index=keep_index)
-
         if map_size:
             result += [
                 self._empty_like(keep_index)
@@ -1095,6 +1105,8 @@ def fillna(
             elif method == "backfill":
                 method = "bfill"
 
+        # TODO: This logic should be handled in different subclasses since
+        # different Frames support different types of values.
         if isinstance(value, cudf.Series):
             value = value.reindex(self._data.names)
         elif isinstance(value, cudf.DataFrame):
@@ -1129,7 +1141,13 @@ def fillna(
                 filled_data[col_name] = col.copy(deep=True)
 
         return self._mimic_inplace(
-            self._from_data(data=filled_data),
+            self._from_data(
+                data=ColumnAccessor._create_unsafe(
+                    data=filled_data,
+                    multiindex=self._data.multiindex,
+                    level_names=self._data.level_names,
+                )
+            ),
             inplace=inplace,
         )
 
@@ -1209,6 +1227,11 @@ def interpolate(
             some or all ``NaN`` values
 
         """
+        if isinstance(self, cudf.BaseIndex):
+            warnings.warn(
+                "Index.interpolate is deprecated and will be removed.",
+                FutureWarning,
+            )
 
         if method in {"pad", "ffill"} and limit_direction != "forward":
             raise ValueError(
@@ -1261,100 +1284,27 @@ def _quantiles(
             libcudf.types.NullOrder[key] for key in null_precedence
         ]
 
-        result = self.__class__._from_data(
-            *libcudf.quantiles.quantiles(
-                self,
+        return self._from_columns_like_self(
+            libcudf.quantiles.quantiles(
+                [*self._columns],
                 q,
                 interpolation,
                 is_sorted,
                 column_order,
                 null_precedence,
-            )
-        )
-
-        result._copy_type_metadata(self)
-        return result
-
-    @_cudf_nvtx_annotate
-    def rank(
-        self,
-        axis=0,
-        method="average",
-        numeric_only=None,
-        na_option="keep",
-        ascending=True,
-        pct=False,
-    ):
-        """
-        Compute numerical data ranks (1 through n) along axis.
-        By default, equal values are assigned a rank that is the average of the
-        ranks of those values.
-
-        Parameters
-        ----------
-        axis : {0 or 'index'}, default 0
-            Index to direct ranking.
-        method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
-            How to rank the group of records that have the same value
-            (i.e. ties):
-            * average: average rank of the group
-            * min: lowest rank in the group
-            * max: highest rank in the group
-            * first: ranks assigned in order they appear in the array
-            * dense: like 'min', but rank always increases by 1 between groups.
-        numeric_only : bool, optional
-            For DataFrame objects, rank only numeric columns if set to True.
-        na_option : {'keep', 'top', 'bottom'}, default 'keep'
-            How to rank NaN values:
-            * keep: assign NaN rank to NaN values
-            * top: assign smallest rank to NaN values if ascending
-            * bottom: assign highest rank to NaN values if ascending.
-        ascending : bool, default True
-            Whether or not the elements should be ranked in ascending order.
-        pct : bool, default False
-            Whether or not to display the returned rankings in percentile
-            form.
-
-        Returns
-        -------
-        same type as caller
-            Return a Series or DataFrame with data ranks as values.
-        """
-        if method not in {"average", "min", "max", "first", "dense"}:
-            raise KeyError(method)
-
-        method_enum = libcudf.sort.RankMethod[method.upper()]
-        if na_option not in {"keep", "top", "bottom"}:
-            raise ValueError(
-                "na_option must be one of 'keep', 'top', or 'bottom'"
-            )
-
-        if axis not in (0, "index"):
-            raise NotImplementedError(
-                f"axis must be `0`/`index`, "
-                f"axis={axis} is not yet supported in rank"
-            )
-
-        source = self
-        if numeric_only:
-            numeric_cols = (
-                name
-                for name in self._data.names
-                if _is_non_decimal_numeric_dtype(self._data[name])
-            )
-            source = self._get_columns_by_label(numeric_cols)
-            if source.empty:
-                return source.astype("float64")
-
-        data, index = libcudf.sort.rank_columns(
-            source, method_enum, na_option, ascending, pct
+            ),
+            column_names=self._column_names,
         )
 
-        return self._from_data(data, index).astype(np.float64)
-
     @_cudf_nvtx_annotate
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
+        if isinstance(self, cudf.BaseIndex):
+            warnings.warn(
+                "Index.shift is deprecated and will be removed.",
+                FutureWarning,
+            )
+
         axis = self._get_axis_from_axis_arg(axis)
         if axis != 0:
             raise ValueError("Only axis=0 is supported.")
@@ -1441,30 +1391,33 @@ def from_arrow(cls, data):
 
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
-            cudf_indices_frame, _ = libcudf.interop.from_arrow(
-                dict_indices_table, dict_indices_table.column_names
-            )
+            indices_columns = libcudf.interop.from_arrow(dict_indices_table)
             # as dictionary size can vary, it can't be a single table
             cudf_dictionaries_columns = {
                 name: ColumnBase.from_arrow(dict_dictionaries[name])
                 for name in dict_dictionaries.keys()
             }
 
-            for name, codes in cudf_indices_frame.items():
-                cudf_category_frame[name] = build_categorical_column(
+            cudf_category_frame = {
+                name: build_categorical_column(
                     cudf_dictionaries_columns[name],
                     codes,
                     mask=codes.base_mask,
                     size=codes.size,
                     ordered=dict_ordered[name],
                 )
+                for name, codes in zip(
+                    dict_indices_table.column_names, indices_columns
+                )
+            }
 
         # Handle non-dict arrays
-        cudf_non_category_frame = (
-            {}
-            if data.num_columns == 0
-            else libcudf.interop.from_arrow(data, data.column_names)[0]
-        )
+        cudf_non_category_frame = {
+            name: col
+            for name, col in zip(
+                data.column_names, libcudf.interop.from_arrow(data)
+            )
+        }
 
         result = {**cudf_non_category_frame, **cudf_category_frame}
 
@@ -1747,6 +1700,12 @@ def replace(
         3    3    8  d
         4    4    9  e
         """
+        if isinstance(self, cudf.BaseIndex):
+            warnings.warn(
+                "Index.replace is deprecated and will be removed.",
+                FutureWarning,
+            )
+
         if limit is not None:
             raise NotImplementedError("limit parameter is not implemented yet")
 
@@ -1996,76 +1955,6 @@ def notnull(self):
     # Alias for notnull
     notna = notnull
 
-    @_cudf_nvtx_annotate
-    def interleave_columns(self):
-        """
-        Interleave Series columns of a table into a single column.
-
-        Converts the column major table `cols` into a row major column.
-
-        Parameters
-        ----------
-        cols : input Table containing columns to interleave.
-
-        Examples
-        --------
-        >>> df = DataFrame([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3']])
-        >>> df
-        0    [A1, A2, A3]
-        1    [B1, B2, B3]
-        >>> df.interleave_columns()
-        0    A1
-        1    B1
-        2    A2
-        3    B2
-        4    A3
-        5    B3
-
-        Returns
-        -------
-        The interleaved columns as a single column
-        """
-        if ("category" == self.dtypes).any():
-            raise ValueError(
-                "interleave_columns does not support 'category' dtype."
-            )
-
-        result = self._constructor_sliced(
-            libcudf.reshape.interleave_columns(self)
-        )
-
-        return result
-
-    @_cudf_nvtx_annotate
-    def tile(self, count):
-        """
-        Repeats the rows from `self` DataFrame `count` times to form a
-        new DataFrame.
-
-        Parameters
-        ----------
-        self : input Table containing columns to interleave.
-        count : Number of times to tile "rows". Must be non-negative.
-
-        Examples
-        --------
-        >>> df  = Dataframe([[8, 4, 7], [5, 2, 3]])
-        >>> count = 2
-        >>> df.tile(df, count)
-           0  1  2
-        0  8  4  7
-        1  5  2  3
-        0  8  4  7
-        1  5  2  3
-
-        Returns
-        -------
-        The table containing the tiled "rows".
-        """
-        result = self.__class__._from_data(*libcudf.reshape.tile(self, count))
-        result._copy_type_metadata(self)
-        return result
-
     @_cudf_nvtx_annotate
     def searchsorted(
         self, values, side="left", ascending=True, na_position="last"
@@ -2135,12 +2024,24 @@ def searchsorted(
             scalar_flag = True
 
         if not isinstance(values, Frame):
-            values = as_column(values)
-            if values.dtype != self.dtype:
-                self = self.astype(values.dtype)
-            values = values.as_frame()
+            values = [as_column(values)]
+        else:
+            values = [*values._columns]
+        if len(values) != len(self._data):
+            raise ValueError("Mismatch number of columns to search for.")
+
+        sources = [
+            col
+            if is_dtype_equal(col.dtype, val.dtype)
+            else col.astype(val.dtype)
+            for col, val in zip(self._columns, values)
+        ]
         outcol = libcudf.search.search_sorted(
-            self, values, side, ascending=ascending, na_position=na_position
+            sources,
+            values,
+            side,
+            ascending=ascending,
+            na_position=na_position,
         )
 
         # Retrun result as cupy array if the values is non-scalar
@@ -2242,15 +2143,17 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
         # Get an int64 column consisting of the indices required to sort self
         # according to the columns specified in by.
 
-        to_sort = (
-            self
-            if by is None
-            else self._get_columns_by_label(list(by), downcast=False)
-        )
+        to_sort = [
+            *(
+                self
+                if by is None
+                else self._get_columns_by_label(list(by), downcast=False)
+            )._columns
+        ]
 
         # If given a scalar need to construct a sequence of length # of columns
         if np.isscalar(ascending):
-            ascending = [ascending] * to_sort._num_columns
+            ascending = [ascending] * len(to_sort)
 
         return libcudf.sort.order_by(to_sort, ascending, na_position)
 
@@ -2309,6 +2212,12 @@ def scale(self):
         4    0.043478
         dtype: float64
         """
+        if isinstance(self, cudf.BaseIndex):
+            warnings.warn(
+                "Index.scale is deprecated and will be removed.",
+                FutureWarning,
+            )
+
         vmin = self.min()
         vmax = self.max()
         scaled = (self - vmin) / (vmax - vmin)
@@ -2404,8 +2313,22 @@ def _is_sorted(self, ascending=None, null_position=None):
             Returns True, if sorted as expected by ``ascending`` and
             ``null_position``, False otherwise.
         """
+        if ascending is not None and not cudf.api.types.is_list_like(
+            ascending
+        ):
+            raise TypeError(
+                f"Expected a list-like or None for `ascending`, got "
+                f"{type(ascending)}"
+            )
+        if null_position is not None and not cudf.api.types.is_list_like(
+            null_position
+        ):
+            raise TypeError(
+                f"Expected a list-like or None for `null_position`, got "
+                f"{type(null_position)}"
+            )
         return libcudf.sort.is_sorted(
-            self, ascending=ascending, null_position=null_position
+            [*self._columns], ascending=ascending, null_position=null_position
         )
 
     @_cudf_nvtx_annotate
@@ -2425,10 +2348,8 @@ def _split(self, splits):
 
     @_cudf_nvtx_annotate
     def _encode(self):
-        data, index, indices = libcudf.transform.table_encode(self)
-        for name, col in data.items():
-            data[name] = col._with_type_metadata(self._data[name].dtype)
-        keys = self.__class__._from_data(data, index)
+        columns, indices = libcudf.transform.table_encode([*self._columns])
+        keys = self._from_columns_like_self(columns)
         return keys, indices
 
     @_cudf_nvtx_annotate
@@ -3365,6 +3286,12 @@ def _scan(self, op, axis=None, skipna=True):
         2   6  24
         3  10  34
         """
+        if isinstance(self, cudf.BaseIndex):
+            warnings.warn(
+                f"Index.{op} is deprecated and will be removed.",
+                FutureWarning,
+            )
+
         cast_to_int = op in ("cumsum", "cumprod")
         skipna = True if skipna is None else skipna
 
@@ -3402,7 +3329,7 @@ def _scan(self, op, axis=None, skipna=True):
         # TODO: This will work for Index because it's passing self._index
         # (which is None), but eventually we may want to remove that parameter
         # for Index._from_data and simplify.
-        return self._from_data(results, index=self._index)
+        return self._from_data(results, self._index)
 
     @_cudf_nvtx_annotate
     @ioutils.doc_to_json()
@@ -3429,7 +3356,7 @@ def to_dlpack(self):
 
     @_cudf_nvtx_annotate
     def to_string(self):
-        """
+        r"""
         Convert to string
 
         cuDF uses Pandas internals for efficient string formatting.
@@ -3446,9 +3373,9 @@ def to_string(self):
         >>> df['key'] = [0, 1, 2]
         >>> df['val'] = [float(i + 10) for i in range(3)]
         >>> df.to_string()
-        '   key   val\\n0    0  10.0\\n1    1  11.0\\n2    2  12.0'
+        '   key   val\n0    0  10.0\n1    1  11.0\n2    2  12.0'
         """
-        return self.__repr__()
+        return repr(self)
 
     def __str__(self):
         return self.to_string()
@@ -3735,9 +3662,9 @@ def _get_replacement_values_for_columns(
             values_columns = {
                 col: [value]
                 if _is_non_decimal_numeric_dtype(columns_dtype_map[col])
-                else cudf.utils.utils.scalar_broadcast_to(
+                else full(
+                    len(to_replace),
                     value,
-                    (len(to_replace),),
                     cudf.dtype(type(value)),
                 )
                 for col in columns_dtype_map
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6b98e82d553..013ae7ad033 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,9 +1,9 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import collections
 import itertools
 import pickle
 import warnings
+from collections import abc
 from functools import cached_property
 from typing import Any, Iterable, List, Tuple, Union
 
@@ -207,6 +207,30 @@ def cumcount(self):
             .reset_index(drop=True)
         )
 
+    def rank(
+        self,
+        method="average",
+        ascending=True,
+        na_option="keep",
+        pct=False,
+        axis=0,
+    ):
+        """
+        Return the rank of values within each group.
+        """
+        if not axis == 0:
+            raise NotImplementedError("Only axis=0 is supported.")
+
+        def rank(x):
+            return getattr(x, "rank")(
+                method=method,
+                ascending=ascending,
+                na_option=na_option,
+                pct=pct,
+            )
+
+        return self.agg(rank)
+
     @cached_property
     def _groupby(self):
         return libgroupby.GroupBy(
@@ -516,7 +540,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    def apply(self, function):
+    def apply(self, function, *args):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
@@ -566,19 +590,20 @@ def mult(df):
             .. code-block::
 
                 >>> df = pd.DataFrame({
-                    'a': [1, 1, 2, 2],
-                    'b': [1, 2, 1, 2],
-                    'c': [1, 2, 3, 4]})
+                ...     'a': [1, 1, 2, 2],
+                ...     'b': [1, 2, 1, 2],
+                ...     'c': [1, 2, 3, 4],
+                ... })
                 >>> gdf = cudf.from_pandas(df)
                 >>> df.groupby('a').apply(lambda x: x.iloc[[0]])
-                        a  b  c
-                    a
-                    1 0  1  1  1
-                    2 2  2  1  3
+                     a  b  c
+                a
+                1 0  1  1  1
+                2 2  2  1  3
                 >>> gdf.groupby('a').apply(lambda x: x.iloc[[0]])
-                        a  b  c
-                    0  1  1  1
-                    2  2  1  3
+                   a  b  c
+                0  1  1  1
+                2  2  1  3
         """
         if not callable(function):
             raise TypeError(f"type {type(function)} is not callable")
@@ -594,8 +619,7 @@ def mult(df):
         chunks = [
             grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:])
         ]
-        chunk_results = [function(chk) for chk in chunks]
-
+        chunk_results = [function(chk, *args) for chk in chunks]
         if not len(chunk_results):
             return self.obj.head(0)
 
@@ -603,8 +627,11 @@ def mult(df):
             result = cudf.Series(chunk_results, index=group_names)
             result.index.names = self.grouping.names
         elif isinstance(chunk_results[0], cudf.Series):
-            result = cudf.concat(chunk_results, axis=1).T
-            result.index.names = self.grouping.names
+            if isinstance(self.obj, cudf.DataFrame):
+                result = cudf.concat(chunk_results, axis=1).T
+                result.index.names = self.grouping.names
+            else:
+                result = cudf.concat(chunk_results)
         else:
             result = cudf.concat(chunk_results)
 
@@ -1104,16 +1131,11 @@ def _cov_or_corr(self, func, method_name):
             for i in range(0, len(cols_list), num_cols)
         ]
 
-        def combine_columns(gb_cov_corr, ys):
-            list_of_columns = [gb_cov_corr._data[y] for y in ys]
-            frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys)
-            return interleave_columns(frame)
-
         # interleave: combines the correlation or covariance results for each
         # column-pair into a single column
         res = cudf.DataFrame._from_data(
             {
-                x: combine_columns(gb_cov_corr, ys)
+                x: interleave_columns([gb_cov_corr._data[y] for y in ys])
                 for ys, x in zip(cols_split, column_names)
             }
         )
@@ -1581,8 +1603,8 @@ def agg(self, func):
 
         return result
 
-    def apply(self, func):
-        result = super().apply(func)
+    def apply(self, func, *args):
+        result = super().apply(func, *args)
 
         # apply Series name to result
         result.name = self.obj.name
@@ -1637,7 +1659,7 @@ def _handle_by_or_level(self, by=None, level=None):
                     self._handle_series(by)
                 elif isinstance(by, cudf.BaseIndex):
                     self._handle_index(by)
-                elif isinstance(by, collections.abc.Mapping):
+                elif isinstance(by, abc.Mapping):
                     self._handle_mapping(by)
                 elif isinstance(by, Grouper):
                     self._handle_grouper(by)
@@ -1756,7 +1778,7 @@ def _is_multi_agg(aggs):
     Returns True if more than one aggregation is performed
     on any of the columns as specified in `aggs`.
     """
-    if isinstance(aggs, collections.abc.Mapping):
+    if isinstance(aggs, abc.Mapping):
         return any(is_list_like(agg) for agg in aggs.values())
     if is_list_like(aggs):
         return True
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index aff13025e72..1ed530ae22b 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -76,10 +76,10 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        sort_vals, key_as_table, side="left"
+        [*sort_vals._data.columns], [*key_as_table._columns], side="left"
     ).element_indexing(0)
     upper_bound = search_sorted(
-        sort_vals, key_as_table, side="right"
+        [*sort_vals._data.columns], [*key_as_table._columns], side="right"
     ).element_indexing(0)
 
     return lower_bound, upper_bound, sort_inds
@@ -1116,7 +1116,7 @@ def __repr__(self):
         # related issue : https://github.com/pandas-dev/pandas/issues/35389
         if isinstance(preprocess, CategoricalIndex):
             if preprocess.categories.dtype.kind == "f":
-                output = (
+                output = repr(
                     preprocess.astype("str")
                     .to_pandas()
                     .astype(
@@ -1127,18 +1127,17 @@ def __repr__(self):
                             ordered=preprocess.dtype.ordered,
                         )
                     )
-                    .__repr__()
                 )
                 break_idx = output.find("ordered=")
                 output = (
                     output[:break_idx].replace("'", "") + output[break_idx:]
                 )
             else:
-                output = preprocess.to_pandas().__repr__()
+                output = repr(preprocess.to_pandas())
 
             output = output.replace("nan", cudf._NA_REP)
         elif preprocess._values.nullable:
-            output = self._clean_nulls_from_index().to_pandas().__repr__()
+            output = repr(self._clean_nulls_from_index().to_pandas())
 
             if not isinstance(self, StringIndex):
                 # We should remove all the single quotes
@@ -1150,7 +1149,7 @@ def __repr__(self):
                 # of StringIndex and it is valid to have them.
                 output = output.replace("'", "")
         else:
-            output = preprocess.to_pandas().__repr__()
+            output = repr(preprocess.to_pandas())
 
         # Fix and correct the class name of the output
         # string by finding first occurrence of "(" in the output
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 10736948b57..1361fc56fa0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -818,7 +818,8 @@ def hash_values(self, method="murmur3"):
         # calculation, necessitating the unfortunate circular reference to the
         # child class here.
         return cudf.Series._from_data(
-            {None: libcudf.hash.hash(self, method)}, index=self.index
+            {None: libcudf.hash.hash([*self._columns], method)},
+            index=self.index,
         )
 
     def _gather(
@@ -991,6 +992,7 @@ def add_prefix(self, prefix):
         Examples
         --------
         **Series**
+
         >>> s = cudf.Series([1, 2, 3, 4])
         >>> s
         0    1
@@ -1006,6 +1008,7 @@ def add_prefix(self, prefix):
         dtype: int64
 
         **DataFrame**
+
         >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
         >>> df
            A  B
@@ -2204,7 +2207,7 @@ def _make_operands_and_index_for_binop(
         Optional[cudf.BaseIndex],
     ]:
         raise NotImplementedError(
-            "Binary operations are not supported for {self.__class__}"
+            f"Binary operations are not supported for {self.__class__}"
         )
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
@@ -2688,21 +2691,52 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         if not ignore_index and self._index is not None:
             explode_column_num += self._index.nlevels
 
-        data, index = libcudf.lists.explode_outer(
-            self, explode_column_num, ignore_index
+        exploded = libcudf.lists.explode_outer(
+            [
+                *(self._index._data.columns if not ignore_index else ()),
+                *self._columns,
+            ],
+            explode_column_num,
         )
-        res = self.__class__._from_data(
-            ColumnAccessor(
-                data,
-                multiindex=self._data.multiindex,
-                level_names=self._data._level_names,
-            ),
-            index=index,
+
+        return self._from_columns_like_self(
+            exploded,
+            self._column_names,
+            self._index_names if not ignore_index else None,
         )
 
-        if not ignore_index and self._index is not None:
-            res.index.names = self._index.names
-        return res
+    @_cudf_nvtx_annotate
+    def tile(self, count):
+        """Repeats the rows `count` times to form a new Frame.
+
+        Parameters
+        ----------
+        self : input Table containing columns to interleave.
+        count : Number of times to tile "rows". Must be non-negative.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df  = cudf.Dataframe([[8, 4, 7], [5, 2, 3]])
+        >>> count = 2
+        >>> df.tile(df, count)
+           0  1  2
+        0  8  4  7
+        1  5  2  3
+        0  8  4  7
+        1  5  2  3
+
+        Returns
+        -------
+        The indexed frame containing the tiled "rows".
+        """
+        return self._from_columns_like_self(
+            libcudf.reshape.tile(
+                [*self._index._columns, *self._columns], count
+            ),
+            column_names=self._column_names,
+            index_names=self._index_names,
+        )
 
     @_cudf_nvtx_annotate
     @docutils.doc_apply(
@@ -3543,6 +3577,93 @@ def ge(
             other=other, op="__ge__", fill_value=fill_value, can_reindex=True
         )
 
+    @_cudf_nvtx_annotate
+    def rank(
+        self,
+        axis=0,
+        method="average",
+        numeric_only=None,
+        na_option="keep",
+        ascending=True,
+        pct=False,
+    ):
+        """
+        Compute numerical data ranks (1 through n) along axis.
+
+        By default, equal values are assigned a rank that is the average of the
+        ranks of those values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}, default 0
+            Index to direct ranking.
+        method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+            How to rank the group of records that have the same value
+            (i.e. ties):
+            * average: average rank of the group
+            * min: lowest rank in the group
+            * max: highest rank in the group
+            * first: ranks assigned in order they appear in the array
+            * dense: like 'min', but rank always increases by 1 between groups.
+        numeric_only : bool, optional
+            For DataFrame objects, rank only numeric columns if set to True.
+        na_option : {'keep', 'top', 'bottom'}, default 'keep'
+            How to rank NaN values:
+            * keep: assign NaN rank to NaN values
+            * top: assign smallest rank to NaN values if ascending
+            * bottom: assign highest rank to NaN values if ascending.
+        ascending : bool, default True
+            Whether or not the elements should be ranked in ascending order.
+        pct : bool, default False
+            Whether or not to display the returned rankings in percentile
+            form.
+
+        Returns
+        -------
+        same type as caller
+            Return a Series or DataFrame with data ranks as values.
+        """
+        if isinstance(self, cudf.BaseIndex):
+            warnings.warn(
+                "Index.rank is deprecated and will be removed.",
+                FutureWarning,
+            )
+
+        if method not in {"average", "min", "max", "first", "dense"}:
+            raise KeyError(method)
+
+        method_enum = libcudf.aggregation.RankMethod[method.upper()]
+        if na_option not in {"keep", "top", "bottom"}:
+            raise ValueError(
+                "na_option must be one of 'keep', 'top', or 'bottom'"
+            )
+
+        if axis not in (0, "index"):
+            raise NotImplementedError(
+                f"axis must be `0`/`index`, "
+                f"axis={axis} is not yet supported in rank"
+            )
+
+        source = self
+        if numeric_only:
+            numeric_cols = (
+                name
+                for name in self._data.names
+                if _is_non_decimal_numeric_dtype(self._data[name])
+            )
+            source = self._get_columns_by_label(numeric_cols)
+            if source.empty:
+                return source.astype("float64")
+
+        result_columns = libcudf.sort.rank_columns(
+            [*source._columns], method_enum, na_option, ascending, pct
+        )
+
+        return self.__class__._from_data(
+            dict(zip(source._column_names, result_columns)),
+            index=source._index,
+        ).astype(np.float64)
+
 
 def _check_duplicate_level_names(specified, level_names):
     """Raise if any of `specified` has duplicates in `level_names`."""
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index ead0cd566d9..e1057c3b997 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -2,8 +2,8 @@
 
 from __future__ import annotations
 
-import collections
 import warnings
+from collections import abc
 from typing import TYPE_CHECKING, Any, Tuple, cast
 
 import numpy as np
@@ -166,7 +166,7 @@ def _match_categorical_dtypes_both(
 
 
 def _coerce_to_tuple(obj):
-    if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
+    if isinstance(obj, abc.Iterable) and not isinstance(obj, str):
         return tuple(obj)
     else:
         return (obj,)
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index c7e46cf0165..6a495ef8d9a 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -177,15 +177,15 @@ def __init__(
         )
 
     def perform_merge(self) -> Frame:
-        left_join_cols = {}
-        right_join_cols = {}
+        left_join_cols = []
+        right_join_cols = []
 
         for left_key, right_key in zip(self._left_keys, self._right_keys):
             lcol = left_key.get(self.lhs)
             rcol = right_key.get(self.rhs)
             lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how)
-            left_join_cols[left_key.name] = lcol_casted
-            right_join_cols[left_key.name] = rcol_casted
+            left_join_cols.append(lcol_casted)
+            right_join_cols.append(rcol_casted)
 
             # Categorical dtypes must be cast back from the underlying codes
             # type that was returned by _match_join_keys.
@@ -201,8 +201,8 @@ def perform_merge(self) -> Frame:
             right_key.set(self.rhs, rcol_casted, validate=False)
 
         left_rows, right_rows = self._joiner(
-            cudf.core.frame.Frame(left_join_cols),
-            cudf.core.frame.Frame(right_join_cols),
+            left_join_cols,
+            right_join_cols,
             how=self.how,
         )
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 591ec582a3b..332e8897d3b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -5,12 +5,12 @@
 import itertools
 import numbers
 import pickle
-from collections.abc import Sequence
+from collections import abc
 from functools import cached_property
 from numbers import Integral
 from typing import Any, List, MutableMapping, Tuple, Union
 
-import cupy
+import cupy as cp
 import numpy as np
 import pandas as pd
 from pandas._config import get_option
@@ -29,11 +29,26 @@
     as_index,
 )
 from cudf.utils.docutils import doc_apply
-from cudf.utils.utils import (
-    NotIterable,
-    _cudf_nvtx_annotate,
-    _maybe_indices_to_slice,
-)
+from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+
+
+def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
+    """Makes best effort to convert an array of indices into a python slice.
+    If the conversion is not possible, return input. `indices` are expected
+    to be valid.
+    """
+    # TODO: improve efficiency by avoiding sync.
+    if len(indices) == 1:
+        x = indices[0].item()
+        return slice(x, x + 1)
+    if len(indices) == 2:
+        x1, x2 = indices[0].item(), indices[1].item()
+        return slice(x1, x2 + 1, x2 - x1)
+    start, step = indices[0].item(), (indices[1] - indices[0]).item()
+    stop = start + step * len(indices)
+    if (indices == cp.arange(start, stop, step)).all():
+        return slice(start, stop, step)
+    return indices
 
 
 class MultiIndex(Frame, BaseIndex, NotIterable):
@@ -95,7 +110,7 @@ def __init__(
         if len(levels) == 0:
             raise ValueError("Must pass non-zero number of levels/codes")
         if not isinstance(codes, cudf.DataFrame) and not isinstance(
-            codes[0], (Sequence, np.ndarray)
+            codes[0], (abc.Sequence, np.ndarray)
         ):
             raise TypeError("Codes is not a Sequence of sequences")
 
@@ -455,7 +470,7 @@ def __repr__(self):
         else:
             preprocess = preprocess.to_pandas(nullable=True)
 
-        output = preprocess.__repr__()
+        output = repr(preprocess)
         output_prefix = self.__class__.__name__ + "("
         output = output.lstrip(output_prefix)
         lines = output.split("\n")
@@ -912,7 +927,7 @@ def deserialize(cls, header, frames):
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
-        if isinstance(index, (Integral, Sequence)):
+        if isinstance(index, (Integral, abc.Sequence)):
             index = np.array(index)
         elif isinstance(index, slice):
             start, stop, step = index.indices(len(self))
@@ -1709,7 +1724,7 @@ def get_loc(self, key, method=None, tolerance=None):
             return true_inds
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cupy.full(self._data.nrows, False)
+        mask = cp.full(self._data.nrows, False)
         mask[true_inds] = True
         return mask
 
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index a388e2560ee..5977b63777f 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import itertools
+from collections import abc
 from typing import Dict, Optional
 
 import numpy as np
@@ -485,14 +486,14 @@ def melt(
     1  b         B          3
     2  c         B          5
     """
-    assert col_level in (None,)
+    if col_level is not None:
+        raise NotImplementedError("col_level != None is not supported yet.")
 
     # Arg cleaning
-    import collections
 
     # id_vars
     if id_vars is not None:
-        if not isinstance(id_vars, collections.abc.Sequence):
+        if not isinstance(id_vars, abc.Sequence):
             id_vars = [id_vars]
         id_vars = list(id_vars)
         missing = set(id_vars) - set(frame._column_names)
@@ -506,7 +507,7 @@ def melt(
 
     # value_vars
     if value_vars is not None:
-        if not isinstance(value_vars, collections.abc.Sequence):
+        if not isinstance(value_vars, abc.Sequence):
             value_vars = [value_vars]
         value_vars = list(value_vars)
         missing = set(value_vars) - set(frame._column_names)
@@ -771,10 +772,10 @@ def merge_sorted(
 
     Parameters
     ----------
-    objs : list of DataFrame, Series, or Index
+    objs : list of DataFrame or Series
     keys : list, default None
         List of Column names to sort by. If None, all columns used
-        (Ignored if `index=True`)
+        (Ignored if `by_index=True`)
     by_index : bool, default False
         Use index for sorting. `keys` input will be ignored if True
     ignore_index : bool, default False
@@ -805,18 +806,38 @@ def merge_sorted(
     if by_index and ignore_index:
         raise ValueError("`by_index` and `ignore_index` cannot both be True")
 
-    result = objs[0].__class__._from_data(
-        *cudf._lib.merge.merge_sorted(
-            objs,
-            keys=keys,
-            by_index=by_index,
-            ignore_index=ignore_index,
+    if by_index:
+        key_columns_indices = list(range(0, objs[0]._index.nlevels))
+    else:
+        if keys is None:
+            key_columns_indices = list(range(0, objs[0]._num_columns))
+        else:
+            key_columns_indices = [
+                objs[0]._column_names.index(key) for key in keys
+            ]
+        if not ignore_index:
+            key_columns_indices = [
+                idx + objs[0]._index.nlevels for idx in key_columns_indices
+            ]
+
+    columns = [
+        [
+            *(obj._index._data.columns if not ignore_index else ()),
+            *obj._columns,
+        ]
+        for obj in objs
+    ]
+
+    return objs[0]._from_columns_like_self(
+        cudf._lib.merge.merge_sorted(
+            input_columns=columns,
+            key_columns_indices=key_columns_indices,
             ascending=ascending,
             na_position=na_position,
-        )
+        ),
+        column_names=objs[0]._column_names,
+        index_names=None if ignore_index else objs[0]._index_names,
     )
-    result._copy_type_metadata(objs[0])
-    return result
 
 
 def _pivot(df, index, columns):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 5bf52ed7520..d813db58d1e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -6,7 +6,7 @@
 import inspect
 import pickle
 import warnings
-from collections import abc as abc
+from collections import abc
 from shutil import get_terminal_size
 from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union
 
@@ -14,11 +14,11 @@
 import numpy as np
 import pandas as pd
 from pandas._config import get_option
+from pandas.core.dtypes.common import is_float
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.scalar import _is_null_host_scalar
-from cudf._lib.transform import bools_to_mask
 from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -42,7 +42,6 @@
     arange,
     as_column,
     column,
-    column_empty_like,
     full,
 )
 from cudf.core.column.categorical import (
@@ -64,15 +63,16 @@
 )
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.core.udf.scalar_function import _get_scalar_kernel
-from cudf.utils import cudautils, docutils
+from cudf.utils import docutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     can_convert_to_column,
     find_common_type,
     is_mixed_with_object_dtype,
     min_scalar_type,
+    to_cudf_compatible_scalar,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate, to_cudf_compatible_scalar
+from cudf.utils.utils import _cudf_nvtx_annotate
 
 
 def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
@@ -1064,11 +1064,8 @@ def __repr__(self):
         ) or isinstance(
             preprocess._column, cudf.core.column.timedelta.TimeDeltaColumn
         ):
-            output = (
-                preprocess.astype("O")
-                .fillna(cudf._NA_REP)
-                .to_pandas()
-                .__repr__()
+            output = repr(
+                preprocess.astype("O").fillna(cudf._NA_REP).to_pandas()
             )
         elif isinstance(
             preprocess._column, cudf.core.column.CategoricalColumn
@@ -1111,7 +1108,7 @@ def __repr__(self):
                 na_rep=cudf._NA_REP,
             )
         else:
-            output = preprocess.to_pandas().__repr__()
+            output = repr(preprocess.to_pandas())
 
         lines = output.split("\n")
 
@@ -2021,9 +2018,15 @@ def _return_sentinel_series():
     def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         Apply a scalar function to the values of a Series.
+        Similar to ``pandas.Series.apply``.
 
-        Similar to `pandas.Series.apply. Applies a user
-        defined function elementwise over a series.
+        ``apply`` relies on Numba to JIT compile ``func``.
+        Thus the allowed operations within ``func`` are limited
+        to the ones specified
+        [here](https://numba.pydata.org/numba-doc/latest/cuda/cudapysupported.html).
+        For more information, see the cuDF guide to
+        user defined functions found
+        [here](https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html).
 
         Parameters
         ----------
@@ -2061,7 +2064,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         2    4
         dtype: int64
 
-        Apply a basic function to a series with nulls
+        Apply a basic function to a series with nulls:
 
         >>> sr = cudf.Series([1,cudf.NA,3])
         >>> def f(x):
@@ -2073,7 +2076,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         dtype: int64
 
         Use a function that does something conditionally,
-        based on if the value is or is not null
+        based on if the value is or is not null:
 
         >>> sr = cudf.Series([1,cudf.NA,3])
         >>> def f(x):
@@ -2091,7 +2094,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         as derived from the UDFs logic. Note that this means
         the common type will be returned even if such data
         is passed that would not result in any values of that
-        dtype.
+        dtype:
 
         >>> sr = cudf.Series([1,cudf.NA,3])
         >>> def f(x):
@@ -2104,7 +2107,10 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         if convert_dtype is not True:
             raise ValueError("Series.apply only supports convert_dtype=True")
-        return self._apply(func, _get_scalar_kernel, *args, **kwargs)
+
+        result = self._apply(func, _get_scalar_kernel, *args, **kwargs)
+        result.name = self.name
+        return result
 
     @_cudf_nvtx_annotate
     def applymap(self, udf, out_dtype=None):
@@ -2963,19 +2969,22 @@ def digitize(self, bins, right=False):
 
     @_cudf_nvtx_annotate
     def diff(self, periods=1):
-        """Calculate the difference between values at positions i and i - N in
-        an array and store the output in a new array.
+        """First discrete difference of element.
+
+        Calculates the difference of a Series element compared with another
+        element in the Series (default is element in previous row).
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Periods to shift for calculating difference,
+            accepts negative values.
 
         Returns
         -------
         Series
             First differences of the Series.
 
-        Notes
-        -----
-        Diff currently only supports float and integer dtype columns with
-        no null values.
-
         Examples
         --------
         >>> import cudf
@@ -3022,32 +3031,12 @@ def diff(self, periods=1):
         5    <NA>
         dtype: int64
         """
-        if self.has_nulls:
-            raise AssertionError(
-                "Diff currently requires columns with no null values"
-            )
-
-        if not np.issubdtype(self.dtype, np.number):
-            raise NotImplementedError(
-                "Diff currently only supports numeric dtypes"
-            )
-
-        # TODO: move this libcudf
-        input_col = self._column
-        output_col = column_empty_like(input_col)
-        output_mask = column_empty_like(input_col, dtype="bool")
-        if output_col.size > 0:
-            cudautils.gpu_diff.forall(output_col.size)(
-                input_col, output_col, output_mask, periods
-            )
-
-        output_col = column.build_column(
-            data=output_col.data,
-            dtype=output_col.dtype,
-            mask=bools_to_mask(output_mask),
-        )
+        if not is_integer(periods):
+            if not (is_float(periods) and periods.is_integer()):
+                raise ValueError("periods must be an integer")
+            periods = int(periods)
 
-        return Series(output_col, name=self.name, index=self.index)
+        return self - self.shift(periods=periods)
 
     @copy_docstring(SeriesGroupBy)
     @_cudf_nvtx_annotate
@@ -4625,13 +4614,13 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 
 @_cudf_nvtx_annotate
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
-    """Returns a boolean array where two arrays are equal within a tolerance.
+    r"""Returns a boolean array where two arrays are equal within a tolerance.
 
     Two values in ``a`` and ``b`` are  considered equal when the following
     equation is satisfied.
 
     .. math::
-       |a - b| \\le \\mathrm{atol} + \\mathrm{rtol} |b|
+       |a - b| \le \mathrm{atol} + \mathrm{rtol} |b|
 
     Parameters
     ----------
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 003f8ea7fdb..7fa66bd831d 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -52,9 +52,9 @@ def _reduce(
         if level is not None:
             raise NotImplementedError("level parameter is not implemented yet")
 
-        if numeric_only not in (None, True):
+        if numeric_only:
             raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
+                f"Series.{op} does not implement numeric_only"
             )
         try:
             return getattr(self._column, op)(**kwargs)
@@ -81,8 +81,8 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def ndim(self):
-        """Get the dimensionality (always 1 for single-columned frames)."""
+    def ndim(self):  # noqa: D401
+        """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
     @property  # type: ignore
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 782b74ef4a6..83cceff5c4c 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
+import warnings
 from typing import Union
-from warnings import warn
 
 import cupy as cp
 
@@ -186,7 +186,7 @@ def __call__(
                 "When truncation is not True, the behaviour currently differs "
                 "from HuggingFace as cudf always returns overflowing tokens"
             )
-            warn(warning_msg)
+            warnings.warn(warning_msg)
 
         if padding != "max_length":
             error_msg = (
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 7eea7cedaad..0273227010b 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -57,7 +57,7 @@ def to_numeric(arg, errors="raise", downcast=None):
         otherwise ndarray
 
     Notes
-    -------
+    -----
     An important difference from pandas is that this function does not accept
     mixed numeric/non-numeric type sequences. For example ``[1, 'a']``.
     A ``TypeError`` will be raised when such input is received, regardless of
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index cdb9492c695..ed5fc1d6d23 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -63,7 +63,7 @@ def __hash__(self):
         Needed so that numba caches type instances with different
         `value_type` separately.
         """
-        return self.__repr__().__hash__()
+        return hash(repr(self))
 
     def unify(self, context, other):
         """
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 00a2cb4cee2..644643db83c 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 
 import cudf
@@ -34,12 +34,13 @@ def from_dlpack(pycapsule_obj):
     tensor is row-major, transpose it before passing it to this function.
     """
 
-    data, _ = libdlpack.from_dlpack(pycapsule_obj)
+    columns = libdlpack.from_dlpack(pycapsule_obj)
+    column_names = range(len(columns))
 
-    if len(data) == 1:
-        return cudf.Series._from_data(data)
+    if len(columns) == 1:
+        return cudf.Series._from_columns(columns, column_names=column_names)
     else:
-        return cudf.DataFrame._from_data(data)
+        return cudf.DataFrame._from_columns(columns, column_names=column_names)
 
 
 @ioutils.doc_to_dlpack()
@@ -91,4 +92,4 @@ def to_dlpack(cudf_obj):
     )
     gdf = gdf.astype(dtype)
 
-    return libdlpack.to_dlpack(gdf)
+    return libdlpack.to_dlpack([*gdf._columns])
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 4dd9f434097..607d9121630 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -3,7 +3,7 @@
 import itertools
 import re
 import warnings
-from collections.abc import Mapping, Sequence
+from collections import abc
 from contextlib import contextmanager
 from decimal import Decimal
 
@@ -238,9 +238,9 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
     else:
         if len(func_args_and_kwargs) == 1:
             func_args, func_kwargs = [], {}
-            if isinstance(func_args_and_kwargs[0], Sequence):
+            if isinstance(func_args_and_kwargs[0], abc.Sequence):
                 func_args = func_args_and_kwargs[0]
-            elif isinstance(func_args_and_kwargs[0], Mapping):
+            elif isinstance(func_args_and_kwargs[0], abc.Mapping):
                 func_kwargs = func_args_and_kwargs[0]
             else:
                 raise ValueError(
@@ -248,12 +248,12 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
                     "either a Sequence or a Mapping"
                 )
         elif len(func_args_and_kwargs) == 2:
-            if not isinstance(func_args_and_kwargs[0], Sequence):
+            if not isinstance(func_args_and_kwargs[0], abc.Sequence):
                 raise ValueError(
                     "Positional argument at 1st position of "
                     "func_args_and_kwargs should be a sequence."
                 )
-            if not isinstance(func_args_and_kwargs[1], Mapping):
+            if not isinstance(func_args_and_kwargs[1], abc.Mapping):
                 raise ValueError(
                     "Key-word argument at 2nd position of "
                     "func_args_and_kwargs should be a dictionary mapping."
@@ -363,6 +363,13 @@ def assert_column_memory_ne(
     raise AssertionError("lhs and rhs holds the same memory.")
 
 
+def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
+    # Wrapper around pd.Series using a float64 default dtype for empty data.
+    if dtype is None and (data is None or len(data) == 0):
+        dtype = "float64"
+    return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
+
+
 parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
     "left_dtype,right_dtype",
     list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index c3e25adad77..8134e307a72 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -504,7 +504,11 @@ def rand_dataframe(
                         generator=lambda cardinality=cardinality: [
                             mimesis.random.random.schoice(
                                 string.printable,
-                                meta.get("max_string_length", 1000),
+                                np.random.randint(
+                                    low=0,
+                                    high=meta.get("max_string_length", 1000),
+                                    size=1,
+                                )[0],
                             )
                             for _ in range(cardinality)
                         ],
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 702136d7c98..b134d2b26e9 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -215,7 +215,7 @@ def assert_column_equal(
                 msg1 = f"{left.ordered}"
                 msg2 = f"{right.ordered}"
                 raise_assert_detail(
-                    "{obj} category", "Orders are different", msg1, msg2
+                    f"{obj} category", "Orders are different", msg1, msg2
                 )
 
     if (
diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
index e7cf113f604..c2cd78f88a0 100644
--- a/python/cudf/cudf/tests/test_api_types.py
+++ b/python/cudf/cudf/tests/test_api_types.py
@@ -3,10 +3,10 @@
 import numpy as np
 import pandas as pd
 import pytest
-from pandas.api import types as ptypes
+from pandas.api import types as pd_types
 
 import cudf
-from cudf.api import types as types
+from cudf.api import types
 
 
 @pytest.mark.parametrize(
@@ -1035,11 +1035,13 @@ def test_is_decimal_dtype(obj, expect):
     ),
 )
 def test_pandas_agreement(obj):
-    assert types.is_categorical_dtype(obj) == ptypes.is_categorical_dtype(obj)
-    assert types.is_numeric_dtype(obj) == ptypes.is_numeric_dtype(obj)
-    assert types.is_integer_dtype(obj) == ptypes.is_integer_dtype(obj)
-    assert types.is_integer(obj) == ptypes.is_integer(obj)
-    assert types.is_string_dtype(obj) == ptypes.is_string_dtype(obj)
+    assert types.is_categorical_dtype(obj) == pd_types.is_categorical_dtype(
+        obj
+    )
+    assert types.is_numeric_dtype(obj) == pd_types.is_numeric_dtype(obj)
+    assert types.is_integer_dtype(obj) == pd_types.is_integer_dtype(obj)
+    assert types.is_integer(obj) == pd_types.is_integer(obj)
+    assert types.is_string_dtype(obj) == pd_types.is_string_dtype(obj)
 
 
 @pytest.mark.parametrize(
@@ -1115,7 +1117,7 @@ def test_pandas_agreement(obj):
     ),
 )
 def test_pandas_agreement_scalar(obj):
-    assert types.is_scalar(obj) == ptypes.is_scalar(obj)
+    assert types.is_scalar(obj) == pd_types.is_scalar(obj)
 
 
 # TODO: Add test of interval.
diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index bd322a28a08..c8a9b5d03f5 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-from cudf import Series
+from cudf import NA, DataFrame, Series
 from cudf.testing import _utils as utils
 
 
@@ -58,3 +58,45 @@ def test_applymap_change_out_dtype():
     expect = np.array(data, dtype=float)
     got = out.to_numpy()
     np.testing.assert_array_equal(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": [4, 5, 6]},
+        {"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]},
+        {"a": [1, 2, 3], "b": [True, False, True]},
+        {"a": [1, NA, 2], "b": [NA, 4, NA]},
+    ],
+)
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x: x + 1,
+        lambda x: x - 0.5,
+        lambda x: 2 if x is NA else 2 + (x + 1) / 4.1,
+        lambda x: 42,
+    ],
+)
+@pytest.mark.parametrize("na_action", [None, "ignore"])
+def test_applymap_dataframe(data, func, na_action):
+    gdf = DataFrame(data)
+    pdf = gdf.to_pandas(nullable=True)
+
+    expect = pdf.applymap(func, na_action=na_action)
+    got = gdf.applymap(func, na_action=na_action)
+
+    utils.assert_eq(expect, got, check_dtype=False)
+
+
+def test_applymap_raise_cases():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+
+    def f(x, some_kwarg=0):
+        return x + some_kwarg
+
+    with pytest.raises(NotImplementedError):
+        df.applymap(f, some_kwarg=1)
+
+    with pytest.raises(ValueError):
+        df.applymap(f, na_action="some_invalid_option")
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 29654fb9556..b4e07ed67c9 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -1,17 +1,36 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 import numpy as np
 import pandas as pd
 import pytest
 
 import cudf
 from cudf.testing._utils import assert_eq
-from cudf.utils.utils import IS_NEP18_ACTIVE
 
-missing_arrfunc_cond = not IS_NEP18_ACTIVE
-missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
-# Test implementation based on dask array test
-# https://github.com/dask/dask/blob/master/dask/array/tests/test_array_function.py
+# To determine if NEP18 is available in the current version of NumPy we simply
+# attempt to concatenate an object with `__array_function__` defined and see if
+# NumPy invokes the protocol or not. Taken from dask array
+# https://github.com/dask/dask/blob/master/dask/array/utils.py#L352-L363
+# TODO: Unclear if this is still necessary. NEP 18 was introduced as the
+# default in 1.17 (https://github.com/numpy/numpy/releases/tag/v1.17.0) almost
+# 3 years ago, and it was originally introduced one version before in 1.16
+# (although not enabled by default then). Can we safely assume that testers
+# will have a sufficiently new version of numpy to run these tests?
+class _Test:
+    def __array_function__(self, *args, **kwargs):
+        return True
+
+
+try:
+    np.concatenate([_Test()])
+except ValueError:
+    missing_arrfunc_cond = True
+else:
+    missing_arrfunc_cond = False
+
+del _Test
+
+missing_arrfunc_reason = "NEP-18 support is not available in NumPy"
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 3cc3e4153b1..2017ba06f76 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -646,7 +646,7 @@ def test_concat_two_empty_series(ignore_index, axis):
         ),
     ],
 )
-def test_concat_dataframe_with_multiIndex(df1, df2):
+def test_concat_dataframe_with_multiindex(df1, df2):
     gdf1 = df1
     gdf1 = gdf1.set_index(["k1", "k2"])
 
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index f06142f4cc9..15dfa111860 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -1,4 +1,6 @@
-from datetime import datetime as dt
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
+import datetime
 
 import numpy as np
 import pandas as pd
@@ -41,12 +43,12 @@ def get_string_series():
 testdata_all = [
     (
         cudf_date_series("20010101", "20020215", freq="400h"),
-        dt.strptime("2001-01-01", "%Y-%m-%d"),
+        datetime.datetime.strptime("2001-01-01", "%Y-%m-%d"),
         True,
     ),
     (
         cudf_date_series("20010101", "20020215", freq="400h"),
-        dt.strptime("2000-01-01", "%Y-%m-%d"),
+        datetime.datetime.strptime("2000-01-01", "%Y-%m-%d"),
         False,
     ),
     (cudf_date_series("20010101", "20020215", freq="400h"), 20000101, False),
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index a7fad792bd0..d95fe278469 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -13,7 +13,6 @@
 from copy import copy
 
 import cupy
-import cupy as cp
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -2468,35 +2467,6 @@ def test_arrow_handle_no_index_name(pdf, gdf):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("num_rows", [1, 3, 10, 100])
-@pytest.mark.parametrize("num_bins", [1, 2, 4, 20])
-@pytest.mark.parametrize("right", [True, False])
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
-@pytest.mark.parametrize("series_bins", [True, False])
-def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
-    data = np.random.randint(0, 100, num_rows).astype(dtype)
-    bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype)))
-    s = cudf.Series(data)
-    if series_bins:
-        s_bins = cudf.Series(bins)
-        indices = s.digitize(s_bins, right)
-    else:
-        indices = s.digitize(bins, right)
-    np.testing.assert_array_equal(
-        np.digitize(data, bins, right), indices.to_numpy()
-    )
-
-
-def test_series_digitize_invalid_bins():
-    s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32")
-    bins = cudf.Series([2, None, None, 50, 90], dtype="int32")
-
-    with pytest.raises(
-        ValueError, match="`bins` cannot contain null entries."
-    ):
-        _ = s.digitize(bins)
-
-
 def test_pandas_non_contiguious():
     arr1 = np.random.sample([5000, 10])
     assert arr1.flags["C_CONTIGUOUS"] is True
@@ -3692,9 +3662,7 @@ def test_all(data):
     # Pandas treats `None` in object type columns as True for some reason, so
     # replacing with `False`
     if np.array(data).ndim <= 1:
-        pdata = cudf.utils.utils._create_pandas_series(data=data).replace(
-            [None], False
-        )
+        pdata = pd.Series(data=data).replace([None], False)
         gdata = cudf.Series.from_pandas(pdata)
     else:
         pdata = pd.DataFrame(data, columns=["a", "b"]).replace([None], False)
@@ -3745,7 +3713,7 @@ def test_all(data):
 @pytest.mark.parametrize("axis", [0, 1])
 def test_any(data, axis):
     if np.array(data).ndim <= 1:
-        pdata = cudf.utils.utils._create_pandas_series(data=data)
+        pdata = pd.Series(data=data)
         gdata = cudf.Series.from_pandas(pdata)
 
         if axis == 1:
@@ -4215,7 +4183,7 @@ def test_create_dataframe_column():
     ],
 )
 def test_series_values_host_property(data):
-    pds = cudf.utils.utils._create_pandas_series(data=data)
+    pds = pd.Series(data=data)
     gds = cudf.Series(data)
 
     np.testing.assert_array_equal(pds.values, gds.values_host)
@@ -4238,7 +4206,7 @@ def test_series_values_host_property(data):
     ],
 )
 def test_series_values_property(data):
-    pds = cudf.utils.utils._create_pandas_series(data=data)
+    pds = pd.Series(data=data)
     gds = cudf.Series(data)
     gds_vals = gds.values
     assert isinstance(gds_vals, cupy.ndarray)
@@ -7332,7 +7300,7 @@ def test_sample_axis_0(
 
 @pytest.mark.parametrize("replace", [True, False])
 @pytest.mark.parametrize(
-    "random_state_lib", [cp.random.RandomState, np.random.RandomState]
+    "random_state_lib", [cupy.random.RandomState, np.random.RandomState]
 )
 def test_sample_reproducibility(replace, random_state_lib):
     df = cudf.DataFrame({"a": cupy.arange(0, 1024)})
@@ -7384,7 +7352,7 @@ def test_oversample_without_replace(n, frac, axis):
     )
 
 
-@pytest.mark.parametrize("random_state", [None, cp.random.RandomState(42)])
+@pytest.mark.parametrize("random_state", [None, cupy.random.RandomState(42)])
 def test_sample_unsupported_arguments(random_state):
     df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]})
     with pytest.raises(
@@ -9098,7 +9066,7 @@ def test_groupby_cov_for_pandas_bug_case():
     ],
 )
 @pytest.mark.parametrize("periods", (-5, -1, 0, 1, 5))
-def test_diff_dataframe_numeric_dtypes(data, periods):
+def test_diff_numeric_dtypes(data, periods):
     gdf = cudf.DataFrame(data)
     pdf = gdf.to_pandas()
 
@@ -9137,7 +9105,7 @@ def test_diff_decimal_dtypes(precision, scale, dtype):
     )
 
 
-def test_diff_dataframe_invalid_axis():
+def test_diff_invalid_axis():
     gdf = cudf.DataFrame(np.array([1.123, 2.343, 5.890, 0.0]))
     with pytest.raises(NotImplementedError, match="Only axis=0 is supported."):
         gdf.diff(periods=1, axis=1)
@@ -9152,16 +9120,30 @@ def test_diff_dataframe_invalid_axis():
             "string_col": ["a", "b", "c", "d", "e"],
         },
         ["a", "b", "c", "d", "e"],
-        [np.nan, None, np.nan, None],
     ],
 )
-def test_diff_dataframe_non_numeric_dypes(data):
+def test_diff_unsupported_dtypes(data):
     gdf = cudf.DataFrame(data)
     with pytest.raises(
-        NotImplementedError,
-        match="DataFrame.diff only supports numeric dtypes",
+        TypeError,
+        match=r"unsupported operand type\(s\)",
     ):
-        gdf.diff(periods=2, axis=0)
+        gdf.diff()
+
+
+def test_diff_many_dtypes():
+    pdf = pd.DataFrame(
+        {
+            "dates": pd.date_range("2020-01-01", "2020-01-06", freq="D"),
+            "bools": [True, True, True, False, True, True],
+            "floats": [1.0, 2.0, 3.5, np.nan, 5.0, -1.7],
+            "ints": [1, 2, 3, 3, 4, 5],
+            "nans_nulls": [np.nan, None, None, np.nan, np.nan, None],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+    assert_eq(pdf.diff(), gdf.diff())
+    assert_eq(pdf.diff(periods=2), gdf.diff(periods=2))
 
 
 def test_dataframe_assign_cp_np_array():
@@ -9230,3 +9212,144 @@ def test_dataframe_pct_change(data, periods, fill_method):
     expected = pdf.pct_change(periods=periods, fill_method=fill_method)
 
     assert_eq(expected, actual)
+
+
+def test_mean_timeseries():
+    gdf = cudf.datasets.timeseries()
+    pdf = gdf.to_pandas()
+
+    expected = pdf.mean(numeric_only=True)
+    actual = gdf.mean(numeric_only=True)
+
+    assert_eq(expected, actual)
+
+    with pytest.raises(TypeError):
+        gdf.mean()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "a": [1, 2, 3, 4, 5],
+            "b": ["a", "b", "c", "d", "e"],
+            "c": [1.0, 2.0, 3.0, 4.0, 5.0],
+        }
+    ],
+)
+def test_std_different_dtypes(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expected = pdf.std(numeric_only=True)
+    actual = gdf.std(numeric_only=True)
+
+    assert_eq(expected, actual)
+
+    with pytest.raises(TypeError):
+        gdf.std()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        }
+    ],
+)
+def test_empty_numeric_only(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+    expected = pdf.prod(numeric_only=True)
+    actual = gdf.prod(numeric_only=True)
+    assert_eq(expected, actual)
+
+
+@pytest.fixture
+def df_eval():
+    N = 10
+    int_max = 10
+    rng = cupy.random.default_rng(0)
+    return cudf.DataFrame(
+        {
+            "a": rng.integers(N, size=int_max),
+            "b": rng.integers(N, size=int_max),
+            "c": rng.integers(N, size=int_max),
+            "d": rng.integers(N, size=int_max),
+        }
+    )
+
+
+# Note that for now expressions do not automatically handle casting, so inputs
+# need to be casted appropriately
+@pytest.mark.parametrize(
+    "expr, dtype",
+    [
+        ("a", int),
+        ("+a", int),
+        ("a + b", int),
+        ("a == b", int),
+        ("a / b", float),
+        ("a * b", int),
+        ("a > b", int),
+        ("a > b > c", int),
+        ("a > b < c", int),
+        ("a & b", int),
+        ("a & b | c", int),
+        ("sin(a)", float),
+        ("exp(sin(abs(a)))", float),
+        ("sqrt(floor(a))", float),
+        ("ceil(arctanh(a))", float),
+        ("(a + b) - (c * d)", int),
+        ("~a", int),
+        ("(a > b) and (c > d)", int),
+        ("(a > b) or (c > d)", int),
+        ("not (a > b)", int),
+        ("a + 1", int),
+        ("a + 1.0", float),
+        ("-a + 1", int),
+        ("+a + 1", int),
+        ("e = a + 1", int),
+        (
+            """
+            e = log(cos(a)) + 1.0
+            f = abs(c) - exp(d)
+            """,
+            float,
+        ),
+        ("a_b_are_equal = (a == b)", int),
+    ],
+)
+def test_dataframe_eval(df_eval, expr, dtype):
+    df_eval = df_eval.astype(dtype)
+    expect = df_eval.to_pandas().eval(expr)
+    got = df_eval.eval(expr)
+    # In the specific case where the evaluated expression is a unary function
+    # of a single column with no nesting, pandas will retain the name. This
+    # level of compatibility is out of scope for now.
+    assert_eq(expect, got, check_names=False)
+
+    # Test inplace
+    if re.search("[^=]=[^=]", expr) is not None:
+        pdf_eval = df_eval.to_pandas()
+        pdf_eval.eval(expr, inplace=True)
+        df_eval.eval(expr, inplace=True)
+        assert_eq(pdf_eval, df_eval)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        """
+        e = a + b
+        a == b
+        """,
+        "a_b_are_equal = (a == b) = c",
+    ],
+)
+def test_dataframe_eval_errors(df_eval, expr):
+    with pytest.raises(ValueError):
+        df_eval.eval(expr)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 964ac9e5457..8be338e787a 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import datetime
-import datetime as dt
 import operator
 import re
 
@@ -219,8 +218,8 @@ def test_sort_datetime():
 
 def test_issue_165():
     df_pandas = pd.DataFrame()
-    start_date = dt.datetime.strptime("2000-10-21", "%Y-%m-%d")
-    data = [(start_date + dt.timedelta(days=x)) for x in range(6)]
+    start_date = datetime.datetime.strptime("2000-10-21", "%Y-%m-%d")
+    data = [(start_date + datetime.timedelta(days=x)) for x in range(6)]
     df_pandas["dates"] = data
     df_pandas["num"] = [1, 2, 3, 4, 5, 6]
     df_cudf = DataFrame.from_pandas(df_pandas)
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index 3e7891ba0af..3277e52edb3 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing._utils import _create_pandas_series, assert_eq
 
 
 @pytest.mark.parametrize(
@@ -22,7 +22,7 @@
 @pytest.mark.parametrize("inplace", [True, False])
 def test_dropna_series(data, nulls, inplace):
 
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
 
     if len(data) > 0:
         if nulls == "one":
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index e8a695570f0..98061f4e977 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import itertools as it
+import itertools
 import random
 
 import numpy as np
@@ -9,7 +9,11 @@
 
 import cudf
 from cudf import concat
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import (
+    _create_pandas_series,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 # TODO: PANDAS 1.0 support
 # Revisit drop_duplicates() tests to update parameters like ignore_index.
@@ -59,7 +63,7 @@ def test_duplicated_with_misspelled_column_name(subset):
     ],
 )
 def test_drop_duplicates_series(data, keep):
-    pds = cudf.utils.utils._create_pandas_series(data)
+    pds = _create_pandas_series(data)
     gds = cudf.from_pandas(pds)
 
     assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep))
@@ -280,7 +284,7 @@ def test_drop_duplicates_empty(df):
 
 @pytest.mark.parametrize("num_columns", [3, 4, 5])
 def test_dataframe_drop_duplicates_numeric_method(num_columns):
-    comb = list(it.permutations(range(num_columns), num_columns))
+    comb = list(itertools.permutations(range(num_columns), num_columns))
     shuf = list(comb)
     random.Random(num_columns).shuffle(shuf)
 
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index eba37c1f5af..9e87fdbd3be 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -292,6 +292,40 @@ def foo(df):
     assert_groupby_results_equal(expect, got)
 
 
+def create_test_groupby_apply_args_params():
+    def f1(df, k):
+        df["out"] = df["val1"] + df["val2"] + k
+        return df
+
+    def f2(df, k, L):
+        df["out"] = df["val1"] - df["val2"] + (k / L)
+        return df
+
+    def f3(df, k, L, m):
+        df["out"] = ((k * df["val1"]) + (L * df["val2"])) / m
+        return df
+
+    return [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))]
+
+
+@pytest.mark.parametrize("func,args", create_test_groupby_apply_args_params())
+def test_groupby_apply_args(func, args):
+    np.random.seed(0)
+    df = DataFrame()
+    nelem = 20
+    df["key1"] = np.random.randint(0, 3, nelem)
+    df["key2"] = np.random.randint(0, 2, nelem)
+    df["val1"] = np.random.random(nelem)
+    df["val2"] = np.random.random(nelem)
+
+    expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False)
+    got_grpby = df.groupby(["key1", "key2"])
+
+    expect = expect_grpby.apply(func, *args)
+    got = got_grpby.apply(func, *args)
+    assert_groupby_results_equal(expect, got)
+
+
 def test_groupby_apply_grouped():
     np.random.seed(0)
     df = DataFrame()
@@ -1595,7 +1629,38 @@ def test_groupby_pipe():
     assert_groupby_results_equal(expected, actual)
 
 
-def test_groupby_apply_return_scalars():
+def create_test_groupby_apply_return_scalars_params():
+    def f0(x):
+        x = x[~x["B"].isna()]
+        ticker = x.shape[0]
+        full = ticker / 10
+        return full
+
+    def f1(x, k):
+        x = x[~x["B"].isna()]
+        ticker = x.shape[0]
+        full = ticker / k
+        return full
+
+    def f2(x, k, L):
+        x = x[~x["B"].isna()]
+        ticker = x.shape[0]
+        full = L * (ticker / k)
+        return full
+
+    def f3(x, k, L, m):
+        x = x[~x["B"].isna()]
+        ticker = x.shape[0]
+        full = L * (ticker / k) % m
+        return full
+
+    return [(f0, ()), (f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))]
+
+
+@pytest.mark.parametrize(
+    "func,args", create_test_groupby_apply_return_scalars_params()
+)
+def test_groupby_apply_return_scalars(func, args):
     pdf = pd.DataFrame(
         {
             "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
@@ -1615,30 +1680,52 @@ def test_groupby_apply_return_scalars():
     )
     gdf = cudf.from_pandas(pdf)
 
-    def custom_map_func(x):
-        x = x[~x["B"].isna()]
-        ticker = x.shape[0]
-        full = ticker / 10
-        return full
-
-    expected = pdf.groupby("A").apply(lambda x: custom_map_func(x))
-    actual = gdf.groupby("A").apply(lambda x: custom_map_func(x))
+    expected = pdf.groupby("A").apply(func, *args)
+    actual = gdf.groupby("A").apply(func, *args)
 
     assert_groupby_results_equal(expected, actual)
 
 
+def create_test_groupby_apply_return_series_dataframe_params():
+    def f0(x):
+        return x - x.max()
+
+    def f1(x):
+        return x.min() - x.max()
+
+    def f2(x):
+        return x.min()
+
+    def f3(x, k):
+        return x - x.max() + k
+
+    def f4(x, k, L):
+        return x.min() - x.max() + (k / L)
+
+    def f5(x, k, L, m):
+        return m * x.min() + (k / L)
+
+    return [
+        (f0, ()),
+        (f1, ()),
+        (f2, ()),
+        (f3, (42,)),
+        (f4, (42, 119)),
+        (f5, (41, 119, 212.1)),
+    ]
+
+
 @pytest.mark.parametrize(
-    "cust_func",
-    [lambda x: x - x.max(), lambda x: x.min() - x.max(), lambda x: x.min()],
+    "func,args", create_test_groupby_apply_return_series_dataframe_params()
 )
-def test_groupby_apply_return_series_dataframe(cust_func):
+def test_groupby_apply_return_series_dataframe(func, args):
     pdf = pd.DataFrame(
         {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]}
     )
     gdf = cudf.from_pandas(pdf)
 
-    expected = pdf.groupby(["key"]).apply(cust_func)
-    actual = gdf.groupby(["key"]).apply(cust_func)
+    expected = pdf.groupby(["key"]).apply(func, *args)
+    actual = gdf.groupby(["key"]).apply(func, *args)
 
     assert_groupby_results_equal(expected, actual)
 
@@ -1724,6 +1811,50 @@ def test_groupby_2keys_scan(nelem, func):
     assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
 
 
+@pytest.mark.parametrize("nelem", [100, 1000])
+@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
+@pytest.mark.parametrize("pct", [False, True])
+def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct):
+    t = rand_dataframe(
+        dtypes_meta=[
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
+        ],
+        rows=nelem,
+        use_threads=False,
+    )
+    pdf = t.to_pandas()
+    pdf.columns = ["x", "y", "z"]
+    gdf = cudf.from_pandas(pdf)
+    expect_df = pdf.groupby(["x", "y"], sort=True).rank(
+        method=method, ascending=ascending, na_option=na_option, pct=pct
+    )
+    got_df = gdf.groupby(["x", "y"], sort=True).rank(
+        method=method, ascending=ascending, na_option=na_option, pct=pct
+    )
+
+    assert_groupby_results_equal(got_df, expect_df, check_dtype=False)
+
+
+def test_groupby_rank_fails():
+    gdf = cudf.DataFrame(
+        {"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]}
+    )
+    with pytest.raises(NotImplementedError):
+        gdf.groupby(["x", "y"]).rank(method="min", axis=1)
+    gdf = cudf.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 2, 2],
+            "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]],
+        }
+    )
+    with pytest.raises(NotImplementedError):
+        gdf.groupby(["a"]).rank(method="min", axis=1)
+
+
 def test_groupby_mix_agg_scan():
     err_msg = "Cannot perform both aggregation and scan in one operation"
     func = ["cumsum", "sum"]
@@ -2213,6 +2344,22 @@ def foo(x):
     assert_groupby_results_equal(expect, got)
 
 
+@pytest.mark.parametrize(
+    "func,args",
+    [
+        (lambda x, k: x + k, (42,)),
+        (lambda x, k, L: x + k - L, (42, 191)),
+        (lambda x, k, L, m: (x + k) / (L * m), (42, 191, 99.9)),
+    ],
+)
+def test_groupby_apply_series_args(func, args):
+
+    got = make_frame(DataFrame, 100).groupby("x").y.apply(func, *args)
+    expect = make_frame(pd.DataFrame, 100).groupby("x").y.apply(func, *args)
+
+    assert_groupby_results_equal(expect, got)
+
+
 @pytest.mark.parametrize("label", [None, "left", "right"])
 @pytest.mark.parametrize("closed", [None, "left", "right"])
 def test_groupby_freq_week(label, closed):
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index de4303a34a8..8730cb187b5 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -3,12 +3,12 @@
 import os
 from io import BytesIO
 
-import fastavro as fa
+import fastavro
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
-from pyarrow import orc as orc
+from pyarrow import orc
 
 import cudf
 from cudf.testing._utils import assert_eq
@@ -253,7 +253,7 @@ def test_read_avro(datadir, hdfs, test_url):
 
     got = cudf.read_avro(hd_fpath)
     with open(fname, mode="rb") as f:
-        expect = pd.DataFrame.from_records(fa.reader(f))
+        expect = pd.DataFrame.from_records(fastavro.reader(f))
 
     for col in expect.columns:
         expect[col] = expect[col].astype(got[col].dtype)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 37286c65341..05830f79880 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -28,6 +28,7 @@
     SIGNED_INTEGER_TYPES,
     SIGNED_TYPES,
     UNSIGNED_TYPES,
+    _create_pandas_series,
     assert_column_memory_eq,
     assert_column_memory_ne,
     assert_eq,
@@ -973,9 +974,7 @@ def test_index_equal_misc(data, other):
     actual = gd_data.equals(np.array(gd_other))
     assert_eq(expected, actual)
 
-    expected = pd_data.equals(
-        cudf.utils.utils._create_pandas_series(data=pd_other)
-    )
+    expected = pd_data.equals(_create_pandas_series(pd_other))
     actual = gd_data.equals(cudf.Series(gd_other))
     assert_eq(expected, actual)
 
@@ -2433,7 +2432,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
     ],
 )
 def test_isin_index(data, values):
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.index.isin(values)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 740c32a8a26..225aa0cd6bc 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -726,7 +726,7 @@ def test_dataframe_take(ntake):
 
 
 @pytest.mark.parametrize("ntake", [1, 2, 8, 9])
-def test_dataframe_take_with_multiIndex(ntake):
+def test_dataframe_take_with_multiindex(ntake):
     np.random.seed(0)
     df = cudf.DataFrame(
         index=cudf.MultiIndex(
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index cf53a3525ef..09eee3520e5 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -11,6 +11,7 @@
 import cudf
 from cudf import NA
 from cudf._lib.copying import get_element
+from cudf.api.types import is_scalar
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -425,7 +426,7 @@ def test_contains_invalid(data, scalar):
 
 
 @pytest.mark.parametrize(
-    "data, scalar, expect",
+    "data, search_key, expect",
     [
         (
             [[1, 2, 3], [], [3, 4, 5]],
@@ -448,6 +449,16 @@ def test_contains_invalid(data, scalar):
             "y",
             [3, -1],
         ),
+        (
+            [["h", "a", None], ["t", "g"]],
+            ["a", "b"],
+            [1, -1],
+        ),
+        (
+            [None, ["h", "i"], ["p", "k", "z"]],
+            ["x", None, "z"],
+            [None, None, 2],
+        ),
         (
             [["d", None, "e"], [None, "f"], []],
             cudf.Scalar(cudf.NA, "O"),
@@ -460,15 +471,21 @@ def test_contains_invalid(data, scalar):
         ),
     ],
 )
-def test_index(data, scalar, expect):
+def test_index(data, search_key, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect, dtype="int32")
-    got = sr.list.index(cudf.Scalar(scalar, sr.dtype.element_type))
+    if is_scalar(search_key):
+        got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type))
+    else:
+        got = sr.list.index(
+            cudf.Series(search_key, dtype=sr.dtype.element_type)
+        )
+
     assert_eq(expect, got)
 
 
 @pytest.mark.parametrize(
-    "data, scalar",
+    "data, search_key",
     [
         (
             [[9, None, 8], [], [7, 6, 5]],
@@ -478,16 +495,42 @@ def test_index(data, scalar, expect):
             [["a", "b", "c"], None, [None, "d"]],
             2,
         ),
+        (
+            [["e", "s"], ["t", "w"]],
+            [5, 6],
+        ),
     ],
 )
-def test_index_invalid(data, scalar):
+def test_index_invalid_type(data, search_key):
     sr = cudf.Series(data)
     with pytest.raises(
         TypeError,
         match="Type/Scale of search key does not "
         "match list column element type.",
     ):
-        sr.list.index(scalar)
+        sr.list.index(search_key)
+
+
+@pytest.mark.parametrize(
+    "data, search_key",
+    [
+        (
+            [[5, 8], [2, 6]],
+            [8, 2, 4],
+        ),
+        (
+            [["h", "j"], ["p", None], ["t", "z"]],
+            ["j", "a"],
+        ),
+    ],
+)
+def test_index_invalid_length(data, search_key):
+    sr = cudf.Series(data)
+    with pytest.raises(
+        RuntimeError,
+        match="Number of search keys must match list column size.",
+    ):
+        sr.list.index(search_key)
 
 
 @pytest.mark.parametrize(
@@ -759,3 +802,15 @@ def test_listcol_setitem_retain_dtype():
     # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/
     df2 = df1.copy()
     assert df2["a"].dtype == df["a"].dtype
+
+
+def test_list_astype():
+    s = cudf.Series([[1, 2], [3, 4]])
+    s2 = s.list.astype("float64")
+    assert s2.dtype == cudf.ListDtype("float64")
+    assert_eq(s.list.leaves.astype("float64"), s2.list.leaves)
+
+    s = cudf.Series([[[1, 2], [3]], [[5, 6], None]])
+    s2 = s.list.astype("string")
+    assert s2.dtype == cudf.ListDtype(cudf.ListDtype("string"))
+    assert_eq(s.list.leaves.astype("string"), s2.list.leaves)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index eaef002f37d..38225b3efb9 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 """
 Test related to MultiIndex
@@ -688,7 +688,7 @@ def test_multiindex_copy_sem(data, levels, codes, names):
     # Test same behavior when used on DataFrame
     gdf.index = gmi_copy
     pdf.index = pmi_copy
-    assert gdf.__repr__() == pdf.__repr__()
+    assert repr(gdf) == repr(pdf)
 
 
 @pytest.mark.parametrize(
@@ -1131,7 +1131,7 @@ def test_multiindex_values_host():
         ),
     ],
 )
-def test_multiIndex_fillna(gdi, fill_value, expected):
+def test_multiindex_fillna(gdi, fill_value, expected):
     assert_eq(expected, gdi.fillna(fill_value))
 
 
@@ -1173,7 +1173,7 @@ def test_multiIndex_fillna(gdi, fill_value, expected):
         ),
     ],
 )
-def test_multiIndex_empty(pdi):
+def test_multiindex_empty(pdi):
     gdi = cudf.from_pandas(pdi)
 
     assert_eq(pdi.empty, gdi.empty)
@@ -1217,7 +1217,7 @@ def test_multiIndex_empty(pdi):
         ),
     ],
 )
-def test_multiIndex_size(pdi):
+def test_multiindex_size(pdi):
     gdi = cudf.from_pandas(pdi)
 
     assert_eq(pdi.size, gdi.size)
@@ -1375,7 +1375,7 @@ def test_multiindex_sort_values(pmidx, ascending, return_indexer):
     ],
 )
 @pytest.mark.parametrize("ascending", [True, False])
-def test_multiIndex_argsort(pdi, ascending):
+def test_multiindex_argsort(pdi, ascending):
     gdi = cudf.from_pandas(pdi)
 
     if not ascending:
@@ -1562,7 +1562,7 @@ def test_multiindex_indexing(key):
     assert_eq(gi[key], pi[key], exact=False)
 
 
-def test_multiIndex_duplicate_names():
+def test_multiindex_duplicate_names():
     gi = cudf.MultiIndex(
         levels=[["a", "b"], ["b", "a"]],
         codes=[[0, 0], [0, 1]],
@@ -1699,7 +1699,7 @@ def test_intersection_mulitIndex(idx1, idx2, sort):
         None,
     ],
 )
-def test_pickle_roundtrip_multiIndex(names):
+def test_pickle_roundtrip_multiindex(names):
     df = cudf.DataFrame(
         {
             "one": [1, 2, 3],
@@ -1745,7 +1745,7 @@ def test_pickle_roundtrip_multiIndex(names):
         "is_interval",
     ],
 )
-def test_multiIndex_type_methods(pidx, func):
+def test_multiindex_type_methods(pidx, func):
     gidx = cudf.from_pandas(pidx)
 
     expected = getattr(pidx, func)()
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 5082fb08b92..c3969bf6c14 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -11,7 +11,7 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.orc
-import pyorc as po
+import pyorc
 import pytest
 
 import cudf
@@ -307,7 +307,7 @@ def test_orc_read_skiprows(tmpdir):
         {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]},
         dtype=pd.BooleanDtype(),
     )
-    writer = po.Writer(buff, po.Struct(a=po.Boolean()))
+    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
     tuples = list(
         map(
             lambda x: (None,) if x[0] is pd.NA else x,
@@ -931,29 +931,35 @@ def generate_list_struct_buff(size=100_000):
     buff = BytesIO()
 
     schema = {
-        "lvl3_list": po.Array(po.Array(po.Array(po.BigInt()))),
-        "lvl1_list": po.Array(po.BigInt()),
-        "lvl1_struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
-        "lvl2_struct": po.Struct(
+        "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))),
+        "lvl1_list": pyorc.Array(pyorc.BigInt()),
+        "lvl1_struct": pyorc.Struct(
+            **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
+        ),
+        "lvl2_struct": pyorc.Struct(
             **{
-                "a": po.BigInt(),
-                "lvl1_struct": po.Struct(
-                    **{"c": po.BigInt(), "d": po.BigInt()}
+                "a": pyorc.BigInt(),
+                "lvl1_struct": pyorc.Struct(
+                    **{"c": pyorc.BigInt(), "d": pyorc.BigInt()}
                 ),
             }
         ),
-        "list_nests_struct": po.Array(
-            po.Array(po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}))
+        "list_nests_struct": pyorc.Array(
+            pyorc.Array(
+                pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()})
+            )
         ),
-        "struct_nests_list": po.Struct(
+        "struct_nests_list": pyorc.Struct(
             **{
-                "struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
-                "list": po.Array(po.BigInt()),
+                "struct": pyorc.Struct(
+                    **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
+                ),
+                "list": pyorc.Array(pyorc.BigInt()),
             }
         ),
     }
 
-    schema = po.Struct(**schema)
+    schema = pyorc.Struct(**schema)
 
     lvl3_list = [
         rd.choice(
@@ -1019,7 +1025,7 @@ def generate_list_struct_buff(size=100_000):
         }
     )
 
-    writer = po.Writer(buff, schema, stripe_size=1024)
+    writer = pyorc.Writer(buff, schema, stripe_size=1024)
     tuples = list(
         map(
             lambda x: (None,) if x[0] is pd.NA else x,
@@ -1101,15 +1107,17 @@ def gen_map_buff(size=10000):
     buff = BytesIO()
 
     schema = {
-        "lvl1_map": po.Map(key=po.String(), value=po.BigInt()),
-        "lvl2_map": po.Map(key=po.String(), value=po.Array(po.BigInt())),
-        "lvl2_struct_map": po.Map(
-            key=po.String(),
-            value=po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}),
+        "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()),
+        "lvl2_map": pyorc.Map(
+            key=pyorc.String(), value=pyorc.Array(pyorc.BigInt())
+        ),
+        "lvl2_struct_map": pyorc.Map(
+            key=pyorc.String(),
+            value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}),
         ),
     }
 
-    schema = po.Struct(**schema)
+    schema = pyorc.Struct(**schema)
 
     lvl1_map = [
         rd.choice(
@@ -1186,8 +1194,8 @@ def gen_map_buff(size=10000):
             "lvl2_struct_map": lvl2_struct_map,
         }
     )
-    writer = po.Writer(
-        buff, schema, stripe_size=1024, compression=po.CompressionKind.NONE
+    writer = pyorc.Writer(
+        buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE
     )
     tuples = list(
         map(
@@ -1479,8 +1487,9 @@ def test_statistics_sum_overflow():
     minint64 = np.iinfo(np.int64).min
 
     buff = BytesIO()
-    with po.Writer(
-        buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt())
+    with pyorc.Writer(
+        buff,
+        pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()),
     ) as writer:
         writer.write((maxint64, minint64, minint64))
         writer.write((1, -1, 1))
@@ -1497,20 +1506,20 @@ def test_statistics_sum_overflow():
 
 def test_empty_statistics():
     buff = BytesIO()
-    orc_schema = po.Struct(
-        a=po.BigInt(),
-        b=po.Double(),
-        c=po.String(),
-        d=po.Decimal(11, 2),
-        e=po.Date(),
-        f=po.Timestamp(),
-        g=po.Boolean(),
-        h=po.Binary(),
-        i=po.BigInt(),
+    orc_schema = pyorc.Struct(
+        a=pyorc.BigInt(),
+        b=pyorc.Double(),
+        c=pyorc.String(),
+        d=pyorc.Decimal(11, 2),
+        e=pyorc.Date(),
+        f=pyorc.Timestamp(),
+        g=pyorc.Boolean(),
+        h=pyorc.Binary(),
+        i=pyorc.BigInt(),
         # One column with non null value, else cudf/pyorc readers crash
     )
     data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
-    with po.Writer(buff, orc_schema) as writer:
+    with pyorc.Writer(buff, orc_schema) as writer:
         writer.write(data)
 
     got = cudf.io.orc.read_orc_statistics([buff])
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 14e81d6ad30..08311f89148 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1371,3 +1371,17 @@ def test_replace_nulls(gsr, old, new, expected):
         expected.sort_values().reset_index(drop=True),
         actual.sort_values().reset_index(drop=True),
     )
+
+
+def test_fillna_columns_multiindex():
+    columns = pd.MultiIndex.from_tuples([("a", "b"), ("d", "e")])
+    pdf = pd.DataFrame(
+        {"0": [1, 2, None, 3, None], "1": [None, None, None, None, 4]}
+    )
+    pdf.columns = columns
+    gdf = cudf.from_pandas(pdf)
+
+    expected = pdf.fillna(10)
+    actual = gdf.fillna(10)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 8f2e4811e36..c4985639173 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -41,7 +41,7 @@ def test_null_series(nrows, dtype):
         ps = sr.to_pandas()
 
     pd.options.display.max_rows = int(nrows)
-    psrepr = ps.__repr__()
+    psrepr = repr(ps)
     psrepr = psrepr.replace("NaN", "<NA>")
     psrepr = psrepr.replace("NaT", "<NA>")
     psrepr = psrepr.replace("None", "<NA>")
@@ -49,7 +49,7 @@ def test_null_series(nrows, dtype):
         psrepr = psrepr.replace("UInt", "uint")
     elif "Int" in psrepr:
         psrepr = psrepr.replace("Int", "int")
-    assert psrepr.split() == sr.__repr__().split()
+    assert psrepr.split() == repr(sr).split()
     pd.reset_option("display.max_rows")
 
 
@@ -72,11 +72,13 @@ def test_null_dataframe(ncols):
         gdf[dtype] = sr
     pdf = gdf.to_pandas()
     pd.options.display.max_columns = int(ncols)
-    pdfrepr = pdf.__repr__()
-    pdfrepr = pdfrepr.replace("NaN", "<NA>")
-    pdfrepr = pdfrepr.replace("NaT", "<NA>")
-    pdfrepr = pdfrepr.replace("None", "<NA>")
-    assert pdfrepr.split() == gdf.__repr__().split()
+    pdf_repr = (
+        repr(pdf)
+        .replace("NaN", "<NA>")
+        .replace("NaT", "<NA>")
+        .replace("None", "<NA>")
+    )
+    assert pdf_repr.split() == repr(gdf).split()
     pd.reset_option("display.max_columns")
 
 
@@ -87,7 +89,7 @@ def test_full_series(nrows, dtype):
     ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype)
     sr = cudf.from_pandas(ps)
     pd.options.display.max_rows = nrows
-    assert ps.__repr__() == sr.__repr__()
+    assert repr(ps) == repr(sr)
     pd.reset_option("display.max_rows")
 
 
@@ -121,8 +123,8 @@ def test_integer_dataframe(x):
     gdf = cudf.DataFrame({"x": x})
     pdf = gdf.to_pandas()
     pd.options.display.max_columns = 1
-    assert gdf.__repr__() == pdf.__repr__()
-    assert gdf.T.__repr__() == pdf.T.__repr__()
+    assert repr(gdf) == repr(pdf)
+    assert repr(gdf.T) == repr(pdf.T)
     pd.reset_option("display.max_columns")
 
 
@@ -134,9 +136,9 @@ def test_integer_dataframe(x):
 @settings(deadline=None)
 def test_integer_series(x):
     sr = cudf.Series(x)
-    ps = cudf.utils.utils._create_pandas_series(data=x)
+    ps = pd.Series(data=x)
 
-    assert sr.__repr__() == ps.__repr__()
+    assert repr(sr) == repr(ps)
 
 
 @given(st.lists(st.floats()))
@@ -144,15 +146,15 @@ def test_integer_series(x):
 def test_float_dataframe(x):
     gdf = cudf.DataFrame({"x": cudf.Series(x, nan_as_null=False)})
     pdf = gdf.to_pandas()
-    assert gdf.__repr__() == pdf.__repr__()
+    assert repr(gdf) == repr(pdf)
 
 
 @given(st.lists(st.floats()))
 @settings(deadline=None)
 def test_float_series(x):
     sr = cudf.Series(x, nan_as_null=False)
-    ps = cudf.utils.utils._create_pandas_series(data=x)
-    assert sr.__repr__() == ps.__repr__()
+    ps = pd.Series(data=x)
+    assert repr(sr) == repr(ps)
 
 
 @pytest.fixture
@@ -176,12 +178,12 @@ def mixed_gdf(mixed_pdf):
 
 
 def test_mixed_dataframe(mixed_pdf, mixed_gdf):
-    assert mixed_gdf.__repr__() == mixed_pdf.__repr__()
+    assert repr(mixed_gdf) == repr(mixed_pdf)
 
 
 def test_mixed_series(mixed_pdf, mixed_gdf):
     for col in mixed_gdf.columns:
-        assert mixed_gdf[col].__repr__() == mixed_pdf[col].__repr__()
+        assert repr(mixed_gdf[col]) == repr(mixed_pdf[col])
 
 
 def test_MI():
@@ -204,11 +206,9 @@ def test_MI():
     pd.options.display.max_columns = 0
     gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes))
     pdf = gdf.to_pandas()
-    gdfT = gdf.T
-    pdfT = pdf.T
-    assert gdf.__repr__() == pdf.__repr__()
-    assert gdf.index.__repr__() == pdf.index.__repr__()
-    assert gdfT.__repr__() == pdfT.__repr__()
+    assert repr(gdf) == repr(pdf)
+    assert repr(gdf.index) == repr(pdf.index)
+    assert repr(gdf.T) == repr(pdf.T)
     pd.reset_option("display.max_rows")
     pd.reset_option("display.max_columns")
 
@@ -224,9 +224,9 @@ def test_groupby_MI(nrows, ncols):
     pdg = pdf.groupby(["a", "b"], sort=True).count()
     pd.options.display.max_rows = nrows
     pd.options.display.max_columns = ncols
-    assert gdg.__repr__() == pdg.__repr__()
-    assert gdg.index.__repr__() == pdg.index.__repr__()
-    assert gdg.T.__repr__() == pdg.T.__repr__()
+    assert repr(gdg) == repr(pdg)
+    assert repr(gdg.index) == repr(pdg.index)
+    assert repr(gdg.T) == repr(pdg.T)
     pd.reset_option("display.max_rows")
     pd.reset_option("display.max_columns")
 
@@ -241,7 +241,7 @@ def test_generic_index(length, dtype):
     )
     gsr = cudf.Series.from_pandas(psr)
 
-    assert psr.index.__repr__() == gsr.index.__repr__()
+    assert repr(psr.index) == repr(gsr.index)
 
 
 @pytest.mark.parametrize(
@@ -290,8 +290,8 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
     sliced_gdf = gdf[slice]
     sliced_pdf = pdf[slice]
 
-    expected_repr = sliced_pdf.__repr__().replace("None", "<NA>")
-    actual_repr = sliced_gdf.__repr__()
+    expected_repr = repr(sliced_pdf).replace("None", "<NA>")
+    actual_repr = repr(sliced_gdf)
 
     assert expected_repr == actual_repr
     pd.reset_option("display.max_rows")
@@ -392,7 +392,7 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
 )
 def test_generic_index_null(index, expected_repr):
 
-    actual_repr = index.__repr__()
+    actual_repr = repr(index)
 
     assert expected_repr == actual_repr
 
@@ -475,19 +475,19 @@ def test_dataframe_null_index_repr(df, pandas_special_case):
     gdf = cudf.from_pandas(pdf)
 
     expected_repr = (
-        pdf.__repr__()
+        repr(pdf)
         .replace("NaN", "<NA>")
         .replace("NaT", "<NA>")
         .replace("None", "<NA>")
     )
-    actual_repr = gdf.__repr__()
+    actual_repr = repr(gdf)
 
     if pandas_special_case:
         # Pandas inconsistently print StringIndex null values
         # as `None` at some places and `NaN` at few other places
         # Whereas cudf is consistent with strings `null` values
         # to be printed as `None` everywhere.
-        actual_repr = gdf.__repr__().replace("None", "<NA>")
+        actual_repr = repr(gdf).replace("None", "<NA>")
 
     assert expected_repr.split() == actual_repr.split()
 
@@ -554,19 +554,19 @@ def test_series_null_index_repr(sr, pandas_special_case):
     gsr = cudf.from_pandas(psr)
 
     expected_repr = (
-        psr.__repr__()
+        repr(psr)
         .replace("NaN", "<NA>")
         .replace("NaT", "<NA>")
         .replace("None", "<NA>")
     )
-    actual_repr = gsr.__repr__()
+    actual_repr = repr(gsr)
 
     if pandas_special_case:
         # Pandas inconsistently print StringIndex null values
         # as `None` at some places and `NaN` at few other places
         # Whereas cudf is consistent with strings `null` values
         # to be printed as `None` everywhere.
-        actual_repr = gsr.__repr__().replace("None", "<NA>")
+        actual_repr = repr(gsr).replace("None", "<NA>")
     assert expected_repr.split() == actual_repr.split()
 
 
@@ -607,9 +607,9 @@ def test_timedelta_series_s_us_repr(data, dtype):
     psr = sr.to_pandas()
 
     expected = (
-        psr.__repr__().replace("timedelta64[ns]", dtype).replace("NaT", "<NA>")
+        repr(psr).replace("timedelta64[ns]", dtype).replace("NaT", "<NA>")
     )
-    actual = sr.__repr__()
+    actual = repr(sr)
 
     assert expected.split() == actual.split()
 
@@ -886,7 +886,7 @@ def test_timedelta_series_s_us_repr(data, dtype):
 )
 def test_timedelta_series_ns_ms_repr(ser, expected_repr):
     expected = expected_repr
-    actual = ser.__repr__()
+    actual = repr(ser)
 
     assert expected.split() == actual.split()
 
@@ -1042,7 +1042,7 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr):
     ],
 )
 def test_timedelta_dataframe_repr(df, expected_repr):
-    actual_repr = df.__repr__()
+    actual_repr = repr(df)
 
     assert actual_repr.split() == expected_repr.split()
 
@@ -1105,7 +1105,7 @@ def test_timedelta_dataframe_repr(df, expected_repr):
 def test_timedelta_index_repr(index, expected_repr):
     if not PANDAS_GE_110:
         pytest.xfail(reason="pandas >= 1.1 requried")
-    actual_repr = index.__repr__()
+    actual_repr = repr(index)
 
     assert actual_repr.split() == expected_repr.split()
 
@@ -1132,11 +1132,11 @@ def test_timedelta_index_repr(index, expected_repr):
     ],
 )
 @pytest.mark.parametrize("max_seq_items", [None, 1, 2, 5, 10, 100])
-def test_multiIndex_repr(pmi, max_seq_items):
+def test_multiindex_repr(pmi, max_seq_items):
     pd.set_option("display.max_seq_items", max_seq_items)
     gmi = cudf.from_pandas(pmi)
 
-    assert gmi.__repr__() == pmi.__repr__()
+    assert repr(gmi) == repr(pmi)
     pd.reset_option("display.max_seq_items")
 
 
@@ -1377,8 +1377,8 @@ def test_multiIndex_repr(pmi, max_seq_items):
         ),
     ],
 )
-def test_multiIndex_null_repr(gdi, expected_repr):
-    actual_repr = gdi.__repr__()
+def test_multiindex_null_repr(gdi, expected_repr):
+    actual_repr = repr(gdi)
 
     assert actual_repr.split() == expected_repr.split()
 
@@ -1401,7 +1401,7 @@ def test_categorical_series_with_nan_repr():
     """
     )
 
-    assert series.__repr__().split() == expected_repr.split()
+    assert repr(series).split() == expected_repr.split()
 
     sliced_expected_repr = textwrap.dedent(
         """
@@ -1414,7 +1414,7 @@ def test_categorical_series_with_nan_repr():
         """
     )
 
-    assert series[2:].__repr__().split() == sliced_expected_repr.split()
+    assert repr(series[2:]).split() == sliced_expected_repr.split()
 
 
 def test_categorical_dataframe_with_nan_repr():
@@ -1434,7 +1434,7 @@ def test_categorical_dataframe_with_nan_repr():
     """
     )
 
-    assert df.__repr__().split() == expected_repr.split()
+    assert repr(df).split() == expected_repr.split()
 
 
 def test_categorical_index_with_nan_repr():
@@ -1449,21 +1449,21 @@ def test_categorical_index_with_nan_repr():
         "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')"
     )
 
-    assert cat_index.__repr__() == expected_repr
+    assert repr(cat_index) == expected_repr
 
     sliced_expected_repr = (
         "CategoricalIndex([NaN, 10.0, NaN, <NA>], "
         "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')"
     )
 
-    assert cat_index[2:].__repr__() == sliced_expected_repr
+    assert repr(cat_index[2:]) == sliced_expected_repr
 
 
 def test_empty_series_name():
     ps = pd.Series([], name="abc", dtype="int")
     gs = cudf.from_pandas(ps)
 
-    assert ps.__repr__() == gs.__repr__()
+    assert repr(ps) == repr(gs)
 
 
 def test_repr_struct_after_concat():
@@ -1493,4 +1493,4 @@ def test_repr_struct_after_concat():
     )
     pdf = df.to_pandas()
 
-    assert df.__repr__() == pdf.__repr__()
+    assert repr(df) == repr(pdf)
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 87d1faf33ca..397d7f1c277 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_110
-from cudf.testing._utils import assert_eq
+from cudf.testing._utils import _create_pandas_series, assert_eq
 from cudf.testing.dataset_generator import rand_dataframe
 
 
@@ -58,7 +58,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center):
         elif nulls == "all":
             data = [np.nan] * len(data)
 
-    psr = cudf.utils.utils._create_pandas_series(data=data, index=index)
+    psr = _create_pandas_series(data, index=index)
     gsr = cudf.Series(psr)
     for window_size in range(1, len(data) + 1):
         for min_periods in range(1, window_size + 1):
@@ -322,7 +322,7 @@ def test_rolling_getitem_window():
 @pytest.mark.parametrize("center", [True, False])
 def test_rollling_series_numba_udf_basic(data, index, center):
 
-    psr = cudf.utils.utils._create_pandas_series(data=data, index=index)
+    psr = _create_pandas_series(data, index=index)
     gsr = cudf.from_pandas(psr)
 
     def some_func(A):
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index e8382681820..0aceb07bd35 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import datetime
-import datetime as dt
 import re
 from decimal import Decimal
 
@@ -11,7 +10,6 @@
 import pytest
 
 import cudf
-from cudf import Scalar as pycudf_scalar
 from cudf._lib.copying import get_element
 from cudf.testing._utils import (
     ALL_TYPES,
@@ -297,9 +295,9 @@ def test_date_duration_scalars(value):
 
     actual = s.value
 
-    if isinstance(value, dt.datetime):
+    if isinstance(value, datetime.datetime):
         expected = np.datetime64(value)
-    elif isinstance(value, dt.timedelta):
+    elif isinstance(value, datetime.timedelta):
         expected = np.timedelta64(value)
     elif isinstance(value, pd.Timestamp):
         expected = value.to_datetime64()
@@ -344,7 +342,7 @@ def test_scalar_invalid_implicit_conversion(cls, dtype):
         cls(pd.NA)
     except TypeError as e:
         with pytest.raises(TypeError, match=re.escape(str(e))):
-            slr = pycudf_scalar(None, dtype=dtype)
+            slr = cudf.Scalar(None, dtype=dtype)
             cls(slr)
 
 
@@ -354,7 +352,7 @@ def test_scalar_invalid_implicit_conversion(cls, dtype):
     [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
 )
 def test_device_scalar_direct_construction(value, decimal_type):
-    value = cudf.utils.utils.to_cudf_compatible_scalar(value)
+    value = cudf.utils.dtypes.to_cudf_compatible_scalar(value)
 
     dtype = (
         value.dtype
@@ -378,7 +376,7 @@ def test_device_scalar_direct_construction(value, decimal_type):
 
 @pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES)
 def test_construct_from_scalar(value):
-    value = cudf.utils.utils.to_cudf_compatible_scalar(value)
+    value = cudf.utils.dtypes.to_cudf_compatible_scalar(value)
     x = cudf.Scalar(
         value, value.dtype if not isinstance(value, Decimal) else None
     )
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index cd029d02d79..d3433a589a7 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 import cupy
 import numpy as np
 import pandas as pd
@@ -73,6 +73,14 @@ def test_searchsorted_dataframe(side, multiindex):
         assert result == [2, 0, 4, 1]
 
 
+def test_search_sorted_dataframe_unequal_number_of_columns():
+    values = cudf.DataFrame({"a": [1, 0, 5, 1]})
+    base = cudf.DataFrame({"a": [1, 0, 5, 1], "b": ["x", "z", "w", "a"]})
+
+    with pytest.raises(ValueError, match="Mismatch number of columns"):
+        base.searchsorted(values)
+
+
 @pytest.mark.parametrize("side", ["left", "right"])
 def test_searchsorted_categorical(side):
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6f0f77f0aa2..d755ed58724 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -16,8 +16,10 @@
 from cudf.testing._utils import (
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
+    _create_pandas_series,
     assert_eq,
     assert_exceptions_equal,
+    gen_rand,
 )
 
 
@@ -380,7 +382,7 @@ def test_series_tolist(data):
     [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57],
 )
 def test_series_size(data):
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
     gsr = cudf.Series(data)
 
     assert_eq(psr.size, gsr.size)
@@ -1593,7 +1595,7 @@ def test_series_nunique_index(data):
 )
 def test_isin_numeric(data, values):
     index = np.random.randint(0, 100, len(data))
-    psr = cudf.utils.utils._create_pandas_series(data=data, index=index)
+    psr = _create_pandas_series(data, index=index)
     gsr = cudf.Series.from_pandas(psr, nan_as_null=False)
 
     expected = psr.isin(values)
@@ -1653,7 +1655,7 @@ def test_fill_new_category():
     ],
 )
 def test_isin_datetime(data, values):
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
@@ -1689,7 +1691,7 @@ def test_isin_datetime(data, values):
     ],
 )
 def test_isin_string(data, values):
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
@@ -1718,9 +1720,95 @@ def test_isin_string(data, values):
     ],
 )
 def test_isin_categorical(data, values):
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
     gsr = cudf.Series.from_pandas(psr)
 
     got = gsr.isin(values)
     expected = psr.isin(values)
     assert_eq(got, expected)
+
+
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES)
+@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20])
+@pytest.mark.parametrize("data_empty", [False, True])
+def test_diff(dtype, period, data_empty):
+    if data_empty:
+        data = None
+    else:
+        if dtype == np.int8:
+            # to keep data in range
+            data = gen_rand(dtype, 100000, low=-2, high=2)
+        else:
+            data = gen_rand(dtype, 100000)
+
+    gs = cudf.Series(data, dtype=dtype)
+    ps = pd.Series(data, dtype=dtype)
+
+    expected_outcome = ps.diff(period)
+    diffed_outcome = gs.diff(period).astype(expected_outcome.dtype)
+
+    if data_empty:
+        assert_eq(diffed_outcome, expected_outcome, check_index_type=False)
+    else:
+        assert_eq(diffed_outcome, expected_outcome)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["a", "b", "c", "d", "e"],
+    ],
+)
+def test_diff_unsupported_dtypes(data):
+    gs = cudf.Series(data)
+    with pytest.raises(
+        TypeError,
+        match=r"unsupported operand type\(s\)",
+    ):
+        gs.diff()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        pd.date_range("2020-01-01", "2020-01-06", freq="D"),
+        [True, True, True, False, True, True],
+        [1.0, 2.0, 3.5, 4.0, 5.0, -1.7],
+        [1, 2, 3, 3, 4, 5],
+        [np.nan, None, None, np.nan, np.nan, None],
+    ],
+)
+def test_diff_many_dtypes(data):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+    assert_eq(ps.diff(), gs.diff())
+    assert_eq(ps.diff(periods=2), gs.diff(periods=2))
+
+
+@pytest.mark.parametrize("num_rows", [1, 100])
+@pytest.mark.parametrize("num_bins", [1, 10])
+@pytest.mark.parametrize("right", [True, False])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
+@pytest.mark.parametrize("series_bins", [True, False])
+def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
+    data = np.random.randint(0, 100, num_rows).astype(dtype)
+    bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype)))
+    s = cudf.Series(data)
+    if series_bins:
+        s_bins = cudf.Series(bins)
+        indices = s.digitize(s_bins, right)
+    else:
+        indices = s.digitize(bins, right)
+    np.testing.assert_array_equal(
+        np.digitize(data, bins, right), indices.to_numpy()
+    )
+
+
+def test_series_digitize_invalid_bins():
+    s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32")
+    bins = cudf.Series([2, None, None, 50, 90], dtype="int32")
+
+    with pytest.raises(
+        ValueError, match="`bins` cannot contain null entries."
+    ):
+        _ = s.digitize(bins)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 977a01952db..f134849663d 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -9,7 +9,11 @@
 
 import cudf
 from cudf.datasets import randomdata
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing._utils import (
+    _create_pandas_series,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 params_dtypes = [np.int32, np.uint32, np.float32, np.float64]
 methods = ["min", "max", "sum", "mean", "var", "std"]
@@ -217,7 +221,7 @@ def test_approx_quantiles_int():
 )
 def test_misc_quantiles(data, q):
 
-    pdf_series = cudf.utils.utils._create_pandas_series(data=data)
+    pdf_series = _create_pandas_series(data)
     gdf_series = cudf.Series(data)
 
     expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q)
@@ -239,13 +243,10 @@ def test_misc_quantiles(data, q):
         cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
         cudf.Series([]),
         cudf.Series([-3]),
-        randomdata(
-            nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
-        ),
     ],
 )
 @pytest.mark.parametrize("null_flag", [False, True])
-def test_kurtosis(data, null_flag):
+def test_kurtosis_series(data, null_flag):
     pdata = data.to_pandas()
 
     if null_flag and len(data) > 2:
@@ -262,8 +263,13 @@ def test_kurtosis(data, null_flag):
     expected = pdata.kurt()
     np.testing.assert_array_almost_equal(got, expected)
 
+    got = data.kurt(numeric_only=False)
+    got = got if np.isscalar(got) else got.to_numpy()
+    expected = pdata.kurt(numeric_only=False)
+    np.testing.assert_array_almost_equal(got, expected)
+
     with pytest.raises(NotImplementedError):
-        data.kurt(numeric_only=False)
+        data.kurt(numeric_only=True)
 
 
 @pytest.mark.parametrize(
@@ -280,13 +286,10 @@ def test_kurtosis(data, null_flag):
         cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]),
         cudf.Series([]),
         cudf.Series([-3]),
-        randomdata(
-            nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
-        ),
     ],
 )
 @pytest.mark.parametrize("null_flag", [False, True])
-def test_skew(data, null_flag):
+def test_skew_series(data, null_flag):
     pdata = data.to_pandas()
 
     if null_flag and len(data) > 2:
@@ -298,8 +301,13 @@ def test_skew(data, null_flag):
     got = got if np.isscalar(got) else got.to_numpy()
     np.testing.assert_array_almost_equal(got, expected)
 
+    got = data.skew(numeric_only=False)
+    expected = pdata.skew(numeric_only=False)
+    got = got if np.isscalar(got) else got.to_numpy()
+    np.testing.assert_array_almost_equal(got, expected)
+
     with pytest.raises(NotImplementedError):
-        data.skew(numeric_only=False)
+        data.skew(numeric_only=True)
 
 
 @pytest.mark.parametrize("dtype", params_dtypes)
@@ -480,14 +488,14 @@ def test_df_corr(method):
 )
 @pytest.mark.parametrize("skipna", [True, False, None])
 def test_nans_stats(data, ops, skipna):
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
     gsr = cudf.Series(data, nan_as_null=False)
 
     assert_eq(
         getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna)
     )
 
-    psr = cudf.utils.utils._create_pandas_series(data=data)
+    psr = _create_pandas_series(data)
     gsr = cudf.Series(data, nan_as_null=False)
     # Since there is no concept of `nan_as_null` in pandas,
     # nulls will be returned in the operations. So only
@@ -541,3 +549,62 @@ def test_cov_corr_invalid_dtypes(gsr):
         rfunc_args_and_kwargs=([gsr],),
         compare_error_message=False,
     )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        randomdata(
+            nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
+        ),
+    ],
+)
+@pytest.mark.parametrize("null_flag", [False, True])
+def test_kurtosis_df(data, null_flag):
+    pdata = data.to_pandas()
+
+    if null_flag and len(data) > 2:
+        data.iloc[[0, 2]] = None
+        pdata.iloc[[0, 2]] = None
+
+    got = data.kurtosis()
+    got = got if np.isscalar(got) else got.to_numpy()
+    expected = pdata.kurtosis()
+    np.testing.assert_array_almost_equal(got, expected)
+
+    got = data.kurt()
+    got = got if np.isscalar(got) else got.to_numpy()
+    expected = pdata.kurt()
+    np.testing.assert_array_almost_equal(got, expected)
+
+    got = data.kurt(numeric_only=True)
+    got = got if np.isscalar(got) else got.to_numpy()
+    expected = pdata.kurt(numeric_only=True)
+    np.testing.assert_array_almost_equal(got, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        randomdata(
+            nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str}
+        ),
+    ],
+)
+@pytest.mark.parametrize("null_flag", [False, True])
+def test_skew_df(data, null_flag):
+    pdata = data.to_pandas()
+
+    if null_flag and len(data) > 2:
+        data.iloc[[0, 2]] = None
+        pdata.iloc[[0, 2]] = None
+
+    got = data.skew()
+    expected = pdata.skew()
+    got = got if np.isscalar(got) else got.to_numpy()
+    np.testing.assert_array_almost_equal(got, expected)
+
+    got = data.skew(numeric_only=True)
+    expected = pdata.skew(numeric_only=True)
+    got = got if np.isscalar(got) else got.to_numpy()
+    np.testing.assert_array_almost_equal(got, expected)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index b68a7562b6b..438f46d4266 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -367,9 +367,12 @@ def func(row):
 
 
 @pytest.mark.parametrize(
-    "data", [cudf.Series([1, 2, 3]), cudf.Series([1, cudf.NA, 3])]
+    "data,name",
+    [([1, 2, 3], None), ([1, cudf.NA, 3], None), ([1, 2, 3], "test_name")],
 )
-def test_series_apply_basic(data):
+def test_series_apply_basic(data, name):
+    data = cudf.Series(data, name=name)
+
     def func(x):
         return x + 1
 
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 4796402f14d..fb6e35f4f58 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -14,27 +14,6 @@
 #
 
 
-@cuda.jit
-def gpu_diff(in_col, out_col, out_mask, N):
-    """Calculate the difference between values at positions i and i - N in an
-    array and store the output in a new array.
-    """
-    i = cuda.grid(1)
-
-    if N > 0:
-        if i < in_col.size:
-            out_col[i] = in_col[i] - in_col[i - N]
-            out_mask[i] = True
-        if i < N:
-            out_mask[i] = False
-    else:
-        if i <= (in_col.size + N):
-            out_col[i] = in_col[i] - in_col[i - N]
-            out_mask[i] = True
-        if i >= (in_col.size + N) and i < in_col.size:
-            out_mask[i] = False
-
-
 # Find segments
 
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 4cd1738996f..35c6fdc73f8 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import datetime as dt
+import datetime
 from collections import namedtuple
 from decimal import Decimal
 
@@ -259,9 +259,9 @@ def to_cudf_compatible_scalar(val, dtype=None):
     ) or cudf.api.types.is_string_dtype(dtype):
         dtype = "str"
 
-    if isinstance(val, dt.datetime):
+    if isinstance(val, datetime.datetime):
         val = np.datetime64(val)
-    elif isinstance(val, dt.timedelta):
+    elif isinstance(val, datetime.timedelta):
         val = np.timedelta64(val)
     elif isinstance(val, pd.Timestamp):
         val = val.to_datetime64()
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index a722d350ef4..ab3adc1651a 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -55,6 +55,12 @@ def validate_setup():
             raise e
         # If there is no GPU detected, set `gpus_count` to -1
         gpus_count = -1
+    except RuntimeError as e:
+        # getDeviceCount() can raise a RuntimeError
+        # when ``libcuda.so`` is missing.
+        # We don't want this to propagate up to the user.
+        warnings.warn(str(e))
+        return
 
     if gpus_count > 0:
         # Cupy throws RunTimeException to get GPU count,
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index cdaaff6b2af..25b3d517e1c 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import ast
-import datetime as dt
+import datetime
 from typing import Any, Dict
 
 import numpy as np
@@ -232,7 +232,7 @@ def query_execute(df, expr, callenv):
         name = name[len(ENVREF_PREFIX) :]
         try:
             val = envdict[name]
-            if isinstance(val, dt.datetime):
+            if isinstance(val, datetime.datetime):
                 val = np.datetime64(val)
         except KeyError:
             msg = "{!r} not defined in the calling environment"
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 1de6a1a01ec..70273f1d949 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import decimal
 import functools
 import hashlib
 import os
@@ -8,9 +7,7 @@
 from functools import partial
 from typing import FrozenSet, Set, Union
 
-import cupy as cp
 import numpy as np
-import pandas as pd
 from nvtx import annotate
 
 import rmm
@@ -18,7 +15,6 @@
 import cudf
 from cudf.core import column
 from cudf.core.buffer import Buffer
-from cudf.utils.dtypes import to_cudf_compatible_scalar
 
 # The size of the mask in bytes
 mask_dtype = cudf.dtype(np.int32)
@@ -176,41 +172,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-# TODO: We should evaluate whether calls to this could be more easily replaced
-# with column.full, which appears to be significantly faster in simple cases.
-def scalar_broadcast_to(scalar, size, dtype=None):
-
-    if isinstance(size, (tuple, list)):
-        size = size[0]
-
-    if cudf._lib.scalar._is_null_host_scalar(scalar):
-        if dtype is None:
-            dtype = "object"
-        return column.column_empty(size, dtype=dtype, masked=True)
-
-    if isinstance(scalar, pd.Categorical):
-        if dtype is None:
-            return _categorical_scalar_broadcast_to(scalar, size)
-        else:
-            return scalar_broadcast_to(scalar.categories[0], size).astype(
-                dtype
-            )
-
-    if isinstance(scalar, decimal.Decimal):
-        if dtype is None:
-            dtype = cudf.Decimal128Dtype._from_decimal(scalar)
-
-        out_col = column.column_empty(size, dtype=dtype)
-        if out_col.size != 0:
-            out_col[:] = scalar
-        return out_col
-
-    scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
-    dtype = scalar.dtype
-
-    return cudf.core.column.full(size=size, fill_value=scalar, dtype=dtype)
-
-
 def initfunc(f):
     """
     Decorator for initialization functions that should
@@ -228,19 +189,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-# taken from dask array
-# https://github.com/dask/dask/blob/master/dask/array/utils.py#L352-L363
-def _is_nep18_active():
-    class A:
-        def __array_function__(self, *args, **kwargs):
-            return True
-
-    try:
-        return np.concatenate([A()])
-    except ValueError:
-        return False
-
-
 @initfunc
 def set_allocator(
     allocator="default",
@@ -274,9 +222,6 @@ def set_allocator(
     )
 
 
-IS_NEP18_ACTIVE = _is_nep18_active()
-
-
 class GetAttrGetItemMixin:
     """This mixin changes `__getattr__` to attempt a `__getitem__` call.
 
@@ -406,92 +351,6 @@ def search_range(start, stop, x, step=1, side="left"):
     return max(min(length, i), 0)
 
 
-def _categorical_scalar_broadcast_to(cat_scalar, size):
-    if isinstance(cat_scalar, (cudf.Series, pd.Series)):
-        cats = cat_scalar.cat.categories
-        code = cat_scalar.cat.codes[0]
-        ordered = cat_scalar.cat.ordered
-    else:
-        # handles pd.Categorical, cudf.categorical.CategoricalColumn
-        cats = cat_scalar.categories
-        code = cat_scalar.codes[0]
-        ordered = cat_scalar.ordered
-
-    cats = column.as_column(cats)
-    codes = scalar_broadcast_to(code, size)
-
-    return column.build_categorical_column(
-        categories=cats,
-        codes=codes,
-        mask=codes.base_mask,
-        size=codes.size,
-        offset=codes.offset,
-        ordered=ordered,
-    )
-
-
-def _create_pandas_series(
-    data=None, index=None, dtype=None, name=None, copy=False, fastpath=False
-):
-    """
-    Wrapper to create a Pandas Series. If the length of data is 0 and
-    dtype is not passed, this wrapper defaults the dtype to `float64`.
-
-    Parameters
-    ----------
-    data : array-like, Iterable, dict, or scalar value
-        Contains data stored in Series. If data is a dict, argument
-        order is maintained.
-    index : array-like or Index (1d)
-        Values must be hashable and have the same length as data.
-        Non-unique index values are allowed. Will default to
-        RangeIndex (0, 1, 2, …, n) if not provided.
-        If data is dict-like and index is None, then the keys
-        in the data are used as the index. If the index is not None,
-        the resulting Series is reindexed with the index values.
-    dtype : str, numpy.dtype, or ExtensionDtype, optional
-        Data type for the output Series. If not specified, this
-        will be inferred from data. See the user guide for more usages.
-    name : str, optional
-        The name to give to the Series.
-    copy : bool, default False
-        Copy input data.
-
-    Returns
-    -------
-    pd.Series
-    """
-    if (data is None or len(data) == 0) and dtype is None:
-        dtype = "float64"
-    return pd.Series(
-        data=data,
-        index=index,
-        dtype=dtype,
-        name=name,
-        copy=copy,
-        fastpath=fastpath,
-    )
-
-
-def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
-    """Makes best effort to convert an array of indices into a python slice.
-    If the conversion is not possible, return input. `indices` are expected
-    to be valid.
-    """
-    # TODO: improve efficiency by avoiding sync.
-    if len(indices) == 1:
-        x = indices[0].item()
-        return slice(x, x + 1)
-    if len(indices) == 2:
-        x1, x2 = indices[0].item(), indices[1].item()
-        return slice(x1, x2 + 1, x2 - x1)
-    start, step = indices[0].item(), (indices[1] - indices[0]).item()
-    stop = start + step * len(indices)
-    if (indices == cp.arange(start, stop, step)).all():
-        return slice(start, stop, step)
-    return indices
-
-
 def _get_color_for_nvtx(name):
     m = hashlib.sha256()
     m.update(name.encode())
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 2ec9909dd6f..a447fcfe027 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -47,7 +47,8 @@
         "pytest",
         "pytest-benchmark",
         "pytest-xdist",
-        "hypothesis" "mimesis",
+        "hypothesis",
+        "mimesis",
         "fastavro>=0.22.9",
         "python-snappy>=0.6.0",
         "pyorc",
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 3c11fe3ffbb..f5df0e261c9 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -79,7 +79,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
         ex = set(columns) - set(schema)
         if ex:
             raise ValueError(
-                "Requested columns ({ex}) not in schema ({set(schema)})"
+                f"Requested columns ({ex}) not in schema ({set(schema)})"
             )
     else:
         columns = list(schema)
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 042759f68cf..b201626becf 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -177,65 +177,98 @@ def read_partition(
         strings_to_cats = kwargs.get("strings_to_categorical", False)
         read_kwargs = kwargs.get("read", {})
         read_kwargs.update(open_file_options or {})
-
-        # Assume multi-piece read
-        paths = []
-        rgs = []
-        last_partition_keys = None
-        dfs = []
-
-        for i, piece in enumerate(pieces):
-
-            (path, row_group, partition_keys) = piece
-            row_group = None if row_group == [None] else row_group
-
-            if i > 0 and partition_keys != last_partition_keys:
-                dfs.append(
-                    cls._read_paths(
-                        paths,
-                        fs,
-                        columns=read_columns,
-                        row_groups=rgs if rgs else None,
-                        strings_to_categorical=strings_to_cats,
-                        partitions=partitions,
-                        partitioning=partitioning,
-                        partition_keys=last_partition_keys,
-                        **read_kwargs,
+        check_file_size = read_kwargs.pop("check_file_size", None)
+
+        # Wrap reading logic in a `try` block so that we can
+        # inform the user that the `read_parquet` partition
+        # size is too large for the available memory
+        try:
+
+            # Assume multi-piece read
+            paths = []
+            rgs = []
+            last_partition_keys = None
+            dfs = []
+
+            for i, piece in enumerate(pieces):
+
+                (path, row_group, partition_keys) = piece
+                row_group = None if row_group == [None] else row_group
+
+                # File-size check to help "protect" users from change
+                # to up-stream `split_row_groups` default. We only
+                # check the file size if this partition corresponds
+                # to a full file, and `check_file_size` is defined
+                if check_file_size and len(pieces) == 1 and row_group is None:
+                    file_size = fs.size(path)
+                    if file_size > check_file_size:
+                        warnings.warn(
+                            f"A large parquet file ({file_size}B) is being "
+                            f"used to create a DataFrame partition in "
+                            f"read_parquet. This may cause out of memory "
+                            f"exceptions in operations downstream. See the "
+                            f"notes on split_row_groups in the read_parquet "
+                            f"documentation. Setting split_row_groups "
+                            f"explicitly will silence this warning."
+                        )
+
+                if i > 0 and partition_keys != last_partition_keys:
+                    dfs.append(
+                        cls._read_paths(
+                            paths,
+                            fs,
+                            columns=read_columns,
+                            row_groups=rgs if rgs else None,
+                            strings_to_categorical=strings_to_cats,
+                            partitions=partitions,
+                            partitioning=partitioning,
+                            partition_keys=last_partition_keys,
+                            **read_kwargs,
+                        )
                     )
+                    paths = rgs = []
+                    last_partition_keys = None
+                paths.append(path)
+                rgs.append(
+                    [row_group]
+                    if not isinstance(row_group, list)
+                    and row_group is not None
+                    else row_group
                 )
-                paths = rgs = []
-                last_partition_keys = None
-            paths.append(path)
-            rgs.append(
-                [row_group]
-                if not isinstance(row_group, list) and row_group is not None
-                else row_group
-            )
-            last_partition_keys = partition_keys
+                last_partition_keys = partition_keys
 
-        dfs.append(
-            cls._read_paths(
-                paths,
-                fs,
-                columns=read_columns,
-                row_groups=rgs if rgs else None,
-                strings_to_categorical=strings_to_cats,
-                partitions=partitions,
-                partitioning=partitioning,
-                partition_keys=last_partition_keys,
-                **read_kwargs,
+            dfs.append(
+                cls._read_paths(
+                    paths,
+                    fs,
+                    columns=read_columns,
+                    row_groups=rgs if rgs else None,
+                    strings_to_categorical=strings_to_cats,
+                    partitions=partitions,
+                    partitioning=partitioning,
+                    partition_keys=last_partition_keys,
+                    **read_kwargs,
+                )
             )
-        )
-        df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0]
-
-        # Re-set "object" dtypes align with pa schema
-        set_object_dtypes_from_pa_schema(df, schema)
+            df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0]
 
-        if index and (index[0] in df.columns):
-            df = df.set_index(index[0])
-        elif index is False and df.index.names != (None,):
-            # If index=False, we shouldn't have a named index
-            df.reset_index(inplace=True)
+            # Re-set "object" dtypes align with pa schema
+            set_object_dtypes_from_pa_schema(df, schema)
+
+            if index and (index[0] in df.columns):
+                df = df.set_index(index[0])
+            elif index is False and df.index.names != (None,):
+                # If index=False, we shouldn't have a named index
+                df.reset_index(inplace=True)
+
+        except MemoryError as err:
+            raise MemoryError(
+                "Parquet data was larger than the available GPU memory!\n\n"
+                "See the notes on split_row_groups in the read_parquet "
+                "documentation.\n\n"
+                "Original Error: " + str(err)
+            )
+            raise err
 
         return df
 
@@ -349,25 +382,34 @@ def set_object_dtypes_from_pa_schema(df, schema):
                 df._data[col_name] = col.astype(typ)
 
 
-def read_parquet(
-    path,
-    columns=None,
-    split_row_groups=None,
-    row_groups_per_part=None,
-    **kwargs,
-):
+def read_parquet(path, columns=None, **kwargs):
     """Read parquet files into a Dask DataFrame
 
-    Calls ``dask.dataframe.read_parquet`` to cordinate the execution of
-    ``cudf.read_parquet``, and ultimately read multiple partitions into
-    a single Dask dataframe. The Dask version must supply an
-    ``ArrowDatasetEngine`` class to support full functionality.
-    See ``cudf.read_parquet`` and Dask documentation for further details.
+    Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine``
+    to cordinate the execution of ``cudf.read_parquet``, and to
+    ultimately create a ``dask_cudf.DataFrame`` collection.
+
+    See the ``dask.dataframe.read_parquet`` documentation for
+    all available options.
 
     Examples
     --------
-    >>> import dask_cudf
-    >>> df = dask_cudf.read_parquet("/path/to/dataset/")  # doctest: +SKIP
+    >>> from dask_cudf import read_parquet
+    >>> df = read_parquet("/path/to/dataset/")  # doctest: +SKIP
+
+    When dealing with one or more large parquet files having an
+    in-memory footprint >15% device memory, the ``split_row_groups``
+    argument should be used to map Parquet **row-groups** to DataFrame
+    partitions (instead of **files** to partitions). For example, the
+    following code will map each row-group to a distinct partition:
+
+    >>> df = read_parquet(..., split_row_groups=True)  # doctest: +SKIP
+
+    To map **multiple** row-groups to each partition, an integer can be
+    passed to ``split_row_groups`` to specify the **maximum** number of
+    row-groups allowed in each output partition:
+
+    >>> df = read_parquet(..., split_row_groups=10)  # doctest: +SKIP
 
     See Also
     --------
@@ -376,22 +418,24 @@ def read_parquet(
     if isinstance(columns, str):
         columns = [columns]
 
-    if row_groups_per_part:
-        warnings.warn(
-            "row_groups_per_part is deprecated. "
-            "Pass an integer value to split_row_groups instead.",
-            FutureWarning,
-        )
-        if split_row_groups is None:
-            split_row_groups = row_groups_per_part
-
-    return dd.read_parquet(
-        path,
-        columns=columns,
-        split_row_groups=split_row_groups,
-        engine=CudfEngine,
-        **kwargs,
-    )
+    # Set "check_file_size" option to determine whether we
+    # should check the parquet-file size. This check is meant
+    # to "protect" users from `split_row_groups` default changes
+    check_file_size = kwargs.pop("check_file_size", 500_000_000)
+    if (
+        check_file_size
+        and ("split_row_groups" not in kwargs)
+        and ("chunksize" not in kwargs)
+    ):
+        # User is not specifying `split_row_groups` or `chunksize`,
+        # so we should warn them if/when a file is ~>0.5GB on disk.
+        # They can set `split_row_groups` explicitly to silence/skip
+        # this check
+        if "read" not in kwargs:
+            kwargs["read"] = {}
+        kwargs["read"]["check_file_size"] = check_file_size
+
+    return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs)
 
 
 to_parquet = partial(dd.to_parquet, engine=CudfEngine)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index d9b8ee4595a..ef5741b0539 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -36,42 +36,55 @@
 ddf = dd.from_pandas(df, npartitions=npartitions)
 
 
-@pytest.mark.parametrize("stats", [True, False])
-def test_roundtrip_from_dask(tmpdir, stats):
+# Helper function to make it easier to handle the
+# upcoming deprecation of `gather_statistics`.
+# See: https://github.com/dask/dask/issues/8937
+# TODO: This function should be used to switch to
+# the "new" `calculate_divisions` kwarg (for newer
+# Dask versions) once it is introduced
+def _divisions(setting):
+    return {"gather_statistics": setting}
+
+
+@pytest.mark.parametrize("write_metadata_file", [True, False])
+@pytest.mark.parametrize("divisions", [True, False])
+def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file):
     tmpdir = str(tmpdir)
-    ddf.to_parquet(tmpdir, engine="pyarrow")
+    ddf.to_parquet(
+        tmpdir, write_metadata_file=write_metadata_file, engine="pyarrow"
+    )
     files = sorted(
         (os.path.join(tmpdir, f) for f in os.listdir(tmpdir)),
         key=natural_sort_key,
     )
 
     # Read list of parquet files
-    ddf2 = dask_cudf.read_parquet(files, gather_statistics=stats)
-    dd.assert_eq(ddf, ddf2, check_divisions=stats)
+    ddf2 = dask_cudf.read_parquet(files, **_divisions(divisions))
+    dd.assert_eq(ddf, ddf2, check_divisions=divisions)
 
     # Specify columns=['x']
     ddf2 = dask_cudf.read_parquet(
-        files, columns=["x"], gather_statistics=stats
+        files, columns=["x"], **_divisions(divisions)
     )
-    dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats)
+    dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions)
 
     # Specify columns='y'
-    ddf2 = dask_cudf.read_parquet(files, columns="y", gather_statistics=stats)
-    dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats)
+    ddf2 = dask_cudf.read_parquet(files, columns="y", **_divisions(divisions))
+    dd.assert_eq(ddf[["y"]], ddf2, check_divisions=divisions)
 
     # Now include metadata
-    ddf2 = dask_cudf.read_parquet(tmpdir, gather_statistics=stats)
-    dd.assert_eq(ddf, ddf2, check_divisions=stats)
+    ddf2 = dask_cudf.read_parquet(tmpdir, **_divisions(divisions))
+    dd.assert_eq(ddf, ddf2, check_divisions=divisions)
 
     # Specify columns=['x'] (with metadata)
     ddf2 = dask_cudf.read_parquet(
-        tmpdir, columns=["x"], gather_statistics=stats
+        tmpdir, columns=["x"], **_divisions(divisions)
     )
-    dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats)
+    dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions)
 
     # Specify columns='y' (with metadata)
-    ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", gather_statistics=stats)
-    dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats)
+    ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", **_divisions(divisions))
+    dd.assert_eq(ddf[["y"]], ddf2, check_divisions=divisions)
 
 
 def test_roundtrip_from_dask_index_false(tmpdir):
@@ -99,8 +112,8 @@ def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
     gddf = dask_cudf.from_dask_dataframe(ddf)
     gddf.to_parquet(tmpdir, write_metadata_file=write_meta)
 
-    gddf2 = dask_cudf.read_parquet(tmpdir)
-    dd.assert_eq(gddf, gddf2, check_divisions=write_meta)
+    gddf2 = dask_cudf.read_parquet(tmpdir, **_divisions(True))
+    dd.assert_eq(gddf, gddf2)
 
 
 def test_roundtrip_none_rangeindex(tmpdir):
@@ -161,21 +174,21 @@ def test_dask_timeseries_from_pandas(tmpdir):
 
 
 @pytest.mark.parametrize("index", [False, None])
-@pytest.mark.parametrize("stats", [False, True])
-def test_dask_timeseries_from_dask(tmpdir, index, stats):
+@pytest.mark.parametrize("divisions", [False, True])
+def test_dask_timeseries_from_dask(tmpdir, index, divisions):
 
     fn = str(tmpdir)
     ddf2 = dask.datasets.timeseries(freq="D")
     ddf2.to_parquet(fn, engine="pyarrow", write_index=index)
-    read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats)
+    read_df = dask_cudf.read_parquet(fn, index=index, **_divisions(divisions))
     dd.assert_eq(
-        ddf2, read_df, check_divisions=(stats and index), check_index=index
+        ddf2, read_df, check_divisions=(divisions and index), check_index=index
     )
 
 
 @pytest.mark.parametrize("index", [False, None])
-@pytest.mark.parametrize("stats", [False, True])
-def test_dask_timeseries_from_daskcudf(tmpdir, index, stats):
+@pytest.mark.parametrize("divisions", [False, True])
+def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
 
     fn = str(tmpdir)
     ddf2 = dask_cudf.from_cudf(
@@ -183,9 +196,9 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, stats):
     )
     ddf2.name = ddf2.name.astype("object")
     ddf2.to_parquet(fn, write_index=index)
-    read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats)
+    read_df = dask_cudf.read_parquet(fn, index=index, **_divisions(divisions))
     dd.assert_eq(
-        ddf2, read_df, check_divisions=(stats and index), check_index=index
+        ddf2, read_df, check_divisions=(divisions and index), check_index=index
     )
 
 
@@ -212,17 +225,23 @@ def test_filters(tmpdir):
 
     ddf.to_parquet(tmp_path, engine="pyarrow")
 
-    a = dask_cudf.read_parquet(tmp_path, filters=[("x", ">", 4)])
+    a = dask_cudf.read_parquet(
+        tmp_path, filters=[("x", ">", 4)], split_row_groups=True
+    )
     assert a.npartitions == 3
     assert (a.x > 3).all().compute()
 
-    b = dask_cudf.read_parquet(tmp_path, filters=[("y", "==", "c")])
+    b = dask_cudf.read_parquet(
+        tmp_path, filters=[("y", "==", "c")], split_row_groups=True
+    )
     assert b.npartitions == 1
     b = b.compute().to_pandas()
     assert (b.y == "c").all()
 
     c = dask_cudf.read_parquet(
-        tmp_path, filters=[("y", "==", "c"), ("x", ">", 6)]
+        tmp_path,
+        filters=[("y", "==", "c"), ("x", ">", 6)],
+        split_row_groups=True,
     )
     assert c.npartitions <= 1
     assert not len(c)
@@ -237,13 +256,17 @@ def test_filters_at_row_group_level(tmpdir):
 
     ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=10 / 5)
 
-    a = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)])
+    a = dask_cudf.read_parquet(
+        tmp_path, filters=[("x", "==", 1)], split_row_groups=True
+    )
     assert a.npartitions == 1
     assert (a.shape[0] == 2).compute()
 
     ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=1)
 
-    b = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)])
+    b = dask_cudf.read_parquet(
+        tmp_path, filters=[("x", "==", 1)], split_row_groups=True
+    )
     assert b.npartitions == 1
     assert (b.shape[0] == 1).compute()
 
@@ -341,7 +364,7 @@ def test_chunksize(tmpdir, chunksize, metadata):
         path,
         chunksize=chunksize,
         split_row_groups=True,
-        gather_statistics=True,
+        **_divisions(True),
     )
     ddf2.compute(scheduler="synchronous")
 
@@ -360,8 +383,8 @@ def test_chunksize(tmpdir, chunksize, metadata):
             path,
             chunksize=chunksize,
             split_row_groups=True,
-            gather_statistics=True,
             aggregate_files=True,
+            **_divisions(True),
         )
 
         dd.assert_eq(ddf1, ddf3, check_divisions=False)
@@ -382,7 +405,7 @@ def test_chunksize(tmpdir, chunksize, metadata):
 
 @pytest.mark.parametrize("row_groups", [1, 3, 10, 12])
 @pytest.mark.parametrize("index", [False, True])
-def test_row_groups_per_part(tmpdir, row_groups, index):
+def test_split_row_groups(tmpdir, row_groups, index):
     nparts = 2
     df_size = 100
     row_group_size = 5
@@ -410,7 +433,7 @@ def test_row_groups_per_part(tmpdir, row_groups, index):
 
     ddf2 = dask_cudf.read_parquet(
         str(tmpdir),
-        row_groups_per_part=row_groups,
+        split_row_groups=row_groups,
     )
 
     dd.assert_eq(ddf1, ddf2, check_divisions=False)
@@ -448,9 +471,9 @@ def test_create_metadata_file(tmpdir, partition_on):
     # with the _metadata file present
     ddf2 = dask_cudf.read_parquet(
         tmpdir,
-        gather_statistics=True,
         split_row_groups=False,
         index="myindex",
+        **_divisions(True),
     )
     if partition_on:
         ddf1 = df1.sort_values("b")
@@ -481,7 +504,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     # New pyarrow-dataset base can handle an inconsistent
     # schema (even without a _metadata file), but computing
     # and dtype validation may fail
-    ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)
+    ddf1 = dask_cudf.read_parquet(str(tmpdir), **_divisions(True))
 
     # Add global metadata file.
     # Dask-CuDF can do this without requiring schema
@@ -490,7 +513,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
 
     # Check that we can still read the ddf
     # with the _metadata file present
-    ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)
+    ddf2 = dask_cudf.read_parquet(str(tmpdir), **_divisions(True))
 
     # Check that the result is the same with and
     # without the _metadata file.  Note that we must
@@ -538,3 +561,12 @@ def test_cudf_list_struct_write(tmpdir):
     ddf.to_parquet(temp_file)
     new_ddf = dask_cudf.read_parquet(temp_file)
     dd.assert_eq(df, new_ddf)
+
+
+def test_check_file_size(tmpdir):
+    # Test simple file-size check to help warn users
+    # of upstream change to `split_row_groups` default
+    fn = str(tmpdir.join("test.parquet"))
+    cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn)
+    with pytest.warns(match="large parquet file"):
+        dask_cudf.read_parquet(fn, check_file_size=1).compute()
diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py
new file mode 100644
index 00000000000..929f00ec296
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import pytest
+from pandas import NA
+
+from dask import dataframe as dd
+
+from dask_cudf.tests.utils import _make_random_frame
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x: x + 1,
+        lambda x: x - 0.5,
+        lambda x: 2 if x is NA else 2 + (x + 1) / 4.1,
+        lambda x: 42,
+    ],
+)
+@pytest.mark.parametrize("has_na", [True, False])
+def test_applymap_basic(func, has_na):
+    size = 2000
+    pdf, dgdf = _make_random_frame(size, include_na=False)
+
+    dpdf = dd.from_pandas(pdf, npartitions=dgdf.npartitions)
+
+    expect = dpdf.applymap(func)
+    got = dgdf.applymap(func)
+    dd.assert_eq(expect, got, check_dtype=False)
diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py
index 64b7cc85971..87bd401accd 100644
--- a/python/dask_cudf/dask_cudf/tests/test_binops.py
+++ b/python/dask_cudf/dask_cudf/tests/test_binops.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
 import operator
 
 import numpy as np
@@ -8,6 +10,8 @@
 
 import cudf
 
+from dask_cudf.tests.utils import _make_random_frame
+
 
 def _make_empty_frame(npartitions=2):
     df = pd.DataFrame({"x": [], "y": []})
@@ -16,15 +20,6 @@ def _make_empty_frame(npartitions=2):
     return dgf
 
 
-def _make_random_frame(nelem, npartitions=2):
-    df = pd.DataFrame(
-        {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)}
-    )
-    gdf = cudf.DataFrame.from_pandas(df)
-    dgf = dd.from_pandas(gdf, npartitions=npartitions)
-    return df, dgf
-
-
 def _make_random_frame_float(nelem, npartitions=2):
     df = pd.DataFrame(
         {
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
new file mode 100644
index 00000000000..88a2116fb0a
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+
+import dask.dataframe as dd
+
+import cudf
+
+
+def _make_random_frame(nelem, npartitions=2, include_na=False):
+    df = pd.DataFrame(
+        {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)}
+    )
+
+    if include_na:
+        df["x"][::2] = pd.NA
+
+    gdf = cudf.DataFrame.from_pandas(df)
+    dgf = dd.from_pandas(gdf, npartitions=npartitions)
+    return df, dgf