diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000000..043a93e6ff9
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,27 @@
+---
+Checks:    
+      'modernize-*,
+       -modernize-use-equals-default,
+       -modernize-concat-nested-namespaces,
+       -modernize-use-trailing-return-type'
+      
+      # -modernize-use-equals-default        # auto-fix is broken (doesn't insert =default correctly)
+      # -modernize-concat-nested-namespaces  # auto-fix is broken (can delete code)
+      # -modernize-use-trailing-return-type  # just a preference
+
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+CheckOptions:
+ - key:             modernize-loop-convert.MaxCopySize
+   value:           '16'
+ - key:             modernize-loop-convert.MinConfidence
+   value:           reasonable
+ - key:             modernize-pass-by-value.IncludeStyle
+   value:           llvm
+ - key:             modernize-replace-auto-ptr.IncludeStyle
+   value:           llvm
+ - key:             modernize-use-nullptr.NullMacros
+   value:           'NULL'
+...
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68ff9abc9ea..6d4bdfb8d98 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# cuDF 22.04.00 (Date TBD)
+
+Please see https://github.com/rapidsai/cudf/releases/tag/v22.04.00a for the latest changes to this development branch.
+
 # cuDF 22.02.00 (Date TBD)
 
 Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the latest changes to this development branch.
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index eef8a04067d..00000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-# An integration test & dev container which builds and installs cuDF from main
-ARG CUDA_VERSION=11.0
-ARG CUDA_SHORT_VERSION=${CUDA_VERSION}
-ARG LINUX_VERSION=ubuntu18.04
-FROM nvidia/cuda:${CUDA_VERSION}-devel-${LINUX_VERSION}
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/lib
-ENV DEBIAN_FRONTEND=noninteractive
-
-ARG CC=9
-ARG CXX=9
-RUN apt update -y --fix-missing && \
-    apt upgrade -y && \
-    apt install -y --no-install-recommends software-properties-common && \
-    add-apt-repository ppa:ubuntu-toolchain-r/test && \
-    apt update -y --fix-missing && \
-    apt install -y --no-install-recommends \
-      git \
-      gcc-${CC} \
-      g++-${CXX} \
-      tzdata && \
-    apt-get autoremove -y && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install conda
-ADD https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh /miniconda.sh
-RUN sh /miniconda.sh -b -p /conda && /conda/bin/conda update -n base conda
-ENV PATH=${PATH}:/conda/bin
-# Enables "source activate conda"
-SHELL ["/bin/bash", "-c"]
-
-# Build cuDF conda env
-ARG CUDA_SHORT_VERSION
-ARG PYTHON_VERSION
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ARG NUMBA_VERSION
-ENV NUMBA_VERSION=$NUMBA_VERSION
-ARG NUMPY_VERSION
-ENV NUMPY_VERSION=$NUMPY_VERSION
-ARG PANDAS_VERSION
-ENV PANDAS_VERSION=$PANDAS_VERSION
-ARG PYARROW_VERSION
-ENV PYARROW_VERSION=$PYARROW_VERSION
-ARG CYTHON_VERSION
-ENV CYTHON_VERSION=$CYTHON_VERSION
-ARG CMAKE_VERSION
-ENV CMAKE_VERSION=$CMAKE_VERSION
-ARG CUDF_REPO=https://github.com/rapidsai/cudf
-ENV CUDF_REPO=$CUDF_REPO
-ARG CUDF_BRANCH=main
-ENV CUDF_BRANCH=$CUDF_BRANCH
-
-# Add everything from the local build context
-ADD . /cudf/
-
-# Checks if local build context has the source, if not clone it then run a bash script to modify
-# the environment file based on versions set in build args
-RUN ls -la /cudf
-RUN if [ -f /cudf/docker/package_versions.sh ]; \
-    then /cudf/docker/package_versions.sh /cudf/conda/environments/cudf_dev_cuda${CUDA_SHORT_VERSION}.yml && \
-         conda env create --name cudf --file /cudf/conda/environments/cudf_dev_cuda${CUDA_SHORT_VERSION}.yml ; \
-    else rm -rf /cudf && \
-         git clone --recurse-submodules -b ${CUDF_BRANCH} ${CUDF_REPO} /cudf && \
-         /cudf/docker/package_versions.sh /cudf/conda/environments/cudf_dev_cuda${CUDA_SHORT_VERSION}.yml && \
-         conda env create --name cudf --file /cudf/conda/environments/cudf_dev_cuda${CUDA_SHORT_VERSION}.yml ; \
-    fi
-
-ENV CC=/opts/conda/envs/rapids/bin/gcc-${CC}
-ENV CXX=/opts/conda/envs/rapids/bin/g++-${CXX}
-
-# libcudf & cudf build/install
-RUN source activate cudf && \
-    cd /cudf/ && \
-    ./build.sh libcudf cudf
diff --git a/build.sh b/build.sh
index 45074a6645f..c2eba134c35 100755
--- a/build.sh
+++ b/build.sh
@@ -230,6 +230,7 @@ if buildAll || hasArg libcudf; then
         fi
         echo "$MSG"
         python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
+        cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log
     fi
 
     if [[ ${INSTALL_TARGET} != "" ]]; then
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 534ac19ee98..178bdab0154 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='main'
+export DASK_DISTRIBUTED_GIT_TAG='2022.01.0'
 
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 13f7f0e6267..9fb86b0b3c5 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -14,7 +14,7 @@ LANG=C.UTF-8
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index f23296038f2..6f19f174da0 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -85,6 +85,7 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then
       gpuci_logger "Copying build metrics results"
       mkdir -p "$WORKSPACE/build-metrics"
       cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
+      cp "$LIBCUDF_BUILD_DIR/ninja.log" "$WORKSPACE/build-metrics/ninja.log"
   fi
 
   gpuci_logger "Build conda pkg for libcudf_kafka"
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index 746c0005a47..1699fc16a47 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -3,31 +3,11 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 set -e
 
-DEFAULT_CUDA_VER="11.5"
-
-#Always upload cudf Python package
+#Always upload cudf packages
 export UPLOAD_CUDF=1
-
-#Upload libcudf once per CUDA
-if [[ "$PYTHON" == "3.7" ]]; then
-    export UPLOAD_LIBCUDF=1
-else
-    export UPLOAD_LIBCUDF=0
-fi
-
-# upload cudf_kafka for all versions of Python
-if [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
-    export UPLOAD_CUDF_KAFKA=1
-else
-    export UPLOAD_CUDF_KAFKA=0
-fi
-
-#We only want to upload libcudf_kafka once per python/CUDA combo
-if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
-    export UPLOAD_LIBCUDF_KAFKA=1
-else
-    export UPLOAD_LIBCUDF_KAFKA=0
-fi
+export UPLOAD_LIBCUDF=1
+export UPLOAD_CUDF_KAFKA=1
+export UPLOAD_LIBCUDF_KAFKA=1
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     #If project flash is not activate, always build both
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 39a39c46eff..d5fb7451769 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -31,10 +31,10 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='main'
+export DASK_DISTRIBUTED_GIT_TAG='2022.01.0'
 
 # ucx-py version
-export UCX_PY_VERSION='0.24.*'
+export UCX_PY_VERSION='0.25.*'
 
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index b048470d155..ab5202fa9f7 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # ucx-py version
-export UCX_PY_VERSION='0.24.*'
+export UCX_PY_VERSION='0.25.*'
 
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 1105b9c194d..5575b69c226 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -42,10 +42,13 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
 
+# cmake-format rapids-cmake definitions
+sed_runner 's/'"branch-.*\/cmake-format-rapids-cmake.json"'/'"branch-${NEXT_SHORT_TAG}\/cmake-format-rapids-cmake.json"'/g' ci/checks/style.sh
+
 # doxyfile update
 sed_runner 's/PROJECT_NUMBER         = .*/PROJECT_NUMBER         = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile
 
-# RTD update
+# sphinx docs update
 sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source/conf.py
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py
 
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index bbbc754e850..b9577d937d9 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -10,7 +10,7 @@ dependencies:
   - clang=11.1.0
   - clang-tools=11.1.0
   - cupy>=9.5.0,<11.0.0a0
-  - rmm=22.02.*
+  - rmm=22.04.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
@@ -33,6 +33,7 @@ dependencies:
   - ipython
   - pandoc=<2.0.0
   - cudatoolkit=11.5
+  - cuda-python >=11.5,<12.0
   - pip
   - flake8=3.8.3
   - black=19.10
@@ -41,8 +42,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.11.1
-  - distributed>=2021.11.1
+  - dask>=2021.11.1,<=2022.01.0
+  - distributed>=2021.11.1,<=2022.01.0
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index a20749bc8c9..bd1412bc611 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -52,7 +52,7 @@ requirements:
     - packaging
     - cachetools
     - ptxcompiler  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
-
+    - cuda-python >=11.5,<12.0
 test:                                   # [linux64]
   requires:                             # [linux64]
     - cudatoolkit {{ cuda_version }}.*  # [linux64]
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index acb56c464e4..56f2730db7a 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -35,7 +35,7 @@ requirements:
   run:
     - python
     - libcudf_kafka {{ version }}
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
+    - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf {{ version }}
 
 test:                                   # [linux64]
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 8bcdd1ec61e..2e8badc3a54 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -26,15 +26,15 @@ build:
 requirements:
   host:
     - python
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
+    - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf_kafka {{ version }}
   run:
     - python
     - streamz
     - cudf {{ version }}
-    - dask>=2021.11.1,<=2021.11.2
-    - distributed>=2021.11.1,<=2021.11.2
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
+    - dask>=2021.11.1,<=2022.01.0
+    - distributed>=2021.11.1,<=2022.01.0
+    - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf_kafka {{ version }}
 
 test:                                   # [linux64]
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index fd34ff4112d..225d77729df 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -27,14 +27,14 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask>=2021.11.1
-    - distributed>=2021.11.1
+    - dask>=2021.11.1,<=2022.01.0
+    - distributed>=2021.11.1,<=2022.01.0
     - cudatoolkit {{ cuda_version }}
   run:
     - python
     - cudf {{ version }}
-    - dask>=2021.11.1
-    - distributed>=2021.11.1
+    - dask>=2021.11.1,<=2022.01.0
+    - distributed>=2021.11.1,<=2022.01.0
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2f51f582e12..90e94ffcc7b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 22.02.00
+  VERSION 22.04.00
   LANGUAGES C CXX CUDA
 )
 
@@ -186,6 +186,8 @@ add_library(
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
   src/binaryop/compiled/NullEquals.cu
+  src/binaryop/compiled/NullLogicalOr.cu
+  src/binaryop/compiled/NullLogicalAnd.cu
   src/binaryop/compiled/NullMax.cu
   src/binaryop/compiled/NullMin.cu
   src/binaryop/compiled/PMod.cu
@@ -322,11 +324,16 @@ add_library(
   src/jit/parser.cpp
   src/jit/type.cpp
   src/join/conditional_join.cu
-  src/join/mixed_join.cu
   src/join/cross_join.cu
   src/join/hash_join.cu
   src/join/join.cu
   src/join/join_utils.cu
+  src/join/mixed_join.cu
+  src/join/mixed_join_kernels.cu
+  src/join/mixed_join_kernels_semi.cu
+  src/join/mixed_join_semi.cu
+  src/join/mixed_join_size_kernels.cu
+  src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
   src/lists/contains.cu
   src/lists/combine/concatenate_list_elements.cu
@@ -418,13 +425,11 @@ add_library(
   src/strings/copying/concatenate.cu
   src/strings/copying/copying.cu
   src/strings/copying/shift.cu
+  src/strings/count_matches.cu
   src/strings/extract/extract.cu
   src/strings/extract/extract_all.cu
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
-  src/strings/findall.cu
-  src/strings/find.cu
-  src/strings/find_multiple.cu
   src/strings/padding.cu
   src/strings/json/json_path.cu
   src/strings/regex/regcomp.cpp
@@ -434,6 +439,10 @@ add_library(
   src/strings/replace/multi_re.cu
   src/strings/replace/replace.cu
   src/strings/replace/replace_re.cu
+  src/strings/search/findall.cu
+  src/strings/search/findall_record.cu
+  src/strings/search/find.cu
+  src/strings/search/find_multiple.cu
   src/strings/split/partition.cu
   src/strings/split/split.cu
   src/strings/split/split_record.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 370f84fc14a..57592de59af 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 find_package(Threads REQUIRED)
 
-add_library(cudf_datagen STATIC common/generate_benchmark_input.cpp)
+add_library(cudf_datagen STATIC common/generate_input.cpp)
 target_compile_features(cudf_datagen PUBLIC cxx_std_17 cuda_std_17)
 
 target_compile_options(
@@ -39,7 +39,7 @@ target_include_directories(
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
   cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
-                               synchronization/synchronization.cpp io/cuio_benchmark_common.cpp
+                               synchronization/synchronization.cpp io/cuio_common.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
 add_custom_command(
@@ -86,194 +86,180 @@ endfunction()
 
 # ##################################################################################################
 # * column benchmarks -----------------------------------------------------------------------------
-ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp)
+ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)
 
 # ##################################################################################################
 # * gather benchmark ------------------------------------------------------------------------------
-ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu)
+ConfigureBench(GATHER_BENCH copying/gather.cu)
 
 # ##################################################################################################
 # * scatter benchmark -----------------------------------------------------------------------------
-ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu)
+ConfigureBench(SCATTER_BENCH copying/scatter.cu)
 
 # ##################################################################################################
 # * lists scatter benchmark -----------------------------------------------------------------------
-ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu)
+ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists.cu)
 
 # ##################################################################################################
 # * contiguous_split benchmark  -------------------------------------------------------------------
-ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
+ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split.cu)
 
 # ##################################################################################################
 # * shift benchmark -------------------------------------------------------------------------------
-ConfigureBench(SHIFT_BENCH copying/shift_benchmark.cu)
+ConfigureBench(SHIFT_BENCH copying/shift.cu)
 
 # ##################################################################################################
 # * copy-if-else benchmark
 # -----------------------------------------------------------------------------
-ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else_benchmark.cpp)
+ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
 
 # ##################################################################################################
 # * transpose benchmark ---------------------------------------------------------------------------
-ConfigureBench(TRANSPOSE_BENCH transpose/transpose_benchmark.cu)
+ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cu)
 
 # ##################################################################################################
 # * apply_boolean_mask benchmark ------------------------------------------------------------------
-ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask_benchmark.cpp)
+ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp)
 
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
-ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchmark.cpp)
+ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)
 
 # ##################################################################################################
 # * join benchmark --------------------------------------------------------------------------------
-ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
-ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
+ConfigureBench(JOIN_BENCH join/left_join.cu join/conditional_join.cu)
+ConfigureNVBench(JOIN_NVBENCH join/join.cu)
 
 # ##################################################################################################
 # * iterator benchmark ----------------------------------------------------------------------------
-ConfigureBench(ITERATOR_BENCH iterator/iterator_benchmark.cu)
+ConfigureBench(ITERATOR_BENCH iterator/iterator.cu)
 
 # ##################################################################################################
 # * search benchmark ------------------------------------------------------------------------------
-ConfigureBench(SEARCH_BENCH search/search_benchmark.cpp)
+ConfigureBench(SEARCH_BENCH search/search.cpp)
 
 # ##################################################################################################
 # * sort benchmark --------------------------------------------------------------------------------
-ConfigureBench(
-  SORT_BENCH sort/rank_benchmark.cpp sort/sort_benchmark.cpp sort/sort_strings_benchmark.cpp
-)
+ConfigureBench(SORT_BENCH sort/rank.cpp sort/sort.cpp sort/sort_strings.cpp)
 
 # ##################################################################################################
 # * quantiles benchmark
 # --------------------------------------------------------------------------------
-ConfigureBench(QUANTILES_BENCH quantiles/quantiles_benchmark.cpp)
+ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp)
 
 # ##################################################################################################
 # * type_dispatcher benchmark ---------------------------------------------------------------------
-ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher_benchmark.cu)
+ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
 
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
 ConfigureBench(
-  REDUCTION_BENCH reduction/anyall_benchmark.cpp reduction/dictionary_benchmark.cpp
-  reduction/reduce_benchmark.cpp reduction/scan_benchmark.cpp reduction/minmax_benchmark.cpp
+  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/reduce.cpp
+  reduction/scan.cpp reduction/minmax.cpp
 )
 
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
-ConfigureBench(REPLACE_BENCH replace/clamp_benchmark.cpp replace/nans_benchmark.cpp)
+ConfigureBench(REPLACE_BENCH replace/clamp.cpp replace/nans.cpp)
 
 # ##################################################################################################
 # * filling benchmark -----------------------------------------------------------------------------
-ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp)
+ConfigureBench(FILL_BENCH filling/repeat.cpp)
 
 # ##################################################################################################
 # * groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(
-  GROUPBY_BENCH
-  groupby/group_sum_benchmark.cu
-  groupby/group_nth_benchmark.cu
-  groupby/group_shift_benchmark.cu
-  groupby/group_struct_benchmark.cu
-  groupby/group_no_requests_benchmark.cu
-  groupby/group_scan_benchmark.cu
+  GROUPBY_BENCH groupby/group_sum.cu groupby/group_nth.cu groupby/group_shift.cu
+  groupby/group_struct.cu groupby/group_no_requests.cu groupby/group_scan.cu
 )
 
 # ##################################################################################################
 # * hashing benchmark -----------------------------------------------------------------------------
-ConfigureBench(HASHING_BENCH hashing/hash_benchmark.cpp hashing/partition_benchmark.cpp)
+ConfigureBench(HASHING_BENCH hashing/hash.cpp hashing/partition.cpp)
 
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
-ConfigureBench(MERGE_BENCH merge/merge_benchmark.cpp)
+ConfigureBench(MERGE_BENCH merge/merge.cpp)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------
-ConfigureBench(NULLMASK_BENCH null_mask/set_null_mask_benchmark.cpp)
+ConfigureBench(NULLMASK_BENCH null_mask/set_null_mask.cpp)
 
 # ##################################################################################################
 # * parquet writer chunks benchmark ---------------------------------------------------------------
-ConfigureBench(PARQUET_WRITER_CHUNKS_BENCH io/parquet/parquet_writer_chunks_benchmark.cpp)
+ConfigureBench(PARQUET_WRITER_CHUNKS_BENCH io/parquet/parquet_writer_chunks.cpp)
 
 # ##################################################################################################
 # * parquet reader benchmark ----------------------------------------------------------------------
-ConfigureBench(PARQUET_READER_BENCH io/parquet/parquet_reader_benchmark.cpp)
+ConfigureBench(PARQUET_READER_BENCH io/parquet/parquet_reader.cpp)
 
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
-ConfigureBench(ORC_READER_BENCH io/orc/orc_reader_benchmark.cpp)
+ConfigureBench(ORC_READER_BENCH io/orc/orc_reader.cpp)
 
 # ##################################################################################################
 # * csv reader benchmark --------------------------------------------------------------------------
-ConfigureBench(CSV_READER_BENCH io/csv/csv_reader_benchmark.cpp)
+ConfigureBench(CSV_READER_BENCH io/csv/csv_reader.cpp)
 
 # ##################################################################################################
 # * parquet writer benchmark ----------------------------------------------------------------------
-ConfigureBench(PARQUET_WRITER_BENCH io/parquet/parquet_writer_benchmark.cpp)
+ConfigureBench(PARQUET_WRITER_BENCH io/parquet/parquet_writer.cpp)
 
 # ##################################################################################################
 # * orc writer benchmark --------------------------------------------------------------------------
-ConfigureBench(ORC_WRITER_BENCH io/orc/orc_writer_benchmark.cpp)
+ConfigureBench(ORC_WRITER_BENCH io/orc/orc_writer.cpp)
 
 # ##################################################################################################
 # * csv writer benchmark --------------------------------------------------------------------------
-ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer_benchmark.cpp)
+ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
+ConfigureBench(AST_BENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(
-  BINARYOP_BENCH binaryop/binaryop_benchmark.cpp binaryop/compiled_binaryop_benchmark.cpp
-)
+ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
 ConfigureBench(
-  TEXT_BENCH
-  text/ngrams_benchmark.cpp
-  text/normalize_benchmark.cpp
-  text/normalize_spaces_benchmark.cpp
-  text/replace_benchmark.cpp
-  text/subword_benchmark.cpp
-  text/tokenize_benchmark.cpp
+  TEXT_BENCH text/ngrams.cpp text/normalize.cpp text/normalize_spaces.cpp text/replace.cpp
+  text/subword.cpp text/tokenize.cpp
 )
 
 # ##################################################################################################
 # * strings benchmark -------------------------------------------------------------------
 ConfigureBench(
   STRINGS_BENCH
-  string/case_benchmark.cpp
-  string/combine_benchmark.cpp
-  string/contains_benchmark.cpp
-  string/convert_datetime_benchmark.cpp
-  string/convert_durations_benchmark.cpp
-  string/convert_fixed_point_benchmark.cpp
-  string/convert_numerics_benchmark.cpp
-  string/copy_benchmark.cpp
-  string/extract_benchmark.cpp
-  string/factory_benchmark.cu
-  string/filter_benchmark.cpp
-  string/find_benchmark.cpp
-  string/repeat_strings_benchmark.cpp
-  string/replace_benchmark.cpp
-  string/replace_re_benchmark.cpp
-  string/split_benchmark.cpp
-  string/substring_benchmark.cpp
-  string/translate_benchmark.cpp
-  string/url_decode_benchmark.cpp
+  string/case.cpp
+  string/combine.cpp
+  string/contains.cpp
+  string/convert_datetime.cpp
+  string/convert_durations.cpp
+  string/convert_fixed_point.cpp
+  string/convert_numerics.cpp
+  string/copy.cpp
+  string/extract.cpp
+  string/factory.cu
+  string/filter.cpp
+  string/find.cpp
+  string/repeat_strings.cpp
+  string/replace.cpp
+  string/replace_re.cpp
+  string/split.cpp
+  string/substring.cpp
+  string/translate.cpp
+  string/url_decode.cpp
 )
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH string/json_benchmark.cpp)
+ConfigureBench(JSON_BENCH string/json.cpp)
 
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
-ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp)
+ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)
 
 add_custom_target(
   run_benchmarks
diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform.cpp
similarity index 100%
rename from cpp/benchmarks/ast/transform_benchmark.cpp
rename to cpp/benchmarks/ast/transform.cpp
diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
similarity index 100%
rename from cpp/benchmarks/binaryop/binaryop_benchmark.cpp
rename to cpp/benchmarks/binaryop/binaryop.cpp
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
similarity index 100%
rename from cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
rename to cpp/benchmarks/binaryop/compiled_binaryop.cpp
diff --git a/cpp/benchmarks/column/concatenate_benchmark.cpp b/cpp/benchmarks/column/concatenate.cpp
similarity index 100%
rename from cpp/benchmarks/column/concatenate_benchmark.cpp
rename to cpp/benchmarks/column/concatenate.cpp
diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_input.cpp
similarity index 99%
rename from cpp/benchmarks/common/generate_benchmark_input.cpp
rename to cpp/benchmarks/common/generate_input.cpp
index 995cea13c27..3147b21e0d7 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "generate_benchmark_input.hpp"
+#include "generate_input.hpp"
 #include "random_distribution_factory.hpp"
 
 #include <cudf/column/column.hpp>
@@ -31,6 +31,7 @@
 
 #include <future>
 #include <memory>
+#include <optional>
 #include <random>
 #include <thread>
 #include <vector>
diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_input.hpp
similarity index 98%
rename from cpp/benchmarks/common/generate_benchmark_input.hpp
rename to cpp/benchmarks/common/generate_input.hpp
index 3dbc6561839..8261341ccfb 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -22,7 +22,7 @@
 #include <cudf/utilities/traits.hpp>
 
 /**
- * @file generate_benchmark_input.hpp
+ * @file generate_input.hpp
  * @brief Contains declarations of functions that generate columns filled with random data.
  *
  * Also includes the data profile descriptor classes.
@@ -300,8 +300,8 @@ class data_profile {
 
   auto get_bool_probability() const { return bool_probability; }
   auto get_null_frequency() const { return null_frequency; };
-  auto get_cardinality() const { return cardinality; };
-  auto get_avg_run_length() const { return avg_run_length; };
+  [[nodiscard]] auto get_cardinality() const { return cardinality; };
+  [[nodiscard]] auto get_avg_run_length() const { return avg_run_length; };
 
   // Users should pass integral values for bounds when setting the parameters for types that have
   // discrete distributions (integers, strings, lists). Otherwise the call with have no effect.
diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp
index 65dc8b4dd4d..48e6855c39a 100644
--- a/cpp/benchmarks/common/random_distribution_factory.hpp
+++ b/cpp/benchmarks/common/random_distribution_factory.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "generate_benchmark_input.hpp"
+#include "generate_input.hpp"
 
 #include <memory>
 #include <random>
diff --git a/cpp/benchmarks/copying/contiguous_split_benchmark.cu b/cpp/benchmarks/copying/contiguous_split.cu
similarity index 96%
rename from cpp/benchmarks/copying/contiguous_split_benchmark.cu
rename to cpp/benchmarks/copying/contiguous_split.cu
index 55e1360efc8..bb6a9320c4a 100644
--- a/cpp/benchmarks/copying/contiguous_split_benchmark.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -51,10 +51,12 @@ void BM_contiguous_split_common(benchmark::State& state,
   std::vector<std::unique_ptr<cudf::column>> columns(src_cols.size());
   std::transform(src_cols.begin(), src_cols.end(), columns.begin(), [](T& in) {
     auto ret = in.release();
-    ret->null_count();
+    // computing the null count is not a part of the benchmark's target code path, and we want the
+    // property to be pre-computed so that we measure the performance of only the intended code path
+    [[maybe_unused]] auto const nulls = ret->null_count();
     return ret;
   });
-  cudf::table src_table(std::move(columns));
+  auto const src_table = cudf::table(std::move(columns));
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
diff --git a/cpp/benchmarks/copying/copy_if_else_benchmark.cpp b/cpp/benchmarks/copying/copy_if_else.cpp
similarity index 97%
rename from cpp/benchmarks/copying/copy_if_else_benchmark.cpp
rename to cpp/benchmarks/copying/copy_if_else.cpp
index 513e4f4c179..6f3ba34e373 100644
--- a/cpp/benchmarks/copying/copy_if_else_benchmark.cpp
+++ b/cpp/benchmarks/copying/copy_if_else.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/copying/gather_benchmark.cu b/cpp/benchmarks/copying/gather.cu
similarity index 98%
rename from cpp/benchmarks/copying/gather_benchmark.cu
rename to cpp/benchmarks/copying/gather.cu
index f075e9c486e..eaa201a0678 100644
--- a/cpp/benchmarks/copying/gather_benchmark.cu
+++ b/cpp/benchmarks/copying/gather.cu
@@ -39,7 +39,7 @@ template <class TypeParam, bool coalesce>
 void BM_gather(benchmark::State& state)
 {
   const cudf::size_type source_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type n_cols = (cudf::size_type)state.range(1);
+  const auto n_cols = (cudf::size_type)state.range(1);
 
   // Every element is valid
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
diff --git a/cpp/benchmarks/copying/scatter_benchmark.cu b/cpp/benchmarks/copying/scatter.cu
similarity index 98%
rename from cpp/benchmarks/copying/scatter_benchmark.cu
rename to cpp/benchmarks/copying/scatter.cu
index 0c24dd50a13..a9ab376c8c3 100644
--- a/cpp/benchmarks/copying/scatter_benchmark.cu
+++ b/cpp/benchmarks/copying/scatter.cu
@@ -40,7 +40,7 @@ template <class TypeParam, bool coalesce>
 void BM_scatter(benchmark::State& state)
 {
   const cudf::size_type source_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type n_cols = (cudf::size_type)state.range(1);
+  const auto n_cols = (cudf::size_type)state.range(1);
 
   // Every element is valid
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
diff --git a/cpp/benchmarks/copying/shift_benchmark.cu b/cpp/benchmarks/copying/shift.cu
similarity index 100%
rename from cpp/benchmarks/copying/shift_benchmark.cu
rename to cpp/benchmarks/copying/shift.cu
diff --git a/cpp/benchmarks/filling/repeat_benchmark.cpp b/cpp/benchmarks/filling/repeat.cpp
similarity index 100%
rename from cpp/benchmarks/filling/repeat_benchmark.cpp
rename to cpp/benchmarks/filling/repeat.cpp
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8476a137c12..5f23cbbafdd 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -29,9 +29,12 @@ namespace {
 // memory resource factory helpers
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
-inline auto make_pool()
+inline auto make_pool_instance()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+  static rmm::mr::cuda_memory_resource cuda_mr;
+  static auto pool_mr =
+    std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>>(&cuda_mr);
+  return pool_mr;
 }
 }  // namespace
 
@@ -68,13 +71,19 @@ inline auto make_pool()
  */
 class benchmark : public ::benchmark::Fixture {
  public:
-  virtual void SetUp(const ::benchmark::State& state)
+  benchmark() : ::benchmark::Fixture()
   {
-    mr = make_pool();
+    const char* env_iterations = std::getenv("CUDF_BENCHMARK_ITERATIONS");
+    if (env_iterations != nullptr) { this->Iterations(std::max(0L, atol(env_iterations))); }
+  }
+
+  void SetUp(const ::benchmark::State& state) override
+  {
+    mr = make_pool_instance();
     rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
   }
 
-  virtual void TearDown(const ::benchmark::State& state)
+  void TearDown(const ::benchmark::State& state) override
   {
     // reset default resource to the initial resource
     rmm::mr::set_current_device_resource(nullptr);
@@ -82,8 +91,8 @@ class benchmark : public ::benchmark::Fixture {
   }
 
   // eliminate partial override warnings (see benchmark/benchmark.h)
-  virtual void SetUp(::benchmark::State& st) { SetUp(const_cast<const ::benchmark::State&>(st)); }
-  virtual void TearDown(::benchmark::State& st)
+  void SetUp(::benchmark::State& st) override { SetUp(const_cast<const ::benchmark::State&>(st)); }
+  void TearDown(::benchmark::State& st) override
   {
     TearDown(const_cast<const ::benchmark::State&>(st));
   }
@@ -102,7 +111,10 @@ class memory_stats_logger {
 
   ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
 
-  size_t peak_memory_usage() const noexcept { return statistics_mr.get_bytes_counter().peak; }
+  [[nodiscard]] size_t peak_memory_usage() const noexcept
+  {
+    return statistics_mr.get_bytes_counter().peak;
+  }
 
  private:
   rmm::mr::device_memory_resource* existing_mr;
diff --git a/cpp/benchmarks/groupby/group_benchmark_common.hpp b/cpp/benchmarks/groupby/group_common.hpp
similarity index 100%
rename from cpp/benchmarks/groupby/group_benchmark_common.hpp
rename to cpp/benchmarks/groupby/group_common.hpp
diff --git a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu b/cpp/benchmarks/groupby/group_no_requests.cu
similarity index 98%
rename from cpp/benchmarks/groupby/group_no_requests_benchmark.cu
rename to cpp/benchmarks/groupby/group_no_requests.cu
index 209155862bd..750e0c6d3b3 100644
--- a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_no_requests.cu
@@ -15,7 +15,7 @@
  */
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/groupby/group_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/copying.hpp>
diff --git a/cpp/benchmarks/groupby/group_nth_benchmark.cu b/cpp/benchmarks/groupby/group_nth.cu
similarity index 97%
rename from cpp/benchmarks/groupby/group_nth_benchmark.cu
rename to cpp/benchmarks/groupby/group_nth.cu
index 107b3839c4c..daeb88f6dee 100644
--- a/cpp/benchmarks/groupby/group_nth_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_nth.cu
@@ -15,7 +15,7 @@
  */
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/groupby/group_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/copying.hpp>
diff --git a/cpp/benchmarks/groupby/group_scan_benchmark.cu b/cpp/benchmarks/groupby/group_scan.cu
similarity index 98%
rename from cpp/benchmarks/groupby/group_scan_benchmark.cu
rename to cpp/benchmarks/groupby/group_scan.cu
index d9849e53498..9a6d7b51429 100644
--- a/cpp/benchmarks/groupby/group_scan_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_scan.cu
@@ -15,7 +15,7 @@
  */
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/groupby/group_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/copying.hpp>
diff --git a/cpp/benchmarks/groupby/group_shift_benchmark.cu b/cpp/benchmarks/groupby/group_shift.cu
similarity index 97%
rename from cpp/benchmarks/groupby/group_shift_benchmark.cu
rename to cpp/benchmarks/groupby/group_shift.cu
index 6b0710f4044..29bc99f6b61 100644
--- a/cpp/benchmarks/groupby/group_shift_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_shift.cu
@@ -15,7 +15,7 @@
  */
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/groupby/group_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/detail/iterator.cuh>
diff --git a/cpp/benchmarks/groupby/group_struct_benchmark.cu b/cpp/benchmarks/groupby/group_struct.cu
similarity index 98%
rename from cpp/benchmarks/groupby/group_struct_benchmark.cu
rename to cpp/benchmarks/groupby/group_struct.cu
index 702983a63bf..355c7cbab6c 100644
--- a/cpp/benchmarks/groupby/group_struct_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_struct.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum.cu
similarity index 98%
rename from cpp/benchmarks/groupby/group_sum_benchmark.cu
rename to cpp/benchmarks/groupby/group_sum.cu
index 63f9aa02070..4a33ddeacd4 100644
--- a/cpp/benchmarks/groupby/group_sum_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_sum.cu
@@ -15,7 +15,7 @@
  */
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/groupby/group_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/copying.hpp>
diff --git a/cpp/benchmarks/hashing/hash_benchmark.cpp b/cpp/benchmarks/hashing/hash.cpp
similarity index 97%
rename from cpp/benchmarks/hashing/hash_benchmark.cpp
rename to cpp/benchmarks/hashing/hash.cpp
index 4ccb0bfad9d..e2ad38230a2 100644
--- a/cpp/benchmarks/hashing/hash_benchmark.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/hashing/partition_benchmark.cpp b/cpp/benchmarks/hashing/partition.cpp
similarity index 100%
rename from cpp/benchmarks/hashing/partition_benchmark.cpp
rename to cpp/benchmarks/hashing/partition.cpp
diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp
similarity index 90%
rename from cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
rename to cpp/benchmarks/io/csv/csv_reader.cpp
index 77bf4b03a14..241ba4d5954 100644
--- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader.cpp
@@ -16,9 +16,9 @@
 
 #include <benchmark/benchmark.h>
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/csv.hpp>
@@ -35,17 +35,15 @@ class CsvRead : public cudf::benchmark {
 
 void BM_csv_read_varying_input(benchmark::State& state)
 {
-  auto const data_types     = get_type_or_group(state.range(0));
-  io_type const source_type = static_cast<io_type>(state.range(1));
+  auto const data_types  = get_type_or_group(state.range(0));
+  auto const source_type = static_cast<io_type>(state.range(1));
 
   auto const tbl  = create_random_table(data_types, num_cols, table_size_bytes{data_size});
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(source_type);
   cudf_io::csv_writer_options options =
-    cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
-      .include_header(true)
-      .rows_per_chunk(1 << 14);  // TODO: remove once default is sensible
+    cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
   cudf_io::write_csv(options);
 
   cudf_io::csv_reader_options const read_options =
@@ -59,6 +57,7 @@ void BM_csv_read_varying_input(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void BM_csv_read_varying_options(benchmark::State& state)
@@ -79,23 +78,22 @@ void BM_csv_read_varying_options(benchmark::State& state)
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
 
-  std::vector<char> csv_data;
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
   cudf_io::csv_writer_options options =
-    cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
+    cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
       .include_header(true)
-      .line_terminator("\r\n")
-      .rows_per_chunk(1 << 14);  // TODO: remove once default is sensible
+      .line_terminator("\r\n");
   cudf_io::write_csv(options);
 
   cudf_io::csv_reader_options read_options =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_data.data(), csv_data.size()})
+    cudf_io::csv_reader_options::builder(source_sink.make_source_info())
       .use_cols_indexes(cols_to_read)
       .thousands('\'')
       .windowslinetermination(true)
       .comment('#')
       .prefix("BM_");
 
-  size_t const chunk_size             = csv_data.size() / num_chunks;
+  size_t const chunk_size             = source_sink.size() / num_chunks;
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
@@ -132,6 +130,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)       \
diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
similarity index 89%
rename from cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
rename to cpp/benchmarks/io/csv/csv_writer.cpp
index 9baab6b2571..413a269bcb2 100644
--- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -16,9 +16,9 @@
 
 #include <benchmark/benchmark.h>
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/csv.hpp>
@@ -35,8 +35,8 @@ class CsvWrite : public cudf::benchmark {
 
 void BM_csv_write_varying_inout(benchmark::State& state)
 {
-  auto const data_types   = get_type_or_group(state.range(0));
-  io_type const sink_type = static_cast<io_type>(state.range(1));
+  auto const data_types = get_type_or_group(state.range(0));
+  auto const sink_type  = static_cast<io_type>(state.range(1));
 
   auto const tbl  = create_random_table(data_types, num_cols, table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -46,14 +46,13 @@ void BM_csv_write_varying_inout(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
-      cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
-        .include_header(true)
-        .rows_per_chunk(1 << 14);  // TODO: remove once default is sensible
+      cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
     cudf_io::write_csv(options);
   }
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void BM_csv_write_varying_options(benchmark::State& state)
@@ -71,12 +70,12 @@ void BM_csv_write_varying_options(benchmark::State& state)
   auto const view = tbl->view();
 
   std::string const na_per(na_per_len, '#');
-  std::vector<char> csv_data;
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
-      cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
+      cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
         .include_header(true)
         .na_rep(na_per)
         .rows_per_chunk(rows_per_chunk);
@@ -85,6 +84,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)       \
diff --git a/cpp/benchmarks/io/cuio_benchmark_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
similarity index 89%
rename from cpp/benchmarks/io/cuio_benchmark_common.cpp
rename to cpp/benchmarks/io/cuio_common.cpp
index 627ac9ccc04..3743be8bd5a 100644
--- a/cpp/benchmarks/io/cuio_benchmark_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 
-#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 
+#include <fstream>
 #include <numeric>
 #include <string>
 
@@ -53,13 +54,25 @@ cudf_io::source_info cuio_source_sink_pair::make_source_info()
 cudf_io::sink_info cuio_source_sink_pair::make_sink_info()
 {
   switch (type) {
-    case io_type::VOID: return cudf_io::sink_info();
+    case io_type::VOID: return cudf_io::sink_info(&void_sink);
     case io_type::FILEPATH: return cudf_io::sink_info(file_name);
     case io_type::HOST_BUFFER: return cudf_io::sink_info(&buffer);
     default: CUDF_FAIL("invalid output type");
   }
 }
 
+size_t cuio_source_sink_pair::size()
+{
+  switch (type) {
+    case io_type::VOID: return void_sink.bytes_written();
+    case io_type::FILEPATH:
+      return static_cast<size_t>(
+        std::ifstream(file_name, std::ifstream::ate | std::ifstream::binary).tellg());
+    case io_type::HOST_BUFFER: return buffer.size();
+    default: CUDF_FAIL("invalid output type");
+  }
+}
+
 std::vector<cudf::type_id> dtypes_for_column_selection(std::vector<cudf::type_id> const& data_types,
                                                        column_selection col_sel)
 {
diff --git a/cpp/benchmarks/io/cuio_benchmark_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
similarity index 92%
rename from cpp/benchmarks/io/cuio_benchmark_common.hpp
rename to cpp/benchmarks/io/cuio_common.hpp
index 7107585dbcc..c74ee191d4e 100644
--- a/cpp/benchmarks/io/cuio_benchmark_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -39,6 +39,15 @@ std::string random_file_in_dir(std::string const& dir_path);
  * @brief Class to create a coupled `source_info` and `sink_info` of given type.
  */
 class cuio_source_sink_pair {
+  class bytes_written_only_sink : public cudf::io::data_sink {
+    size_t _bytes_written = 0;
+
+   public:
+    void host_write(void const* data, size_t size) override { _bytes_written += size; }
+    void flush() override {}
+    size_t bytes_written() override { return _bytes_written; }
+  };
+
  public:
   cuio_source_sink_pair(io_type type);
   ~cuio_source_sink_pair()
@@ -66,12 +75,15 @@ class cuio_source_sink_pair {
    */
   cudf::io::sink_info make_sink_info();
 
+  [[nodiscard]] size_t size();
+
  private:
   static temp_directory const tmpdir;
 
   io_type const type;
   std::vector<char> buffer;
   std::string const file_name;
+  bytes_written_only_sink void_sink;
 };
 
 /**
diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
similarity index 91%
rename from cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
rename to cpp/benchmarks/io/orc/orc_reader.cpp
index 6ab8d8d09c0..bb4a0ce72d8 100644
--- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -16,9 +16,9 @@
 
 #include <benchmark/benchmark.h>
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/orc.hpp>
@@ -40,7 +40,7 @@ void BM_orc_read_varying_input(benchmark::State& state)
   cudf::size_type const run_length  = state.range(2);
   cudf_io::compression_type const compression =
     state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE;
-  io_type const source_type = static_cast<io_type>(state.range(4));
+  auto const source_type = static_cast<io_type>(state.range(4));
 
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
@@ -66,13 +66,13 @@ void BM_orc_read_varying_input(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
-std::vector<std::string> get_col_names(std::vector<char> const& orc_data)
+std::vector<std::string> get_col_names(cudf_io::source_info const& source)
 {
   cudf_io::orc_reader_options const read_options =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()})
-      .num_rows(1);
+    cudf_io::orc_reader_options::builder(source).num_rows(1);
   return cudf_io::read_orc(read_options).metadata.column_names;
 }
 
@@ -99,14 +99,15 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
 
-  std::vector<char> orc_data;
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
   cudf_io::orc_writer_options options =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{&orc_data}, view);
+    cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view);
   cudf_io::write_orc(options);
 
-  auto const cols_to_read = select_column_names(get_col_names(orc_data), col_sel);
+  auto const cols_to_read =
+    select_column_names(get_col_names(source_sink.make_source_info()), col_sel);
   cudf_io::orc_reader_options read_options =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()})
+    cudf_io::orc_reader_options::builder(source_sink.make_source_info())
       .columns(cols_to_read)
       .use_index(use_index)
       .use_np_dtypes(use_np_dtypes)
@@ -148,6 +149,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define ORC_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                               \
diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
similarity index 84%
rename from cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
rename to cpp/benchmarks/io/orc/orc_writer.cpp
index 933b3d02e08..50ae76e867c 100644
--- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,12 @@
  * limitations under the License.
  */
 
+#include "cudf/io/types.hpp"
 #include <benchmark/benchmark.h>
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/orc.hpp>
@@ -40,7 +41,7 @@ void BM_orc_write_varying_inout(benchmark::State& state)
   cudf::size_type const run_length  = state.range(2);
   cudf_io::compression_type const compression =
     state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE;
-  io_type const sink_type = static_cast<io_type>(state.range(4));
+  auto const sink_type = static_cast<io_type>(state.range(4));
 
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
@@ -61,12 +62,19 @@ void BM_orc_write_varying_inout(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void BM_orc_write_varying_options(benchmark::State& state)
 {
-  auto const compression  = static_cast<cudf::io::compression_type>(state.range(0));
-  auto const enable_stats = state.range(1) != 0;
+  auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
+  auto const stats_freq  = [&] {
+    switch (state.range(2)) {
+      case 0: return cudf::io::STATISTICS_NONE;
+      case 1: return cudf::io::ORC_STATISTICS_STRIPE;
+      default: return cudf::io::ORC_STATISTICS_ROW_GROUP;
+    }
+  }();
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
@@ -85,12 +93,13 @@ void BM_orc_write_varying_options(benchmark::State& state)
     cudf_io::orc_writer_options const options =
       cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression)
-        .enable_statistics(enable_stats);
+        .enable_statistics(stats_freq);
     cudf_io::write_orc(options);
   }
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define ORC_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                               \
@@ -113,6 +122,8 @@ BENCHMARK_DEFINE_F(OrcWrite, writer_options)
 BENCHMARK_REGISTER_F(OrcWrite, writer_options)
   ->ArgsProduct({{int32_t(cudf::io::compression_type::NONE),
                   int32_t(cudf::io::compression_type::SNAPPY)},
-                 {0, 1}})
+                 {int32_t{cudf::io::STATISTICS_NONE},
+                  int32_t{cudf::io::ORC_STATISTICS_STRIPE},
+                  int32_t{cudf::io::ORC_STATISTICS_ROW_GROUP}}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
similarity index 91%
rename from cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
rename to cpp/benchmarks/io/parquet/parquet_reader.cpp
index 888102c03be..d7a3a668bd1 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -16,9 +16,9 @@
 
 #include <benchmark/benchmark.h>
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/parquet.hpp>
@@ -40,7 +40,7 @@ void BM_parq_read_varying_input(benchmark::State& state)
   cudf::size_type const run_length  = state.range(2);
   cudf_io::compression_type const compression =
     state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE;
-  io_type const source_type = static_cast<io_type>(state.range(4));
+  auto const source_type = static_cast<io_type>(state.range(4));
 
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
@@ -66,14 +66,13 @@ void BM_parq_read_varying_input(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
-std::vector<std::string> get_col_names(std::vector<char> const& parquet_data)
+std::vector<std::string> get_col_names(cudf::io::source_info const& source)
 {
   cudf_io::parquet_reader_options const read_options =
-    cudf_io::parquet_reader_options::builder(
-      cudf_io::source_info{parquet_data.data(), parquet_data.size()})
-      .num_rows(1);
+    cudf_io::parquet_reader_options::builder(source).num_rows(1);
   return cudf_io::read_parquet(read_options).metadata.column_names;
 }
 
@@ -100,15 +99,15 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
 
-  std::vector<char> parquet_data;
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
   cudf_io::parquet_writer_options options =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&parquet_data}, view);
+    cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view);
   cudf_io::write_parquet(options);
 
-  auto const cols_to_read = select_column_names(get_col_names(parquet_data), col_sel);
+  auto const cols_to_read =
+    select_column_names(get_col_names(source_sink.make_source_info()), col_sel);
   cudf_io::parquet_reader_options read_options =
-    cudf_io::parquet_reader_options::builder(
-      cudf_io::source_info{parquet_data.data(), parquet_data.size()})
+    cudf_io::parquet_reader_options::builder(source_sink.make_source_info())
       .columns(cols_to_read)
       .convert_strings_to_categories(str_to_categories)
       .use_pandas_metadata(use_pandas_metadata)
@@ -150,6 +149,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define PARQ_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                              \
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
similarity index 95%
rename from cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
rename to cpp/benchmarks/io/parquet/parquet_writer.cpp
index 1af7e206692..8287c27f804 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -16,9 +16,9 @@
 
 #include <benchmark/benchmark.h>
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/parquet.hpp>
@@ -40,7 +40,7 @@ void BM_parq_write_varying_inout(benchmark::State& state)
   cudf::size_type const run_length  = state.range(2);
   cudf_io::compression_type const compression =
     state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE;
-  io_type const sink_type = static_cast<io_type>(state.range(4));
+  auto const sink_type = static_cast<io_type>(state.range(4));
 
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
@@ -61,6 +61,7 @@ void BM_parq_write_varying_inout(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void BM_parq_write_varying_options(benchmark::State& state)
@@ -93,6 +94,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define PARQ_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                              \
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
similarity index 89%
rename from cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
rename to cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
index 0041af80a15..98eaba213e5 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
@@ -23,8 +23,9 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/parquet.hpp>
@@ -48,15 +49,17 @@ void PQ_write(benchmark::State& state)
   cudf::table_view view = tbl->view();
 
   auto mem_stats_logger = cudf::memory_stats_logger();
+  cuio_source_sink_pair source_sink(io_type::VOID);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options opts =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info(), view);
+      cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view);
     cudf_io::write_parquet(opts);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void PQ_write_chunked(benchmark::State& state)
@@ -71,10 +74,11 @@ void PQ_write_chunked(benchmark::State& state)
   }
 
   auto mem_stats_logger = cudf::memory_stats_logger();
+  cuio_source_sink_pair source_sink(io_type::VOID);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::chunked_parquet_writer_options opts =
-      cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info());
+      cudf_io::chunked_parquet_writer_options::builder(source_sink.make_sink_info());
     cudf_io::parquet_chunked_writer writer(opts);
     std::for_each(tables.begin(), tables.end(), [&writer](std::unique_ptr<cudf::table> const& tbl) {
       writer.write(*tbl);
@@ -84,6 +88,7 @@ void PQ_write_chunked(benchmark::State& state)
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define PWBM_BENCHMARK_DEFINE(name, size, num_columns)                                    \
diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
similarity index 98%
rename from cpp/benchmarks/io/text/multibyte_split_benchmark.cpp
rename to cpp/benchmarks/io/text/multibyte_split.cpp
index cb8a61caa57..09c011cada1 100644
--- a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
diff --git a/cpp/benchmarks/iterator/iterator_benchmark.cu b/cpp/benchmarks/iterator/iterator.cu
similarity index 100%
rename from cpp/benchmarks/iterator/iterator_benchmark.cu
rename to cpp/benchmarks/iterator/iterator.cu
diff --git a/cpp/benchmarks/join/conditional_join_benchmark.cu b/cpp/benchmarks/join/conditional_join.cu
similarity index 99%
rename from cpp/benchmarks/join/conditional_join_benchmark.cu
rename to cpp/benchmarks/join/conditional_join.cu
index bf078ff51eb..69fb28d29b2 100644
--- a/cpp/benchmarks/join/conditional_join_benchmark.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <benchmarks/join/join_benchmark_common.hpp>
+#include <benchmarks/join/join_common.hpp>
 
 template <typename key_type, typename payload_type>
 class ConditionalJoin : public cudf::benchmark {
diff --git a/cpp/benchmarks/join/join_nvbench.cu b/cpp/benchmarks/join/join.cu
similarity index 99%
rename from cpp/benchmarks/join/join_nvbench.cu
rename to cpp/benchmarks/join/join.cu
index ffb21d8594d..55a1e524479 100644
--- a/cpp/benchmarks/join/join_nvbench.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -15,7 +15,7 @@
  */
 
 #include <benchmarks/fixture/rmm_pool_raii.hpp>
-#include <benchmarks/join/join_benchmark_common.hpp>
+#include <benchmarks/join/join_common.hpp>
 
 void skip_helper(nvbench::state& state)
 {
diff --git a/cpp/benchmarks/join/join_benchmark_common.hpp b/cpp/benchmarks/join/join_common.hpp
similarity index 100%
rename from cpp/benchmarks/join/join_benchmark_common.hpp
rename to cpp/benchmarks/join/join_common.hpp
diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/left_join.cu
similarity index 99%
rename from cpp/benchmarks/join/join_benchmark.cu
rename to cpp/benchmarks/join/left_join.cu
index 72d9b541232..e332b70e30a 100644
--- a/cpp/benchmarks/join/join_benchmark.cu
+++ b/cpp/benchmarks/join/left_join.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <benchmarks/join/join_benchmark_common.hpp>
+#include <benchmarks/join/join_common.hpp>
 
 template <typename key_type, typename payload_type>
 class Join : public cudf::benchmark {
diff --git a/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
similarity index 98%
rename from cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
rename to cpp/benchmarks/lists/copying/scatter_lists.cu
index 49007fda7a3..22e4be9ce9d 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -45,7 +45,7 @@ void BM_lists_scatter(::benchmark::State& state)
 
   const size_type base_size{(size_type)state.range(0)};
   const size_type num_elements_per_row{(size_type)state.range(1)};
-  const size_type num_rows = (size_type)ceil(double(base_size) / num_elements_per_row);
+  const auto num_rows = (size_type)ceil(double(base_size) / num_elements_per_row);
 
   auto source_base_col = make_fixed_width_column(
     data_type{type_to_id<TypeParam>()}, base_size, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/benchmarks/merge/merge_benchmark.cpp b/cpp/benchmarks/merge/merge.cpp
similarity index 100%
rename from cpp/benchmarks/merge/merge_benchmark.cpp
rename to cpp/benchmarks/merge/merge.cpp
diff --git a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp b/cpp/benchmarks/null_mask/set_null_mask.cpp
similarity index 100%
rename from cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
rename to cpp/benchmarks/null_mask/set_null_mask.cpp
diff --git a/cpp/benchmarks/quantiles/quantiles_benchmark.cpp b/cpp/benchmarks/quantiles/quantiles.cpp
similarity index 98%
rename from cpp/benchmarks/quantiles/quantiles_benchmark.cpp
rename to cpp/benchmarks/quantiles/quantiles.cpp
index fa602304dec..3ecb436d7fa 100644
--- a/cpp/benchmarks/quantiles/quantiles_benchmark.cpp
+++ b/cpp/benchmarks/quantiles/quantiles.cpp
@@ -23,7 +23,7 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/reduction/anyall_benchmark.cpp b/cpp/benchmarks/reduction/anyall.cpp
similarity index 100%
rename from cpp/benchmarks/reduction/anyall_benchmark.cpp
rename to cpp/benchmarks/reduction/anyall.cpp
diff --git a/cpp/benchmarks/reduction/dictionary_benchmark.cpp b/cpp/benchmarks/reduction/dictionary.cpp
similarity index 100%
rename from cpp/benchmarks/reduction/dictionary_benchmark.cpp
rename to cpp/benchmarks/reduction/dictionary.cpp
diff --git a/cpp/benchmarks/reduction/minmax_benchmark.cpp b/cpp/benchmarks/reduction/minmax.cpp
similarity index 100%
rename from cpp/benchmarks/reduction/minmax_benchmark.cpp
rename to cpp/benchmarks/reduction/minmax.cpp
diff --git a/cpp/benchmarks/reduction/reduce_benchmark.cpp b/cpp/benchmarks/reduction/reduce.cpp
similarity index 100%
rename from cpp/benchmarks/reduction/reduce_benchmark.cpp
rename to cpp/benchmarks/reduction/reduce.cpp
diff --git a/cpp/benchmarks/reduction/scan_benchmark.cpp b/cpp/benchmarks/reduction/scan.cpp
similarity index 97%
rename from cpp/benchmarks/reduction/scan_benchmark.cpp
rename to cpp/benchmarks/reduction/scan.cpp
index b2d8fcfc004..05c15a4fcb5 100644
--- a/cpp/benchmarks/reduction/scan_benchmark.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/replace/clamp_benchmark.cpp b/cpp/benchmarks/replace/clamp.cpp
similarity index 98%
rename from cpp/benchmarks/replace/clamp_benchmark.cpp
rename to cpp/benchmarks/replace/clamp.cpp
index 4d9da4aca6d..dd8b06227bc 100644
--- a/cpp/benchmarks/replace/clamp_benchmark.cpp
+++ b/cpp/benchmarks/replace/clamp.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/replace/nans_benchmark.cpp b/cpp/benchmarks/replace/nans.cpp
similarity index 97%
rename from cpp/benchmarks/replace/nans_benchmark.cpp
rename to cpp/benchmarks/replace/nans.cpp
index a337ae5e7ad..3faf217956b 100644
--- a/cpp/benchmarks/replace/nans_benchmark.cpp
+++ b/cpp/benchmarks/replace/nans.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/search/search_benchmark.cpp b/cpp/benchmarks/search/search.cpp
similarity index 100%
rename from cpp/benchmarks/search/search_benchmark.cpp
rename to cpp/benchmarks/search/search.cpp
diff --git a/cpp/benchmarks/sort/rank_benchmark.cpp b/cpp/benchmarks/sort/rank.cpp
similarity index 97%
rename from cpp/benchmarks/sort/rank_benchmark.cpp
rename to cpp/benchmarks/sort/rank.cpp
index 60be95b9112..826740dae55 100644
--- a/cpp/benchmarks/sort/rank_benchmark.cpp
+++ b/cpp/benchmarks/sort/rank.cpp
@@ -23,7 +23,7 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/sort/sort_benchmark.cpp b/cpp/benchmarks/sort/sort.cpp
similarity index 98%
rename from cpp/benchmarks/sort/sort_benchmark.cpp
rename to cpp/benchmarks/sort/sort.cpp
index fe68ddd0051..e4c1af159aa 100644
--- a/cpp/benchmarks/sort/sort_benchmark.cpp
+++ b/cpp/benchmarks/sort/sort.cpp
@@ -23,7 +23,7 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/sort/sort_strings_benchmark.cpp b/cpp/benchmarks/sort/sort_strings.cpp
similarity index 96%
rename from cpp/benchmarks/sort/sort_strings_benchmark.cpp
rename to cpp/benchmarks/sort/sort_strings.cpp
index f5effcafcfb..8adeef21a79 100644
--- a/cpp/benchmarks/sort/sort_strings_benchmark.cpp
+++ b/cpp/benchmarks/sort/sort_strings.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
similarity index 100%
rename from cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp
rename to cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp
similarity index 100%
rename from cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp
rename to cpp/benchmarks/stream_compaction/drop_duplicates.cpp
diff --git a/cpp/benchmarks/string/case_benchmark.cpp b/cpp/benchmarks/string/case.cpp
similarity index 96%
rename from cpp/benchmarks/string/case_benchmark.cpp
rename to cpp/benchmarks/string/case.cpp
index 508ae49e093..0f1653af2c6 100644
--- a/cpp/benchmarks/string/case_benchmark.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/combine_benchmark.cpp b/cpp/benchmarks/string/combine.cpp
similarity index 97%
rename from cpp/benchmarks/string/combine_benchmark.cpp
rename to cpp/benchmarks/string/combine.cpp
index 7dabd32e874..8983646b6f1 100644
--- a/cpp/benchmarks/string/combine_benchmark.cpp
+++ b/cpp/benchmarks/string/combine.cpp
@@ -17,7 +17,7 @@
 #include "string_bench_args.hpp"
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/contains_benchmark.cpp b/cpp/benchmarks/string/contains.cpp
similarity index 96%
rename from cpp/benchmarks/string/contains_benchmark.cpp
rename to cpp/benchmarks/string/contains.cpp
index 79bdda77634..fbcfabb4532 100644
--- a/cpp/benchmarks/string/contains_benchmark.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -46,7 +46,7 @@ static void BM_contains(benchmark::State& state, contains_type ct)
         cudf::strings::count_re(input, "\\d+");
         break;
       case contains_type::findall:  // returns occurrences of matches
-        cudf::strings::findall_re(input, "\\d+");
+        cudf::strings::findall(input, "\\d+");
         break;
     }
   }
diff --git a/cpp/benchmarks/string/convert_datetime_benchmark.cpp b/cpp/benchmarks/string/convert_datetime.cpp
similarity index 98%
rename from cpp/benchmarks/string/convert_datetime_benchmark.cpp
rename to cpp/benchmarks/string/convert_datetime.cpp
index dc7e891286a..af51b504ee8 100644
--- a/cpp/benchmarks/string/convert_datetime_benchmark.cpp
+++ b/cpp/benchmarks/string/convert_datetime.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/convert_durations_benchmark.cpp b/cpp/benchmarks/string/convert_durations.cpp
similarity index 100%
rename from cpp/benchmarks/string/convert_durations_benchmark.cpp
rename to cpp/benchmarks/string/convert_durations.cpp
diff --git a/cpp/benchmarks/string/convert_fixed_point_benchmark.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp
similarity index 98%
rename from cpp/benchmarks/string/convert_fixed_point_benchmark.cpp
rename to cpp/benchmarks/string/convert_fixed_point.cpp
index 482104be436..5c050592c7b 100644
--- a/cpp/benchmarks/string/convert_fixed_point_benchmark.cpp
+++ b/cpp/benchmarks/string/convert_fixed_point.cpp
@@ -18,7 +18,7 @@
 #include <synchronization/synchronization.hpp>
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/convert/convert_floats.hpp>
diff --git a/cpp/benchmarks/string/convert_numerics_benchmark.cpp b/cpp/benchmarks/string/convert_numerics.cpp
similarity index 98%
rename from cpp/benchmarks/string/convert_numerics_benchmark.cpp
rename to cpp/benchmarks/string/convert_numerics.cpp
index 86f4d413974..02ccb17e74a 100644
--- a/cpp/benchmarks/string/convert_numerics_benchmark.cpp
+++ b/cpp/benchmarks/string/convert_numerics.cpp
@@ -18,7 +18,7 @@
 #include <synchronization/synchronization.hpp>
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
diff --git a/cpp/benchmarks/string/copy_benchmark.cpp b/cpp/benchmarks/string/copy.cpp
similarity index 98%
rename from cpp/benchmarks/string/copy_benchmark.cpp
rename to cpp/benchmarks/string/copy.cpp
index 23a70215015..d40b0e069bc 100644
--- a/cpp/benchmarks/string/copy_benchmark.cpp
+++ b/cpp/benchmarks/string/copy.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/extract_benchmark.cpp b/cpp/benchmarks/string/extract.cpp
similarity index 98%
rename from cpp/benchmarks/string/extract_benchmark.cpp
rename to cpp/benchmarks/string/extract.cpp
index 7ed083d9571..b4034ff054a 100644
--- a/cpp/benchmarks/string/extract_benchmark.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -17,7 +17,7 @@
 #include "string_bench_args.hpp"
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory.cu
similarity index 98%
rename from cpp/benchmarks/string/factory_benchmark.cu
rename to cpp/benchmarks/string/factory.cu
index bae08431b51..2a88def1871 100644
--- a/cpp/benchmarks/string/factory_benchmark.cu
+++ b/cpp/benchmarks/string/factory.cu
@@ -17,7 +17,7 @@
 #include "string_bench_args.hpp"
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/filter_benchmark.cpp b/cpp/benchmarks/string/filter.cpp
similarity index 98%
rename from cpp/benchmarks/string/filter_benchmark.cpp
rename to cpp/benchmarks/string/filter.cpp
index 97228122c42..fb030c2ccc2 100644
--- a/cpp/benchmarks/string/filter_benchmark.cpp
+++ b/cpp/benchmarks/string/filter.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/find_benchmark.cpp b/cpp/benchmarks/string/find.cpp
similarity index 98%
rename from cpp/benchmarks/string/find_benchmark.cpp
rename to cpp/benchmarks/string/find.cpp
index 8e570a55440..167e9bc1348 100644
--- a/cpp/benchmarks/string/find_benchmark.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json.cpp
similarity index 98%
rename from cpp/benchmarks/string/json_benchmark.cpp
rename to cpp/benchmarks/string/json.cpp
index c6a6b757951..1ade4d01e1e 100644
--- a/cpp/benchmarks/string/json_benchmark.cpp
+++ b/cpp/benchmarks/string/json.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/repeat_strings_benchmark.cpp b/cpp/benchmarks/string/repeat_strings.cpp
similarity index 99%
rename from cpp/benchmarks/string/repeat_strings_benchmark.cpp
rename to cpp/benchmarks/string/repeat_strings.cpp
index 56f342f6824..86b8525023f 100644
--- a/cpp/benchmarks/string/repeat_strings_benchmark.cpp
+++ b/cpp/benchmarks/string/repeat_strings.cpp
@@ -17,7 +17,7 @@
 #include "string_bench_args.hpp"
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/replace_benchmark.cpp b/cpp/benchmarks/string/replace.cpp
similarity index 98%
rename from cpp/benchmarks/string/replace_benchmark.cpp
rename to cpp/benchmarks/string/replace.cpp
index 0d785fd25aa..9be2e3a8627 100644
--- a/cpp/benchmarks/string/replace_benchmark.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/replace_re_benchmark.cpp b/cpp/benchmarks/string/replace_re.cpp
similarity index 98%
rename from cpp/benchmarks/string/replace_re_benchmark.cpp
rename to cpp/benchmarks/string/replace_re.cpp
index 18ec28371e3..c106953bf69 100644
--- a/cpp/benchmarks/string/replace_re_benchmark.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -17,7 +17,7 @@
 #include "string_bench_args.hpp"
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/split_benchmark.cpp b/cpp/benchmarks/string/split.cpp
similarity index 98%
rename from cpp/benchmarks/string/split_benchmark.cpp
rename to cpp/benchmarks/string/split.cpp
index cab477754a6..fc879d1d0eb 100644
--- a/cpp/benchmarks/string/split_benchmark.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/substring_benchmark.cpp b/cpp/benchmarks/string/substring.cpp
similarity index 98%
rename from cpp/benchmarks/string/substring_benchmark.cpp
rename to cpp/benchmarks/string/substring.cpp
index e8a66f7b323..8864fffc40b 100644
--- a/cpp/benchmarks/string/substring_benchmark.cpp
+++ b/cpp/benchmarks/string/substring.cpp
@@ -17,7 +17,7 @@
 #include "string_bench_args.hpp"
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/translate_benchmark.cpp b/cpp/benchmarks/string/translate.cpp
similarity index 98%
rename from cpp/benchmarks/string/translate_benchmark.cpp
rename to cpp/benchmarks/string/translate.cpp
index 49396b0ce71..98688fa14fc 100644
--- a/cpp/benchmarks/string/translate_benchmark.cpp
+++ b/cpp/benchmarks/string/translate.cpp
@@ -17,7 +17,7 @@
 #include "string_bench_args.hpp"
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/string/url_decode_benchmark.cpp b/cpp/benchmarks/string/url_decode.cpp
similarity index 100%
rename from cpp/benchmarks/string/url_decode_benchmark.cpp
rename to cpp/benchmarks/string/url_decode.cpp
diff --git a/cpp/benchmarks/text/ngrams_benchmark.cpp b/cpp/benchmarks/text/ngrams.cpp
similarity index 97%
rename from cpp/benchmarks/text/ngrams_benchmark.cpp
rename to cpp/benchmarks/text/ngrams.cpp
index 52f55249631..7c39ebbb1bb 100644
--- a/cpp/benchmarks/text/ngrams_benchmark.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/text/normalize_benchmark.cpp b/cpp/benchmarks/text/normalize.cpp
similarity index 98%
rename from cpp/benchmarks/text/normalize_benchmark.cpp
rename to cpp/benchmarks/text/normalize.cpp
index f041547d021..ac8e92b3376 100644
--- a/cpp/benchmarks/text/normalize_benchmark.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
diff --git a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp b/cpp/benchmarks/text/normalize_spaces.cpp
similarity index 97%
rename from cpp/benchmarks/text/normalize_spaces_benchmark.cpp
rename to cpp/benchmarks/text/normalize_spaces.cpp
index 6260bb02c55..34749b579b9 100644
--- a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
+++ b/cpp/benchmarks/text/normalize_spaces.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/text/replace_benchmark.cpp b/cpp/benchmarks/text/replace.cpp
similarity index 100%
rename from cpp/benchmarks/text/replace_benchmark.cpp
rename to cpp/benchmarks/text/replace.cpp
diff --git a/cpp/benchmarks/text/subword_benchmark.cpp b/cpp/benchmarks/text/subword.cpp
similarity index 100%
rename from cpp/benchmarks/text/subword_benchmark.cpp
rename to cpp/benchmarks/text/subword.cpp
diff --git a/cpp/benchmarks/text/tokenize_benchmark.cpp b/cpp/benchmarks/text/tokenize.cpp
similarity index 98%
rename from cpp/benchmarks/text/tokenize_benchmark.cpp
rename to cpp/benchmarks/text/tokenize.cpp
index cd6428a9406..fa3f816db59 100644
--- a/cpp/benchmarks/text/tokenize_benchmark.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/string/string_bench_args.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
diff --git a/cpp/benchmarks/transpose/transpose_benchmark.cu b/cpp/benchmarks/transpose/transpose.cu
similarity index 100%
rename from cpp/benchmarks/transpose/transpose_benchmark.cu
rename to cpp/benchmarks/transpose/transpose.cu
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
similarity index 97%
rename from cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
rename to cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 8e51bcca63d..90097889a86 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -170,11 +170,11 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa
 template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
 void type_dispatcher_benchmark(::benchmark::State& state)
 {
-  const cudf::size_type source_size = static_cast<cudf::size_type>(state.range(1));
+  const auto source_size = static_cast<cudf::size_type>(state.range(1));
 
-  const cudf::size_type n_cols = static_cast<cudf::size_type>(state.range(0));
+  const auto n_cols = static_cast<cudf::size_type>(state.range(0));
 
-  const cudf::size_type work_per_thread = static_cast<cudf::size_type>(state.range(2));
+  const auto work_per_thread = static_cast<cudf::size_type>(state.range(2));
 
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
 
diff --git a/cpp/cmake/libcudacxx.patch b/cpp/cmake/libcudacxx.patch
deleted file mode 100644
index 3cdc40ef084..00000000000
--- a/cpp/cmake/libcudacxx.patch
+++ /dev/null
@@ -1,21 +0,0 @@
-diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
-index d55a43688..654142d7e 100644
---- a/include/cuda/std/detail/__config
-+++ b/include/cuda/std/detail/__config
-@@ -23,7 +23,7 @@
-     #define _LIBCUDACXX_CUDACC_VER_MINOR __CUDACC_VER_MINOR__
-     #define _LIBCUDACXX_CUDACC_VER_BUILD __CUDACC_VER_BUILD__
-     #define _LIBCUDACXX_CUDACC_VER                                                  \
--        _LIBCUDACXX_CUDACC_VER_MAJOR * 10000 + _LIBCUDACXX_CUDACC_VER_MINOR * 100 + \
-+        _LIBCUDACXX_CUDACC_VER_MAJOR * 100000 + _LIBCUDACXX_CUDACC_VER_MINOR * 1000 + \
-         _LIBCUDACXX_CUDACC_VER_BUILD
- 
-     #define _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-@@ -64,7 +64,7 @@
- #  endif
- #endif
- 
--#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 110500))
-+#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 1105000))
- #  define _LIBCUDACXX_HAS_NO_INT128
- #endif
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 16e7a58b020..c964c85156c 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
     cuco 0.0
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG 193de1aa74f5721717f991ca757dc610c852bb17
+    GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 0917adcd764..4b2917bc11e 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,11 +16,7 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports PATCH_COMMAND patch --reject-file=- -p1 -N <
-                                    ${CUDF_SOURCE_DIR}/cmake/libcudacxx.patch || true
-  )
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
 
   set(LIBCUDACXX_INCLUDE_DIR
       "${libcudacxx_SOURCE_DIR}/include"
diff --git a/cpp/docs/BENCHMARKING.md b/cpp/docs/BENCHMARKING.md
index ed99ff5f1be..8794c90d1db 100644
--- a/cpp/docs/BENCHMARKING.md
+++ b/cpp/docs/BENCHMARKING.md
@@ -1,19 +1,24 @@
 # Unit Benchmarking in libcudf
 
-Unit benchmarks in libcudf are written using [Google Benchmark](https://github.com/google/benchmark).
+Unit benchmarks in libcudf are written using [NVBench](https://github.com/NVIDIA/nvbench).
+While many existing benchmarks are written using
+[Google Benchmark](https://github.com/google/benchmark), new benchmarks should use NVBench.
 
-Google Benchmark provides many options for specifying ranges of parameters to benchmarks to test
-with varying parameters, as well as to control the time unit reported, among other options. Refer to
-other benchmarks in `cpp/benchmarks` to understand the options.
+The NVBench library is similar to Google Benchmark, but has several quality of life improvements
+when doing GPU benchmarking such as displaying the fraction of peak memory bandwidth achieved and
+details about the GPU hardware.
+
+Both NVBench and Google Benchmark provide many options for specifying ranges of parameters to
+benchmark, as well as to control the time unit reported, among other options. Refer to existing
+benchmarks in `cpp/benchmarks` to understand the options.
 
 ## Directory and File Naming
 
 The naming of unit benchmark directories and source files should be consistent with the feature
 being benchmarked. For example, the benchmarks for APIs in `copying.hpp` should live in
-`cudf/cpp/benchmarks/copying`. Each feature (or set of related features) should have its own
-benchmark source file named `<feature>_benchmark.cu/cpp`. For example,
-`cudf/cpp/src/copying/scatter.cu` has benchmarks in
-`cudf/cpp/benchmarks/copying/scatter_benchmark.cu`.
+`cpp/benchmarks/copying`. Each feature (or set of related features) should have its own
+benchmark source file named `<feature>.cu/cpp`. For example, `cpp/src/copying/scatter.cu` has
+benchmarks in `cpp/benchmarks/copying/scatter.cu`.
 
 In the interest of improving compile time, whenever possible, test source files should be `.cpp`
 files because `nvcc` is slower than `gcc` in compiling host code. Note that `thrust::device_vector`
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 5e465ed6991..eeebe38d873 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -347,7 +347,9 @@ implemented using asynchronous APIs on the default stream (e.g., stream 0).
 
 The recommended pattern for doing this is to make the definition of the external API invoke an
 internal API in the `detail` namespace. The internal `detail` API has the same parameters as the
-public API, plus a `rmm::cuda_stream_view` parameter at the end defaulted to
+public API, plus a `rmm::cuda_stream_view` parameter at the end with no default value. If the
+detail API also accepts a memory resource parameter, the stream parameter should be ideally placed
+just *before* the memory resource. The public API will call the detail API and provide
 `rmm::cuda_stream_default`. The implementation should be wholly contained in the `detail` API
 definition and use only asynchronous versions of CUDA APIs with the stream parameter.
 
@@ -362,14 +364,14 @@ void external_function(...);
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+void external_function(..., rmm::cuda_stream_view stream)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
-    // defaulted stream parameter
+    // Use the stream parameter in the detail implementation.
     void external_function(..., rmm::cuda_stream_view stream){
-        // implementation uses stream w/ async APIs
+        // Implementation uses the stream with async APIs.
         rmm::device_buffer buff(...,stream);
         CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
         kernel<<<..., stream>>>(...);
@@ -378,8 +380,8 @@ namespace detail{
 } // namespace detail
 
 void external_function(...){
-    CUDF_FUNC_RANGE(); // Auto generates NVTX range for lifetime of this function
-    detail::external_function(...);
+    CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
+    detail::external_function(..., rmm::cuda_stream_default);
 }
 ```
 
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 6a556bb4b34..3f98209852d 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcudf"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 22.02.00
+PROJECT_NUMBER         = 22.04.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2168,7 +2168,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/22.02
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/22.04
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 9bb021f1429..40718c27988 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -14,7 +14,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-22.02)
+set(CUDF_TAG branch-22.04)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 374af536dc5..23587f49334 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -99,9 +99,9 @@ class aggregation {
   Kind kind;  ///< The aggregation to perform
   virtual ~aggregation() = default;
 
-  virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }
-  virtual size_t do_hash() const { return std::hash<int>{}(kind); }
-  virtual std::unique_ptr<aggregation> clone() const = 0;
+  [[nodiscard]] virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }
+  [[nodiscard]] virtual size_t do_hash() const { return std::hash<int>{}(kind); }
+  [[nodiscard]] virtual std::unique_ptr<aggregation> clone() const = 0;
 
   // override functions for compound aggregations
   virtual std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
@@ -118,7 +118,7 @@ class aggregation {
  */
 class rolling_aggregation : public virtual aggregation {
  public:
-  ~rolling_aggregation() = default;
+  ~rolling_aggregation() override = default;
 
  protected:
   rolling_aggregation() {}
@@ -130,7 +130,7 @@ class rolling_aggregation : public virtual aggregation {
  */
 class groupby_aggregation : public virtual aggregation {
  public:
-  ~groupby_aggregation() = default;
+  ~groupby_aggregation() override = default;
 
  protected:
   groupby_aggregation() {}
@@ -141,7 +141,7 @@ class groupby_aggregation : public virtual aggregation {
  */
 class groupby_scan_aggregation : public virtual aggregation {
  public:
-  ~groupby_scan_aggregation() = default;
+  ~groupby_scan_aggregation() override = default;
 
  protected:
   groupby_scan_aggregation() {}
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index ecd46ec2c23..2bfe1b03dd3 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -76,7 +76,7 @@ struct expression_result {
     subclass().template set_value<Element>(index, result);
   }
 
-  __device__ inline bool is_valid() const { return subclass().is_valid(); }
+  [[nodiscard]] __device__ inline bool is_valid() const { return subclass().is_valid(); }
 
   __device__ inline T value() const { return subclass().value(); }
 };
@@ -110,7 +110,7 @@ struct value_expression_result
   /**
    * @brief Returns true if the underlying data is valid and false otherwise.
    */
-  __device__ inline bool is_valid() const
+  [[nodiscard]] __device__ inline bool is_valid() const
   {
     if constexpr (has_nulls) { return _obj.has_value(); }
     return true;
@@ -174,7 +174,7 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  __device__ inline bool is_valid() const
+  [[nodiscard]] __device__ inline bool is_valid() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
@@ -186,7 +186,7 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  __device__ inline mutable_column_device_view value() const
+  [[nodiscard]] __device__ inline mutable_column_device_view value() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index 4f73cb1ef6e..0b54dc7e4f0 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -166,7 +166,7 @@ class expression_parser {
    *
    * @return cudf::data_type
    */
-  cudf::data_type output_type() const;
+  [[nodiscard]] cudf::data_type output_type() const;
 
   /**
    * @brief Visit a literal expression.
@@ -206,10 +206,10 @@ class expression_parser {
    */
   class intermediate_counter {
    public:
-    intermediate_counter() : used_values(), max_used(0) {}
+    intermediate_counter() : used_values() {}
     cudf::size_type take();
     void give(cudf::size_type value);
-    cudf::size_type get_max_used() const { return max_used; }
+    [[nodiscard]] cudf::size_type get_max_used() const { return max_used; }
 
    private:
     /**
@@ -221,10 +221,10 @@ class expression_parser {
      *
      * @return cudf::size_type Smallest value not already in the container.
      */
-    cudf::size_type find_first_missing() const;
+    [[nodiscard]] cudf::size_type find_first_missing() const;
 
     std::vector<cudf::size_type> used_values;
-    cudf::size_type max_used;
+    cudf::size_type max_used{0};
   };
 
   expression_device_view device_expression_data;  ///< The collection of data required to evaluate
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 20aaa42fb68..eb98e0e0bee 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -38,14 +38,14 @@ class expression_parser;
 struct expression {
   virtual cudf::size_type accept(detail::expression_parser& visitor) const = 0;
 
-  bool may_evaluate_null(table_view const& left, rmm::cuda_stream_view stream) const
+  [[nodiscard]] bool may_evaluate_null(table_view const& left, rmm::cuda_stream_view stream) const
   {
     return may_evaluate_null(left, left, stream);
   }
 
-  virtual bool may_evaluate_null(table_view const& left,
-                                 table_view const& right,
-                                 rmm::cuda_stream_view stream) const = 0;
+  [[nodiscard]] virtual bool may_evaluate_null(table_view const& left,
+                                               table_view const& right,
+                                               rmm::cuda_stream_view stream) const = 0;
 
   virtual ~expression() {}
 };
@@ -173,14 +173,17 @@ class literal : public expression {
    *
    * @return cudf::data_type
    */
-  cudf::data_type get_data_type() const { return get_value().type(); }
+  [[nodiscard]] cudf::data_type get_data_type() const { return get_value().type(); }
 
   /**
    * @brief Get the value object.
    *
    * @return cudf::detail::fixed_width_scalar_device_view_base
    */
-  cudf::detail::fixed_width_scalar_device_view_base get_value() const { return value; }
+  [[nodiscard]] cudf::detail::fixed_width_scalar_device_view_base get_value() const
+  {
+    return value;
+  }
 
   /**
    * @brief Accepts a visitor class.
@@ -190,9 +193,9 @@ class literal : public expression {
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
-  bool may_evaluate_null(table_view const& left,
-                         table_view const& right,
-                         rmm::cuda_stream_view stream) const override
+  [[nodiscard]] bool may_evaluate_null(table_view const& left,
+                                       table_view const& right,
+                                       rmm::cuda_stream_view stream) const override
   {
     return !is_valid(stream);
   }
@@ -202,7 +205,10 @@ class literal : public expression {
    *
    * @return bool
    */
-  bool is_valid(rmm::cuda_stream_view stream) const { return scalar.is_valid(stream); }
+  [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream) const
+  {
+    return scalar.is_valid(stream);
+  }
 
  private:
   cudf::scalar const& scalar;
@@ -232,14 +238,14 @@ class column_reference : public expression {
    *
    * @return cudf::size_type
    */
-  cudf::size_type get_column_index() const { return column_index; }
+  [[nodiscard]] cudf::size_type get_column_index() const { return column_index; }
 
   /**
    * @brief Get the table source.
    *
    * @return table_reference
    */
-  table_reference get_table_source() const { return table_source; }
+  [[nodiscard]] table_reference get_table_source() const { return table_source; }
 
   /**
    * @brief Get the data type.
@@ -247,7 +253,7 @@ class column_reference : public expression {
    * @param table Table used to determine types.
    * @return cudf::data_type
    */
-  cudf::data_type get_data_type(table_view const& table) const
+  [[nodiscard]] cudf::data_type get_data_type(table_view const& table) const
   {
     return table.column(get_column_index()).type();
   }
@@ -259,7 +265,8 @@ class column_reference : public expression {
    * @param right_table Right table used to determine types.
    * @return cudf::data_type
    */
-  cudf::data_type get_data_type(table_view const& left_table, table_view const& right_table) const
+  [[nodiscard]] cudf::data_type get_data_type(table_view const& left_table,
+                                              table_view const& right_table) const
   {
     auto const table = [&] {
       if (get_table_source() == table_reference::LEFT) {
@@ -281,9 +288,9 @@ class column_reference : public expression {
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
-  bool may_evaluate_null(table_view const& left,
-                         table_view const& right,
-                         rmm::cuda_stream_view stream) const override
+  [[nodiscard]] bool may_evaluate_null(table_view const& left,
+                                       table_view const& right,
+                                       rmm::cuda_stream_view stream) const override
   {
     return (table_source == table_reference::LEFT ? left : right).column(column_index).has_nulls();
   }
@@ -327,7 +334,7 @@ class operation : public expression {
    *
    * @return ast_operator
    */
-  ast_operator get_operator() const { return op; }
+  [[nodiscard]] ast_operator get_operator() const { return op; }
 
   /**
    * @brief Get the operands.
@@ -344,9 +351,9 @@ class operation : public expression {
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
-  bool may_evaluate_null(table_view const& left,
-                         table_view const& right,
-                         rmm::cuda_stream_view stream) const override
+  [[nodiscard]] bool may_evaluate_null(table_view const& left,
+                                       table_view const& right,
+                                       rmm::cuda_stream_view stream) const override
   {
     return std::any_of(operands.cbegin(),
                        operands.cend(),
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index a514010c1f0..daf55c0befe 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,11 @@ enum class binary_operator : int32_t {
                          ///< operand when one is null; or invalid when both are null
   GENERIC_BINARY,        ///< generic binary operator to be generated with input
                          ///< ptx code
-  INVALID_BINARY         ///< invalid operation
+  NULL_LOGICAL_AND,  ///< operator && with Spark rules: (null, null) is null, (null, true) is null,
+                     ///< (null, false) is false, and (valid, valid) == LOGICAL_AND(valid, valid)
+  NULL_LOGICAL_OR,   ///< operator || with Spark rules: (null, null) is null, (null, true) is true,
+                     ///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid)
+  INVALID_BINARY     ///< invalid operation
 };
 /**
  * @brief Performs a binary operation between a scalar and a column.
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 8decce7f260..7869f9bd2aa 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -122,12 +122,12 @@ class column {
   /**
    * @brief Returns the column's logical element type
    */
-  data_type type() const noexcept { return _type; }
+  [[nodiscard]] data_type type() const noexcept { return _type; }
 
   /**
    * @brief Returns the number of elements
    */
-  size_type size() const noexcept { return _size; }
+  [[nodiscard]] size_type size() const noexcept { return _size; }
 
   /**
    * @brief Returns the count of null elements.
@@ -137,7 +137,7 @@ class column {
    * first invocation of `null_count()` will compute and store the count of null
    * elements indicated by the `null_mask` (if it exists).
    */
-  size_type null_count() const;
+  [[nodiscard]] size_type null_count() const;
 
   /**
    * @brief Sets the column's null value indicator bitmask to `new_null_mask`.
@@ -199,7 +199,7 @@ class column {
    * @return true The column can hold null values
    * @return false The column cannot hold null values
    */
-  bool nullable() const noexcept { return (_null_mask.size() > 0); }
+  [[nodiscard]] bool nullable() const noexcept { return (_null_mask.size() > 0); }
 
   /**
    * @brief Indicates whether the column contains null elements.
@@ -207,12 +207,12 @@ class column {
    * @return true One or more elements are null
    * @return false Zero elements are null
    */
-  bool has_nulls() const noexcept { return (null_count() > 0); }
+  [[nodiscard]] bool has_nulls() const noexcept { return (null_count() > 0); }
 
   /**
    * @brief Returns the number of child columns
    */
-  size_type num_children() const noexcept { return _children.size(); }
+  [[nodiscard]] size_type num_children() const noexcept { return _children.size(); }
 
   /**
    * @brief Returns a reference to the specified child
@@ -228,7 +228,10 @@ class column {
    * @param child_index Index of the desired child
    * @return column const& Const reference to the desired child
    */
-  column const& child(size_type child_index) const noexcept { return *_children[child_index]; };
+  [[nodiscard]] column const& child(size_type child_index) const noexcept
+  {
+    return *_children[child_index];
+  };
 
   /**
    * @brief Wrapper for the contents of a column.
@@ -264,7 +267,7 @@ class column {
    *
    * @return column_view The immutable, non-owning view
    */
-  column_view view() const;
+  [[nodiscard]] column_view view() const;
 
   /**
    * @brief Implicit conversion operator to a `column_view`.
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index b29df1852b2..d2332ef9026 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -139,12 +139,12 @@ class alignas(16) column_device_view_base {
   /**
    * @brief Returns the number of elements in the column.
    */
-  __host__ __device__ size_type size() const noexcept { return _size; }
+  [[nodiscard]] __host__ __device__ size_type size() const noexcept { return _size; }
 
   /**
    * @brief Returns the element type
    */
-  __host__ __device__ data_type type() const noexcept { return _type; }
+  [[nodiscard]] __host__ __device__ data_type type() const noexcept { return _type; }
 
   /**
    * @brief Indicates whether the column can contain null elements, i.e., if it
@@ -155,7 +155,7 @@ class alignas(16) column_device_view_base {
    * @return true The bitmask is allocated
    * @return false The bitmask is not allocated
    */
-  __host__ __device__ bool nullable() const noexcept { return nullptr != _null_mask; }
+  [[nodiscard]] __host__ __device__ bool nullable() const noexcept { return nullptr != _null_mask; }
 
   /**
    * @brief Returns raw pointer to the underlying bitmask allocation.
@@ -164,13 +164,16 @@ class alignas(16) column_device_view_base {
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
    */
-  __host__ __device__ bitmask_type const* null_mask() const noexcept { return _null_mask; }
+  [[nodiscard]] __host__ __device__ bitmask_type const* null_mask() const noexcept
+  {
+    return _null_mask;
+  }
 
   /**
    * @brief Returns the index of the first element relative to the base memory
    * allocation, i.e., what is returned from `head<T>()`.
    */
-  __host__ __device__ size_type offset() const noexcept { return _offset; }
+  [[nodiscard]] __host__ __device__ size_type offset() const noexcept { return _offset; }
 
   /**
    * @brief Returns whether the specified element holds a valid value (i.e., not
@@ -186,7 +189,7 @@ class alignas(16) column_device_view_base {
    * @return true The element is valid
    * @return false The element is null
    */
-  __device__ bool is_valid(size_type element_index) const noexcept
+  [[nodiscard]] __device__ bool is_valid(size_type element_index) const noexcept
   {
     return not nullable() or is_valid_nocheck(element_index);
   }
@@ -203,7 +206,7 @@ class alignas(16) column_device_view_base {
    * @return true The element is valid
    * @return false The element is null
    */
-  __device__ bool is_valid_nocheck(size_type element_index) const noexcept
+  [[nodiscard]] __device__ bool is_valid_nocheck(size_type element_index) const noexcept
   {
     return bit_is_set(_null_mask, offset() + element_index);
   }
@@ -221,7 +224,7 @@ class alignas(16) column_device_view_base {
    * @return true The element is null
    * @return false The element is valid
    */
-  __device__ bool is_null(size_type element_index) const noexcept
+  [[nodiscard]] __device__ bool is_null(size_type element_index) const noexcept
   {
     return not is_valid(element_index);
   }
@@ -237,7 +240,7 @@ class alignas(16) column_device_view_base {
    * @return true The element is null
    * @return false The element is valid
    */
-  __device__ bool is_null_nocheck(size_type element_index) const noexcept
+  [[nodiscard]] __device__ bool is_null_nocheck(size_type element_index) const noexcept
   {
     return not is_valid_nocheck(element_index);
   }
@@ -251,7 +254,7 @@ class alignas(16) column_device_view_base {
    * @param word_index The index of the word to get
    * @return bitmask word for the given word_index
    */
-  __device__ bitmask_type get_mask_word(size_type word_index) const noexcept
+  [[nodiscard]] __device__ bitmask_type get_mask_word(size_type word_index) const noexcept
   {
     return null_mask()[word_index];
   }
@@ -476,7 +479,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * For columns with null elements, use `make_null_replacement_iterator`.
    */
   template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_iterator<T> begin() const
+  [[nodiscard]] const_iterator<T> begin() const
   {
     return const_iterator<T>{count_it{0}, detail::value_accessor<T>{*this}};
   }
@@ -494,7 +497,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * For columns with null elements, use `make_null_replacement_iterator`.
    */
   template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_iterator<T> end() const
+  [[nodiscard]] const_iterator<T> end() const
   {
     return const_iterator<T>{count_it{size()}, detail::value_accessor<T>{*this}};
   }
@@ -602,7 +605,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   template <typename T,
             bool has_nulls,
             CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_pair_iterator<T, has_nulls> pair_begin() const
+  [[nodiscard]] const_pair_iterator<T, has_nulls> pair_begin() const
   {
     return const_pair_iterator<T, has_nulls>{count_it{0},
                                              detail::pair_accessor<T, has_nulls>{*this}};
@@ -632,7 +635,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   template <typename T,
             bool has_nulls,
             CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_pair_rep_iterator<T, has_nulls> pair_rep_begin() const
+  [[nodiscard]] const_pair_rep_iterator<T, has_nulls> pair_rep_begin() const
   {
     return const_pair_rep_iterator<T, has_nulls>{count_it{0},
                                                  detail::pair_rep_accessor<T, has_nulls>{*this}};
@@ -673,7 +676,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   template <typename T,
             bool has_nulls,
             CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_pair_iterator<T, has_nulls> pair_end() const
+  [[nodiscard]] const_pair_iterator<T, has_nulls> pair_end() const
   {
     return const_pair_iterator<T, has_nulls>{count_it{size()},
                                              detail::pair_accessor<T, has_nulls>{*this}};
@@ -693,7 +696,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   template <typename T,
             bool has_nulls,
             CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_pair_rep_iterator<T, has_nulls> pair_rep_end() const
+  [[nodiscard]] const_pair_rep_iterator<T, has_nulls> pair_rep_end() const
   {
     return const_pair_rep_iterator<T, has_nulls>{count_it{size()},
                                                  detail::pair_rep_accessor<T, has_nulls>{*this}};
@@ -743,7 +746,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @param child_index The index of the desired child
    * @return column_view The requested child `column_view`
    */
-  __device__ column_device_view child(size_type child_index) const noexcept
+  [[nodiscard]] __device__ column_device_view child(size_type child_index) const noexcept
   {
     return d_children[child_index];
   }
@@ -751,7 +754,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   /**
    * @brief Returns a span containing the children of this column
    */
-  __device__ device_span<column_device_view const> children() const noexcept
+  [[nodiscard]] __device__ device_span<column_device_view const> children() const noexcept
   {
     return device_span<column_device_view const>(d_children, _num_children);
   }
@@ -761,7 +764,10 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *
    * @return The number of child columns
    */
-  __host__ __device__ size_type num_child_columns() const noexcept { return _num_children; }
+  [[nodiscard]] __host__ __device__ size_type num_child_columns() const noexcept
+  {
+    return _num_children;
+  }
 
  protected:
   column_device_view* d_children{};  ///< Array of `column_device_view`
@@ -907,7 +913,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
    */
-  __host__ __device__ bitmask_type* null_mask() const noexcept
+  [[nodiscard]] __host__ __device__ bitmask_type* null_mask() const noexcept
   {
     return const_cast<bitmask_type*>(detail::column_device_view_base::null_mask());
   }
@@ -957,7 +963,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @param child_index The index of the desired child
    * @return column_view The requested child `column_view`
    */
-  __device__ mutable_column_device_view child(size_type child_index) const noexcept
+  [[nodiscard]] __device__ mutable_column_device_view child(size_type child_index) const noexcept
   {
     return d_children[child_index];
   }
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 3f335509da8..325f023f283 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -123,17 +123,17 @@ class column_view_base {
   /**
    * @brief Returns the number of elements in the column
    */
-  size_type size() const noexcept { return _size; }
+  [[nodiscard]] size_type size() const noexcept { return _size; }
 
   /**
    * @brief Returns true if `size()` returns zero, or false otherwise
    */
-  size_type is_empty() const noexcept { return size() == 0; }
+  [[nodiscard]] size_type is_empty() const noexcept { return size() == 0; }
 
   /**
    * @brief Returns the element `data_type`
    */
-  data_type type() const noexcept { return _type; }
+  [[nodiscard]] data_type type() const noexcept { return _type; }
 
   /**
    * @brief Indicates if the column can contain null elements, i.e., if it has
@@ -144,7 +144,7 @@ class column_view_base {
    * @return true The bitmask is allocated
    * @return false The bitmask is not allocated
    */
-  bool nullable() const noexcept { return nullptr != _null_mask; }
+  [[nodiscard]] bool nullable() const noexcept { return nullptr != _null_mask; }
 
   /**
    * @brief Returns the count of null elements
@@ -154,7 +154,7 @@ class column_view_base {
    * first invocation of `null_count()` will compute and store the count of null
    * elements indicated by the `null_mask` (if it exists).
    */
-  size_type null_count() const;
+  [[nodiscard]] size_type null_count() const;
 
   /**
    * @brief Returns the count of null elements in the range [begin, end)
@@ -169,7 +169,7 @@ class column_view_base {
    * @param[in] begin The starting index of the range (inclusive).
    * @param[in] end The index of the last element in the range (exclusive).
    */
-  size_type null_count(size_type begin, size_type end) const;
+  [[nodiscard]] size_type null_count(size_type begin, size_type end) const;
 
   /**
    * @brief Indicates if the column contains null elements,
@@ -178,7 +178,7 @@ class column_view_base {
    * @return true One or more elements are null
    * @return false All elements are valid
    */
-  bool has_nulls() const { return null_count() > 0; }
+  [[nodiscard]] bool has_nulls() const { return null_count() > 0; }
 
   /**
    * @brief Indicates if the column contains null elements in the range
@@ -192,7 +192,10 @@ class column_view_base {
    * @return true One or more elements are null in the range [begin, end)
    * @return false All elements are valid in the range [begin, end)
    */
-  bool has_nulls(size_type begin, size_type end) const { return null_count(begin, end) > 0; }
+  [[nodiscard]] bool has_nulls(size_type begin, size_type end) const
+  {
+    return null_count(begin, end) > 0;
+  }
 
   /**
    * @brief Returns raw pointer to the underlying bitmask allocation.
@@ -201,13 +204,13 @@ class column_view_base {
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
    */
-  bitmask_type const* null_mask() const noexcept { return _null_mask; }
+  [[nodiscard]] bitmask_type const* null_mask() const noexcept { return _null_mask; }
 
   /**
    * @brief Returns the index of the first element relative to the base memory
    * allocation, i.e., what is returned from `head<T>()`.
    */
-  size_type offset() const noexcept { return _offset; }
+  [[nodiscard]] size_type offset() const noexcept { return _offset; }
 
  protected:
   data_type _type{type_id::EMPTY};   ///< Element type
@@ -352,12 +355,15 @@ class column_view : public detail::column_view_base {
    * @param child_index The index of the desired child
    * @return column_view The requested child `column_view`
    */
-  column_view child(size_type child_index) const noexcept { return _children[child_index]; }
+  [[nodiscard]] column_view child(size_type child_index) const noexcept
+  {
+    return _children[child_index];
+  }
 
   /**
    * @brief Returns the number of child columns.
    */
-  size_type num_children() const noexcept { return _children.size(); }
+  [[nodiscard]] size_type num_children() const noexcept { return _children.size(); }
 
   /**
    * @brief Returns iterator to the beginning of the ordered sequence of child column-views.
@@ -524,7 +530,7 @@ class mutable_column_view : public detail::column_view_base {
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
    */
-  bitmask_type* null_mask() const noexcept
+  [[nodiscard]] bitmask_type* null_mask() const noexcept
   {
     return const_cast<bitmask_type*>(detail::column_view_base::null_mask());
   }
@@ -544,7 +550,7 @@ class mutable_column_view : public detail::column_view_base {
    * @param child_index The index of the desired child
    * @return mutable_column_view The requested child `mutable_column_view`
    */
-  mutable_column_view child(size_type child_index) const noexcept
+  [[nodiscard]] mutable_column_view child(size_type child_index) const noexcept
   {
     return mutable_children[child_index];
   }
@@ -552,7 +558,7 @@ class mutable_column_view : public detail::column_view_base {
   /**
    * @brief Returns the number of child columns.
    */
-  size_type num_children() const noexcept { return mutable_children.size(); }
+  [[nodiscard]] size_type num_children() const noexcept { return mutable_children.size(); }
 
   /**
    * @brief Returns iterator to the beginning of the ordered sequence of child column-views.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 81dddbd284a..850a11426af 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -553,8 +553,8 @@ struct packed_columns {
   struct metadata {
     metadata() = default;
     metadata(std::vector<uint8_t>&& v) : data_(std::move(v)) {}
-    uint8_t const* data() const { return data_.data(); }
-    size_t size() const { return data_.size(); }
+    [[nodiscard]] uint8_t const* data() const { return data_.data(); }
+    [[nodiscard]] size_t size() const { return data_.size(); }
 
    private:
     std::vector<uint8_t> data_;
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 3674efbcc7b..fbf315776f4 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -148,7 +148,7 @@ class sum_aggregation final : public rolling_aggregation,
  public:
   sum_aggregation() : aggregation(SUM) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<sum_aggregation>(*this);
   }
@@ -167,7 +167,7 @@ class product_aggregation final : public groupby_aggregation {
  public:
   product_aggregation() : aggregation(PRODUCT) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<product_aggregation>(*this);
   }
@@ -188,7 +188,7 @@ class min_aggregation final : public rolling_aggregation,
  public:
   min_aggregation() : aggregation(MIN) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<min_aggregation>(*this);
   }
@@ -209,7 +209,7 @@ class max_aggregation final : public rolling_aggregation,
  public:
   max_aggregation() : aggregation(MAX) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<max_aggregation>(*this);
   }
@@ -230,7 +230,7 @@ class count_aggregation final : public rolling_aggregation,
  public:
   count_aggregation(aggregation::Kind kind) : aggregation(kind) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<count_aggregation>(*this);
   }
@@ -249,7 +249,7 @@ class any_aggregation final : public aggregation {
  public:
   any_aggregation() : aggregation(ANY) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<any_aggregation>(*this);
   }
@@ -268,7 +268,7 @@ class all_aggregation final : public aggregation {
  public:
   all_aggregation() : aggregation(ALL) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<all_aggregation>(*this);
   }
@@ -287,7 +287,7 @@ class sum_of_squares_aggregation final : public groupby_aggregation {
  public:
   sum_of_squares_aggregation() : aggregation(SUM_OF_SQUARES) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<sum_of_squares_aggregation>(*this);
   }
@@ -306,7 +306,7 @@ class mean_aggregation final : public rolling_aggregation, public groupby_aggreg
  public:
   mean_aggregation() : aggregation(MEAN) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<mean_aggregation>(*this);
   }
@@ -325,7 +325,7 @@ class m2_aggregation : public groupby_aggregation {
  public:
   m2_aggregation() : aggregation{M2} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<m2_aggregation>(*this);
   }
@@ -344,14 +344,17 @@ class std_var_aggregation : public rolling_aggregation, public groupby_aggregati
  public:
   size_type _ddof;  ///< Delta degrees of freedom
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<std_var_aggregation const&>(_other);
     return _ddof == other._ddof;
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
  protected:
   std_var_aggregation(aggregation::Kind k, size_type ddof) : rolling_aggregation(k), _ddof{ddof}
@@ -359,7 +362,7 @@ class std_var_aggregation : public rolling_aggregation, public groupby_aggregati
     CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE,
                  "std_var_aggregation can accept only STD, VARIANCE");
   }
-  size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
+  [[nodiscard]] size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
 };
 
 /**
@@ -372,7 +375,7 @@ class var_aggregation final : public std_var_aggregation {
   {
   }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<var_aggregation>(*this);
   }
@@ -394,7 +397,7 @@ class std_aggregation final : public std_var_aggregation {
   {
   }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<std_aggregation>(*this);
   }
@@ -413,7 +416,7 @@ class median_aggregation final : public groupby_aggregation {
  public:
   median_aggregation() : aggregation(MEDIAN) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<median_aggregation>(*this);
   }
@@ -437,7 +440,7 @@ class quantile_aggregation final : public groupby_aggregation {
   std::vector<double> _quantiles;  ///< Desired quantile(s)
   interpolation _interpolation;    ///< Desired interpolation
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
 
@@ -447,9 +450,12 @@ class quantile_aggregation final : public groupby_aggregation {
            std::equal(_quantiles.begin(), _quantiles.end(), other._quantiles.begin());
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<quantile_aggregation>(*this);
   }
@@ -478,7 +484,7 @@ class argmax_aggregation final : public rolling_aggregation, public groupby_aggr
  public:
   argmax_aggregation() : aggregation(ARGMAX) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<argmax_aggregation>(*this);
   }
@@ -497,7 +503,7 @@ class argmin_aggregation final : public rolling_aggregation, public groupby_aggr
  public:
   argmin_aggregation() : aggregation(ARGMIN) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<argmin_aggregation>(*this);
   }
@@ -521,16 +527,19 @@ class nunique_aggregation final : public groupby_aggregation {
 
   null_policy _null_handling;  ///< include or exclude nulls
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<nunique_aggregation const&>(_other);
     return _null_handling == other._null_handling;
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<nunique_aggregation>(*this);
   }
@@ -558,16 +567,19 @@ class nth_element_aggregation final : public groupby_aggregation {
   size_type _n;                ///< nth index to return
   null_policy _null_handling;  ///< include or exclude nulls
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<nth_element_aggregation const&>(_other);
     return _n == other._n and _null_handling == other._null_handling;
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<nth_element_aggregation>(*this);
   }
@@ -592,7 +604,7 @@ class row_number_aggregation final : public rolling_aggregation {
  public:
   row_number_aggregation() : aggregation(ROW_NUMBER) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<row_number_aggregation>(*this);
   }
@@ -611,7 +623,7 @@ class rank_aggregation final : public rolling_aggregation, public groupby_scan_a
  public:
   rank_aggregation() : aggregation{RANK} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<rank_aggregation>(*this);
   }
@@ -630,7 +642,7 @@ class dense_rank_aggregation final : public rolling_aggregation, public groupby_
  public:
   dense_rank_aggregation() : aggregation{DENSE_RANK} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<dense_rank_aggregation>(*this);
   }
@@ -654,16 +666,19 @@ class collect_list_aggregation final : public rolling_aggregation, public groupb
 
   null_policy _null_handling;  ///< include or exclude nulls
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<collect_list_aggregation const&>(_other);
     return (_null_handling == other._null_handling);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<collect_list_aggregation>(*this);
   }
@@ -698,7 +713,7 @@ class collect_set_aggregation final : public rolling_aggregation, public groupby
   nan_equality _nans_equal;    ///< whether to consider NaNs as equal value (applicable only to
                                ///< floating point types)
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<collect_set_aggregation const&>(_other);
@@ -706,9 +721,12 @@ class collect_set_aggregation final : public rolling_aggregation, public groupby
             _nans_equal == other._nans_equal);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<collect_set_aggregation>(*this);
   }
@@ -737,16 +755,19 @@ class lead_lag_aggregation final : public rolling_aggregation {
   {
   }
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<lead_lag_aggregation const&>(_other);
     return (row_offset == other.row_offset);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<lead_lag_aggregation>(*this);
   }
@@ -760,7 +781,7 @@ class lead_lag_aggregation final : public rolling_aggregation {
   size_type row_offset;
 
  private:
-  size_t hash_impl() const { return std::hash<size_type>()(row_offset); }
+  [[nodiscard]] size_t hash_impl() const { return std::hash<size_type>()(row_offset); }
 };
 
 /**
@@ -782,7 +803,7 @@ class udf_aggregation final : public rolling_aggregation {
                  "udf_aggregation can accept only PTX, CUDA");
   }
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<udf_aggregation const&>(_other);
@@ -790,9 +811,12 @@ class udf_aggregation final : public rolling_aggregation {
             _function_name == other._function_name and _output_type == other._output_type);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<udf_aggregation>(*this);
   }
@@ -809,7 +833,7 @@ class udf_aggregation final : public rolling_aggregation {
   data_type _output_type;
 
  protected:
-  size_t hash_impl() const
+  [[nodiscard]] size_t hash_impl() const
   {
     return std::hash<std::string>{}(_source) ^ std::hash<std::string>{}(_operator_name) ^
            std::hash<std::string>{}(_function_name) ^
@@ -824,7 +848,7 @@ class merge_lists_aggregation final : public groupby_aggregation {
  public:
   explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<merge_lists_aggregation>(*this);
   }
@@ -850,16 +874,19 @@ class merge_sets_aggregation final : public groupby_aggregation {
   nan_equality _nans_equal;    ///< whether to consider NaNs as equal value (applicable only to
                                ///< floating point types)
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<merge_sets_aggregation const&>(_other);
     return (_nulls_equal == other._nulls_equal && _nans_equal == other._nans_equal);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<merge_sets_aggregation>(*this);
   }
@@ -884,7 +911,7 @@ class merge_m2_aggregation final : public groupby_aggregation {
  public:
   explicit merge_m2_aggregation() : aggregation{MERGE_M2} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<merge_m2_aggregation>(*this);
   }
@@ -908,9 +935,12 @@ class covariance_aggregation final : public groupby_aggregation {
   size_type _min_periods;
   size_type _ddof;
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<covariance_aggregation>(*this);
   }
@@ -940,16 +970,19 @@ class correlation_aggregation final : public groupby_aggregation {
   correlation_type _type;
   size_type _min_periods;
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<correlation_aggregation const&>(_other);
     return (_type == other._type);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<correlation_aggregation>(*this);
   }
@@ -979,7 +1012,7 @@ class tdigest_aggregation final : public groupby_aggregation {
 
   int const max_centroids;
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<tdigest_aggregation>(*this);
   }
@@ -1003,7 +1036,7 @@ class merge_tdigest_aggregation final : public groupby_aggregation {
 
   int const max_centroids;
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<merge_tdigest_aggregation>(*this);
   }
diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp
index 41f5c19f06a..4409d7e0d73 100644
--- a/cpp/include/cudf/detail/aggregation/result_cache.hpp
+++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp
@@ -49,11 +49,11 @@ class result_cache {
 
   result_cache(size_t num_columns) : _cache(num_columns) {}
 
-  bool has_result(column_view const& input, aggregation const& agg) const;
+  [[nodiscard]] bool has_result(column_view const& input, aggregation const& agg) const;
 
   void add_result(column_view const& input, aggregation const& agg, std::unique_ptr<column>&& col);
 
-  column_view get_result(column_view const& input, aggregation const& agg) const;
+  [[nodiscard]] column_view get_result(column_view const& input, aggregation const& agg) const;
 
   std::unique_ptr<column> release_result(column_view const& input, aggregation const& agg);
 
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index ee5cb5c265d..1debef17db7 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -77,8 +77,8 @@ struct tagged_element_relational_comparator {
   {
   }
 
-  __device__ weak_ordering compare(index_type lhs_tagged_index,
-                                   index_type rhs_tagged_index) const noexcept
+  [[nodiscard]] __device__ weak_ordering compare(index_type lhs_tagged_index,
+                                                 index_type rhs_tagged_index) const noexcept
   {
     auto const [l_side, l_indx] = lhs_tagged_index;
     auto const [r_side, r_indx] = rhs_tagged_index;
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 6f32e3190bf..751b7c00e8a 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -106,17 +106,17 @@ class flattened_table {
   /**
    * @brief Getter for the flattened columns, as a `table_view`.
    */
-  table_view flattened_columns() const { return _flattened_columns; }
+  [[nodiscard]] table_view flattened_columns() const { return _flattened_columns; }
 
   /**
    * @brief Getter for the cudf::order of the table_view's columns.
    */
-  std::vector<order> orders() const { return _orders; }
+  [[nodiscard]] std::vector<order> orders() const { return _orders; }
 
   /**
    * @brief Getter for the cudf::null_order of the table_view's columns.
    */
-  std::vector<null_order> null_orders() const { return _null_orders; }
+  [[nodiscard]] std::vector<null_order> null_orders() const { return _null_orders; }
 
   /**
    * @brief Conversion to `table_view`, to fetch flattened columns.
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 6380e76fdfa..b8ea228383d 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -62,8 +62,8 @@ struct genericAtomicOperationImpl<T, Op, 1> {
   {
     using T_int = unsigned int;
 
-    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
-    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    auto* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift          = ((reinterpret_cast<size_t>(addr) & 3) * 8);
 
     T_int old = *address_uint32;
     T_int assumed;
@@ -87,7 +87,7 @@ struct genericAtomicOperationImpl<T, Op, 2> {
   {
     using T_int      = unsigned int;
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 =
+    auto* address_uint32 =
       reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
@@ -322,8 +322,8 @@ struct typesAtomicCASImpl<T, 1> {
   {
     using T_int = unsigned int;
 
-    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
-    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift          = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    auto* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
 
     // the 'target_value' in `old` can be different from `compare`
     // because other thread may update the value
@@ -355,7 +355,7 @@ struct typesAtomicCASImpl<T, 2> {
     using T_int = unsigned int;
 
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 =
+    auto* address_uint32 =
       reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index c35d24ddeac..b5ca5a3590e 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -16,12 +16,16 @@
 
 #pragma once
 
+#include <cstddef>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 
+#include <thrust/iterator/reverse_iterator.h>
+
 using hash_value_type = uint32_t;
 
 namespace cudf {
@@ -86,12 +90,12 @@ struct MurmurHash3_32 {
   MurmurHash3_32() = default;
   constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
+  [[nodiscard]] __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
   {
     return (x << r) | (x >> (32 - r));
   }
 
-  __device__ inline uint32_t fmix32(uint32_t h) const
+  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
   {
     h ^= h >> 16;
     h *= 0x85ebca6b;
@@ -337,17 +341,21 @@ struct SparkMurmurHash3_32 {
   template <typename TKey>
   result_type __device__ inline compute(TKey const& key) const
   {
-    constexpr int len        = sizeof(TKey);
-    int8_t const* const data = reinterpret_cast<int8_t const*>(&key);
-    constexpr int nblocks    = len / 4;
+    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(TKey));
+  }
+
+  result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const
+  {
+    constexpr cudf::size_type block_size = sizeof(uint32_t) / sizeof(std::byte);
+    cudf::size_type const nblocks        = len / block_size;
+    uint32_t h1                          = m_seed;
+    constexpr uint32_t c1                = 0xcc9e2d51;
+    constexpr uint32_t c2                = 0x1b873593;
 
-    uint32_t h1           = m_seed;
-    constexpr uint32_t c1 = 0xcc9e2d51;
-    constexpr uint32_t c2 = 0x1b873593;
     //----------
-    // body
-    uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4);
-    for (int i = -nblocks; i; i++) {
+    // Process all four-byte chunks
+    uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data);
+    for (cudf::size_type i = 0; i < nblocks; i++) {
       uint32_t k1 = blocks[i];
       k1 *= c1;
       k1 = rotl32(k1, 15);
@@ -357,9 +365,14 @@ struct SparkMurmurHash3_32 {
       h1 = h1 * 5 + 0xe6546b64;
     }
     //----------
-    // byte by byte tail processing
-    for (int i = nblocks * 4; i < len; i++) {
-      int32_t k1 = data[i];
+    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
+    // (does not conform to normal MurmurHash3)
+    for (cudf::size_type i = nblocks * 4; i < len; i++) {
+      // We require a two-step cast to get the k1 value from the byte. First,
+      // we must cast to a signed int8_t. Then, the sign bit is preserved when
+      // casting to uint32_t under 2's complement. Java preserves the
+      // signedness when casting byte-to-int, but C++ does not.
+      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
       k1 *= c1;
       k1 = rotl32(k1, 15);
       k1 *= c2;
@@ -427,7 +440,42 @@ template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::operator()(
   numeric::decimal128 const& key) const
 {
-  return this->compute<__int128_t>(key.value());
+  // Generates the Spark MurmurHash3 hash value, mimicking the conversion:
+  // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray()
+  // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381
+  __int128_t const val               = key.value();
+  constexpr cudf::size_type key_size = sizeof(__int128_t);
+  std::byte const* data              = reinterpret_cast<std::byte const*>(&val);
+
+  // Small negative values start with 0xff..., small positive values start with 0x00...
+  bool const is_negative     = val < 0;
+  std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00};
+
+  // If the value can be represented with a shorter than 16-byte integer, the
+  // leading bytes of the little-endian value are truncated and are not hashed.
+  auto const reverse_begin = thrust::reverse_iterator(data + key_size);
+  auto const reverse_end   = thrust::reverse_iterator(data);
+  auto const first_nonzero_byte =
+    thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) {
+      return v == zero_value;
+    }).base();
+  // Max handles special case of 0 and -1 which would shorten to 0 length otherwise
+  cudf::size_type length =
+    std::max(1, static_cast<cudf::size_type>(thrust::distance(data, first_nonzero_byte)));
+
+  // Preserve the 2's complement sign bit by adding a byte back on if necessary.
+  // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to
+  // preserve the sign bit, rather than leaving an "f" at the front which would
+  // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte
+  // is needed because the leftmost bit matches the sign bit. Similarly for
+  // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80.
+  if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; }
+
+  // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed.
+  __int128_t big_endian_value = 0;
+  auto big_endian_data        = reinterpret_cast<std::byte*>(&big_endian_value);
+  thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data);
+  return this->compute_bytes(big_endian_data, length);
 }
 
 template <>
@@ -480,7 +528,7 @@ hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operat
   //----------
   // Spark's byte by byte tail processing
   for (int i = nblocks * 4; i < len; i++) {
-    int32_t k1 = data[i];
+    uint32_t k1 = data[i];
     k1 *= c1;
     k1 = rotl32(k1, 15);
     k1 *= c2;
diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 42f8310040e..33e29e70304 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -59,33 +59,33 @@ class dictionary_column_view : private column_view {
   /**
    * @brief Returns the parent column.
    */
-  column_view parent() const noexcept;
+  [[nodiscard]] column_view parent() const noexcept;
 
   /**
    * @brief Returns the column of indices
    */
-  column_view indices() const noexcept;
+  [[nodiscard]] column_view indices() const noexcept;
 
   /**
    * @brief Returns a column_view combining the indices data
    * with offset, size, and nulls from the parent.
    */
-  column_view get_indices_annotated() const noexcept;
+  [[nodiscard]] column_view get_indices_annotated() const noexcept;
 
   /**
    * @brief Returns the column of keys
    */
-  column_view keys() const noexcept;
+  [[nodiscard]] column_view keys() const noexcept;
 
   /**
    * @brief Returns the `data_type` of the keys child column.
    */
-  data_type keys_type() const noexcept;
+  [[nodiscard]] data_type keys_type() const noexcept;
 
   /**
    * @brief Returns the number of rows in the keys column.
    */
-  size_type keys_size() const noexcept;
+  [[nodiscard]] size_type keys_size() const noexcept;
 };
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 727dce0db9d..6a85428d8f0 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -82,7 +82,8 @@ template <typename Rep,
 CUDF_HOST_DEVICE inline Rep ipow(T exponent)
 {
   cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
-  if (exponent == 0) return static_cast<Rep>(1);
+  if (exponent == 0) { return static_cast<Rep>(1); }
+
   auto extra  = static_cast<Rep>(1);
   auto square = static_cast<Rep>(Base);
   while (exponent > 1) {
@@ -146,12 +147,9 @@ CUDF_HOST_DEVICE inline constexpr T left_shift(T const& val, scale_type const& s
 template <typename Rep, Radix Rad, typename T>
 CUDF_HOST_DEVICE inline constexpr T shift(T const& val, scale_type const& scale)
 {
-  if (scale == 0)
-    return val;
-  else if (scale > 0)
-    return right_shift<Rep, Rad>(val, scale);
-  else
-    return left_shift<Rep, Rad>(val, scale);
+  if (scale == 0) { return val; }
+  if (scale > 0) { return right_shift<Rep, Rad>(val, scale); }
+  return left_shift<Rep, Rad>(val, scale);
 }
 
 }  // namespace detail
@@ -193,7 +191,7 @@ struct scaled_integer {
  */
 template <typename Rep, Radix Rad>
 class fixed_point {
-  Rep _value;
+  Rep _value{};
   scale_type _scale;
 
  public:
@@ -258,7 +256,7 @@ class fixed_point {
    * @brief Default constructor that constructs `fixed_point` number with a
    * value and scale of zero
    */
-  CUDF_HOST_DEVICE inline fixed_point() : _value{0}, _scale{scale_type{0}} {}
+  CUDF_HOST_DEVICE inline fixed_point() : _scale{scale_type{0}} {}
 
   /**
    * @brief Explicit conversion operator for casting to floating point types
@@ -543,7 +541,7 @@ class fixed_point {
    */
   CUDF_HOST_DEVICE inline fixed_point<Rep, Rad> rescaled(scale_type scale) const
   {
-    if (scale == _scale) return *this;
+    if (scale == _scale) { return *this; }
     Rep const value = detail::shift<Rep, Rad>(_value, scale_type{scale - _scale});
     return fixed_point<Rep, Rad>{scaled_integer<Rep>{value, scale}};
   }
@@ -563,10 +561,9 @@ class fixed_point {
       auto const sign  = _value < 0 ? std::string("-") : std::string();
       return sign + detail::to_string(av / n) + std::string(".") + zeros +
              detail::to_string(av % n);
-    } else {
-      auto const zeros = std::string(_scale, '0');
-      return detail::to_string(_value) + zeros;
     }
+    auto const zeros = std::string(_scale, '0');
+    return detail::to_string(_value) + zeros;
   }
 };
 
@@ -628,12 +625,9 @@ CUDF_HOST_DEVICE inline auto multiplication_overflow(T lhs, T rhs)
 {
   auto const min = cuda::std::numeric_limits<Rep>::min();
   auto const max = cuda::std::numeric_limits<Rep>::max();
-  if (rhs > 0)
-    return lhs > max / rhs || lhs < min / rhs;
-  else if (rhs < -1)
-    return lhs > min / rhs || lhs < max / rhs;
-  else
-    return rhs == -1 && lhs == min;
+  if (rhs > 0) { return lhs > max / rhs || lhs < min / rhs; }
+  if (rhs < -1) { return lhs > min / rhs || lhs < max / rhs; }
+  return rhs == -1 && lhs == min;
 }
 
 // PLUS Operation
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 4e8bd65672f..0e00d14291d 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -74,22 +74,22 @@ class avro_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns names of the columns to be read.
    */
-  std::vector<std::string> get_columns() const { return _columns; }
+  [[nodiscard]] std::vector<std::string> get_columns() const { return _columns; }
 
   /**
    * @brief Returns number of rows to skip from the start.
    */
-  size_type get_skip_rows() const { return _skip_rows; }
+  [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of rows to read.
    */
-  size_type get_num_rows() const { return _num_rows; }
+  [[nodiscard]] size_type get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Set names of the column to be read.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 89719cb7f67..44ede9b0d63 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -159,27 +159,27 @@ class csv_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns compression format of the source.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Returns number of bytes to skip from source start.
    */
-  std::size_t get_byte_range_offset() const { return _byte_range_offset; }
+  [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; }
 
   /**
    * @brief Returns number of bytes to read.
    */
-  std::size_t get_byte_range_size() const { return _byte_range_size; }
+  [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; }
 
   /**
    * @brief Returns number of bytes to read with padding.
    */
-  std::size_t get_byte_range_size_with_padding() const
+  [[nodiscard]] std::size_t get_byte_range_size_with_padding() const
   {
     if (_byte_range_size == 0) {
       return 0;
@@ -191,7 +191,7 @@ class csv_reader_options {
   /**
    * @brief Returns number of bytes to pad when reading.
    */
-  std::size_t get_byte_range_padding() const
+  [[nodiscard]] std::size_t get_byte_range_padding() const
   {
     auto const num_names   = _names.size();
     auto const num_dtypes  = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
@@ -213,127 +213,139 @@ class csv_reader_options {
   /**
    * @brief Returns names of the columns.
    */
-  std::vector<std::string> const& get_names() const { return _names; }
+  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
 
   /**
    * @brief Returns prefix to be used for column ID.
    */
-  std::string get_prefix() const { return _prefix; }
+  [[nodiscard]] std::string get_prefix() const { return _prefix; }
 
   /**
    * @brief Whether to rename duplicate column names.
    */
-  bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
+  [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
 
   /**
    * @brief Returns names of the columns to be read.
    */
-  std::vector<std::string> const& get_use_cols_names() const { return _use_cols_names; }
+  [[nodiscard]] std::vector<std::string> const& get_use_cols_names() const
+  {
+    return _use_cols_names;
+  }
 
   /**
    * @brief Returns indexes of columns to read.
    */
-  std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
+  [[nodiscard]] std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
 
   /**
    * @brief Returns number of rows to read.
    */
-  size_type get_nrows() const { return _nrows; }
+  [[nodiscard]] size_type get_nrows() const { return _nrows; }
 
   /**
    * @brief Returns number of rows to skip from start.
    */
-  size_type get_skiprows() const { return _skiprows; }
+  [[nodiscard]] size_type get_skiprows() const { return _skiprows; }
 
   /**
    * @brief Returns number of rows to skip from end.
    */
-  size_type get_skipfooter() const { return _skipfooter; }
+  [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; }
 
   /**
    * @brief Returns header row index.
    */
-  size_type get_header() const { return _header; }
+  [[nodiscard]] size_type get_header() const { return _header; }
 
   /**
    * @brief Returns line terminator.
    */
-  char get_lineterminator() const { return _lineterminator; }
+  [[nodiscard]] char get_lineterminator() const { return _lineterminator; }
 
   /**
    * @brief Returns field delimiter.
    */
-  char get_delimiter() const { return _delimiter; }
+  [[nodiscard]] char get_delimiter() const { return _delimiter; }
 
   /**
    * @brief Returns numeric data thousands separator.
    */
-  char get_thousands() const { return _thousands; }
+  [[nodiscard]] char get_thousands() const { return _thousands; }
 
   /**
    * @brief Returns decimal point character.
    */
-  char get_decimal() const { return _decimal; }
+  [[nodiscard]] char get_decimal() const { return _decimal; }
 
   /**
    * @brief Returns comment line start character.
    */
-  char get_comment() const { return _comment; }
+  [[nodiscard]] char get_comment() const { return _comment; }
 
   /**
    * @brief Whether to treat `\r\n` as line terminator.
    */
-  bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
+  [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
 
   /**
    * @brief Whether to treat whitespace as field delimiter.
    */
-  bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
+  [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
 
   /**
    * @brief Whether to skip whitespace after the delimiter.
    */
-  bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
+  [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
 
   /**
    * @brief Whether to ignore empty lines or parse line values as invalid.
    */
-  bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
+  [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
 
   /**
    * @brief Returns quoting style.
    */
-  quote_style get_quoting() const { return _quoting; }
+  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
 
   /**
    * @brief Returns quoting character.
    */
-  char get_quotechar() const { return _quotechar; }
+  [[nodiscard]] char get_quotechar() const { return _quotechar; }
 
   /**
    * @brief Whether a quote inside a value is double-quoted.
    */
-  bool is_enabled_doublequote() const { return _doublequote; }
+  [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
 
   /**
    * @brief Returns names of columns to read as datetime.
    */
-  std::vector<std::string> const& get_parse_dates_names() const { return _parse_dates_names; }
+  [[nodiscard]] std::vector<std::string> const& get_parse_dates_names() const
+  {
+    return _parse_dates_names;
+  }
 
   /**
    * @brief Returns indexes of columns to read as datetime.
    */
-  std::vector<int> const& get_parse_dates_indexes() const { return _parse_dates_indexes; }
+  [[nodiscard]] std::vector<int> const& get_parse_dates_indexes() const
+  {
+    return _parse_dates_indexes;
+  }
 
   /**
    * @brief Returns names of columns to read as hexadecimal.
    */
-  std::vector<std::string> const& get_parse_hex_names() const { return _parse_hex_names; }
+  [[nodiscard]] std::vector<std::string> const& get_parse_hex_names() const
+  {
+    return _parse_hex_names;
+  }
 
   /**
    * @brief Returns indexes of columns to read as hexadecimal.
    */
-  std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
+  [[nodiscard]] std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
 
   /**
    * @brief Returns per-column types.
@@ -1277,52 +1289,52 @@ class csv_writer_options {
   /**
    * @brief Returns sink used for writer output.
    */
-  sink_info const& get_sink(void) const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns table that would be written to output.
    */
-  table_view const& get_table(void) const { return _table; }
+  [[nodiscard]] table_view const& get_table() const { return _table; }
 
   /**
    * @brief Returns optional associated metadata.
    */
-  table_metadata const* get_metadata(void) const { return _metadata; }
+  [[nodiscard]] table_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns string to used for null entries.
    */
-  std::string get_na_rep(void) const { return _na_rep; }
+  [[nodiscard]] std::string get_na_rep() const { return _na_rep; }
 
   /**
    * @brief Whether to write headers to csv.
    */
-  bool is_enabled_include_header(void) const { return _include_header; }
+  [[nodiscard]] bool is_enabled_include_header() const { return _include_header; }
 
   /**
    * @brief Returns maximum number of rows to process for each file write.
    */
-  size_type get_rows_per_chunk(void) const { return _rows_per_chunk; }
+  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
 
   /**
    * @brief Returns character used for separating lines.
    */
-  std::string get_line_terminator(void) const { return _line_terminator; }
+  [[nodiscard]] std::string get_line_terminator() const { return _line_terminator; }
 
   /**
    * @brief Returns character used for separating lines.
    */
-  char get_inter_column_delimiter(void) const { return _inter_column_delimiter; }
+  [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }
 
   /**
    * @brief Returns string used for values != 0 in INT8 types.
    */
-  std::string get_true_value(void) const { return _true_value; }
+  [[nodiscard]] std::string get_true_value() const { return _true_value; }
 
   /**
    * @brief Returns string used for values == 0 in INT8 types.
    */
-  std::string get_false_value(void) const { return _false_value; }
+  [[nodiscard]] std::string get_false_value() const { return _false_value; }
 
   // Setter
   /**
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 2c1966ee6ba..6d4c8ec9b8c 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -120,7 +120,7 @@ class data_sink {
    *
    * @return bool If this writer supports device_write() calls.
    */
-  virtual bool supports_device_write() const { return false; }
+  [[nodiscard]] virtual bool supports_device_write() const { return false; }
 
   /**
    * @brief Estimates whether a direct device write would be more optimal for the given size.
@@ -128,7 +128,10 @@ class data_sink {
    * @param size Number of bytes to write
    * @return whether the device write is expected to be more performant for the given size
    */
-  virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); }
+  [[nodiscard]] virtual bool is_device_write_preferred(size_t size) const
+  {
+    return supports_device_write();
+  }
 
   /**
    * @brief Append the buffer content to the sink from a gpu address
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 627ec29a496..18ab8aad088 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -52,12 +52,12 @@ class datasource {
     /**
      * @brief Returns the buffer size in bytes.
      */
-    virtual size_t size() const = 0;
+    [[nodiscard]] virtual size_t size() const = 0;
 
     /**
      * @brief Returns the address of the data in the buffer.
      */
-    virtual uint8_t const* data() const = 0;
+    [[nodiscard]] virtual uint8_t const* data() const = 0;
 
     /**
      * @brief Base class destructor
@@ -155,7 +155,7 @@ class datasource {
    *
    * @return bool Whether this source supports device_read() calls
    */
-  virtual bool supports_device_read() const { return false; }
+  [[nodiscard]] virtual bool supports_device_read() const { return false; }
 
   /**
    * @brief Estimates whether a direct device read would be more optimal for the given size.
@@ -163,7 +163,10 @@ class datasource {
    * @param size Number of bytes to read
    * @return whether the device read is expected to be more performant for the given size
    */
-  virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); }
+  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
+  {
+    return supports_device_read();
+  }
 
   /**
    * @brief Returns a device buffer with a subset of data from the source.
@@ -243,31 +246,31 @@ class datasource {
    *
    * @return size_t The size of the source data in bytes
    */
-  virtual size_t size() const = 0;
+  [[nodiscard]] virtual size_t size() const = 0;
 
   /**
    * @brief Returns whether the source contains any data.
    *
    * @return bool True if there is data, False otherwise
    */
-  virtual bool is_empty() const { return size() == 0; }
+  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
 
   /**
    * @brief Implementation for non owning buffer where datasource holds buffer until destruction.
    */
   class non_owning_buffer : public buffer {
    public:
-    non_owning_buffer() : _data(0), _size(0) {}
+    non_owning_buffer() {}
 
     non_owning_buffer(uint8_t* data, size_t size) : _data(data), _size(size) {}
 
-    size_t size() const override { return _size; }
+    [[nodiscard]] size_t size() const override { return _size; }
 
-    uint8_t const* data() const override { return _data; }
+    [[nodiscard]] uint8_t const* data() const override { return _data; }
 
    private:
-    uint8_t* const _data;
-    size_t const _size;
+    uint8_t* const _data{nullptr};
+    size_t const _size{0};
   };
 
   /**
@@ -297,9 +300,12 @@ class datasource {
     {
     }
 
-    size_t size() const override { return _size; }
+    [[nodiscard]] size_t size() const override { return _size; }
 
-    uint8_t const* data() const override { return static_cast<uint8_t const*>(_data_ptr); }
+    [[nodiscard]] uint8_t const* data() const override
+    {
+      return static_cast<uint8_t const*>(_data_ptr);
+    }
 
    private:
     Container _data;
@@ -330,8 +336,8 @@ class arrow_io_source : public datasource {
       : arrow_buffer(arrow_buffer)
     {
     }
-    size_t size() const override { return arrow_buffer->size(); }
-    uint8_t const* data() const override { return arrow_buffer->data(); }
+    [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
+    [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); }
   };
 
  public:
@@ -393,7 +399,7 @@ class arrow_io_source : public datasource {
   /**
    * @brief Returns the size of the data in the `arrow` source.
    */
-  size_t size() const override
+  [[nodiscard]] size_t size() const override
   {
     auto result = arrow_file->GetSize();
     CUDF_EXPECTS(result.ok(), "Cannot get file size");
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 5f34803f28e..727c24a4431 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -111,7 +111,7 @@ class json_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns data types of the columns.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index b3a2f6bcbbb..c2187f056cf 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,6 @@ class orc_reader_options {
 
   // Columns that should be read as Decimal128
   std::vector<std::string> _decimal128_columns;
-  bool _enable_decimal128 = true;
 
   friend orc_reader_options_builder;
 
@@ -102,12 +101,12 @@ class orc_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns names of the columns to read.
    */
-  std::vector<std::string> const& get_columns() const { return _columns; }
+  [[nodiscard]] std::vector<std::string> const& get_columns() const { return _columns; }
 
   /**
    * @brief Returns vector of vectors, stripes to read for each input source
@@ -152,11 +151,6 @@ class orc_reader_options {
    */
   std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
 
-  /**
-   * @brief Whether to use row index to speed-up reading.
-   */
-  bool is_enabled_decimal128() const { return _enable_decimal128; }
-
   // Setters
 
   /**
@@ -226,18 +220,13 @@ class orc_reader_options {
    *
    * @param val Vector of fully qualified column names.
    */
-  void set_decimal_cols_as_float(std::vector<std::string> val)
+  [[deprecated(
+    "Decimal to float conversion is deprecated and will be remove in future release")]] void
+  set_decimal_cols_as_float(std::vector<std::string> val)
   {
     _decimal_cols_as_float = std::move(val);
   }
 
-  /**
-   * @brief Enable/Disable the use of decimal128 type
-   *
-   * @param use Boolean value to enable/disable.
-   */
-  void enable_decimal128(bool use) { _enable_decimal128 = use; }
-
   /**
    * @brief Set columns that should be read as 128-bit Decimal
    *
@@ -357,7 +346,10 @@ class orc_reader_options_builder {
    * @param val Vector of column names.
    * @return this for chaining.
    */
-  orc_reader_options_builder& decimal_cols_as_float(std::vector<std::string> val)
+  [[deprecated(
+    "Decimal to float conversion is deprecated and will be remove in future "
+    "release")]] orc_reader_options_builder&
+  decimal_cols_as_float(std::vector<std::string> val)
   {
     options._decimal_cols_as_float = std::move(val);
     return *this;
@@ -375,17 +367,6 @@ class orc_reader_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Enable/Disable use of decimal128 type
-   *
-   * @param use Boolean value to enable/disable.
-   */
-  orc_reader_options_builder& decimal128(bool use)
-  {
-    options.enable_decimal128(use);
-    return *this;
-  }
-
   /**
    * @brief move orc_reader_options member once it's built.
    */
@@ -434,6 +415,18 @@ table_with_metadata read_orc(
  */
 class orc_writer_options_builder;
 
+/**
+ * @brief Constants to disambiguate statistics terminology for ORC.
+ *
+ * ORC refers to its finest granularity of row-grouping as "row group",
+ * which corresponds to Parquet "pages".
+ * Similarly, ORC's "stripe" corresponds to a Parquet "row group".
+ * The following constants disambiguate the terminology for the statistics
+ * collected at each level.
+ */
+static constexpr statistics_freq ORC_STATISTICS_STRIPE    = statistics_freq::STATISTICS_ROWGROUP;
+static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
+
 /**
  * @brief Settings to use for `write_orc()`.
  */
@@ -442,8 +435,8 @@ class orc_writer_options {
   sink_info _sink;
   // Specify the compression format to use
   compression_type _compression = compression_type::AUTO;
-  // Enable writing column statistics
-  bool _enable_statistics = true;
+  // Specify frequency of statistics collection
+  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
   size_t _stripe_size_bytes = default_stripe_size_bytes;
   // Maximum number of rows in stripe (unless smaller than a single row group)
@@ -491,27 +484,35 @@ class orc_writer_options {
   /**
    * @brief Returns sink info.
    */
-  sink_info const& get_sink() const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns compression type.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool is_enabled_statistics() const { return _enable_statistics; }
+  [[nodiscard]] bool is_enabled_statistics() const
+  {
+    return _stats_freq != statistics_freq::STATISTICS_NONE;
+  }
+
+  /**
+   * @brief Returns frequency of statistics collection.
+   */
+  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
+  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto get_stripe_size_rows() const { return _stripe_size_rows; }
+  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
@@ -525,17 +526,20 @@ class orc_writer_options {
   /**
    * @brief Returns table to be written to output.
    */
-  table_view get_table() const { return _table; }
+  [[nodiscard]] table_view get_table() const { return _table; }
 
   /**
    * @brief Returns associated metadata.
    */
-  table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
    */
-  std::map<std::string, std::string> const& get_key_value_metadata() const { return _user_data; }
+  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
+  {
+    return _user_data;
+  }
 
   // Setters
 
@@ -547,11 +551,16 @@ class orc_writer_options {
   void set_compression(compression_type comp) { _compression = comp; }
 
   /**
-   * @brief Enable/Disable writing column statistics.
+   * @brief Choose granularity of statistics collection.
+   *
+   * The granularity can be set to:
+   * - cudf::io::STATISTICS_NONE: No statistics are collected.
+   * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
+   * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
    *
-   * @param val Boolean value to enable/disable statistics.
+   * @param val Frequency of statistics collection.
    */
-  void enable_statistics(bool val) { _enable_statistics = val; }
+  void enable_statistics(statistics_freq val) { _stats_freq = val; }
 
   /**
    * @brief Sets the maximum stripe size, in bytes.
@@ -644,14 +653,19 @@ class orc_writer_options_builder {
   }
 
   /**
-   * @brief Enable/Disable writing column statistics.
+   * @brief Choose granularity of column statistics to be written
    *
-   * @param val Boolean value to enable/disable.
+   * The granularity can be set to:
+   * - cudf::io::STATISTICS_NONE: No statistics are collected.
+   * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
+   * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
+   *
+   * @param val Level of statistics collection.
    * @return this for chaining.
    */
-  orc_writer_options_builder& enable_statistics(bool val)
+  orc_writer_options_builder& enable_statistics(statistics_freq val)
   {
-    options._enable_statistics = val;
+    options._stats_freq = val;
     return *this;
   }
 
@@ -772,8 +786,8 @@ class chunked_orc_writer_options {
   sink_info _sink;
   // Specify the compression format to use
   compression_type _compression = compression_type::AUTO;
-  // Enable writing column statistics
-  bool _enable_statistics = true;
+  // Specify granularity of statistics collection
+  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
   size_t _stripe_size_bytes = default_stripe_size_bytes;
   // Maximum number of rows in stripe (unless smaller than a single row group)
@@ -814,27 +828,27 @@ class chunked_orc_writer_options {
   /**
    * @brief Returns sink info.
    */
-  sink_info const& get_sink() const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns compression type.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
-   * @brief Whether writing column statistics is enabled/disabled.
+   * @brief Returns granularity of statistics collection.
    */
-  bool is_enabled_statistics() const { return _enable_statistics; }
+  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
+  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto get_stripe_size_rows() const { return _stripe_size_rows; }
+  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
@@ -848,12 +862,15 @@ class chunked_orc_writer_options {
   /**
    * @brief Returns associated metadata.
    */
-  table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
    */
-  std::map<std::string, std::string> const& get_key_value_metadata() const { return _user_data; }
+  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
+  {
+    return _user_data;
+  }
 
   // Setters
 
@@ -865,11 +882,16 @@ class chunked_orc_writer_options {
   void set_compression(compression_type comp) { _compression = comp; }
 
   /**
-   * @brief Enable/Disable writing column statistics.
+   * @brief Choose granularity of statistics collection
    *
-   * @param val Boolean value to enable/disable.
+   * The granularity can be set to:
+   * - cudf::io::STATISTICS_NONE: No statistics are collected.
+   * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
+   * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
+   *
+   * @param val Frequency of statistics collection.
    */
-  void enable_statistics(bool val) { _enable_statistics = val; }
+  void enable_statistics(statistics_freq val) { _stats_freq = val; }
 
   /**
    * @brief Sets the maximum stripe size, in bytes.
@@ -952,14 +974,19 @@ class chunked_orc_writer_options_builder {
   }
 
   /**
-   * @brief Enable/Disable writing column statistics.
+   * @brief Choose granularity of statistics collection
+   *
+   * The granularity can be set to:
+   * - cudf::io::STATISTICS_NONE: No statistics are collected.
+   * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
+   * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
    *
-   * @param val Boolean value to enable/disable.
+   * @param val Frequency of statistics collection.
    * @return this for chaining.
    */
-  chunked_orc_writer_options_builder& enable_statistics(bool val)
+  chunked_orc_writer_options_builder& enable_statistics(statistics_freq val)
   {
-    options._enable_statistics = val;
+    options._stats_freq = val;
     return *this;
   }
 
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 740f7a8b2db..2ceac947c8d 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -96,33 +96,36 @@ class parquet_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns true/false depending on whether strings should be converted to categories or
    * not.
    */
-  bool is_enabled_convert_strings_to_categories() const { return _convert_strings_to_categories; }
+  [[nodiscard]] bool is_enabled_convert_strings_to_categories() const
+  {
+    return _convert_strings_to_categories;
+  }
 
   /**
    * @brief Returns true/false depending whether to use pandas metadata or not while reading.
    */
-  bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
+  [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
 
   /**
    * @brief Returns number of rows to skip from the start.
    */
-  size_type get_skip_rows() const { return _skip_rows; }
+  [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of rows to read.
    */
-  size_type get_num_rows() const { return _num_rows; }
+  [[nodiscard]] size_type get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Returns names of column to be read.
    */
-  std::vector<std::string> const& get_columns() const { return _columns; }
+  [[nodiscard]] std::vector<std::string> const& get_columns() const { return _columns; }
 
   /**
    * @brief Returns list of individual row groups to be read.
@@ -421,32 +424,32 @@ class parquet_writer_options {
   /**
    * @brief Returns sink info.
    */
-  sink_info const& get_sink() const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns compression format used.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Returns level of statistics requested in output file.
    */
-  statistics_freq get_stats_level() const { return _stats_level; }
+  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
 
   /**
    * @brief Returns table_view.
    */
-  table_view get_table() const { return _table; }
+  [[nodiscard]] table_view get_table() const { return _table; }
 
   /**
    * @brief Returns partitions.
    */
-  std::vector<partition_info> const& get_partitions() const { return _partitions; }
+  [[nodiscard]] std::vector<partition_info> const& get_partitions() const { return _partitions; }
 
   /**
    * @brief Returns associated metadata.
    */
-  table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
@@ -801,22 +804,22 @@ class chunked_parquet_writer_options {
   /**
    * @brief Returns sink info.
    */
-  sink_info const& get_sink() const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns compression format used.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Returns level of statistics requested in output file.
    */
-  statistics_freq get_stats_level() const { return _stats_level; }
+  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
 
   /**
    * @brief Returns metadata information.
    */
-  table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index e65afa04fe5..5e6dda5a514 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -36,8 +36,8 @@ namespace text {
  */
 class device_data_chunk {
  public:
-  virtual char const* data() const                 = 0;
-  virtual std::size_t size() const                 = 0;
+  [[nodiscard]] virtual char const* data() const   = 0;
+  [[nodiscard]] virtual std::size_t size() const   = 0;
   virtual operator device_span<char const>() const = 0;
 };
 
@@ -76,7 +76,7 @@ class data_chunk_reader {
  */
 class data_chunk_source {
  public:
-  virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
+  [[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
 };
 
 }  // namespace text
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index 6b95de53ee7..aeb4b7fff53 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -41,8 +41,8 @@ class device_span_data_chunk : public device_data_chunk {
  public:
   device_span_data_chunk(device_span<char const> data) : _data(data) {}
 
-  char const* data() const override { return _data.data(); }
-  std::size_t size() const override { return _data.size(); }
+  [[nodiscard]] char const* data() const override { return _data.data(); }
+  [[nodiscard]] std::size_t size() const override { return _data.size(); }
   operator device_span<char const>() const override { return _data; }
 
  private:
@@ -53,8 +53,8 @@ class device_uvector_data_chunk : public device_data_chunk {
  public:
   device_uvector_data_chunk(rmm::device_uvector<char>&& data) : _data(std::move(data)) {}
 
-  char const* data() const override { return _data.data(); }
-  std::size_t size() const override { return _data.size(); }
+  [[nodiscard]] char const* data() const override { return _data.data(); }
+  [[nodiscard]] std::size_t size() const override { return _data.size(); }
   operator device_span<char const>() const override { return _data; }
 
  private:
@@ -171,7 +171,7 @@ class device_span_data_chunk_reader : public data_chunk_reader {
 class file_data_chunk_source : public data_chunk_source {
  public:
   file_data_chunk_source(std::string filename) : _filename(filename) {}
-  std::unique_ptr<data_chunk_reader> create_reader() const override
+  [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
   {
     return std::make_unique<istream_data_chunk_reader>(
       std::make_unique<std::ifstream>(_filename, std::ifstream::in));
@@ -187,7 +187,7 @@ class file_data_chunk_source : public data_chunk_source {
 class string_data_chunk_source : public data_chunk_source {
  public:
   string_data_chunk_source(std::string const& data) : _data(data) {}
-  std::unique_ptr<data_chunk_reader> create_reader() const override
+  [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
   {
     return std::make_unique<istream_data_chunk_reader>(std::make_unique<std::istringstream>(_data));
   }
@@ -202,7 +202,7 @@ class string_data_chunk_source : public data_chunk_source {
 class device_span_data_chunk_source : public data_chunk_source {
  public:
   device_span_data_chunk_source(device_span<char const> data) : _data(data) {}
-  std::unique_ptr<data_chunk_reader> create_reader() const override
+  [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
   {
     return std::make_unique<device_span_data_chunk_reader>(_data);
   }
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
index d3c8909ab51..e7136ac69a5 100644
--- a/cpp/include/cudf/io/text/detail/multistate.hpp
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -37,7 +37,7 @@ struct multistate {
      * @brief Creates a segment which represents (0, 0]
      */
 
-    constexpr multistate_segment() : _data(0) {}
+    constexpr multistate_segment() = default;
     /**
      * @brief Creates a segment which represents (head, tail]
      *
@@ -52,15 +52,15 @@ struct multistate {
     /**
      * @brief Get's the (head, ____] value from the segment.
      */
-    constexpr uint8_t get_head() const { return _data & 0b1111; }
+    [[nodiscard]] constexpr uint8_t get_head() const { return _data & 0b1111; }
 
     /**
      * @brief Get's the (____, tail] value from the segment.
      */
-    constexpr uint8_t get_tail() const { return _data >> 4; }
+    [[nodiscard]] constexpr uint8_t get_tail() const { return _data >> 4; }
 
    private:
-    uint8_t _data;
+    uint8_t _data{0};
   };
 
  public:
@@ -87,12 +87,12 @@ struct multistate {
   /**
    * @brief get's the number of segments this multistate represents
    */
-  constexpr uint8_t size() const { return _size; }
+  [[nodiscard]] constexpr uint8_t size() const { return _size; }
 
   /**
    * @brief get's the highest (____, tail] value this multistate represents
    */
-  constexpr uint8_t max_tail() const
+  [[nodiscard]] constexpr uint8_t max_tail() const
   {
     uint8_t maximum = 0;
 
@@ -106,12 +106,12 @@ struct multistate {
   /**
    * @brief get's the Nth (head, ____] value state this multistate represents
    */
-  constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); }
+  [[nodiscard]] constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); }
 
   /**
    * @brief get's the Nth (____, tail] value state this multistate represents
    */
-  constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); }
+  [[nodiscard]] constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); }
 
  private:
   uint8_t _size = 0;
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index d14fe15b0a9..06d15276a68 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -161,13 +161,13 @@ struct trie {
   /**
    * @brief Gets the number of nodes contained in this trie.
    */
-  cudf::size_type size() const { return _nodes.size(); }
+  [[nodiscard]] cudf::size_type size() const { return _nodes.size(); }
 
   /**
    * @brief A pessimistic count of duplicate tokens in the trie. Used to determine the maximum
    * possible stack size required to compute matches of this trie in parallel.
    */
-  cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; }
+  [[nodiscard]] cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; }
 
   /**
    * @brief Create a trie which represents the given pattern.
@@ -255,7 +255,7 @@ struct trie {
                 cudf::detail::make_device_uvector_sync(trie_nodes, stream, mr)};
   }
 
-  trie_device_view view() const { return trie_device_view{_nodes}; }
+  [[nodiscard]] trie_device_view view() const { return trie_device_view{_nodes}; }
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 8f06de99f05..7e4ab5b8d9d 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -155,14 +155,8 @@ struct source_info {
 
   source_info() = default;
 
-  explicit source_info(std::vector<std::string> const& file_paths)
-    : _type(io_type::FILEPATH), _filepaths(file_paths)
-  {
-  }
-  explicit source_info(std::string const& file_path)
-    : _type(io_type::FILEPATH), _filepaths({file_path})
-  {
-  }
+  explicit source_info(std::vector<std::string> const& file_paths) : _filepaths(file_paths) {}
+  explicit source_info(std::string const& file_path) : _filepaths({file_path}) {}
 
   explicit source_info(std::vector<host_buffer> const& host_buffers)
     : _type(io_type::HOST_BUFFER), _buffers(host_buffers)
@@ -182,11 +176,11 @@ struct source_info {
   {
   }
 
-  auto type() const { return _type; }
-  auto const& filepaths() const { return _filepaths; }
-  auto const& buffers() const { return _buffers; }
-  auto const& files() const { return _files; }
-  auto const& user_sources() const { return _user_sources; }
+  [[nodiscard]] auto type() const { return _type; }
+  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
+  [[nodiscard]] auto const& buffers() const { return _buffers; }
+  [[nodiscard]] auto const& files() const { return _files; }
+  [[nodiscard]] auto const& user_sources() const { return _user_sources; }
 
  private:
   io_type _type = io_type::FILEPATH;
@@ -200,7 +194,7 @@ struct source_info {
  */
 struct sink_info {
   sink_info() = default;
-  sink_info(size_t num_sinks) : _type(io_type::VOID), _num_sinks(num_sinks) {}
+  sink_info(size_t num_sinks) : _num_sinks(num_sinks) {}
 
   explicit sink_info(std::vector<std::string> const& file_paths)
     : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths)
@@ -226,11 +220,11 @@ struct sink_info {
   {
   }
 
-  auto type() const { return _type; }
-  auto num_sinks() const { return _num_sinks; }
-  auto const& filepaths() const { return _filepaths; }
-  auto const& buffers() const { return _buffers; }
-  auto const& user_sinks() const { return _user_sinks; }
+  [[nodiscard]] auto type() const { return _type; }
+  [[nodiscard]] auto num_sinks() const { return _num_sinks; }
+  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
+  [[nodiscard]] auto const& buffers() const { return _buffers; }
+  [[nodiscard]] auto const& user_sinks() const { return _user_sinks; }
 
  private:
   io_type _type     = io_type::VOID;
@@ -344,51 +338,51 @@ class column_in_metadata {
    * @param i Index of the child to get
    * @return this for chaining
    */
-  column_in_metadata const& child(size_type i) const { return children[i]; }
+  [[nodiscard]] column_in_metadata const& child(size_type i) const { return children[i]; }
 
   /**
    * @brief Get the name of this column
    */
-  std::string get_name() const { return _name; }
+  [[nodiscard]] std::string get_name() const { return _name; }
 
   /**
    * @brief Get whether nullability has been explicitly set for this column.
    */
-  bool is_nullability_defined() const { return _nullable.has_value(); }
+  [[nodiscard]] bool is_nullability_defined() const { return _nullable.has_value(); }
 
   /**
    * @brief Gets the explicitly set nullability for this column.
    * @throws If nullability is not explicitly defined for this column.
    *         Check using `is_nullability_defined()` first.
    */
-  bool nullable() const { return _nullable.value(); }
+  [[nodiscard]] bool nullable() const { return _nullable.value(); }
 
   /**
    * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
    */
-  bool is_map() const { return _list_column_is_map; }
+  [[nodiscard]] bool is_map() const { return _list_column_is_map; }
 
   /**
    * @brief Get whether to encode this timestamp column using deprecated int96 physical type
    */
-  bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
+  [[nodiscard]] bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
 
   /**
    * @brief Get whether precision has been set for this decimal column
    */
-  bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
+  [[nodiscard]] bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
 
   /**
    * @brief Get the decimal precision that was set for this column.
    * @throws If decimal precision was not set for this column.
    *         Check using `is_decimal_precision_set()` first.
    */
-  uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
+  [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
 
   /**
    * @brief Get the number of children of this column
    */
-  size_type num_children() const { return children.size(); }
+  [[nodiscard]] size_type num_children() const { return children.size(); }
 };
 
 class table_input_metadata {
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 30400074c50..f6efea5f2bb 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -607,9 +607,10 @@ class hash_join {
    * @return The exact number of output when performing an inner join between two tables with
    * `build` and `probe` as the the join keys .
    */
-  std::size_t inner_join_size(cudf::table_view const& probe,
-                              null_equality compare_nulls  = null_equality::EQUAL,
-                              rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] std::size_t inner_join_size(
+    cudf::table_view const& probe,
+    null_equality compare_nulls  = null_equality::EQUAL,
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * Returns the exact number of matches (rows) when performing a left join with the specified probe
@@ -622,9 +623,10 @@ class hash_join {
    * @return The exact number of output when performing a left join between two tables with `build`
    * and `probe` as the the join keys .
    */
-  std::size_t left_join_size(cudf::table_view const& probe,
-                             null_equality compare_nulls  = null_equality::EQUAL,
-                             rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] std::size_t left_join_size(
+    cudf::table_view const& probe,
+    null_equality compare_nulls  = null_equality::EQUAL,
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * Returns the exact number of matches (rows) when performing a full join with the specified probe
@@ -1037,6 +1039,109 @@ mixed_full_join(
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns an index vector corresponding to all rows in the left tables
+ * where the columns of the equality table are equal and the predicate
+ * evaluates to true on the conditional tables.
+ *
+ * If the provided predicate returns NULL for a pair of rows (left, right), the
+ * left row is not included in the output. It is the user's responsiblity to
+ * choose a suitable compare_nulls value AND use appropriate null-safe
+ * operators in the expression.
+ *
+ * If the provided output size or per-row counts are incorrect, behavior is undefined.
+ *
+ * @code{.pseudo}
+ * left_equality: {{0, 1, 2}}
+ * right_equality: {{1, 2, 3}}
+ * left_conditional: {{4, 4, 4}}
+ * right_conditional: {{3, 4, 5}}
+ * Expression: Left.Column_0 > Right.Column_0
+ * Result: {1}
+ * @endcode
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size_data An optional pair of values indicating the exact output size and the
+ * number of matches for each row in the larger of the two input tables, left or right (may be
+ * precomputed using the corresponding mixed_full_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a mixed full join between the four input tables.
+ */
+std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls = null_equality::EQUAL,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns an index vector corresponding to all rows in the left tables
+ * for which there is no row in the right tables where the columns of the
+ * equality table are equal and the predicate evaluates to true on the
+ * conditional tables.
+ *
+ * If the provided predicate returns NULL for a pair of rows (left, right), the
+ * left row is not included in the output. It is the user's responsiblity to
+ * choose a suitable compare_nulls value AND use appropriate null-safe
+ * operators in the expression.
+ *
+ * If the provided output size or per-row counts are incorrect, behavior is undefined.
+ *
+ * @code{.pseudo}
+ * left_equality: {{0, 1, 2}}
+ * right_equality: {{1, 2, 3}}
+ * left_conditional: {{4, 4, 4}}
+ * right_conditional: {{3, 4, 5}}
+ * Expression: Left.Column_0 > Right.Column_0
+ * Result: {0, 2}
+ * @endcode
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size_data An optional pair of values indicating the exact output size and the
+ * number of matches for each row in the larger of the two input tables, left or right (may be
+ * precomputed using the corresponding mixed_full_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a mixed full join between the four input tables.
+ */
+std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls = null_equality::EQUAL,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns the exact number of matches (rows) when performing a
  * mixed inner join between the specified tables where the columns of the
@@ -1123,6 +1228,90 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns the exact number of matches (rows) when performing a mixed
+ * left semi join between the specified tables where the columns of the
+ * equality table are equal and the predicate evaluates to true on the
+ * conditional tables.
+ *
+ * If the provided predicate returns NULL for a pair of rows (left, right),
+ * that pair is not included in the output. It is the user's responsiblity to
+ * choose a suitable compare_nulls value AND use appropriate null-safe
+ * operators in the expression.
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size An optional pair of values indicating the exact output size and the number of
+ * matches for each row in the larger of the two input tables, left or right (may be precomputed
+ * using the corresponding mixed_inner_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair containing the size that would result from performing the
+ * requested join and the number of matches for each row in one of the two
+ * tables. Which of the two tables is an implementation detail and should not
+ * be relied upon, simply passed to the corresponding `mixed_left_join` API as
+ * is.
+ */
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the exact number of matches (rows) when performing a mixed
+ * left anti join between the specified tables.
+ *
+ * If the provided predicate returns NULL for a pair of rows (left, right),
+ * that pair is not included in the output. It is the user's responsiblity to
+ * choose a suitable compare_nulls value AND use appropriate null-safe
+ * operators in the expression.
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size An optional pair of values indicating the exact output size and the number of
+ * matches for each row in the larger of the two input tables, left or right (may be precomputed
+ * using the corresponding mixed_inner_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair containing the size that would result from performing the
+ * requested join and the number of matches for each row in one of the two
+ * tables. Which of the two tables is an implementation detail and should not
+ * be relied upon, simply passed to the corresponding `mixed_left_join` API as
+ * is.
+ */
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns the exact number of matches (rows) when performing a
  * conditional inner join between the specified tables where the predicate
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index bdf68037944..34747f4a2c7 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -91,17 +91,17 @@ struct unbound_list_view {
   /**
    * @brief Returns number of elements in this list row.
    */
-  __device__ inline size_type size() const { return _size; }
+  [[nodiscard]] __device__ inline size_type size() const { return _size; }
 
   /**
    * @brief Returns whether this row came from the `scatter()` source or target
    */
-  __device__ inline label_type label() const { return _label; }
+  [[nodiscard]] __device__ inline label_type label() const { return _label; }
 
   /**
    * @brief Returns the index in the source/target column
    */
-  __device__ inline size_type row_index() const { return _row_index; }
+  [[nodiscard]] __device__ inline size_type row_index() const { return _row_index; }
 
   /**
    * @brief Binds to source/target column (depending on SOURCE/TARGET labels),
@@ -111,7 +111,7 @@ struct unbound_list_view {
    * @param scatter_target Target column for the scatter operation
    * @return A (bound) list_view for the row that this object represents
    */
-  __device__ inline list_device_view bind_to_column(
+  [[nodiscard]] __device__ inline list_device_view bind_to_column(
     lists_column_device_view const& scatter_source,
     lists_column_device_view const& scatter_target) const
   {
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 5071f046e0c..e4803f98e68 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -69,7 +69,7 @@ class list_device_view {
    * The offset of this element as stored in the child column (i.e. 5)
    * may be fetched using this method.
    */
-  __device__ inline size_type element_offset(size_type idx) const
+  [[nodiscard]] __device__ inline size_type element_offset(size_type idx) const
   {
     cudf_assert(idx >= 0 && idx < size() && "idx out of bounds");
     return begin_offset + idx;
@@ -91,7 +91,7 @@ class list_device_view {
   /**
    * @brief Checks whether element is null at specified index in the list row.
    */
-  __device__ inline bool is_null(size_type idx) const
+  [[nodiscard]] __device__ inline bool is_null(size_type idx) const
   {
     cudf_assert(idx >= 0 && idx < size() && "Index out of bounds.");
     auto element_offset = begin_offset + idx;
@@ -101,17 +101,20 @@ class list_device_view {
   /**
    * @brief Checks whether this list row is null.
    */
-  __device__ inline bool is_null() const { return lists_column.is_null(_row_index); }
+  [[nodiscard]] __device__ inline bool is_null() const { return lists_column.is_null(_row_index); }
 
   /**
    * @brief Fetches the number of elements in this list row.
    */
-  __device__ inline size_type size() const { return _size; }
+  [[nodiscard]] __device__ inline size_type size() const { return _size; }
 
   /**
    * @brief Fetches the lists_column_device_view that contains this list.
    */
-  __device__ inline lists_column_device_view const& get_column() const { return lists_column; }
+  [[nodiscard]] __device__ inline lists_column_device_view const& get_column() const
+  {
+    return lists_column;
+  }
 
   template <typename T>
   struct pair_accessor;
@@ -141,7 +144,7 @@ class list_device_view {
    *   2. `p.second == false`
    */
   template <typename T>
-  __device__ inline const_pair_iterator<T> pair_begin() const
+  [[nodiscard]] __device__ inline const_pair_iterator<T> pair_begin() const
   {
     return const_pair_iterator<T>{thrust::counting_iterator<size_type>(0), pair_accessor<T>{*this}};
   }
@@ -151,7 +154,7 @@ class list_device_view {
    * list_device_view.
    */
   template <typename T>
-  __device__ inline const_pair_iterator<T> pair_end() const
+  [[nodiscard]] __device__ inline const_pair_iterator<T> pair_end() const
   {
     return const_pair_iterator<T>{thrust::counting_iterator<size_type>(size()),
                                   pair_accessor<T>{*this}};
@@ -173,7 +176,7 @@ class list_device_view {
    *   2. `p.second == false`
    */
   template <typename T>
-  __device__ inline const_pair_rep_iterator<T> pair_rep_begin() const
+  [[nodiscard]] __device__ inline const_pair_rep_iterator<T> pair_rep_begin() const
   {
     return const_pair_rep_iterator<T>{thrust::counting_iterator<size_type>(0),
                                       pair_rep_accessor<T>{*this}};
@@ -184,7 +187,7 @@ class list_device_view {
    * list_device_view.
    */
   template <typename T>
-  __device__ inline const_pair_rep_iterator<T> pair_rep_end() const
+  [[nodiscard]] __device__ inline const_pair_rep_iterator<T> pair_rep_end() const
   {
     return const_pair_rep_iterator<T>{thrust::counting_iterator<size_type>(size()),
                                       pair_rep_accessor<T>{*this}};
diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh
index aff088a7f44..e48707ec298 100644
--- a/cpp/include/cudf/lists/lists_column_device_view.cuh
+++ b/cpp/include/cudf/lists/lists_column_device_view.cuh
@@ -46,12 +46,12 @@ class lists_column_device_view {
   /**
    * @brief Fetches number of rows in the lists column
    */
-  CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); }
+  [[nodiscard]] CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); }
 
   /**
    * @brief Fetches the offsets column of the underlying list column.
    */
-  __device__ inline column_device_view offsets() const
+  [[nodiscard]] __device__ inline column_device_view offsets() const
   {
     return underlying.child(lists_column_view::offsets_column_index);
   }
@@ -59,7 +59,7 @@ class lists_column_device_view {
   /**
    * @brief Fetches the child column of the underlying list column.
    */
-  __device__ inline column_device_view child() const
+  [[nodiscard]] __device__ inline column_device_view child() const
   {
     return underlying.child(lists_column_view::child_column_index);
   }
@@ -67,19 +67,22 @@ class lists_column_device_view {
   /**
    * @brief Indicates whether the list column is nullable.
    */
-  __device__ inline bool nullable() const { return underlying.nullable(); }
+  [[nodiscard]] __device__ inline bool nullable() const { return underlying.nullable(); }
 
   /**
    * @brief Indicates whether the row (i.e. list) at the specified
    * index is null.
    */
-  __device__ inline bool is_null(size_type idx) const { return underlying.is_null(idx); }
+  [[nodiscard]] __device__ inline bool is_null(size_type idx) const
+  {
+    return underlying.is_null(idx);
+  }
 
   /**
    * @brief Fetches the offset of the underlying column_device_view,
    *        in case it is a sliced/offset column.
    */
-  __device__ inline size_type offset() const { return underlying.offset(); }
+  [[nodiscard]] __device__ inline size_type offset() const { return underlying.offset(); }
 
  private:
   column_device_view underlying;
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index b055a050bf8..d09bc2c935f 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -63,21 +63,21 @@ class lists_column_view : private column_view {
   /**
    * @brief Returns the parent column.
    */
-  column_view parent() const;
+  [[nodiscard]] column_view parent() const;
 
   /**
    * @brief Returns the internal column of offsets
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view offsets() const;
+  [[nodiscard]] column_view offsets() const;
 
   /**
    * @brief Returns the internal child column
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view child() const;
+  [[nodiscard]] column_view child() const;
 
   /**
    * @brief Returns the internal child column, applying any offset from the root.
@@ -89,14 +89,14 @@ class lists_column_view : private column_view {
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view get_sliced_child(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] column_view get_sliced_child(rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Return first offset (accounting for column offset)
    *
    * @return int32_t const* Pointer to the first offset
    */
-  offset_iterator offsets_begin() const noexcept
+  [[nodiscard]] offset_iterator offsets_begin() const noexcept
   {
     return offsets().begin<offset_type>() + offset();
   }
@@ -111,7 +111,10 @@ class lists_column_view : private column_view {
    *
    * @return int32_t const* Pointer to one past the last offset
    */
-  offset_iterator offsets_end() const noexcept { return offsets_begin() + size() + 1; }
+  [[nodiscard]] offset_iterator offsets_end() const noexcept
+  {
+    return offsets_begin() + size() + 1;
+  }
 };
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp
index a4f0a51eac7..4d31bb98f9c 100644
--- a/cpp/include/cudf/rolling/range_window_bounds.hpp
+++ b/cpp/include/cudf/rolling/range_window_bounds.hpp
@@ -56,12 +56,12 @@ struct range_window_bounds {
    * @return true If window is unbounded
    * @return false If window is of finite bounds
    */
-  bool is_unbounded() const { return _is_unbounded; }
+  [[nodiscard]] bool is_unbounded() const { return _is_unbounded; }
 
   /**
    * @brief Returns the underlying scalar value for the bounds
    */
-  scalar const& range_scalar() const { return *_range_scalar; }
+  [[nodiscard]] scalar const& range_scalar() const { return *_range_scalar; }
 
   range_window_bounds(range_window_bounds const&) =
     default;                        // Required to return (by copy) from functions.
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index dc2df368bae..0db729aec28 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -52,7 +52,7 @@ class scalar {
   /**
    * @brief Returns the scalar's logical value type.
    */
-  data_type type() const noexcept;
+  [[nodiscard]] data_type type() const noexcept;
 
   /**
    * @brief Updates the validity of the value.
@@ -72,7 +72,7 @@ class scalar {
    * @return true Value is valid.
    * @return false Value is invalid/null.
    */
-  bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Returns a raw pointer to the validity bool in device memory.
@@ -82,7 +82,7 @@ class scalar {
   /**
    * @brief Returns a const raw pointer to the validity bool in device memory.
    */
-  bool const* validity_data() const;
+  [[nodiscard]] bool const* validity_data() const;
 
  protected:
   data_type _type{type_id::EMPTY};     ///< Logical type of value in the scalar
@@ -128,7 +128,7 @@ class fixed_width_scalar : public scalar {
  public:
   using value_type = T;
 
-  ~fixed_width_scalar()                          = default;
+  ~fixed_width_scalar() override                 = default;
   fixed_width_scalar(fixed_width_scalar&& other) = default;
 
   fixed_width_scalar& operator=(fixed_width_scalar const& other) = delete;
@@ -278,7 +278,7 @@ class fixed_point_scalar : public scalar {
   using value_type = T;
 
   fixed_point_scalar()                           = delete;
-  ~fixed_point_scalar()                          = default;
+  ~fixed_point_scalar() override                 = default;
   fixed_point_scalar(fixed_point_scalar&& other) = default;
 
   fixed_point_scalar& operator=(fixed_point_scalar const& other) = delete;
@@ -392,7 +392,7 @@ class string_scalar : public scalar {
   using value_type = cudf::string_view;
 
   string_scalar()                      = delete;
-  ~string_scalar()                     = default;
+  ~string_scalar() override            = default;
   string_scalar(string_scalar&& other) = default;
 
   // string_scalar(string_scalar const& other) = delete;
@@ -479,24 +479,25 @@ class string_scalar : public scalar {
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  std::string to_string(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] std::string to_string(
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Get the value of the scalar as a string_view.
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Returns the size of the string in bytes.
    */
-  size_type size() const;
+  [[nodiscard]] size_type size() const;
 
   /**
    * @brief Returns a raw pointer to the string in device memory.
    */
-  const char* data() const;
+  [[nodiscard]] const char* data() const;
 
  protected:
   rmm::device_buffer _data{};  ///< device memory containing the string
@@ -647,7 +648,7 @@ class duration_scalar : public chrono_scalar<T> {
 class list_scalar : public scalar {
  public:
   list_scalar()                    = delete;
-  ~list_scalar()                   = default;
+  ~list_scalar() override          = default;
   list_scalar(list_scalar&& other) = default;
 
   list_scalar& operator=(list_scalar const& other) = delete;
@@ -695,7 +696,7 @@ class list_scalar : public scalar {
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
    */
-  column_view view() const;
+  [[nodiscard]] column_view view() const;
 
  private:
   cudf::column _data;
@@ -707,7 +708,7 @@ class list_scalar : public scalar {
 class struct_scalar : public scalar {
  public:
   struct_scalar()                      = delete;
-  ~struct_scalar()                     = default;
+  ~struct_scalar() override            = default;
   struct_scalar(struct_scalar&& other) = default;
   struct_scalar& operator=(struct_scalar const& other) = delete;
   struct_scalar& operator=(struct_scalar&& other) = delete;
@@ -765,7 +766,7 @@ class struct_scalar : public scalar {
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
    */
-  table_view view() const;
+  [[nodiscard]] table_view view() const;
 
  private:
   table _data;
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 56afa150dfc..ae658da9f9b 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -37,7 +37,7 @@ class scalar_device_view_base {
   /**
    * @brief Returns the value type
    */
-  __host__ __device__ data_type type() const noexcept { return _type; }
+  [[nodiscard]] __host__ __device__ data_type type() const noexcept { return _type; }
 
   /**
    * @brief Returns whether the scalar holds a valid value (i.e., not null).
@@ -45,7 +45,7 @@ class scalar_device_view_base {
    * @return true The element is valid
    * @return false The element is null
    */
-  __device__ bool is_valid() const noexcept { return *_is_valid; }
+  [[nodiscard]] __device__ bool is_valid() const noexcept { return *_is_valid; }
 
   /**
    * @brief Updates the validity of the value
@@ -260,17 +260,23 @@ class string_scalar_device_view : public detail::scalar_device_view_base {
   /**
    * @brief Returns string_view of the value of this scalar.
    */
-  __device__ ValueType value() const noexcept { return ValueType{this->data(), _size}; }
+  [[nodiscard]] __device__ ValueType value() const noexcept
+  {
+    return ValueType{this->data(), _size};
+  }
 
   /**
    * @brief Returns a raw pointer to the value in device memory
    */
-  __device__ char const* data() const noexcept { return static_cast<char const*>(_data); }
+  [[nodiscard]] __device__ char const* data() const noexcept
+  {
+    return static_cast<char const*>(_data);
+  }
 
   /**
    * @brief Returns the size of the string in bytes.
    */
-  __device__ size_type size() const noexcept { return _size; }
+  [[nodiscard]] __device__ size_type size() const noexcept { return _size; }
 
  private:
   const char* _data{};  ///< Pointer to device memory containing the value
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 4cfd0b75cd4..0964e713592 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,30 +27,32 @@ namespace strings {
  */
 
 /**
- * @brief Returns a column with character position values where each
+ * @brief Returns a lists column with character position values where each
  * of the target strings are found in each string.
  *
- * The size of the output column is targets.size() * strings.size().
- * output[i] contains the position of target[i % targets.size()] in string[i/targets.size()]
+ * The size of the output column is `input.size()`.
+ * Each row of the output column is of size `targets.size()`.
+ *
+ * `output[i,j]` contains the position of `targets[j]` in `input[i]`
  *
  * @code{.pseudo}
  * Example:
- * s = ["abc","def"]
- * t = ["a","c","e"]
- * r = find_multiple(s,t)
- * r is now [ 0, 2,-1,   // for "abc": "a" at pos 0, "c" at pos 2, "e" not found
- *           -1,-1, 1 ]  // for "def": "a" and "b" not found, "e" at  pos 1
+ * s = ["abc", "def"]
+ * t = ["a", "c", "e"]
+ * r = find_multiple(s, t)
+ * r is now {[ 0, 2,-1],   // for "abc": "a" at pos 0, "c" at pos 2, "e" not found
+ *           [-1,-1, 1 ]}  // for "def": "a" and "b" not found, "e" at  pos 1
  * @endcode
  *
- * @throw cudf::logic_error targets is empty or contains nulls
+ * @throw cudf::logic_error if `targets` is empty or contains nulls
  *
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation.
  * @param targets Strings to search for in each string.
  * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New integer column with character position values.
+ * @return Lists column with character position values.
  */
 std::unique_ptr<column> find_multiple(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 6c3139747af..4207cddbafb 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,37 @@ namespace strings {
  * @param mr Device memory resource used to allocate the returned table's device memory.
  * @return New table of strings columns.
  */
-std::unique_ptr<table> findall_re(
+std::unique_ptr<table> findall(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a lists column of strings for each matching occurrence of the
+ * regex pattern within each string.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["bunny", "rabbit", "hare", "dog"]
+ * r = findall_record(s, "[ab]"")
+ * r is now a lists column like:
+ *  [ ["b"]
+ *    ["a","b","b"]
+ *    ["a"]
+ *    null ]
+ * @endcode
+ *
+ * A null output row results if the pattern is not found in the corresponding row
+ * input string.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation.
+ * @param pattern Regex pattern to match within each string.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New lists column of strings.
+ */
+std::unique_ptr<column> findall_record(
   strings_column_view const& strings,
   std::string const& pattern,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp
index 8435c47eaf5..f6645f2e029 100644
--- a/cpp/include/cudf/strings/json.hpp
+++ b/cpp/include/cudf/strings/json.hpp
@@ -48,7 +48,10 @@ class get_json_object_options {
    * @brief Returns true/false depending on whether single-quotes for representing strings
    * are allowed.
    */
-  CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const { return allow_single_quotes; }
+  [[nodiscard]] CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const
+  {
+    return allow_single_quotes;
+  }
 
   /**
    * @brief Returns true/false depending on whether individually returned string values have
@@ -72,7 +75,7 @@ class get_json_object_options {
    *
    * @endcode
    */
-  CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const
+  [[nodiscard]] CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const
   {
     return strip_quotes_from_single_strings;
   }
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index edba01b174f..f6bf12af967 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -18,6 +18,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <optional>
+
 namespace cudf {
 namespace strings {
 /**
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index a2c4eba1636..0e904958d15 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -20,6 +20,8 @@
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <optional>
+
 namespace cudf {
 namespace strings {
 /**
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 43a90997c86..24c8bfea2be 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -44,7 +44,7 @@ namespace detail {
  */
 __device__ inline size_type characters_in_string(const char* str, size_type bytes)
 {
-  if ((str == 0) || (bytes == 0)) return 0;
+  if ((str == nullptr) || (bytes == 0)) return 0;
   auto ptr = reinterpret_cast<uint8_t const*>(str);
 #ifndef CUDF_JIT_UDF
   return thrust::count_if(
@@ -271,9 +271,9 @@ __device__ inline int string_view::compare(const string_view& in) const
 
 __device__ inline int string_view::compare(const char* data, size_type bytes) const
 {
-  size_type const len1      = size_bytes();
-  const unsigned char* ptr1 = reinterpret_cast<const unsigned char*>(this->data());
-  const unsigned char* ptr2 = reinterpret_cast<const unsigned char*>(data);
+  size_type const len1 = size_bytes();
+  const auto* ptr1     = reinterpret_cast<const unsigned char*>(this->data());
+  const auto* ptr2     = reinterpret_cast<const unsigned char*>(data);
   if ((ptr1 == ptr2) && (bytes == len1)) return 0;
   size_type idx = 0;
   for (; (idx < len1) && (idx < bytes); ++idx) {
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 22409ab3dc7..f88f573ac0c 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -51,20 +51,20 @@ class string_view {
   /**
    * @brief Return the number of bytes in this string
    */
-  CUDF_HOST_DEVICE inline size_type size_bytes() const { return _bytes; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline size_type size_bytes() const { return _bytes; }
   /**
    * @brief Return the number of characters in this string
    */
-  __device__ inline size_type length() const;
+  __device__ [[nodiscard]] inline size_type length() const;
   /**
    * @brief Return a pointer to the internal device array
    */
-  CUDF_HOST_DEVICE inline const char* data() const { return _data; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline const char* data() const { return _data; }
 
   /**
    * @brief Return true if string has no characters
    */
-  CUDF_HOST_DEVICE inline bool empty() const { return size_bytes() == 0; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline bool empty() const { return size_bytes() == 0; }
 
   /**
    * @brief Handy iterator for navigating through encoded characters.
@@ -96,8 +96,8 @@ class string_view {
     __device__ inline bool operator>(const const_iterator&) const;
     __device__ inline bool operator>=(const const_iterator&) const;
     __device__ inline char_utf8 operator*() const;
-    __device__ inline size_type position() const;
-    __device__ inline size_type byte_offset() const;
+    [[nodiscard]] __device__ inline size_type position() const;
+    [[nodiscard]] __device__ inline size_type byte_offset() const;
 
    private:
     const char* p{};
@@ -109,11 +109,11 @@ class string_view {
   /**
    * @brief Return new iterator pointing to the beginning of this string
    */
-  __device__ inline const_iterator begin() const;
+  __device__ [[nodiscard]] inline const_iterator begin() const;
   /**
    * @brief Return new iterator pointing past the end of this string
    */
-  __device__ inline const_iterator end() const;
+  __device__ [[nodiscard]] inline const_iterator end() const;
 
   /**
    * @brief Return single UTF-8 character at the given character position
@@ -126,7 +126,7 @@ class string_view {
    *
    * @param pos Character position
    */
-  __device__ inline size_type byte_offset(size_type pos) const;
+  __device__ [[nodiscard]] inline size_type byte_offset(size_type pos) const;
 
   /**
    * @brief Comparing target string with this string. Each character is compared
@@ -141,7 +141,7 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  __device__ inline int compare(const string_view& str) const;
+  __device__ [[nodiscard]] inline int compare(const string_view& str) const;
   /**
    * @brief Comparing target string with this string. Each character is compared
    * as a UTF-8 code-point value.
@@ -193,9 +193,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if str is not found in this string.
    */
-  __device__ inline size_type find(const string_view& str,
-                                   size_type pos   = 0,
-                                   size_type count = -1) const;
+  __device__ [[nodiscard]] inline size_type find(const string_view& str,
+                                                 size_type pos   = 0,
+                                                 size_type count = -1) const;
   /**
    * @brief Returns the character position of the first occurrence where the
    * array str is found in this string within the character range [pos,pos+n).
@@ -221,9 +221,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  __device__ inline size_type find(char_utf8 character,
-                                   size_type pos   = 0,
-                                   size_type count = -1) const;
+  __device__ [[nodiscard]] inline size_type find(char_utf8 character,
+                                                 size_type pos   = 0,
+                                                 size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where the
    * argument str is found in this string within the character range [pos,pos+n).
@@ -234,9 +234,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  __device__ inline size_type rfind(const string_view& str,
-                                    size_type pos   = 0,
-                                    size_type count = -1) const;
+  __device__ [[nodiscard]] inline size_type rfind(const string_view& str,
+                                                  size_type pos   = 0,
+                                                  size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where the
    * array str is found in this string within the character range [pos,pos+n).
@@ -262,9 +262,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  __device__ inline size_type rfind(char_utf8 character,
-                                    size_type pos   = 0,
-                                    size_type count = -1) const;
+  __device__ [[nodiscard]] inline size_type rfind(char_utf8 character,
+                                                  size_type pos   = 0,
+                                                  size_type count = -1) const;
 
   /**
    * @brief Return a sub-string of this string. The original string and device
@@ -274,7 +274,7 @@ class string_view {
    * @param length Number of characters from start to include in the sub-string.
    * @return New instance pointing to a subset of the characters within this instance.
    */
-  __device__ inline string_view substr(size_type start, size_type length) const;
+  __device__ [[nodiscard]] inline string_view substr(size_type start, size_type length) const;
 
   /**
    * @brief Return minimum value associated with the string type
@@ -300,7 +300,7 @@ class string_view {
   /**
    * @brief Default constructor represents an empty string.
    */
-  CUDF_HOST_DEVICE inline string_view() : _data(""), _bytes(0), _length(0) {}
+  CUDF_HOST_DEVICE inline string_view() : _data("") {}
 
   /**
    * @brief Create instance from existing device char array.
@@ -330,7 +330,7 @@ class string_view {
    * @param bytepos Byte position from start of _data.
    * @return The character position for the specified byte.
    */
-  __device__ inline size_type character_offset(size_type bytepos) const;
+  __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const;
 };
 
 namespace strings {
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index fb3b9387a9b..aab898932de 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -58,14 +58,14 @@ class strings_column_view : private column_view {
   /**
    * @brief Returns the parent column.
    */
-  column_view parent() const;
+  [[nodiscard]] column_view parent() const;
 
   /**
    * @brief Returns the internal column of offsets
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view offsets() const;
+  [[nodiscard]] column_view offsets() const;
 
   /**
    * @brief Return an iterator for the offsets child column.
@@ -74,7 +74,7 @@ class strings_column_view : private column_view {
    *
    * @return Iterator pointing to the first offset value.
    */
-  offset_iterator offsets_begin() const;
+  [[nodiscard]] offset_iterator offsets_begin() const;
 
   /**
    * @brief Return an end iterator for the offsets child column.
@@ -83,14 +83,14 @@ class strings_column_view : private column_view {
    *
    * @return Iterator pointing 1 past the last offset value.
    */
-  offset_iterator offsets_end() const;
+  [[nodiscard]] offset_iterator offsets_end() const;
 
   /**
    * @brief Returns the internal column of chars
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view chars() const;
+  [[nodiscard]] column_view chars() const;
 
   /**
    * @brief Returns the number of bytes in the chars child column.
@@ -98,7 +98,7 @@ class strings_column_view : private column_view {
    * This accounts for empty columns but does not reflect a sliced parent column
    * view  (i.e.: non-zero offset or reduced row count).
    */
-  size_type chars_size() const noexcept;
+  [[nodiscard]] size_type chars_size() const noexcept;
 
   /**
    * @brief Return an iterator for the chars child column.
@@ -111,7 +111,7 @@ class strings_column_view : private column_view {
    *
    * @return Iterator pointing to the first char byte.
    */
-  chars_iterator chars_begin() const;
+  [[nodiscard]] chars_iterator chars_begin() const;
 
   /**
    * @brief Return an end iterator for the offsets child column.
@@ -121,7 +121,7 @@ class strings_column_view : private column_view {
    *
    * @return Iterator pointing 1 past the last char byte.
    */
-  chars_iterator chars_end() const;
+  [[nodiscard]] chars_iterator chars_end() const;
 };
 
 //! Strings column APIs.
diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp
index a25dce9c278..329c24cfe0a 100644
--- a/cpp/include/cudf/structs/structs_column_view.hpp
+++ b/cpp/include/cudf/structs/structs_column_view.hpp
@@ -60,7 +60,7 @@ class structs_column_view : public column_view {
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view get_sliced_child(int index) const;
+  [[nodiscard]] column_view get_sliced_child(int index) const;
 };         // class structs_column_view;
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 3c4b4dda61e..4a3c31d08e9 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -71,18 +71,18 @@ class table {
   /**
    * @brief Returns the number of columns in the table
    */
-  size_type num_columns() const noexcept { return _columns.size(); }
+  [[nodiscard]] size_type num_columns() const noexcept { return _columns.size(); }
 
   /**
    * @brief Returns the number of rows
    */
-  size_type num_rows() const noexcept { return _num_rows; }
+  [[nodiscard]] size_type num_rows() const noexcept { return _num_rows; }
 
   /**
    * @brief Returns an immutable, non-owning `table_view` of the contents of
    *this `table`.
    */
-  table_view view() const;
+  [[nodiscard]] table_view view() const;
 
   /**
    * @brief Conversion operator to an immutable, non-owning `table_view` of the
@@ -141,7 +141,7 @@ class table {
    * @return A table_view consisting of columns from the original table
    * specified by the elements of `column_indices`
    */
-  table_view select(std::vector<cudf::size_type> const& column_indices) const
+  [[nodiscard]] table_view select(std::vector<cudf::size_type> const& column_indices) const
   {
     return select(column_indices.begin(), column_indices.end());
   };
@@ -166,7 +166,7 @@ class table {
    * @param i Index of the desired column
    * @return A const reference to the desired column
    */
-  column const& get_column(cudf::size_type i) const { return *(_columns.at(i)); }
+  [[nodiscard]] column const& get_column(cudf::size_type i) const { return *(_columns.at(i)); }
 
  private:
   std::vector<std::unique_ptr<column>> _columns{};
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 2404fe88a9c..ce61e8853b6 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -61,9 +61,9 @@ class table_device_view_base {
     return _columns[column_index];
   }
 
-  __host__ __device__ size_type num_columns() const noexcept { return _num_columns; }
+  [[nodiscard]] __host__ __device__ size_type num_columns() const noexcept { return _num_columns; }
 
-  __host__ __device__ size_type num_rows() const noexcept { return _num_rows; }
+  [[nodiscard]] __host__ __device__ size_type num_rows() const noexcept { return _num_rows; }
 
   void destroy();
 
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 8abd7aed8e9..77b9e539506 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -87,7 +87,7 @@ class table_view_base {
   /**
    * @brief Returns an iterator to the first view in the `table`.
    */
-  const_iterator begin() const noexcept { return std::begin(_columns); }
+  [[nodiscard]] const_iterator begin() const noexcept { return std::begin(_columns); }
 
   /**
    * @brief Returns an iterator one past the last column view in the `table`.
@@ -103,7 +103,7 @@ class table_view_base {
    * `end()` acts as a place holder. Attempting to dereference it results in
    * undefined behavior.
    */
-  const_iterator end() const noexcept { return std::end(_columns); }
+  [[nodiscard]] const_iterator end() const noexcept { return std::end(_columns); }
 
   /**
    * @brief Returns a reference to the view of the specified column
@@ -119,17 +119,17 @@ class table_view_base {
   /**
    * @brief Returns the number of columns
    */
-  size_type num_columns() const noexcept { return _columns.size(); }
+  [[nodiscard]] size_type num_columns() const noexcept { return _columns.size(); }
 
   /**
    * @brief Returns the number of rows
    */
-  size_type num_rows() const noexcept { return _num_rows; }
+  [[nodiscard]] size_type num_rows() const noexcept { return _num_rows; }
 
   /**
    * @brief Returns true if `num_columns()` returns zero, or false otherwise
    */
-  size_type is_empty() const noexcept { return num_columns() == 0; }
+  [[nodiscard]] size_type is_empty() const noexcept { return num_columns() == 0; }
 
   table_view_base() = default;
 
@@ -208,7 +208,7 @@ class table_view : public detail::table_view_base<column_view> {
    * @return A table_view consisting of columns from the original table
    * specified by the elements of `column_indices`
    */
-  table_view select(std::vector<size_type> const& column_indices) const;
+  [[nodiscard]] table_view select(std::vector<size_type> const& column_indices) const;
 };
 
 /**
@@ -227,7 +227,7 @@ class mutable_table_view : public detail::table_view_base<mutable_column_view> {
 
   mutable_table_view() = default;
 
-  mutable_column_view& column(size_type column_index) const
+  [[nodiscard]] mutable_column_view& column(size_type column_index) const
   {
     return const_cast<mutable_column_view&>(table_view_base::column(column_index));
   }
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.cuh b/cpp/include/cudf/tdigest/tdigest_column_view.cuh
index c7513452387..696657191ca 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.cuh
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.cuh
@@ -82,28 +82,28 @@ class tdigest_column_view : private column_view {
   /**
    * @brief Returns the parent column.
    */
-  column_view parent() const;
+  [[nodiscard]] column_view parent() const;
 
   /**
    * @brief Returns the column of centroids
    */
-  lists_column_view centroids() const;
+  [[nodiscard]] lists_column_view centroids() const;
 
   /**
    * @brief Returns the internal column of mean values
    */
-  column_view means() const;
+  [[nodiscard]] column_view means() const;
 
   /**
    * @brief Returns the internal column of weight values
    */
-  column_view weights() const;
+  [[nodiscard]] column_view weights() const;
 
   /**
    * @brief Returns an iterator that returns the size of each tdigest
    * in the column (each row is 1 digest)
    */
-  auto size_begin() const
+  [[nodiscard]] auto size_begin() const
   {
     return cudf::detail::make_counting_transform_iterator(
       0, tdigest_size{centroids().offsets_begin()});
@@ -113,13 +113,13 @@ class tdigest_column_view : private column_view {
    * @brief Returns the first min value for the column. Each row corresponds
    * to the minimum value for the accompanying digest.
    */
-  double const* min_begin() const;
+  [[nodiscard]] double const* min_begin() const;
 
   /**
    * @brief Returns the first max value for the column. Each row corresponds
    * to the maximum value for the accompanying digest.
    */
-  double const* max_begin() const;
+  [[nodiscard]] double const* max_begin() const;
 };
 
 }  // namespace tdigest
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 459a4182aa0..6222b2e680e 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -268,12 +268,12 @@ class data_type {
   /**
    * @brief Returns the type identifier
    */
-  constexpr type_id id() const noexcept { return _id; }
+  [[nodiscard]] constexpr type_id id() const noexcept { return _id; }
 
   /**
    * @brief Returns the scale (for fixed_point types)
    */
-  constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
+  [[nodiscard]] constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
 
  private:
   type_id _id{type_id::EMPTY};
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 766fe93b9d1..0ac41b2c4a1 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -54,7 +54,7 @@ class span_base {
 
   static constexpr std::size_t extent = Extent;
 
-  constexpr span_base() noexcept : _data(nullptr), _size(0) {}
+  constexpr span_base() noexcept {}
   constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
   // constexpr span_base(pointer begin, pointer end) : _data(begin), _size(end - begin) {}
   constexpr span_base(span_base const& other) noexcept = default;
@@ -71,9 +71,9 @@ class span_base {
   constexpr iterator end() const noexcept { return _data + _size; }
   constexpr pointer data() const noexcept { return _data; }
 
-  constexpr size_type size() const noexcept { return _size; }
-  constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; }
-  constexpr bool empty() const noexcept { return _size == 0; }
+  [[nodiscard]] constexpr size_type size() const noexcept { return _size; }
+  [[nodiscard]] constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; }
+  [[nodiscard]] constexpr bool empty() const noexcept { return _size == 0; }
 
   /**
    * @brief Obtains a subspan consisting of the first N elements of the sequence
@@ -98,8 +98,8 @@ class span_base {
   }
 
  private:
-  pointer _data;
-  size_type _size;
+  pointer _data{nullptr};
+  size_type _size{0};
 };
 
 }  // namespace detail
@@ -251,7 +251,7 @@ class base_2dspan {
   constexpr auto data() const noexcept { return _data; }
   constexpr auto size() const noexcept { return _size; }
   constexpr auto count() const noexcept { return size().first * size().second; }
-  constexpr bool is_empty() const noexcept { return count() == 0; }
+  [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; }
 
   static constexpr size_t flatten_index(size_t row, size_t column, size_type size) noexcept
   {
@@ -263,8 +263,11 @@ class base_2dspan {
     return {this->data() + flatten_index(row, 0, this->size()), this->size().second};
   }
 
-  constexpr RowType<T, dynamic_extent> front() const { return (*this)[0]; }
-  constexpr RowType<T, dynamic_extent> back() const { return (*this)[size().first - 1]; }
+  [[nodiscard]] constexpr RowType<T, dynamic_extent> front() const { return (*this)[0]; }
+  [[nodiscard]] constexpr RowType<T, dynamic_extent> back() const
+  {
+    return (*this)[size().first - 1];
+  }
 
   constexpr base_2dspan subspan(size_t first_row, size_t num_rows) const noexcept
   {
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index 87e4c94070b..d078bf90a8a 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -79,7 +79,7 @@ using Templates0 = Templates<>;
 
 template <typename T>
 struct TypeList {
-  typedef Types<T> type;
+  using type = Types<T>;
 };
 
 template <class... TYPES>
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 8e242e5a4f3..6c21d8dfad2 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -58,5 +58,5 @@ class temp_directory {
    *
    * @return string path of the temporary directory
    */
-  const std::string& path() const { return _path; }
+  [[nodiscard]] const std::string& path() const { return _path; }
 };
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index b105c5c280e..9f4640f1daf 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -21,8 +21,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <stdint.h>
-#include <string.h>
+#include <cstdint>
+#include <cstring>
 
 namespace nvtext {
 namespace detail {
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 2b09ec66203..9d75295cd39 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,6 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <stdint.h>
-#include <string.h>
-
 namespace nvtext {
 
 /**
@@ -43,6 +40,8 @@ struct hashed_vocabulary {
   std::unique_ptr<cudf::column> table;             // uint64
   std::unique_ptr<cudf::column> bin_coefficients;  // uint64
   std::unique_ptr<cudf::column> bin_offsets;       // uint16
+  std::unique_ptr<cudf::column> cp_metadata;       // uint32
+  std::unique_ptr<cudf::column> aux_cp_table;      // uint64
 };
 
 /**
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index e6abba207d9..bdb7e8afcf9 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 22.02.00
+  VERSION 22.04.00
   LANGUAGES CXX
 )
 
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
new file mode 100644
index 00000000000..3a1a663e231
--- /dev/null
+++ b/cpp/scripts/run-clang-tidy.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import re
+import os
+import subprocess
+import argparse
+import json
+import multiprocessing as mp
+import shutil
+
+
+EXPECTED_VERSION = "11.1.0"
+VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
+GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
+SPACES = re.compile(r"\s+")
+SEPARATOR = "-" * 16
+
+
+def parse_args():
+    argparser = argparse.ArgumentParser("Runs clang-tidy on a project")
+    argparser.add_argument("-cdb", type=str,
+                           # TODO This is a hack, needs to be fixed
+                           default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json",
+                           help="Path to cmake-generated compilation database"
+                           " file. It is always found inside the root of the "
+                           "cmake build folder. So make sure that `cmake` has "
+                           "been run once before running this script!")
+    argparser.add_argument("-exe", type=str, default="clang-tidy",
+                           help="Path to clang-tidy exe")
+    argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/",
+                           help="Regex used to ignore files from checking")
+    argparser.add_argument("-select", type=str, default=None,
+                           help="Regex used to select files for checking")
+    argparser.add_argument("-j", type=int, default=-1,
+                           help="Number of parallel jobs to launch.")
+    args = argparser.parse_args()
+    if args.j <= 0:
+        args.j = mp.cpu_count()
+    args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
+    args.select_compiled = re.compile(args.select) if args.select else None
+    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
+    ret = ret.decode("utf-8")
+    version = VERSION_REGEX.search(ret)
+    if version is None:
+        raise Exception("Failed to figure out clang-tidy version!")
+    version = version.group(1)
+    if version != EXPECTED_VERSION:
+        raise Exception("clang-tidy exe must be v%s found '%s'" % \
+                        (EXPECTED_VERSION, version))
+    if not os.path.exists(args.cdb):
+        raise Exception("Compilation database '%s' missing" % args.cdb)
+    return args
+
+
+def get_all_commands(cdb):
+    with open(cdb, "r") as fp:
+        return json.load(fp)
+
+
+def get_gpu_archs(command):
+    archs = []
+    for loc in range(len(command)):
+        if command[loc] != "-gencode":
+            continue
+        arch_flag = command[loc + 1]
+        match = GPU_ARCH_REGEX.search(arch_flag)
+        if match is not None:
+            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
+    return archs
+
+
+def get_index(arr, item):
+    try:
+        return arr.index(item)
+    except:
+        return -1
+
+
+def remove_item(arr, item):
+    loc = get_index(arr, item)
+    if loc >= 0:
+        del arr[loc]
+    return loc
+
+
+def remove_item_plus_one(arr, item):
+    loc = get_index(arr, item)
+    if loc >= 0:
+        del arr[loc + 1]
+        del arr[loc]
+    return loc
+
+
+def get_clang_includes(exe):
+    dir = os.getenv("CONDA_PREFIX")
+    if dir is None:
+        ret = subprocess.check_output("which %s 2>&1" % exe, shell=True)
+        ret = ret.decode("utf-8")
+        dir = os.path.dirname(os.path.dirname(ret))
+    header = os.path.join(dir, "include", "ClangHeaders")
+    return ["-I", header]
+
+
+def get_tidy_args(cmd, exe):
+    command, file = cmd["command"], cmd["file"]
+    is_cuda = file.endswith(".cu")
+    command = re.split(SPACES, command)
+    # compiler is always clang++!
+    command[0] = "clang++"
+    # remove compilation and output targets from the original command
+    remove_item_plus_one(command, "-c")
+    remove_item_plus_one(command, "-o")
+    if is_cuda:
+        # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
+        archs = get_gpu_archs(command)
+        command.extend(archs)
+        while True:
+            loc = remove_item_plus_one(command, "-gencode")
+            if loc < 0:
+                break
+        # "-x cuda" is the right usage in clang
+        loc = get_index(command, "-x")
+        if loc >= 0:
+            command[loc + 1] = "cuda"
+        remove_item_plus_one(command, "-ccbin")
+        remove_item(command, "--expt-extended-lambda")
+        remove_item(command, "--diag_suppress=unrecognized_gcc_pragma")
+    command.extend(get_clang_includes(exe))
+    return command, is_cuda
+
+
+def run_clang_tidy_command(tidy_cmd):
+    cmd = " ".join(tidy_cmd)
+    result = subprocess.run(cmd, check=False, shell=True,
+                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    status = result.returncode == 0
+    if status:
+        out = ""
+    else:
+        out = "CMD: " + cmd
+    out += result.stdout.decode("utf-8").rstrip()
+    return status, out
+
+
+def run_clang_tidy(cmd, args):
+    command, is_cuda = get_tidy_args(cmd, args.exe)
+    tidy_cmd = [args.exe,
+                "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'",
+                cmd["file"], "--", ]
+    tidy_cmd.extend(command)
+    status = True
+    out = ""
+    if is_cuda:
+        tidy_cmd.append("--cuda-device-only")
+        tidy_cmd.append(cmd["file"])
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        out += out1
+        out += "%s" % SEPARATOR
+        if not ret:
+            status = ret
+        tidy_cmd[-2] = "--cuda-host-only"
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        if not ret:
+            status = ret
+        out += out1
+    else:
+        tidy_cmd.append(cmd["file"])
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        if not ret:
+            status = ret
+        out += out1
+    return status, out, cmd["file"]
+
+
+# yikes! global var :(
+results = []
+def collect_result(result):
+    global results
+    results.append(result)
+
+
+def print_result(passed, stdout, file):
+    status_str = "PASSED" if passed else "FAILED"
+    print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR))
+    if stdout:
+        print(stdout)
+        print("%s File:%s ENDS %s" % (SEPARATOR, file, SEPARATOR))
+
+
+def print_results():
+    global results
+    status = True
+    for passed, stdout, file in results:
+        print_result(passed, stdout, file)
+        if not passed:
+            status = False
+    return status
+
+
+def run_tidy_for_all_files(args, all_files):
+    pool = None if args.j == 1 else mp.Pool(args.j)
+    # actual tidy checker
+    for cmd in all_files:
+        # skip files that we don't want to look at
+        if args.ignore_compiled is not None and \
+           re.search(args.ignore_compiled, cmd["file"]) is not None:
+            continue
+        if args.select_compiled is not None and \
+           re.search(args.select_compiled, cmd["file"]) is None:
+            continue
+        if pool is not None:
+            pool.apply_async(run_clang_tidy, args=(cmd, args),
+                             callback=collect_result)
+        else:
+            passed, stdout, file = run_clang_tidy(cmd, args)
+            collect_result((passed, stdout, file))
+    if pool is not None:
+        pool.close()
+        pool.join()
+    return print_results()
+
+
+def main():
+    args = parse_args()
+    # Attempt to making sure that we run this script from root of repo always
+    if not os.path.exists(".git"):
+        raise Exception("This needs to always be run from the root of repo")
+    # Check whether clang-tidy exists
+    # print(args)
+    if "exe" not in args and shutil.which("clang-tidy") is not None:
+        print("clang-tidy not found. Exiting...")
+        return
+    all_files = get_all_commands(args.cdb)
+    status = run_tidy_for_all_files(args, all_files)
+    if not status:
+        raise Exception("clang-tidy failed! Refer to the errors above.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index bac6697da82..33c369b254f 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 import argparse
 import os
@@ -34,49 +34,63 @@
 # build a map of the log entries
 entries = {}
 with open(log_file, "r") as log:
+    last = 0
+    files = {}
     for line in log:
         entry = line.split()
         if len(entry) > 4:
-            elapsed = int(entry[1]) - int(entry[0])
             obj_file = entry[3]
             file_size = (
                 os.path.getsize(os.path.join(log_path, obj_file))
                 if os.path.exists(obj_file)
                 else 0
             )
-            entries[entry[3]] = (elapsed, file_size)
+            start = int(entry[0])
+            end = int(entry[1])
+            # logic based on ninjatracing
+            if end < last:
+                files = {}
+            last = end
+            files.setdefault(entry[4], (entry[3], start, end, file_size))
 
-# check file could be loaded
+    # build entries from files dict
+    for entry in files.values():
+        entries[entry[0]] = (entry[1], entry[2], entry[3])
+
+# check file could be loaded and we have entries to report
 if len(entries) == 0:
     print("Could not parse", log_file)
     exit()
 
-# sort the keys by build time (descending order)
-keys = list(entries.keys())
-sl = sorted(keys, key=lambda k: entries[k][0], reverse=True)
+# sort the entries by build-time (descending order)
+sorted_list = sorted(
+    list(entries.keys()),
+    key=lambda k: entries[k][1] - entries[k][0],
+    reverse=True,
+)
 
-if output_fmt == "xml":
-    # output results in XML format
+# output results in XML format
+def output_xml(entries, sorted_list, args):
     root = ET.Element("testsuites")
     testsuite = ET.Element(
         "testsuite",
         attrib={
             "name": "build-time",
-            "tests": str(len(keys)),
+            "tests": str(len(sorted_list)),
             "failures": str(0),
             "errors": str(0),
         },
     )
     root.append(testsuite)
-    for key in sl:
-        entry = entries[key]
-        elapsed = float(entry[0]) / 1000
+    for name in sorted_list:
+        entry = entries[name]
+        build_time = float(entry[1] - entry[0]) / 1000
         item = ET.Element(
             "testcase",
             attrib={
                 "classname": "BuildTime",
-                "name": key,
-                "time": str(elapsed),
+                "name": name,
+                "time": str(build_time),
             },
         )
         testsuite.append(item)
@@ -85,62 +99,219 @@
     xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
     print(xmlstr)
 
-elif output_fmt == "html":
-    # output results in HTML format
-    print("<html><head><title>Sorted Ninja Build Times</title>")
-    # Note: Jenkins does not support style defined in the html
+
+# utility converts a millisecond value to a colum width in pixels
+def time_to_width(value, end):
+    # map a value from (0,end) to (0,1000)
+    r = (float(value) / float(end)) * 1000.0
+    return int(r)
+
+
+# assign each entry to a thread by analyzing the start/end times and
+# slotting them into thread buckets where they fit
+def assign_entries_to_threads(entries):
+    # first sort the entries' keys by end timestamp
+    sorted_keys = sorted(
+        list(entries.keys()), key=lambda k: entries[k][1], reverse=True
+    )
+
+    # build the chart data by assigning entries to threads
+    results = {}
+    threads = []
+    for name in sorted_keys:
+        entry = entries[name]
+
+        # assign this entry by finding the first available thread identified
+        # by the thread's current start time greater than the entry's end time
+        tid = -1
+        for t in range(len(threads)):
+            if threads[t] >= entry[1]:
+                threads[t] = entry[0]
+                tid = t
+                break
+
+        # if no current thread found, create a new one with this entry
+        if tid < 0:
+            threads.append(entry[0])
+            tid = len(threads) - 1
+
+        # add entry name to the array associated with this tid
+        if tid not in results.keys():
+            results[tid] = []
+        results[tid].append(name)
+
+    # first entry has the last end time
+    end_time = entries[sorted_keys[0]][1]
+
+    # return the threaded entries and the last end time
+    return (results, end_time)
+
+
+# output chart results in HTML format
+def output_html(entries, sorted_list, args):
+    print("<html><head><title>Build Metrics Report</title>")
+    # Note: Jenkins does not support javascript nor style defined in the html
     # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/
     print("</head><body>")
     if args.msg is not None:
         print("<p>", args.msg, "</p>")
-    print("<table>")
-    print(
-        "<tr><th>File</th>",
-        "<th>Compile time<br/>(ms)</th>",
-        "<th>Size<br/>(bytes)</th><tr>",
-        sep="",
-    )
-    summary = {"red": 0, "yellow": 0, "green": 0}
+
+    # map entries to threads
+    # the end_time is used to scale all the entries to a fixed output width
+    threads, end_time = assign_entries_to_threads(entries)
+
+    # color ranges for build times
+    summary = {"red": 0, "yellow": 0, "green": 0, "white": 0}
     red = "bgcolor='#FFBBD0'"
     yellow = "bgcolor='#FFFF80'"
     green = "bgcolor='#AAFFBD'"
-    for key in sl:
-        result = entries[key]
-        elapsed = result[0]
-        color = green
-        if elapsed > 300000:  # 5 minutes
-            color = red
-            summary["red"] += 1
-        elif elapsed > 120000:  # 2 minutes
-            color = yellow
-            summary["yellow"] += 1
-        else:
-            summary["green"] += 1
+    white = "bgcolor='#FFFFFF'"
+
+    # create the build-time chart
+    print("<table id='chart' width='1000px' bgcolor='#BBBBBB'>")
+    for tid in range(len(threads)):
+        names = threads[tid]
+        # sort the names for this thread by start time
+        names = sorted(names, key=lambda k: entries[k][0])
+
+        # use the last entry's end time as the total row size
+        # (this is an estimate and does not have to be exact)
+        last_entry = entries[names[len(names) - 1]]
+        last_time = time_to_width(last_entry[1], end_time)
         print(
-            "<tr ",
-            color,
-            "><td>",
-            key,
-            "</td><td align='right'>",
-            result[0],
-            "</td><td align='right'>",
-            result[1],
-            "</td></tr>",
+            "<tr><td><table width='",
+            last_time,
+            "px' border='0' cellspacing='1' cellpadding='0'><tr>",
             sep="",
         )
-    print("</table><br/><table border='2'>")
+
+        prev_end = 0  # used for spacing between entries
+
+        # write out each entry for this thread as a column for a single row
+        for name in names:
+            entry = entries[name]
+            start = entry[0]
+            end = entry[1]
+
+            # this handles minor gaps between end of the
+            # previous entry and the start of the next
+            if prev_end > 0 and start > prev_end:
+                size = time_to_width(start - prev_end, end_time)
+                print("<td width='", size, "px'></td>")
+            # adjust for the cellspacing
+            prev_end = end + int(end_time / 500)
+
+            # format the build-time
+            build_time = end - start
+            build_time_str = str(build_time) + " ms"
+            if build_time > 120000:  # 2 minutes
+                minutes = int(build_time / 60000)
+                seconds = int(((build_time / 60000) - minutes) * 60)
+                build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
+            elif build_time > 1000:
+                build_time_str = "{:.3f} s".format(build_time / 1000)
+
+            # assign color and accumulate legend values
+            color = white
+            if build_time > 300000:  # 5 minutes
+                color = red
+                summary["red"] += 1
+            elif build_time > 120000:  # 2 minutes
+                color = yellow
+                summary["yellow"] += 1
+            elif build_time > 1000:  # 1 second
+                color = green
+                summary["green"] += 1
+            else:
+                summary["white"] += 1
+
+            # compute the pixel width based on build-time
+            size = max(time_to_width(build_time, end_time), 2)
+            # output the column for this entry
+            print("<td height='20px' width='", size, "px' ", sep="", end="")
+            # title text is shown as hover-text by most browsers
+            print(color, "title='", end="")
+            print(name, "\n", build_time_str, "' ", sep="", end="")
+            # centers the name if it fits in the box
+            print("align='center' nowrap>", end="")
+            # use a slightly smaller, fixed-width font
+            print("<font size='-2' face='courier'>", end="")
+
+            # add the file-name if it fits, otherwise, truncate the name
+            file_name = os.path.basename(name)
+            if len(file_name) + 3 > size / 7:
+                abbr_size = int(size / 7) - 3
+                if abbr_size > 1:
+                    print(file_name[:abbr_size], "...", sep="", end="")
+            else:
+                print(file_name, end="")
+            # done with this entry
+            print("</font></td>")
+            # update the entry with just the computed output info
+            entries[name] = (build_time_str, color, entry[2])
+
+        # add a filler column at the end of each row
+        print("<td width='*'></td></tr></table></td></tr>")
+
+    # done with the chart
+    print("</table><br/>")
+
+    # output detail table in build-time descending order
+    print("<table id='detail' bgcolor='#EEEEEE'>")
+    print(
+        "<tr><th>File</th>",
+        "<th>Compile time</th>",
+        "<th>Size</th><tr>",
+        sep="",
+    )
+    for name in sorted_list:
+        entry = entries[name]
+        build_time_str = entry[0]
+        color = entry[1]
+        file_size = entry[2]
+
+        # format file size
+        file_size_str = ""
+        if file_size > 1000000:
+            file_size_str = "{:.3f} MB".format(file_size / 1000000)
+        elif file_size > 1000:
+            file_size_str = "{:.3f} KB".format(file_size / 1000)
+        elif file_size > 0:
+            file_size_str = str(file_size) + " bytes"
+
+        # output entry row
+        print("<tr ", color, "><td>", name, "</td>", sep="", end="")
+        print("<td align='right'>", build_time_str, "</td>", sep="", end="")
+        print("<td align='right'>", file_size_str, "</td></tr>", sep="")
+
+    print("</table><br/>")
+
     # include summary table with color legend
+    print("<table id='legend' border='2' bgcolor='#EEEEEE'>")
     print("<tr><td", red, ">time &gt; 5 minutes</td>")
     print("<td align='right'>", summary["red"], "</td></tr>")
     print("<tr><td", yellow, ">2 minutes &lt; time &lt; 5 minutes</td>")
     print("<td align='right'>", summary["yellow"], "</td></tr>")
-    print("<tr><td", green, ">time &lt; 2 minutes</td>")
+    print("<tr><td", green, ">1 second &lt; time &lt; 2 minutes</td>")
     print("<td align='right'>", summary["green"], "</td></tr>")
+    print("<tr><td", white, ">time &lt; 1 second</td>")
+    print("<td align='right'>", summary["white"], "</td></tr>")
     print("</table></body></html>")
 
-else:
-    # output results in CSV format
+
+# output results in CSV format
+def output_csv(entries, sorted_list, args):
     print("time,size,file")
-    for key in sl:
-        result = entries[key]
-        print(result[0], result[1], key, sep=",")
+    for name in sorted_list:
+        entry = entries[name]
+        build_time = entry[1] - entry[0]
+        file_size = entry[2]
+        print(build_time, file_size, name, sep=",")
+
+
+if output_fmt == "xml":
+    output_xml(entries, sorted_list, args)
+elif output_fmt == "html":
+    output_html(entries, sorted_list, args)
+else:
+    output_csv(entries, sorted_list, args)
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 7087b71a84e..5f9ff2574e3 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -74,7 +74,8 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
 inline bool is_null_dependent(binary_operator op)
 {
   return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
-         op == binary_operator::NULL_MAX;
+         op == binary_operator::NULL_MAX || op == binary_operator::NULL_LOGICAL_AND ||
+         op == binary_operator::NULL_LOGICAL_OR;
 }
 
 /**
diff --git a/cpp/src/binaryop/compiled/NullLogicalAnd.cu b/cpp/src/binaryop/compiled/NullLogicalAnd.cu
new file mode 100644
index 00000000000..48ae125bc93
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullLogicalAnd.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullLogicalAnd>(mutable_column_device_view&,
+                                                   column_device_view const&,
+                                                   column_device_view const&,
+                                                   bool is_lhs_scalar,
+                                                   bool is_rhs_scalar,
+                                                   rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/NullLogicalOr.cu b/cpp/src/binaryop/compiled/NullLogicalOr.cu
new file mode 100644
index 00000000000..e0ea95ac3ee
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullLogicalOr.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullLogicalOr>(mutable_column_device_view&,
+                                                  column_device_view const&,
+                                                  column_device_view const&,
+                                                  bool is_lhs_scalar,
+                                                  bool is_rhs_scalar,
+                                                  rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index efa8cdca2cc..995c6702cf8 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -164,13 +164,13 @@ struct compare_functor {
 // This functor performs null aware binop between two columns or a column and a scalar by
 // iterating over them on the device
 struct null_considering_binop {
-  auto get_device_view(cudf::scalar const& scalar_item) const
+  [[nodiscard]] auto get_device_view(cudf::scalar const& scalar_item) const
   {
     return get_scalar_device_view(
       static_cast<cudf::scalar_type_t<cudf::string_view>&>(const_cast<scalar&>(scalar_item)));
   }
 
-  auto get_device_view(column_device_view const& col_item) const { return col_item; }
+  [[nodiscard]] auto get_device_view(column_device_view const& col_item) const { return col_item; }
 
   template <typename LhsViewT, typename RhsViewT, typename OutT, typename CompareFunc>
   void populate_out_col(LhsViewT const& lhsv,
@@ -339,6 +339,8 @@ case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs,
 case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_LOGICAL_AND:     apply_binary_op<ops::NullLogicalAnd>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_LOGICAL_OR:      apply_binary_op<ops::NullLogicalOr>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 default:;
 }
   // clang-format on
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index dc1cae82796..9b3e33f491e 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,6 +103,8 @@ struct ops_wrapper {
         type_dispatcher(rhs.type(), type_casted_accessor<TypeCommon>{}, i, rhs, is_rhs_scalar);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
+                      std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
                       std::is_same_v<BinaryOperator, ops::NullMin>) {
           bool output_valid = false;
@@ -150,6 +152,8 @@ struct ops2_wrapper {
       TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
+                      std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
                       std::is_same_v<BinaryOperator, ops::NullMin>) {
           bool output_valid = false;
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 75507d055e0..4b5f78dc400 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -179,8 +179,8 @@ struct PyMod {
     std::enable_if_t<(std::is_floating_point_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
   __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
-    double x1 = static_cast<double>(x);
-    double y1 = static_cast<double>(y);
+    auto x1 = static_cast<double>(x);
+    auto y1 = static_cast<double>(y);
     return fmod(fmod(x1, y1) + y1, y1);
   }
 
@@ -415,6 +415,38 @@ struct NullMin {
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) < static_cast<common_t>(y) ? x : y));
 };
 
+struct NullLogicalAnd {
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x && y)
+  {
+    bool lhs_false  = lhs_valid && !x;
+    bool rhs_false  = rhs_valid && !y;
+    bool both_valid = lhs_valid && rhs_valid;
+    output_valid    = lhs_false || rhs_false || both_valid;
+    return both_valid && !lhs_false && !rhs_false;
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y);
+};
+
+struct NullLogicalOr {
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x || y)
+  {
+    bool lhs_true   = lhs_valid && x;
+    bool rhs_true   = rhs_valid && y;
+    bool both_valid = lhs_valid && rhs_valid;
+    output_valid    = lhs_true || rhs_true || both_valid;
+    return lhs_true || rhs_true;
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y);
+};
+
 }  // namespace ops
 }  // namespace compiled
 }  // namespace binops
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index f89941a3d68..9481c236142 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,22 +64,23 @@ template <typename BinaryOperator>
 struct is_binary_operation_supported {
   // For types where Out type is fixed. (eg. comparison types)
   template <typename TypeLhs, typename TypeRhs>
-  inline constexpr bool operator()(void)
+  inline constexpr bool operator()()
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
       if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
         using common_t = std::common_type_t<TypeLhs, TypeRhs>;
         return std::is_invocable_v<BinaryOperator, common_t, common_t>;
-      } else
+      } else {
         return std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>;
+      }
     } else {
       return false;
     }
   }
 
   template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  inline constexpr bool operator()(void)
+  inline constexpr bool operator()()
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>() and
@@ -166,6 +167,10 @@ struct is_supported_operation_functor {
       case binary_operator::LESS_EQUAL: return bool_op<ops::LessEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::GREATER_EQUAL: return bool_op<ops::GreaterEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_EQUALS: return bool_op<ops::NullEquals, TypeLhs, TypeRhs>(out);
+      case binary_operator::NULL_LOGICAL_AND:
+        return bool_op<ops::NullLogicalAnd, TypeLhs, TypeRhs>(out);
+      case binary_operator::NULL_LOGICAL_OR:
+        return bool_op<ops::NullLogicalOr, TypeLhs, TypeRhs>(out);
       default: return type_dispatcher(out, nested_support_functor<TypeLhs, TypeRhs>{}, op);
     }
     return false;
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 34c0cea683e..3412733f0b2 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -113,7 +113,7 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
       thrust::upper_bound(
         thrust::seq, output_offsets, output_offsets + number_of_views, mask_index) -
       output_offsets - 1;
-    bool bit_is_set = 1;
+    bool bit_is_set = true;
     if (source_view_index < number_of_views) {
       size_type const column_element_index = mask_index - output_offsets[source_view_index];
       bit_is_set = views[source_view_index].is_valid(column_element_index);
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index f8c0006ed45..a74b97da5a1 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1017,7 +1017,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   rmm::device_buffer d_indices_and_source_info(indices_size + src_buf_info_size + offset_stack_size,
                                                stream,
                                                rmm::mr::get_current_device_resource());
-  size_type* d_indices         = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
+  auto* d_indices              = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
   src_buf_info* d_src_buf_info = reinterpret_cast<src_buf_info*>(
     reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) + indices_size);
   size_type* d_offset_stack =
@@ -1198,8 +1198,8 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   rmm::device_buffer d_src_and_dst_buffers(src_bufs_size + dst_bufs_size + offset_stack_size,
                                            stream,
                                            rmm::mr::get_current_device_resource());
-  uint8_t const** d_src_bufs = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
-  uint8_t** d_dst_bufs       = reinterpret_cast<uint8_t**>(
+  auto const** d_src_bufs = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
+  uint8_t** d_dst_bufs    = reinterpret_cast<uint8_t**>(
     reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size);
 
   // setup src buffers
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index 05330a7c492..fa3d19bdcfd 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -55,7 +55,7 @@ struct store_result_functor {
   /**
    * @brief Check if the groupby keys are presorted
    */
-  bool is_presorted() const { return keys_are_sorted == sorted::YES; }
+  [[nodiscard]] bool is_presorted() const { return keys_are_sorted == sorted::YES; }
 
   /**
    * @brief Get the grouped values
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 9ebb516ee14..50e3b812b62 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -49,7 +49,7 @@ struct var_transform {
   {
     if (d_values.is_null(i)) return 0.0;
 
-    ResultType x = static_cast<ResultType>(values_iter[i]);
+    auto x = static_cast<ResultType>(values_iter[i]);
 
     size_type group_idx  = d_group_labels[i];
     size_type group_size = d_group_sizes[group_idx];
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index b7b45341ad2..f48ab852f24 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -327,7 +327,7 @@ __global__ void generate_cluster_limits_kernel(int delta,
   // compute the first cluster limit
   double nearest_w;
   int nearest_w_index;  // group-relative index into the input stream
-  while (1) {
+  while (true) {
     cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w);
     if (cur_weight >= total_weight) { break; }
 
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index a3f954920c8..64ab69cd377 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -242,7 +242,7 @@ class concurrent_unordered_map {
 
   __host__ __device__ mapped_type get_unused_element() const { return m_unused_element; }
 
-  __host__ __device__ size_type capacity() const { return m_capacity; }
+  [[nodiscard]] __host__ __device__ size_type capacity() const { return m_capacity; }
 
  private:
   /**
diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh
index 2b92c9142ca..cdf5b6a8649 100644
--- a/cpp/src/hash/concurrent_unordered_multimap.cuh
+++ b/cpp/src/hash/concurrent_unordered_multimap.cuh
@@ -503,7 +503,7 @@ class concurrent_unordered_multimap {
     if (count_collisions) m_collisions = 0;
   }
 
-  unsigned long long get_num_collisions() const { return m_collisions; }
+  [[nodiscard]] unsigned long long get_num_collisions() const { return m_collisions; }
 
   void print()
   {
diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/src/hash/hash_allocator.cuh
index 0c4acccf33d..db836917808 100644
--- a/cpp/src/hash/hash_allocator.cuh
+++ b/cpp/src/hash/hash_allocator.cuh
@@ -26,7 +26,7 @@
 
 template <class T>
 struct managed_allocator {
-  typedef T value_type;
+  using value_type                    = T;
   rmm::mr::device_memory_resource* mr = new rmm::mr::managed_memory_resource;
 
   managed_allocator() = default;
@@ -62,7 +62,7 @@ bool operator!=(const managed_allocator<T>&, const managed_allocator<U>&)
 
 template <class T>
 struct default_allocator {
-  typedef T value_type;
+  using value_type                    = T;
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
 
   default_allocator() = default;
diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh
index c6cc60a6917..c5aab78589e 100644
--- a/cpp/src/hash/managed.cuh
+++ b/cpp/src/hash/managed.cuh
@@ -22,7 +22,7 @@
 struct managed {
   static void* operator new(size_t n)
   {
-    void* ptr          = 0;
+    void* ptr          = nullptr;
     cudaError_t result = cudaMallocManaged(&ptr, n);
     if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc();
     return ptr;
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 0e0ce8c4335..f368ae9fab5 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -168,7 +168,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   data_type const dtype = DLDataType_to_data_type(tensor.dtype);
 
   size_t const byte_width = size_of(dtype);
-  size_t const num_rows   = static_cast<size_t>(tensor.shape[0]);
+  auto const num_rows     = static_cast<size_t>(tensor.shape[0]);
   size_t const bytes      = num_rows * byte_width;
 
   // For 2D tensors, if the strides pointer is not null, then strides[1] is the
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 9d3db35fea6..c1fa10d19b7 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -15,7 +15,8 @@
  */
 
 #include "avro.h"
-#include <string.h>
+
+#include <cstring>
 #include <unordered_map>
 
 namespace cudf {
@@ -75,7 +76,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
   sig4 |= get_raw<uint8_t>() << 24;
   if (sig4 != avro_magic) { return false; }
   for (;;) {
-    uint32_t num_md_items = static_cast<uint32_t>(get_encoded<int64_t>());
+    auto num_md_items = static_cast<uint32_t>(get_encoded<int64_t>());
     if (num_md_items == 0) { break; }
     for (uint32_t i = 0; i < num_md_items; i++) {
       auto const key   = get_encoded<std::string>();
@@ -103,8 +104,8 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
     auto const block_size   = static_cast<uint32_t>(get_encoded<int64_t>());
     if (block_size <= 0 || object_count <= 0 || m_cur + block_size + 16 > m_end) { break; }
     if (object_count > first_row) {
-      uint32_t block_row = static_cast<uint32_t>(total_object_count);
-      max_block_size     = std::max(max_block_size, block_size);
+      auto block_row = static_cast<uint32_t>(total_object_count);
+      max_block_size = std::max(max_block_size, block_size);
       total_object_count += object_count;
       if (!md->block_list.size()) {
         md->skip_rows = static_cast<uint32_t>(first_row);
diff --git a/cpp/src/io/avro/avro.h b/cpp/src/io/avro/avro.h
index f84693fdba3..3dd989ffa79 100644
--- a/cpp/src/io/avro/avro.h
+++ b/cpp/src/io/avro/avro.h
@@ -19,11 +19,11 @@
 #include "avro_common.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
 #include <map>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 #include <string>
 #include <vector>
 
@@ -85,7 +85,7 @@ class schema_parser {
   bool parse(std::vector<schema_entry>& schema, const std::string& str);
 
  protected:
-  bool more_data() const { return (m_cur < m_end); }
+  [[nodiscard]] bool more_data() const { return (m_cur < m_end); }
   std::string get_str();
 
  protected:
@@ -103,7 +103,7 @@ class container {
   {
   }
 
-  auto bytecount() const { return m_cur - m_base; }
+  [[nodiscard]] auto bytecount() const { return m_cur - m_base; }
 
   template <typename T>
   T get_raw()
diff --git a/cpp/src/io/avro/avro_common.h b/cpp/src/io/avro/avro_common.h
index 17f12da3165..1df6d176e95 100644
--- a/cpp/src/io/avro/avro_common.h
+++ b/cpp/src/io/avro/avro_common.h
@@ -17,8 +17,9 @@
 #pragma once
 
 #include <io/utilities/column_buffer.hpp>
-#include <stdint.h>
-#include <stdio.h>
+
+#include <cstdint>
+#include <cstdio>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index cb1c32458a3..7985d5df345 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -120,7 +120,7 @@ avro_decode_row(schemadesc_s const* schema,
           if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
         } else {  // string or enum
           size_t count    = 0;
-          const char* ptr = 0;
+          const char* ptr = nullptr;
           if (kind == type_enum) {  // dictionary
             size_t idx = schema[i].count + v;
             if (idx < global_dictionary.size()) {
diff --git a/cpp/src/io/comp/brotli_dict.cpp b/cpp/src/io/comp/brotli_dict.cpp
index 3e6939bb816..ef0fab51be6 100644
--- a/cpp/src/io/comp/brotli_dict.cpp
+++ b/cpp/src/io/comp/brotli_dict.cpp
@@ -49,7 +49,8 @@ THE SOFTWARE.
 */
 
 #include "brotli_dict.h"
-#include <stdint.h>
+
+#include <cstdint>
 
 namespace cudf {
 namespace io {
@@ -6528,7 +6529,7 @@ static const brotli_dictionary_s g_dictionary = {
    136, 224, 164, 184, 224, 164, 149, 224, 165, 141, 224, 164, 176, 224, 164, 191, 224, 164, 175,
    224, 164, 164, 224, 164, 190}};
 
-const brotli_dictionary_s* get_brotli_dictionary(void) { return &g_dictionary; }
+const brotli_dictionary_s* get_brotli_dictionary() { return &g_dictionary; }
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/brotli_dict.h b/cpp/src/io/comp/brotli_dict.h
index 4c1fec1492c..315fbd9712b 100644
--- a/cpp/src/io/comp/brotli_dict.h
+++ b/cpp/src/io/comp/brotli_dict.h
@@ -79,7 +79,7 @@ struct brotli_dictionary_s {
 constexpr int brotli_min_dictionary_word_length = 4;
 constexpr int brotli_max_dictionary_word_length = 24;
 
-const brotli_dictionary_s* get_brotli_dictionary(void);
+const brotli_dictionary_s* get_brotli_dictionary();
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/brotli_tables.h b/cpp/src/io/comp/brotli_tables.h
index 6e869999329..72a9b40bf95 100644
--- a/cpp/src/io/comp/brotli_tables.h
+++ b/cpp/src/io/comp/brotli_tables.h
@@ -2149,14 +2149,14 @@ CONSTANT uint8_t kContextLookup[2048] = {
   7,
 };
 
-typedef struct CmdLutElement {
+using CmdLutElement = struct CmdLutElement {
   uint8_t insert_len_extra_bits;
   uint8_t copy_len_extra_bits;
   int8_t distance_code;
   uint8_t context;
   uint16_t insert_len_offset;
   uint16_t copy_len_offset;
-} CmdLutElement;
+};
 
 CONSTANT CmdLutElement kCmdLut[brotli_num_command_symbols] = {
   {0x00, 0x00, 0, 0x00, 0x0000, 0x0002},  {0x00, 0x00, 0, 0x01, 0x0000, 0x0003},
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index 7f37b62e9c2..113623a2e67 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -81,8 +81,9 @@ For more information on these sources, see the manual.
 
 #include "io_uncomp.h"
 #include "unbz2.h"
-#include <stdio.h>
-#include <stdlib.h>
+
+#include <cstdio>
+#include <cstdlib>
 #include <vector>
 
 namespace cudf {
@@ -111,15 +112,15 @@ namespace io {
 
 #define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE))
 
-typedef struct {
+using huff_s = struct {
   int32_t minLen;
   int32_t limit[BZ_MAX_CODE_LEN];
   int32_t base[BZ_MAX_CODE_LEN];
   uint16_t perm[BZ_MAX_ALPHA_SIZE];
-} huff_s;
+};
 
 // Decoder state
-typedef struct {
+using unbz_state_s = struct {
   // Input
   const uint8_t* cur;
   const uint8_t* end;
@@ -153,7 +154,7 @@ typedef struct {
   uint8_t len[BZ_MAX_ALPHA_SIZE];
 
   huff_s ht[BZ_N_GROUPS];
-} unbz_state_s;
+};
 
 // return next 32 bits
 static inline uint32_t next32bits(const unbz_state_s* s)
@@ -530,7 +531,8 @@ int32_t cpu_bz2_uncompress(
   int ret;
   size_t last_valid_block_in, last_valid_block_out;
 
-  if (dest == NULL || destLen == NULL || source == NULL || sourceLen < 12) return BZ_PARAM_ERROR;
+  if (dest == nullptr || destLen == nullptr || source == nullptr || sourceLen < 12)
+    return BZ_PARAM_ERROR;
   s.currBlockNo = 0;
 
   s.cur  = source;
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 8229245276b..b4a42a66133 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -201,8 +201,8 @@ inline __device__ uint32_t Log2Floor(uint32_t value) { return 32 - __clz(value);
 /// @brief initializes the bit reader
 __device__ void initbits(debrotli_state_s* s, const uint8_t* base, size_t len, size_t pos = 0)
 {
-  const uint8_t* p      = base + pos;
-  uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3);
+  const uint8_t* p  = base + pos;
+  auto prefix_bytes = (uint32_t)(((size_t)p) & 3);
   p -= prefix_bytes;
   s->base     = base;
   s->end      = base + len;
@@ -248,7 +248,7 @@ inline __device__ uint32_t getbits(debrotli_state_s* s, uint32_t n)
 
 inline __device__ uint32_t getbits_bytealign(debrotli_state_s* s)
 {
-  uint32_t n    = (uint32_t)((-(int32_t)s->bitpos) & 7);
+  auto n        = (uint32_t)((-(int32_t)s->bitpos) & 7);
   uint32_t bits = showbits(s, n);
   skipbits(s, n);
   return bits;
@@ -315,7 +315,7 @@ static __device__ uint8_t* local_alloc(debrotli_state_s* s, uint32_t bytes)
   int heap_used  = s->heap_used;
   auto const len = allocation_size(bytes);
   if (heap_used + len <= s->heap_limit) {
-    uint8_t* ptr = reinterpret_cast<uint8_t*>(&s->heap[heap_used]);
+    auto* ptr    = reinterpret_cast<uint8_t*>(&s->heap[heap_used]);
     s->heap_used = (uint16_t)(heap_used + len);
     return ptr;
   } else {
@@ -351,9 +351,9 @@ static __device__ uint8_t* ext_heap_alloc(uint32_t bytes,
                                           uint8_t* ext_heap_base,
                                           uint32_t ext_heap_size)
 {
-  uint32_t len                = (bytes + 0xf) & ~0xf;
-  volatile uint32_t* heap_ptr = reinterpret_cast<volatile uint32_t*>(ext_heap_base);
-  uint32_t first_free_block   = ~0;
+  uint32_t len              = (bytes + 0xf) & ~0xf;
+  volatile auto* heap_ptr   = reinterpret_cast<volatile uint32_t*>(ext_heap_base);
+  uint32_t first_free_block = ~0;
   for (;;) {
     uint32_t blk_next, blk_prev;
     first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block);
@@ -421,10 +421,10 @@ static __device__ void ext_heap_free(void* ptr,
                                      uint8_t* ext_heap_base,
                                      uint32_t ext_heap_size)
 {
-  uint32_t len                = (bytes + 0xf) & ~0xf;
-  volatile uint32_t* heap_ptr = (volatile uint32_t*)ext_heap_base;
-  uint32_t first_free_block   = ~0;
-  uint32_t cur_blk            = static_cast<uint32_t>(static_cast<uint8_t*>(ptr) - ext_heap_base);
+  uint32_t len              = (bytes + 0xf) & ~0xf;
+  volatile auto* heap_ptr   = (volatile uint32_t*)ext_heap_base;
+  uint32_t first_free_block = ~0;
+  auto cur_blk              = static_cast<uint32_t>(static_cast<uint8_t*>(ptr) - ext_heap_base);
   for (;;) {
     first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block);
     if (first_free_block != ~0) { break; }
@@ -1299,7 +1299,7 @@ static __device__ void InverseMoveToFrontTransform(debrotli_state_s* s, uint8_t*
   uint32_t i           = 1;
   uint32_t upper_bound = s->mtf_upper_bound;
   uint32_t* mtf        = &s->mtf[1];  // Make mtf[-1] addressable.
-  uint8_t* mtf_u8      = reinterpret_cast<uint8_t*>(mtf);
+  auto* mtf_u8         = reinterpret_cast<uint8_t*>(mtf);
   uint32_t pattern     = 0x03020100;  // Little-endian
 
   // Initialize list using 4 consequent values pattern.
@@ -1419,12 +1419,12 @@ static __device__ debrotli_huff_tree_group_s* HuffmanTreeGroupInit(debrotli_stat
                                                                    uint32_t max_symbol,
                                                                    uint32_t ntrees)
 {
-  debrotli_huff_tree_group_s* group = reinterpret_cast<debrotli_huff_tree_group_s*>(local_alloc(
+  auto* group          = reinterpret_cast<debrotli_huff_tree_group_s*>(local_alloc(
     s, sizeof(debrotli_huff_tree_group_s) + ntrees * sizeof(uint16_t*) - sizeof(uint16_t*)));
-  group->alphabet_size              = (uint16_t)alphabet_size;
-  group->max_symbol                 = (uint16_t)max_symbol;
-  group->num_htrees                 = (uint16_t)ntrees;
-  group->htrees[0]                  = nullptr;
+  group->alphabet_size = (uint16_t)alphabet_size;
+  group->max_symbol    = (uint16_t)max_symbol;
+  group->num_htrees    = (uint16_t)ntrees;
+  group->htrees[0]     = nullptr;
   return group;
 }
 
@@ -1640,7 +1640,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
   const uint8_t *context_map_slice, *dist_context_map_slice;
   int dist_rb_idx;
   uint32_t blen_L, blen_I, blen_D;
-  uint8_t* const dict_scratch = reinterpret_cast<uint8_t*>(
+  auto* const dict_scratch = reinterpret_cast<uint8_t*>(
     &s->hs);  // 24+13 bytes (max length of a dictionary word including prefix & suffix)
   int context_mode;
 
@@ -1808,7 +1808,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
               pos         = meta_block_len;
               copy_length = 0;
             } else {
-              int32_t offset         = (int32_t)words->offsets_by_length[copy_length];
+              auto offset            = (int32_t)words->offsets_by_length[copy_length];
               uint32_t shift         = words->size_bits_by_length[copy_length];
               uint32_t address       = distance_code - max_distance - 1;
               int32_t word_idx       = address & ((1 << shift) - 1);
@@ -1927,8 +1927,8 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
   if (z >= count) { return; }
   // Thread0: initializes shared state and decode stream header
   if (!t) {
-    uint8_t const* src = static_cast<uint8_t const*>(inputs[z].srcDevice);
-    size_t src_size    = inputs[z].srcSize;
+    auto const* src = static_cast<uint8_t const*>(inputs[z].srcDevice);
+    size_t src_size = inputs[z].srcSize;
     if (src && src_size >= 8) {
       s->error = 0;
       s->out = s->outbase = static_cast<uint8_t*>(inputs[z].dstDevice);
@@ -2084,7 +2084,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
 {
   uint32_t count32 = (count > 0) ? count : 0;
   uint32_t fb_heap_size;
-  uint8_t* scratch_u8 = static_cast<uint8_t*>(scratch);
+  auto* scratch_u8 = static_cast<uint8_t*>(scratch);
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(count32, 1);  // TODO: Check max grid dimensions vs max expected count
 
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index dab8ce1afa5..508e960430d 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -926,8 +926,8 @@ __device__ void copy_stored(inflate_state_s* s, int t)
   __syncthreads();
   if (t == 0) {
     // Reset bitstream to end of block
-    uint8_t* p            = cur + len;
-    uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3);
+    uint8_t* p        = cur + len;
+    auto prefix_bytes = (uint32_t)(((size_t)p) & 3);
     p -= prefix_bytes;
     s->cur      = p;
     s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
@@ -952,7 +952,7 @@ __device__ void prefetch_warp(volatile inflate_state_s* s, int t)
   const uint8_t* cur_p = s->pref.cur_p;
   const uint8_t* end   = s->end;
   while (shuffle((t == 0) ? s->pref.run : 0)) {
-    int32_t cur_lo = (int32_t)(size_t)cur_p;
+    auto cur_lo = (int32_t)(size_t)cur_p;
     int do_pref =
       shuffle((t == 0) ? (cur_lo - *(volatile int32_t*)&s->cur < prefetch_size - 32 * 4 - 4) : 0);
     if (do_pref) {
@@ -1035,7 +1035,7 @@ __global__ void __launch_bounds__(block_size)
   inflate_state_s* state = &state_g;
 
   if (!t) {
-    uint8_t* p      = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].srcDevice));
+    auto* p         = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].srcDevice));
     size_t src_size = inputs[z].srcSize;
     uint32_t prefix_bytes;
     // Parse header if needed
@@ -1181,8 +1181,8 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp
   src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src));
   src_align_bits  = src_align_bytes << 3;
   while (len >= 32) {
-    const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
-    uint32_t copy_cnt     = min(len >> 2, 1024);
+    const auto* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
+    uint32_t copy_cnt = min(len >> 2, 1024);
     if (t < copy_cnt) {
       uint32_t v = src32[t];
       if (src_align_bits != 0) { v = __funnelshift_r(v, src32[t + 1], src_align_bits); }
diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h
index 3ca9c9eee10..29856bcd3f3 100644
--- a/cpp/src/io/comp/gpuinflate.h
+++ b/cpp/src/io/comp/gpuinflate.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <stdint.h>
+#include <cstdint>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index d55c06a7d96..9f0a610f8f7 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -55,9 +55,9 @@ static inline __device__ uint32_t snap_hash(uint32_t v)
  */
 static inline __device__ uint32_t fetch4(const uint8_t* src)
 {
-  uint32_t src_align    = 3 & reinterpret_cast<uintptr_t>(src);
-  const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align);
-  uint32_t v            = src32[0];
+  uint32_t src_align = 3 & reinterpret_cast<uintptr_t>(src);
+  const auto* src32  = reinterpret_cast<const uint32_t*>(src - src_align);
+  uint32_t v         = src32[0];
   return (src_align) ? __funnelshift_r(v, src32[1], src_align * 8) : v;
 }
 
@@ -268,15 +268,15 @@ __global__ void __launch_bounds__(128)
   const uint8_t* src;
 
   if (!t) {
-    const uint8_t* src = static_cast<const uint8_t*>(inputs[blockIdx.x].srcDevice);
-    uint32_t src_len   = static_cast<uint32_t>(inputs[blockIdx.x].srcSize);
-    uint8_t* dst       = static_cast<uint8_t*>(inputs[blockIdx.x].dstDevice);
-    uint32_t dst_len   = static_cast<uint32_t>(inputs[blockIdx.x].dstSize);
-    uint8_t* end       = dst + dst_len;
-    s->src             = src;
-    s->src_len         = src_len;
-    s->dst_base        = dst;
-    s->end             = end;
+    const auto* src = static_cast<const uint8_t*>(inputs[blockIdx.x].srcDevice);
+    auto src_len    = static_cast<uint32_t>(inputs[blockIdx.x].srcSize);
+    auto* dst       = static_cast<uint8_t*>(inputs[blockIdx.x].dstDevice);
+    auto dst_len    = static_cast<uint32_t>(inputs[blockIdx.x].dstSize);
+    uint8_t* end    = dst + dst_len;
+    s->src          = src;
+    s->src_len      = src_len;
+    s->dst_base     = dst;
+    s->end          = end;
     while (src_len > 0x7f) {
       if (dst < end) { dst[0] = src_len | 0x80; }
       dst++;
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 94721fb9ce1..66d73074af0 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -22,7 +22,7 @@
 
 #include <cuda_runtime.h>
 
-#include <string.h>  // memset
+#include <cstring>  // memset
 
 #include <zlib.h>  // uncompress
 
@@ -196,17 +196,16 @@ bool OpenZipArchive(zip_archive_s* dst, const uint8_t* raw, size_t len)
     for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2;
          i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0;
          i--) {
-      const zip_eocd_s* eocd = reinterpret_cast<zip_eocd_s const*>(raw + i);
+      const auto* eocd = reinterpret_cast<zip_eocd_s const*>(raw + i);
       if (eocd->sig == 0x06054b50 &&
           eocd->disk_id == eocd->start_disk  // multi-file archives not supported
           && eocd->num_entries == eocd->total_entries &&
           eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len &&
           i + *reinterpret_cast<const uint16_t*>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
-        const zip_cdfh_s* cdfh = reinterpret_cast<const zip_cdfh_s*>(raw + eocd->cdir_offset);
-        dst->eocd              = eocd;
+        const auto* cdfh = reinterpret_cast<const zip_cdfh_s*>(raw + eocd->cdir_offset);
+        dst->eocd        = eocd;
         if (i >= static_cast<ptrdiff_t>(sizeof(zip64_eocdl))) {
-          const zip64_eocdl* eocdl =
-            reinterpret_cast<const zip64_eocdl*>(raw + i - sizeof(zip64_eocdl));
+          const auto* eocdl = reinterpret_cast<const zip64_eocdl*>(raw + i - sizeof(zip64_eocdl));
           if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; }
         }
         // Start of central directory
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index bdd9ddaf1ea..791a16bc912 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -87,10 +87,10 @@ inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos)
  */
 __device__ void snappy_prefetch_bytestream(unsnap_state_s* s, int t)
 {
-  const uint8_t* base  = s->base;
-  uint32_t end         = (uint32_t)(s->end - base);
-  uint32_t align_bytes = (uint32_t)(0x20 - (0x1f & reinterpret_cast<uintptr_t>(base)));
-  int32_t pos          = min(align_bytes, end);
+  const uint8_t* base = s->base;
+  auto end            = (uint32_t)(s->end - base);
+  auto align_bytes    = (uint32_t)(0x20 - (0x1f & reinterpret_cast<uintptr_t>(base)));
+  int32_t pos         = min(align_bytes, end);
   int32_t blen;
   // Start by prefetching up to the next a 32B-aligned location
   if (t < pos) { s->q.buf[t] = base[t]; }
@@ -278,7 +278,7 @@ inline __device__ uint32_t get_len5_mask(uint32_t v0, uint32_t v1)
 __device__ void snappy_decode_symbols(unsnap_state_s* s, uint32_t t)
 {
   uint32_t cur        = 0;
-  uint32_t end        = static_cast<uint32_t>(s->end - s->base);
+  auto end            = static_cast<uint32_t>(s->end - s->base);
   uint32_t bytes_left = s->uncompressed_size;
   uint32_t dst_pos    = 0;
   int32_t batch       = 0;
@@ -498,7 +498,7 @@ template <typename Storage>
 __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_storage)
 {
   const uint8_t* literal_base = s->base;
-  uint8_t* out                = static_cast<uint8_t*>(s->in.dstDevice);
+  auto* out                   = static_cast<uint8_t*>(s->in.dstDevice);
   int batch                   = 0;
 
   do {
@@ -610,7 +610,7 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
     __syncwarp();
     if (t == 0) { s->q.batch_len[batch] = 0; }
     batch = (batch + 1) & (batch_count - 1);
-  } while (1);
+  } while (true);
 }
 
 /**
@@ -639,7 +639,7 @@ __global__ void __launch_bounds__(block_size)
   if (t < batch_count) { s->q.batch_len[t] = 0; }
   __syncthreads();
   if (!t) {
-    const uint8_t* cur = static_cast<const uint8_t*>(s->in.srcDevice);
+    const auto* cur    = static_cast<const uint8_t*>(s->in.srcDevice);
     const uint8_t* end = cur + s->in.srcSize;
     s->error           = 0;
     if (log_cyclecount) { s->tstart = clock(); }
diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h
index 9b83028fa92..ec45dea3072 100644
--- a/cpp/src/io/csv/csv_gpu.h
+++ b/cpp/src/io/csv/csv_gpu.h
@@ -48,8 +48,8 @@ constexpr uint32_t rowofs_block_bytes = rowofs_block_dim * 32;  // 16KB/threadbl
  * Format: row_count * 4 + id, where `row_count` is the number of rows
  * in a character block, and `id` is the row parser state at the end of the block.
  */
-typedef uint32_t rowctx32_t;
-typedef uint64_t rowctx64_t;
+using rowctx32_t = uint32_t;
+using rowctx64_t = uint64_t;
 
 /**
  * Packed row context format
@@ -61,7 +61,7 @@ typedef uint64_t rowctx64_t;
  * always zero (EOF input state implies a zero row count) and therefore
  * stored as 64-bit.
  */
-typedef uint64_t packed_rowctx_t;
+using packed_rowctx_t = uint64_t;
 
 /**
  * @brief return a row context from a {count, id} pair
@@ -116,7 +116,7 @@ inline __host__ __device__ rowctx32_t get_row_context(packed_rowctx_t packed_ctx
 inline __host__ __device__ rowctx64_t select_row_context(rowctx64_t sel_ctx,
                                                          packed_rowctx_t packed_ctx)
 {
-  uint32_t ctxid = static_cast<uint32_t>(sel_ctx & 3);
+  auto ctxid     = static_cast<uint32_t>(sel_ctx & 3);
   rowctx32_t ctx = get_row_context(packed_ctx, ctxid);
   return (sel_ctx & ~3) + ctx;
 }
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 1b66df860a3..1517226952a 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -126,7 +126,7 @@ struct column_to_strings_fn {
   // fails to compile var-templs);
   //
   template <typename column_type>
-  constexpr static bool is_not_handled(void)
+  constexpr static bool is_not_handled()
   {
     // Note: the case (not std::is_same_v<column_type, bool>)
     // is already covered by is_integral)
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 01418fd3bd6..416beaebe5d 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -47,17 +47,17 @@ class aggregate_orc_metadata {
   /**
    * @brief Sums up the number of rows of each source
    */
-  size_type calc_num_rows() const;
+  [[nodiscard]] size_type calc_num_rows() const;
 
   /**
    * @brief Number of columns in a ORC file.
    */
-  size_type calc_num_cols() const;
+  [[nodiscard]] size_type calc_num_cols() const;
 
   /**
    * @brief Sums up the number of stripes of each source
    */
-  size_type calc_num_stripes() const;
+  [[nodiscard]] size_type calc_num_stripes() const;
 
  public:
   std::vector<metadata> per_file_metadata;
@@ -67,26 +67,29 @@ class aggregate_orc_metadata {
 
   aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
 
-  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; }
+  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  {
+    return per_file_metadata[0].ff.types[schema_idx];
+  }
 
   auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
 
-  auto get_num_rows() const { return num_rows; }
+  [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
   auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
 
-  auto get_num_stripes() const { return num_stripes; }
+  [[nodiscard]] auto get_num_stripes() const { return num_stripes; }
 
-  auto const& get_types() const { return per_file_metadata[0].ff.types; }
+  [[nodiscard]] auto const& get_types() const { return per_file_metadata[0].ff.types; }
 
-  int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
+  [[nodiscard]] int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
 
-  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
+  [[nodiscard]] auto is_row_grp_idx_present() const { return row_grp_idx_present; }
 
   /**
    * @brief Returns the name of the given column from the given source.
    */
-  std::string const& column_name(const int source_idx, const int column_id) const
+  [[nodiscard]] std::string const& column_name(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
@@ -98,7 +101,7 @@ class aggregate_orc_metadata {
    *
    * Full name includes ancestor columns' names.
    */
-  std::string const& column_path(const int source_idx, const int column_id) const
+  [[nodiscard]] std::string const& column_path(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 4fa3480c90a..311f18bf72e 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -25,10 +25,10 @@
 #include <io/comp/io_uncomp.h>
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <optional>
-#include <stddef.h>
-#include <stdint.h>
 #include <string>
 #include <vector>
 
@@ -87,7 +87,7 @@ struct Stream {
 
   // Returns index of the column in the table, if any
   // Stream of the 'column 0' does not have a corresponding column in the table
-  std::optional<uint32_t> column_index() const noexcept
+  [[nodiscard]] std::optional<uint32_t> column_index() const noexcept
   {
     return column_id.value_or(0) > 0 ? std::optional<uint32_t>{*column_id - 1}
                                      : std::optional<uint32_t>{};
@@ -540,14 +540,14 @@ class OrcDecompressor {
  public:
   OrcDecompressor(CompressionKind kind, uint32_t blockSize);
   const uint8_t* Decompress(const uint8_t* srcBytes, size_t srcLen, size_t* dstLen);
-  uint32_t GetLog2MaxCompressionRatio() const { return m_log2MaxRatio; }
-  uint32_t GetMaxUncompressedBlockSize(uint32_t block_len) const
+  [[nodiscard]] uint32_t GetLog2MaxCompressionRatio() const { return m_log2MaxRatio; }
+  [[nodiscard]] uint32_t GetMaxUncompressedBlockSize(uint32_t block_len) const
   {
     return (block_len < (m_blockSize >> m_log2MaxRatio)) ? block_len << m_log2MaxRatio
                                                          : m_blockSize;
   }
-  CompressionKind GetKind() const { return m_kind; }
-  uint32_t GetBlockSize() const { return m_blockSize; }
+  [[nodiscard]] CompressionKind GetKind() const { return m_kind; }
+  [[nodiscard]] uint32_t GetBlockSize() const { return m_blockSize; }
 
  protected:
   CompressionKind const m_kind;
@@ -603,16 +603,16 @@ class metadata {
  public:
   explicit metadata(datasource* const src);
 
-  size_t get_total_rows() const { return ff.numberOfRows; }
-  int get_num_stripes() const { return ff.stripes.size(); }
-  int get_num_columns() const { return ff.types.size(); }
+  [[nodiscard]] size_t get_total_rows() const { return ff.numberOfRows; }
+  [[nodiscard]] int get_num_stripes() const { return ff.stripes.size(); }
+  [[nodiscard]] int get_num_columns() const { return ff.types.size(); }
   /**
    * @brief Returns the name of the column with the given ID.
    *
    * Name might not be unique in the ORC file, since columns with different parents are allowed to
    * have the same names.
    */
-  std::string const& column_name(size_type column_id) const
+  [[nodiscard]] std::string const& column_name(size_type column_id) const
   {
     CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_names[column_id];
@@ -623,22 +623,25 @@ class metadata {
    *
    * Each column in the ORC file has a unique path.
    */
-  std::string const& column_path(size_type column_id) const
+  [[nodiscard]] std::string const& column_path(size_type column_id) const
   {
     CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_paths[column_id];
   }
-  int get_row_index_stride() const { return ff.rowIndexStride; }
+  [[nodiscard]] int get_row_index_stride() const { return ff.rowIndexStride; }
 
   /**
    * @brief Returns the ID of the parent column of the given column.
    */
-  size_type parent_id(size_type column_id) const { return parents.at(column_id).value().id; }
+  [[nodiscard]] size_type parent_id(size_type column_id) const
+  {
+    return parents.at(column_id).value().id;
+  }
 
   /**
    * @brief Returns the index the given column has in its parent's children list.
    */
-  size_type field_index(size_type column_id) const
+  [[nodiscard]] size_type field_index(size_type column_id) const
   {
     return parents.at(column_id).value().field_idx;
   }
@@ -646,7 +649,7 @@ class metadata {
   /**
    * @brief Returns whether the given column has a parent.
    */
-  size_type column_has_parent(size_type column_id) const
+  [[nodiscard]] size_type column_has_parent(size_type column_id) const
   {
     return parents.at(column_id).has_value();
   }
@@ -693,7 +696,7 @@ struct orc_column_device_view : public column_device_view {
 struct rowgroup_rows {
   size_type begin;
   size_type end;
-  constexpr auto size() const noexcept { return end - begin; }
+  [[nodiscard]] constexpr auto size() const noexcept { return end - begin; }
 };
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 21c52f9295b..f133b79a27e 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -232,7 +232,6 @@ size_t gather_stream_info(const size_t stripe_index,
  */
 auto decimal_column_type(std::vector<std::string> const& float64_columns,
                          std::vector<std::string> const& decimal128_columns,
-                         bool is_decimal128_enabled,
                          cudf::io::orc::detail::aggregate_orc_metadata const& metadata,
                          int column_index)
 {
@@ -244,7 +243,7 @@ auto decimal_column_type(std::vector<std::string> const& float64_columns,
   };
 
   auto const user_selected_float64    = is_column_in(float64_columns);
-  auto const user_selected_decimal128 = is_decimal128_enabled and is_column_in(decimal128_columns);
+  auto const user_selected_decimal128 = is_column_in(decimal128_columns);
   CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128,
                "Both decimal128 and float64 types selected for column " + column_path);
 
@@ -255,9 +254,6 @@ auto decimal_column_type(std::vector<std::string> const& float64_columns,
                            .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
   if (precision <= cuda::std::numeric_limits<int32_t>::digits10) return type_id::DECIMAL32;
   if (precision <= cuda::std::numeric_limits<int64_t>::digits10) return type_id::DECIMAL64;
-  CUDF_EXPECTS(is_decimal128_enabled,
-               "Decimal precision too high for decimal64, use `decimal_cols_as_float` or enable "
-               "decimal128 use");
   return type_id::DECIMAL128;
 }
 
@@ -371,7 +367,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   size_t decomp_offset           = 0;
   uint32_t max_uncomp_block_size = 0;
   uint32_t start_pos             = 0;
-  uint32_t start_pos_uncomp      = (uint32_t)num_compressed_blocks;
+  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
   for (size_t i = 0; i < compinfo.size(); ++i) {
     auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
@@ -754,8 +750,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
     _metadata.get_schema(orc_col_id),
     _use_np_dtypes,
     _timestamp_type.id(),
-    decimal_column_type(
-      _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, orc_col_id));
+    decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, orc_col_id));
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
@@ -900,7 +895,6 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   // Control decimals conversion
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
   decimal128_columns     = options.get_decimal128_columns();
-  is_decimal128_enabled  = options.is_enabled_decimal128();
 }
 
 timezone_table reader::impl::compute_timezone_table(
@@ -964,8 +958,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         _metadata.get_col_type(col.id),
         _use_np_dtypes,
         _timestamp_type.id(),
-        decimal_column_type(
-          _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id));
+        decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
       if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
           col_type == type_id::DECIMAL128) {
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index e8aa298012b..1e586bcde00 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -223,7 +223,6 @@ class reader::impl {
   bool _use_np_dtypes{true};
   std::vector<std::string> _decimal_cols_as_float;
   std::vector<std::string> decimal128_columns;
-  bool is_decimal128_enabled{true};
   data_type _timestamp_type{type_id::EMPTY};
   reader_column_meta _col_meta{};
 };
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 05bc25597c2..dc09b3e7dd8 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1179,7 +1179,7 @@ __global__ void __launch_bounds__(block_size)
 
       row_in = s->chunk.start_row + s->top.nulls_desc_row - prev_parent_null_count;
       if (row_in + nrows > first_row && row_in < first_row + max_num_rows &&
-          s->chunk.valid_map_base != NULL) {
+          s->chunk.valid_map_base != nullptr) {
         int64_t dst_row   = row_in - first_row;
         int64_t dst_pos   = max(dst_row, (int64_t)0);
         uint32_t startbit = -static_cast<int32_t>(min(dst_row, (int64_t)0));
@@ -1325,14 +1325,14 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s,
          s->top.data.cur_row + s->top.data.nrows < s->top.data.end_row) {
     uint32_t nrows = min(s->top.data.end_row - (s->top.data.cur_row + s->top.data.nrows),
                          min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x));
-    if (s->chunk.valid_map_base != NULL) {
+    if (s->chunk.valid_map_base != nullptr) {
       // We have a present stream
       uint32_t rmax  = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
-      uint32_t r     = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
+      auto r         = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
       uint32_t valid = (t < nrows && r < rmax)
                          ? (((const uint8_t*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
                          : 0;
-      volatile uint16_t* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
+      volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
       uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row;
       if (t < nrows) { row_ofs_plus1[t] = valid; }
       lengths_to_positions<uint16_t>(row_ofs_plus1, nrows, t);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 660ec025d00..02ae191d55a 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1040,7 +1040,7 @@ __global__ void __launch_bounds__(block_size)
     uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0;
     if (cid == CI_DICTIONARY) {
       // Encoding string contents
-      const char* ptr = 0;
+      const char* ptr = nullptr;
       uint32_t count  = 0;
       if (t < numvals) {
         auto string_val = string_column->element<string_view>(string_idx);
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index b197751d925..276a1f49abf 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -428,7 +428,7 @@ extern "C" __global__ void __launch_bounds__(128, 8)
     uint32_t rowgroups_in_chunk = s->chunk.num_rowgroups;
     s->rowgroup_start           = s->chunk.rowgroup_id;
     s->rowgroup_end             = s->rowgroup_start + rowgroups_in_chunk;
-    s->is_compressed            = (strm_info != NULL);
+    s->is_compressed            = (strm_info != nullptr);
   }
   __syncthreads();
   while (s->rowgroup_start < s->rowgroup_end) {
@@ -480,7 +480,7 @@ __global__ void __launch_bounds__(block_size)
                             device_2dspan<rowgroup_rows const> rowgroup_bounds,
                             device_2dspan<size_type> set_counts)
 {
-  typedef cub::BlockReduce<size_type, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<size_type, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   auto const column_id   = blockIdx.x;
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index 3a1e8bf898a..810dfe87320 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -70,8 +70,8 @@ struct timezone_file {
   std::vector<localtime_type_record_s> ttype;
   std::vector<char> posix_tz_string;
 
-  auto timecnt() const { return header.timecnt; }
-  auto typecnt() const { return header.typecnt; }
+  [[nodiscard]] auto timecnt() const { return header.timecnt; }
+  [[nodiscard]] auto typecnt() const { return header.typecnt; }
 
   // Based on https://tools.ietf.org/id/draft-murchison-tzdist-tzif-00.html
   static constexpr auto leap_second_rec_size(bool is_64bit) noexcept
@@ -222,7 +222,7 @@ class posix_parser {
   /**
    * @brief Returns the next character in the input.
    */
-  char next_character() const { return *cur; }
+  [[nodiscard]] char next_character() const { return *cur; }
 
  private:
   typename Container::const_iterator cur;
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index b25dfd0a621..a14d94df540 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -24,7 +24,7 @@
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 
-#include <stdint.h>
+#include <cstdint>
 #include <string>
 #include <vector>
 
@@ -121,7 +121,7 @@ class timezone_table {
     : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)}
   {
   }
-  timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
+  [[nodiscard]] timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
 };
 
 /**
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index b7264cb81ac..a917dbf93a5 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -181,7 +181,7 @@ class orc_column_view {
 
   auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; }
   void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; }
-  auto dict_stride() const noexcept { return _dict_stride; }
+  [[nodiscard]] auto dict_stride() const noexcept { return _dict_stride; }
 
   /**
    * @brief Function that associates an existing dictionary chunk allocation
@@ -192,14 +192,14 @@ class orc_column_view {
     dict   = host_dict;
     d_dict = dev_dict;
   }
-  auto host_dict_chunk(size_t rowgroup) const
+  [[nodiscard]] auto host_dict_chunk(size_t rowgroup) const
   {
     CUDF_EXPECTS(is_string(), "Dictionary chunks are only present in string columns.");
     return &dict[rowgroup * _dict_stride + _str_idx];
   }
-  auto device_dict_chunk() const { return d_dict; }
+  [[nodiscard]] auto device_dict_chunk() const { return d_dict; }
 
-  auto const& decimal_offsets() const { return d_decimal_offsets; }
+  [[nodiscard]] auto const& decimal_offsets() const { return d_decimal_offsets; }
   void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; }
 
   /**
@@ -211,39 +211,39 @@ class orc_column_view {
     stripe_dict   = host_stripe_dict;
     d_stripe_dict = dev_stripe_dict;
   }
-  auto host_stripe_dict(size_t stripe) const
+  [[nodiscard]] auto host_stripe_dict(size_t stripe) const
   {
     CUDF_EXPECTS(is_string(), "Stripe dictionary is only present in string columns.");
     return &stripe_dict[stripe * _dict_stride + _str_idx];
   }
-  auto device_stripe_dict() const noexcept { return d_stripe_dict; }
+  [[nodiscard]] auto device_stripe_dict() const noexcept { return d_stripe_dict; }
 
   // Index in the table
-  uint32_t index() const noexcept { return _index; }
+  [[nodiscard]] uint32_t index() const noexcept { return _index; }
   // Id in the ORC file
-  auto id() const noexcept { return _index + 1; }
+  [[nodiscard]] auto id() const noexcept { return _index + 1; }
 
-  auto is_child() const noexcept { return _is_child; }
+  [[nodiscard]] auto is_child() const noexcept { return _is_child; }
   auto parent_index() const noexcept { return _parent_index.value(); }
   auto child_begin() const noexcept { return children.cbegin(); }
   auto child_end() const noexcept { return children.cend(); }
   auto num_children() const noexcept { return children.size(); }
 
-  auto type_width() const noexcept { return _type_width; }
+  [[nodiscard]] auto type_width() const noexcept { return _type_width; }
   auto size() const noexcept { return cudf_column.size(); }
 
   auto null_count() const noexcept { return cudf_column.null_count(); }
   auto null_mask() const noexcept { return cudf_column.null_mask(); }
-  bool nullable() const noexcept { return null_mask() != nullptr; }
+  [[nodiscard]] bool nullable() const noexcept { return null_mask() != nullptr; }
   auto user_defined_nullable() const noexcept { return nullable_from_metadata; }
 
-  auto scale() const noexcept { return _scale; }
-  auto precision() const noexcept { return _precision; }
+  [[nodiscard]] auto scale() const noexcept { return _scale; }
+  [[nodiscard]] auto precision() const noexcept { return _precision; }
 
   void set_orc_encoding(ColumnEncodingKind e) noexcept { _encoding_kind = e; }
-  auto orc_kind() const noexcept { return _type_kind; }
-  auto orc_encoding() const noexcept { return _encoding_kind; }
-  std::string_view orc_name() const noexcept { return name; }
+  [[nodiscard]] auto orc_kind() const noexcept { return _type_kind; }
+  [[nodiscard]] auto orc_encoding() const noexcept { return _encoding_kind; }
+  [[nodiscard]] std::string_view orc_name() const noexcept { return name; }
 
  private:
   column_view cudf_column;
@@ -1063,15 +1063,15 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
 }
 
 writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
-  bool are_statistics_enabled,
+  statistics_freq stats_freq,
   orc_table_view const& orc_table,
   file_segmentation const& segmentation)
 {
-  auto const num_rowgroup_blobs = segmentation.rowgroups.count();
-  auto const num_stripe_blobs   = segmentation.num_stripes() * orc_table.num_columns();
-  auto const num_file_blobs     = orc_table.num_columns();
-  auto const num_stat_blobs     = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
-
+  auto const num_rowgroup_blobs     = segmentation.rowgroups.count();
+  auto const num_stripe_blobs       = segmentation.num_stripes() * orc_table.num_columns();
+  auto const num_file_blobs         = orc_table.num_columns();
+  auto const num_stat_blobs         = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
+  auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE;
   if (not are_statistics_enabled or num_stat_blobs == 0) { return {}; }
 
   hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
@@ -1164,17 +1164,27 @@ writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
 
   hostdevice_vector<uint8_t> blobs(
     stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
-  gpu::orc_encode_statistics(
-    blobs.device_ptr(), stat_merge.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
+  // Skip rowgroup blobs when encoding, if chosen granularity is coarser than "ROW_GROUP".
+  auto const is_granularity_rowgroup = stats_freq == ORC_STATISTICS_ROW_GROUP;
+  auto const num_skip                = is_granularity_rowgroup ? 0 : num_rowgroup_blobs;
+  gpu::orc_encode_statistics(blobs.device_ptr(),
+                             stat_merge.device_ptr(num_skip),
+                             stat_chunks.data() + num_skip,
+                             num_stat_blobs - num_skip,
+                             stream);
   stat_merge.device_to_host(stream);
   blobs.device_to_host(stream, true);
 
-  std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
-  for (size_t i = 0; i < num_rowgroup_blobs; i++) {
-    auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
-    auto const stat_end   = stat_begin + rowgroup_stat_merge[i].num_chunks;
-    rowgroup_blobs[i].assign(stat_begin, stat_end);
-  }
+  auto rowgroup_blobs = [&]() -> std::vector<ColStatsBlob> {
+    if (not is_granularity_rowgroup) { return {}; }
+    std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
+    for (size_t i = 0; i < num_rowgroup_blobs; i++) {
+      auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
+      auto const stat_end   = stat_begin + rowgroup_stat_merge[i].num_chunks;
+      rowgroup_blobs[i].assign(stat_begin, stat_end);
+    }
+    return rowgroup_blobs;
+  }();
 
   std::vector<ColStatsBlob> stripe_blobs(num_stripe_blobs);
   for (size_t i = 0; i < num_stripe_blobs; i++) {
@@ -1351,7 +1361,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.is_enabled_statistics()),
+    stats_freq_(options.get_statistics_freq()),
     single_write_mode(mode == SingleWriteMode::YES),
     kv_meta(options.get_key_value_metadata()),
     out_sink_(std::move(sink))
@@ -1372,7 +1382,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.is_enabled_statistics()),
+    stats_freq_(options.get_statistics_freq()),
     single_write_mode(mode == SingleWriteMode::YES),
     kv_meta(options.get_key_value_metadata()),
     out_sink_(std::move(sink))
@@ -1954,7 +1964,7 @@ void writer::impl::write(table_view const& table)
 
     ProtobufWriter pbw_(&buffer_);
 
-    auto const statistics = gather_statistic_blobs(enable_statistics_, orc_table, segmentation);
+    auto const statistics = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
 
     // Write stripes
     std::vector<std::future<void>> write_tasks;
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 2738a77e50a..69bb6029ee0 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -62,14 +62,14 @@ struct orc_table_view {
   rmm::device_uvector<uint32_t> d_string_column_indices;
 
   auto num_columns() const noexcept { return columns.size(); }
-  size_type num_rows() const noexcept;
+  [[nodiscard]] size_type num_rows() const noexcept;
   auto num_string_columns() const noexcept { return string_column_indices.size(); }
 
   auto& column(uint32_t idx) { return columns.at(idx); }
-  auto const& column(uint32_t idx) const { return columns.at(idx); }
+  [[nodiscard]] auto const& column(uint32_t idx) const { return columns.at(idx); }
 
   auto& string_column(uint32_t idx) { return columns.at(string_column_indices.at(idx)); }
-  auto const& string_column(uint32_t idx) const
+  [[nodiscard]] auto const& string_column(uint32_t idx) const
   {
     return columns.at(string_column_indices.at(idx));
   }
@@ -85,8 +85,8 @@ struct stripe_rowgroups {
   uint32_t first;  // first rowgroup in the stripe
   uint32_t size;   // number of rowgroups in the stripe
   stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {}
-  auto cbegin() const { return thrust::make_counting_iterator(first); }
-  auto cend() const { return thrust::make_counting_iterator(first + size); }
+  [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); }
+  [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); }
 };
 
 /**
@@ -123,10 +123,10 @@ class orc_streams {
     std::vector<size_t> offsets;
     size_t non_rle_data_size = 0;
     size_t rle_data_size     = 0;
-    auto data_size() const { return non_rle_data_size + rle_data_size; }
+    [[nodiscard]] auto data_size() const { return non_rle_data_size + rle_data_size; }
   };
-  orc_stream_offsets compute_offsets(host_span<orc_column_view const> columns,
-                                     size_t num_rowgroups) const;
+  [[nodiscard]] orc_stream_offsets compute_offsets(host_span<orc_column_view const> columns,
+                                                   size_t num_rowgroups) const;
 
   operator std::vector<Stream> const &() const { return streams; }
 
@@ -293,13 +293,13 @@ class writer::impl {
   /**
    * @brief Returns column statistics encoded in ORC protobuf format.
    *
-   * @param are_statistics_enabled True if statistics are to be included in the output file
+   * @param statistics_freq Frequency of statistics to be included in the output file
    * @param orc_table Table information to be written
    * @param columns List of columns
    * @param segmentation stripe and rowgroup ranges
    * @return The statistic blobs
    */
-  encoded_statistics gather_statistic_blobs(bool are_statistics_enabled,
+  encoded_statistics gather_statistic_blobs(statistics_freq statistics_freq,
                                             orc_table_view const& orc_table,
                                             file_segmentation const& segmentation);
 
@@ -365,8 +365,8 @@ class writer::impl {
   size_t compression_blocksize_     = DEFAULT_COMPRESSION_BLOCKSIZE;
   CompressionKind compression_kind_ = CompressionKind::NONE;
 
-  bool enable_dictionary_ = true;
-  bool enable_statistics_ = true;
+  bool enable_dictionary_     = true;
+  statistics_freq stats_freq_ = ORC_STATISTICS_ROW_GROUP;
 
   // Overall file metadata.  Filled in during the process and written during write_chunked_end()
   cudf::io::orc::FileFooter ff;
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 71452bd7809..53739a26beb 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -20,8 +20,8 @@
 #include "parquet_common.hpp"
 
 #include <algorithm>
-#include <stddef.h>
-#include <stdint.h>
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <vector>
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 751d6b62319..df4310fcd63 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -102,7 +102,7 @@ struct page_state_s {
  */
 __device__ uint32_t device_str2hash32(const char* key, size_t len, uint32_t seed = 33)
 {
-  const uint8_t* p  = reinterpret_cast<const uint8_t*>(key);
+  const auto* p     = reinterpret_cast<const uint8_t*>(key);
   uint32_t h1       = seed, k1;
   const uint32_t c1 = 0xcc9e2d51;
   const uint32_t c2 = 0x1b873593;
@@ -513,7 +513,7 @@ __device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_po
  */
 inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv)
 {
-  const char* ptr = NULL;
+  const char* ptr = nullptr;
   size_t len      = 0;
 
   if (s->dict_base) {
@@ -522,10 +522,9 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo
                                                sizeof(string_index_pair)
                                            : 0;
     if (dict_pos < (uint32_t)s->dict_size) {
-      const string_index_pair* src =
-        reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
-      ptr = src->first;
-      len = src->second;
+      const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
+      ptr             = src->first;
+      len             = src->second;
     }
   } else {
     // Plain encoding
@@ -540,9 +539,9 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo
     *static_cast<uint32_t*>(dstv) = device_str2hash32(ptr, len);
   } else {
     // Output string descriptor
-    string_index_pair* dst = static_cast<string_index_pair*>(dstv);
-    dst->first             = ptr;
-    dst->second            = len;
+    auto* dst   = static_cast<string_index_pair*>(dstv);
+    dst->first  = ptr;
+    dst->second = len;
   }
 }
 
@@ -1016,7 +1015,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
       cur += InitLevelSection(s, cur, end, level_type::DEFINITION);
 
       s->dict_bits = 0;
-      s->dict_base = 0;
+      s->dict_base = nullptr;
       s->dict_size = 0;
       switch (s->page.encoding) {
         case Encoding::PLAIN_DICTIONARY:
@@ -1133,7 +1132,7 @@ static __device__ void store_validity(PageNestingInfo* pni,
   int bit_offset  = pni->valid_map_offset % 32;
   // if we fit entirely in the output word
   if (bit_offset + value_count <= 32) {
-    uint32_t relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
+    auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
 
     if (relevant_mask == ~0) {
       pni->valid_map[word_offset] = valid_mask;
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index ec6b24b3b4e..2074304251f 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1068,7 +1068,7 @@ __global__ void __launch_bounds__(128, 8)
   }
   if (t == 0) {
     uint8_t* base                = s->page.page_data + s->page.max_hdr_size;
-    uint32_t actual_data_size    = static_cast<uint32_t>(s->cur - base);
+    auto actual_data_size        = static_cast<uint32_t>(s->cur - base);
     uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size);
     s->page.max_data_size        = actual_data_size;
     s->comp_in.srcDevice         = base;
@@ -1244,7 +1244,7 @@ class header_encoder {
     *header_end = current_header_ptr;
   }
 
-  inline __device__ uint8_t* get_ptr(void) { return current_header_ptr; }
+  inline __device__ uint8_t* get_ptr() { return current_header_ptr; }
 
   inline __device__ void set_ptr(uint8_t* ptr) { current_header_ptr = ptr; }
 };
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 21610638843..b4fa9b4ae82 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -19,8 +19,8 @@
 #include "parquet_common.hpp"
 
 #include <algorithm>
-#include <stddef.h>
-#include <stdint.h>
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -65,11 +65,11 @@ struct MilliSeconds {
 };
 struct MicroSeconds {
 };
-typedef struct TimeUnit_isset {
-  TimeUnit_isset() : MILLIS(false), MICROS(false) {}
-  bool MILLIS;
-  bool MICROS;
-} TimeUnit_isset;
+using TimeUnit_isset = struct TimeUnit_isset {
+  TimeUnit_isset() {}
+  bool MILLIS{false};
+  bool MICROS{false};
+};
 
 struct TimeUnit {
   TimeUnit_isset isset;
@@ -97,35 +97,21 @@ struct BsonType {
 };
 
 // thrift generated code simplified.
-typedef struct LogicalType_isset {
-  LogicalType_isset()
-    : STRING(false),
-      MAP(false),
-      LIST(false),
-      ENUM(false),
-      DECIMAL(false),
-      DATE(false),
-      TIME(false),
-      TIMESTAMP(false),
-      INTEGER(false),
-      UNKNOWN(false),
-      JSON(false),
-      BSON(false)
-  {
-  }
-  bool STRING;
-  bool MAP;
-  bool LIST;
-  bool ENUM;
-  bool DECIMAL;
-  bool DATE;
-  bool TIME;
-  bool TIMESTAMP;
-  bool INTEGER;
-  bool UNKNOWN;
-  bool JSON;
-  bool BSON;
-} LogicalType_isset;
+using LogicalType_isset = struct LogicalType_isset {
+  LogicalType_isset() {}
+  bool STRING{false};
+  bool MAP{false};
+  bool LIST{false};
+  bool ENUM{false};
+  bool DECIMAL{false};
+  bool DATE{false};
+  bool TIME{false};
+  bool TIMESTAMP{false};
+  bool INTEGER{false};
+  bool UNKNOWN{false};
+  bool JSON{false};
+  bool BSON{false};
+};
 
 struct LogicalType {
   LogicalType_isset isset;
@@ -197,16 +183,19 @@ struct SchemaElement {
   //     required int32 num;
   //  };
   // }
-  bool is_stub() const { return repetition_type == REPEATED && num_children == 1; }
+  [[nodiscard]] bool is_stub() const { return repetition_type == REPEATED && num_children == 1; }
 
   // https://github.com/apache/parquet-cpp/blob/642da05/src/parquet/schema.h#L49-L50
   // One-level LIST encoding: Only allows required lists with required cells:
   //   repeated value_type name
-  bool is_one_level_list() const { return repetition_type == REPEATED and num_children == 0; }
+  [[nodiscard]] bool is_one_level_list() const
+  {
+    return repetition_type == REPEATED and num_children == 0;
+  }
 
   // in parquet terms, a group is a level of nesting in the schema. a group
   // can be a struct or a list
-  bool is_struct() const
+  [[nodiscard]] bool is_struct() const
   {
     return type == UNDEFINED_TYPE &&
            // this assumption might be a little weak.
@@ -369,7 +358,7 @@ class CompactProtocolReader {
     m_base = m_cur = base;
     m_end          = base + len;
   }
-  ptrdiff_t bytecount() const noexcept { return m_cur - m_base; }
+  [[nodiscard]] ptrdiff_t bytecount() const noexcept { return m_cur - m_base; }
   unsigned int getb() noexcept { return (m_cur < m_end) ? *m_cur++ : 0; }
   void skip_bytes(size_t bytecnt) noexcept
   {
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index fc4afe951db..885f36aeca4 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -337,7 +337,7 @@ class aggregate_reader_metadata {
   /**
    * @brief Sums up the number of rows of each source
    */
-  size_type calc_num_rows() const
+  [[nodiscard]] size_type calc_num_rows() const
   {
     return std::accumulate(
       per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
@@ -348,7 +348,7 @@ class aggregate_reader_metadata {
   /**
    * @brief Sums up the number of row groups of each source
    */
-  size_type calc_num_row_groups() const
+  [[nodiscard]] size_type calc_num_row_groups() const
   {
     return std::accumulate(
       per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
@@ -381,16 +381,16 @@ class aggregate_reader_metadata {
     }
   }
 
-  auto const& get_row_group(size_type row_group_index, size_type src_idx) const
+  [[nodiscard]] auto const& get_row_group(size_type row_group_index, size_type src_idx) const
   {
     CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast<size_type>(per_file_metadata.size()),
                  "invalid source index");
     return per_file_metadata[src_idx].row_groups[row_group_index];
   }
 
-  auto const& get_column_metadata(size_type row_group_index,
-                                  size_type src_idx,
-                                  int schema_idx) const
+  [[nodiscard]] auto const& get_column_metadata(size_type row_group_index,
+                                                size_type src_idx,
+                                                int schema_idx) const
   {
     auto col = std::find_if(
       per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
@@ -401,13 +401,16 @@ class aggregate_reader_metadata {
     return col->meta_data;
   }
 
-  auto get_num_rows() const { return num_rows; }
+  [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
-  auto get_num_row_groups() const { return num_row_groups; }
+  [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
-  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].schema[schema_idx]; }
+  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  {
+    return per_file_metadata[0].schema[schema_idx];
+  }
 
-  auto const& get_key_value_metadata() const { return agg_keyval_map; }
+  [[nodiscard]] auto const& get_key_value_metadata() const { return agg_keyval_map; }
 
   /**
    * @brief Gets the concrete nesting depth of output cudf columns
@@ -416,7 +419,7 @@ class aggregate_reader_metadata {
    *
    * @return comma-separated index column names in quotes
    */
-  inline int get_output_nesting_depth(int schema_index) const
+  [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const
   {
     auto& pfm = per_file_metadata[0];
     int depth = 0;
@@ -441,7 +444,7 @@ class aggregate_reader_metadata {
    *
    * @return comma-separated index column names in quotes
    */
-  std::string get_pandas_index() const
+  [[nodiscard]] std::string get_pandas_index() const
   {
     auto it = agg_keyval_map.find("pandas");
     if (it != agg_keyval_map.end()) {
@@ -472,7 +475,7 @@ class aggregate_reader_metadata {
    *
    * @param names List of column names to load, where index column name(s) will be added
    */
-  std::vector<std::string> get_pandas_index_names() const
+  [[nodiscard]] std::vector<std::string> get_pandas_index_names() const
   {
     std::vector<std::string> names;
     auto str = get_pandas_index();
@@ -511,9 +514,9 @@ class aggregate_reader_metadata {
    *
    * @return List of row group indexes and its starting row
    */
-  auto select_row_groups(std::vector<std::vector<size_type>> const& row_groups,
-                         size_type& row_start,
-                         size_type& row_count) const
+  [[nodiscard]] auto select_row_groups(std::vector<std::vector<size_type>> const& row_groups,
+                                       size_type& row_start,
+                                       size_type& row_count) const
   {
     if (!row_groups.empty()) {
       std::vector<row_group_info> selection;
@@ -570,10 +573,10 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  auto select_columns(std::vector<std::string> const& use_names,
-                      bool include_index,
-                      bool strings_to_categorical,
-                      type_id timestamp_type_id) const
+  [[nodiscard]] auto select_columns(std::vector<std::string> const& use_names,
+                                    bool include_index,
+                                    bool strings_to_categorical,
+                                    type_id timestamp_type_id) const
   {
     auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
       auto const& col_schema_idx = std::find_if(
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index b302516ba39..a9306275b26 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -166,12 +166,12 @@ struct aggregate_writer_metadata {
     return global_rowgroup_base;
   }
 
-  bool schema_matches(std::vector<SchemaElement> const& schema) const
+  [[nodiscard]] bool schema_matches(std::vector<SchemaElement> const& schema) const
   {
     return this->schema == schema;
   }
   auto& file(size_t p) { return files[p]; }
-  size_t num_files() const { return files.size(); }
+  [[nodiscard]] size_t num_files() const { return files.size(); }
 
  private:
   int32_t version = 0;
@@ -678,18 +678,18 @@ struct parquet_column_view {
                       std::vector<schema_tree_node> const& schema_tree,
                       rmm::cuda_stream_view stream);
 
-  column_view leaf_column_view() const;
-  gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] column_view leaf_column_view() const;
+  [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
-  column_view cudf_column_view() const { return cudf_col; }
-  parquet::Type physical_type() const { return schema_node.type; }
+  [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
+  [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
   // LIST related member functions
-  uint8_t max_def_level() const noexcept { return _max_def_level; }
-  uint8_t max_rep_level() const noexcept { return _max_rep_level; }
-  bool is_list() const noexcept { return _is_list; }
+  [[nodiscard]] uint8_t max_def_level() const noexcept { return _max_def_level; }
+  [[nodiscard]] uint8_t max_rep_level() const noexcept { return _max_rep_level; }
+  [[nodiscard]] bool is_list() const noexcept { return _is_list; }
 
  private:
   // Schema related members
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index 755f3416b1d..15fe2544930 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -20,13 +20,15 @@
  */
 
 #pragma once
-#include <stdint.h>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cstdint>
+
 namespace cudf {
 namespace io {
 
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index 0992a557491..8e35fcf3c44 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -92,24 +92,20 @@ struct typed_statistics_chunk<T, true> {
   using E = typename detail::extrema_type<T>::type;
   using A = typename detail::aggregation_type<T>::type;
 
-  uint32_t non_nulls;   //!< number of non-null values in chunk
-  uint32_t null_count;  //!< number of null values in chunk
+  uint32_t non_nulls{0};   //!< number of non-null values in chunk
+  uint32_t null_count{0};  //!< number of null values in chunk
 
   E minimum_value;
   E maximum_value;
   A aggregate;
 
-  uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
-  uint8_t has_sum;     //!< Nonzero if sum is valid
+  uint8_t has_minmax{false};  //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum{false};     //!< Nonzero if sum is valid
 
   __device__ typed_statistics_chunk()
-    : non_nulls(0),
-      null_count(0),
-      minimum_value(detail::minimum_identity<E>()),
+    : minimum_value(detail::minimum_identity<E>()),
       maximum_value(detail::maximum_identity<E>()),
-      aggregate(0),
-      has_minmax(false),
-      has_sum(false)  // Set to true when storing
+      aggregate(0)
   {
   }
 
@@ -140,22 +136,17 @@ template <typename T>
 struct typed_statistics_chunk<T, false> {
   using E = typename detail::extrema_type<T>::type;
 
-  uint32_t non_nulls;   //!< number of non-null values in chunk
-  uint32_t null_count;  //!< number of null values in chunk
+  uint32_t non_nulls{0};   //!< number of non-null values in chunk
+  uint32_t null_count{0};  //!< number of null values in chunk
 
   E minimum_value;
   E maximum_value;
 
-  uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
-  uint8_t has_sum;     //!< Nonzero if sum is valid
+  uint8_t has_minmax{false};  //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum{false};     //!< Nonzero if sum is valid
 
   __device__ typed_statistics_chunk()
-    : non_nulls(0),
-      null_count(0),
-      minimum_value(detail::minimum_identity<E>()),
-      maximum_value(detail::maximum_identity<E>()),
-      has_minmax(false),
-      has_sum(false)  // Set to true when storing
+    : minimum_value(detail::minimum_identity<E>()), maximum_value(detail::maximum_identity<E>())
   {
   }
 
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index 2b4f69df10f..d73f0ebc9b7 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -15,7 +15,7 @@
  */
 
 #pragma once
-#include <stdint.h>
+#include <cstdint>
 
 namespace cudf {
 namespace io {
@@ -32,7 +32,7 @@ inline __device__ T shuffle_xor(T var, uint32_t delta)
   return __shfl_xor_sync(~0, var, delta);
 }
 
-inline __device__ void syncwarp(void) { __syncwarp(); }
+inline __device__ void syncwarp() { __syncwarp(); }
 
 inline __device__ uint32_t ballot(int pred) { return __ballot_sync(~0, pred); }
 
@@ -126,18 +126,18 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi)
 
 inline __device__ uint32_t unaligned_load32(const uint8_t* p)
 {
-  uint32_t ofs        = 3 & reinterpret_cast<uintptr_t>(p);
-  const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
-  uint32_t v          = p32[0];
+  uint32_t ofs    = 3 & reinterpret_cast<uintptr_t>(p);
+  const auto* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
+  uint32_t v      = p32[0];
   return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v;
 }
 
 inline __device__ uint64_t unaligned_load64(const uint8_t* p)
 {
-  uint32_t ofs        = 3 & reinterpret_cast<uintptr_t>(p);
-  const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
-  uint32_t v0         = p32[0];
-  uint32_t v1         = p32[1];
+  uint32_t ofs    = 3 & reinterpret_cast<uintptr_t>(p);
+  const auto* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
+  uint32_t v0     = p32[0];
+  uint32_t v1     = p32[1];
   if (ofs) {
     v0 = __funnelshift_r(v0, v1, ofs * 8);
     v1 = __funnelshift_r(v1, p32[2], ofs * 8);
@@ -148,8 +148,8 @@ inline __device__ uint64_t unaligned_load64(const uint8_t* p)
 template <unsigned int nthreads, bool sync_before_store>
 inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len, uint32_t t)
 {
-  uint8_t* dst       = static_cast<uint8_t*>(dstv);
-  const uint8_t* src = static_cast<const uint8_t*>(srcv);
+  auto* dst       = static_cast<uint8_t*>(dstv);
+  const auto* src = static_cast<const uint8_t*>(srcv);
   uint32_t dst_align_bytes, src_align_bytes, src_align_bits;
   // Align output to 32-bit
   dst_align_bytes = 3 & -reinterpret_cast<intptr_t>(dst);
@@ -166,8 +166,8 @@ inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len,
   src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src));
   src_align_bits  = src_align_bytes * 8;
   while (len >= 4) {
-    const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
-    uint32_t copy_cnt     = min(len >> 2, nthreads);
+    const auto* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
+    uint32_t copy_cnt = min(len >> 2, nthreads);
     uint32_t v;
     if (t < copy_cnt) {
       v = src32[t];
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 642f3518edd..63d0103ddec 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -49,9 +49,9 @@ class file_sink : public data_sink {
 
   size_t bytes_written() override { return _bytes_written; }
 
-  bool supports_device_write() const override { return _cufile_out != nullptr; }
+  [[nodiscard]] bool supports_device_write() const override { return _cufile_out != nullptr; }
 
-  bool is_device_write_preferred(size_t size) const override
+  [[nodiscard]] bool is_device_write_preferred(size_t size) const override
   {
     return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size);
   }
@@ -109,13 +109,13 @@ class host_buffer_sink : public data_sink {
  */
 class void_sink : public data_sink {
  public:
-  explicit void_sink() : _bytes_written(0) {}
+  explicit void_sink() {}
 
   virtual ~void_sink() {}
 
   void host_write(void const* data, size_t size) override { _bytes_written += size; }
 
-  bool supports_device_write() const override { return true; }
+  [[nodiscard]] bool supports_device_write() const override { return true; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
@@ -146,7 +146,10 @@ class user_sink_wrapper : public data_sink {
 
   void host_write(void const* data, size_t size) override { user_sink->host_write(data, size); }
 
-  bool supports_device_write() const override { return user_sink->supports_device_write(); }
+  [[nodiscard]] bool supports_device_write() const override
+  {
+    return user_sink->supports_device_write();
+  }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 3de6f35cb0d..6f864ab509f 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -40,9 +40,9 @@ class file_source : public datasource {
 
   virtual ~file_source() = default;
 
-  bool supports_device_read() const override { return _cufile_in != nullptr; }
+  [[nodiscard]] bool supports_device_read() const override { return _cufile_in != nullptr; }
 
-  bool is_device_read_preferred(size_t size) const override
+  [[nodiscard]] bool is_device_read_preferred(size_t size) const override
   {
     return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
   }
@@ -79,7 +79,7 @@ class file_source : public datasource {
     return _cufile_in->read_async(offset, read_size, dst, stream);
   }
 
-  size_t size() const override { return _file.size(); }
+  [[nodiscard]] size_t size() const override { return _file.size(); }
 
  protected:
   detail::file_wrapper _file;
@@ -102,7 +102,7 @@ class memory_mapped_source : public file_source {
     if (_file.size() != 0) map(_file.desc(), offset, size);
   }
 
-  virtual ~memory_mapped_source()
+  ~memory_mapped_source() override
   {
     if (_map_addr != nullptr) { munmap(_map_addr, _map_size); }
   }
@@ -210,7 +210,10 @@ class user_datasource_wrapper : public datasource {
     return source->host_read(offset, size);
   }
 
-  bool supports_device_read() const override { return source->supports_device_read(); }
+  [[nodiscard]] bool supports_device_read() const override
+  {
+    return source->supports_device_read();
+  }
 
   size_t device_read(size_t offset,
                      size_t size,
@@ -227,7 +230,7 @@ class user_datasource_wrapper : public datasource {
     return source->device_read(offset, size, stream);
   }
 
-  size_t size() const override { return source->size(); }
+  [[nodiscard]] size_t size() const override { return source->size(); }
 
  private:
   datasource* const source;  ///< A non-owning pointer to the user-implemented datasource
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 7178418bbbf..fcee4e43a20 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -45,8 +45,8 @@ class file_wrapper {
   explicit file_wrapper(std::string const& filepath, int flags);
   explicit file_wrapper(std::string const& filepath, int flags, mode_t mode);
   ~file_wrapper();
-  auto size() const { return _size; }
-  auto desc() const { return fd; }
+  [[nodiscard]] auto size() const { return _size; }
+  [[nodiscard]] auto desc() const { return fd; }
 };
 
 /**
@@ -184,7 +184,7 @@ struct cufile_registered_file {
     register_handle();
   }
 
-  auto const& handle() const noexcept { return cf_handle; }
+  [[nodiscard]] auto const& handle() const noexcept { return cf_handle; }
 
   ~cufile_registered_file();
 
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index cbf914b8da6..367bbfcbdfa 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -77,9 +77,9 @@ class hostdevice_vector {
     return false;
   }
 
-  size_t max_size() const noexcept { return max_elements; }
-  size_t size() const noexcept { return num_elements; }
-  size_t memory_size() const noexcept { return sizeof(T) * num_elements; }
+  [[nodiscard]] size_t max_size() const noexcept { return max_elements; }
+  [[nodiscard]] size_t size() const noexcept { return num_elements; }
+  [[nodiscard]] size_t memory_size() const noexcept { return sizeof(T) * num_elements; }
 
   T& operator[](size_t i) const { return h_data[i]; }
   T* host_ptr(size_t offset = 0) const { return h_data + offset; }
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 6da3296055c..878b36191ac 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -68,7 +68,7 @@ struct parse_options {
   cudf::detail::optional_trie trie_na;
   bool multi_delimiter;
 
-  parse_options_view view() const
+  [[nodiscard]] parse_options_view view() const
   {
     return {delimiter,
             terminator,
diff --git a/cpp/src/io/utilities/thread_pool.hpp b/cpp/src/io/utilities/thread_pool.hpp
index c57082034db..952ab58813a 100644
--- a/cpp/src/io/utilities/thread_pool.hpp
+++ b/cpp/src/io/utilities/thread_pool.hpp
@@ -44,7 +44,7 @@ namespace detail {
  * and/or obtain its eventual return value.
  */
 class thread_pool {
-  typedef std::uint_fast32_t ui32;
+  using ui32 = int;
 
  public:
   /**
@@ -79,7 +79,7 @@ class thread_pool {
    *
    * @return The number of queued tasks.
    */
-  size_t get_tasks_queued() const
+  [[nodiscard]] size_t get_tasks_queued() const
   {
     const std::scoped_lock lock(queue_mutex);
     return tasks.size();
@@ -90,7 +90,7 @@ class thread_pool {
    *
    * @return The number of running tasks.
    */
-  ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
+  [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
 
   /**
    * @brief Get the total number of unfinished tasks - either still in the queue, or running in a
@@ -98,14 +98,14 @@ class thread_pool {
    *
    * @return The total number of tasks.
    */
-  ui32 get_tasks_total() const { return tasks_total; }
+  [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; }
 
   /**
    * @brief Get the number of threads in the pool.
    *
    * @return The number of threads.
    */
-  ui32 get_thread_count() const { return thread_count; }
+  [[nodiscard]] ui32 get_thread_count() const { return thread_count; }
 
   /**
    * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index 1140a08b76b..85834ad2f0e 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -23,6 +23,8 @@
 
 #include <cudf/utilities/span.hpp>
 
+#include <optional>
+
 namespace cudf {
 namespace detail {
 static constexpr char trie_terminating_character = '\n';
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index c3dc343dd2d..dc62eeec539 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -110,7 +110,6 @@ conditional_join(table_view const& left,
   } else {
     // Allocate storage for the counter used to get the size of the join output
     rmm::device_scalar<std::size_t> size(0, stream, mr);
-    CHECK_CUDA(stream.value());
     if (has_nulls) {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -130,7 +129,6 @@ conditional_join(table_view const& left,
           swap_tables,
           size.data());
     }
-    CHECK_CUDA(stream.value());
     join_size = size.value(stream);
   }
 
@@ -178,8 +176,6 @@ conditional_join(table_view const& left,
         swap_tables);
   }
 
-  CHECK_CUDA(stream.value());
-
   auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
 
   // For full joins, get the indices in the right table that were not joined to
@@ -260,7 +256,6 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
 
   // Allocate storage for the counter used to get the size of the join output
   rmm::device_scalar<std::size_t> size(0, stream, mr);
-  CHECK_CUDA(stream.value());
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
@@ -283,8 +278,6 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
         swap_tables,
         size.data());
   }
-  CHECK_CUDA(stream.value());
-
   return size.value(stream);
 }
 
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 4005d6101bd..c2115c3caa4 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -242,13 +242,13 @@ struct hash_join::hash_join_impl {
             rmm::cuda_stream_view stream,
             rmm::mr::device_memory_resource* mr) const;
 
-  std::size_t inner_join_size(cudf::table_view const& probe,
-                              null_equality compare_nulls,
-                              rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe,
+                                            null_equality compare_nulls,
+                                            rmm::cuda_stream_view stream) const;
 
-  std::size_t left_join_size(cudf::table_view const& probe,
-                             null_equality compare_nulls,
-                             rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe,
+                                           null_equality compare_nulls,
+                                           rmm::cuda_stream_view stream) const;
 
   std::size_t full_join_size(cudf::table_view const& probe,
                              null_equality compare_nulls,
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 938a85247f8..526c22d1d5c 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
+#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 
 #include <limits>
@@ -60,6 +61,9 @@ using mixed_multimap_type = cuco::static_multimap<hash_value_type,
                                                   hash_table_allocator_type,
                                                   cuco::double_hashing<1, hash_type, hash_type>>;
 
+using semi_map_type = cuco::
+  static_map<hash_value_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
+
 using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
 
 using row_equality = cudf::row_equality_comparator<cudf::nullate::DYNAMIC>;
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index c609b58132c..0eb0a8de352 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,8 +33,6 @@
 #include <optional>
 #include <utility>
 
-#include <cstdio>
-
 namespace cudf {
 namespace detail {
 
@@ -57,6 +55,9 @@ mixed_join(
   CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
                "The right conditional and equality tables must have the same number of rows.");
 
+  CUDF_EXPECTS((join_type != join_kind::LEFT_SEMI_JOIN) && (join_type != join_kind::LEFT_ANTI_JOIN),
+               "Left semi and anti joins should use mixed_join_semi.");
+
   auto const right_num_rows{right_conditional.num_rows()};
   auto const left_num_rows{left_conditional.num_rows()};
   auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
@@ -71,25 +72,21 @@ mixed_join(
   // null index for the right table; in others, we return an empty output.
   if (right_num_rows == 0) {
     switch (join_type) {
-      // Left, left anti, and full all return all the row indices from left
-      // with a corresponding NULL from the right.
+      // Left and full joins all return all the row indices from
+      // left with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
-      case join_kind::LEFT_ANTI_JOIN:
       case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream);
-      // Inner and left semi joins return empty output because no matches can exist.
+      // Inner joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
-      case join_kind::LEFT_SEMI_JOIN:
         return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
                               std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   } else if (left_num_rows == 0) {
     switch (join_type) {
-      // Left, left anti, left semi, and inner joins all return empty sets.
+      // Left and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
-      case join_kind::LEFT_ANTI_JOIN:
       case join_kind::INNER_JOIN:
-      case join_kind::LEFT_SEMI_JOIN:
         return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
                               std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
@@ -160,7 +157,6 @@ mixed_join(
   } else {
     // Allocate storage for the counter used to get the size of the join output
     rmm::device_scalar<std::size_t> size(0, stream, mr);
-    CHECK_CUDA(stream.value());
 
     matches_per_row =
       rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
@@ -199,7 +195,6 @@ mixed_join(
           size.data(),
           mutable_matches_per_row_span);
     }
-    CHECK_CUDA(stream.value());
     join_size = size.value(stream);
   }
 
@@ -229,7 +224,7 @@ mixed_join(
   auto const& join_output_r = right_indices->data();
 
   if (has_nulls) {
-    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
+    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
         *left_conditional_view,
         *right_conditional_view,
@@ -244,7 +239,7 @@ mixed_join(
         join_result_offsets.data(),
         swap_tables);
   } else {
-    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
+    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
         *left_conditional_view,
         *right_conditional_view,
@@ -260,8 +255,6 @@ mixed_join(
         swap_tables);
   }
 
-  CHECK_CUDA(stream.value());
-
   auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
 
   // For full joins, get the indices in the right table that were not joined to
@@ -292,6 +285,10 @@ compute_mixed_join_output_size(table_view const& left_equality,
   CUDF_EXPECTS(join_type != join_kind::FULL_JOIN,
                "Size estimation is not available for full joins.");
 
+  CUDF_EXPECTS(
+    (join_type != join_kind::LEFT_SEMI_JOIN) && (join_type != join_kind::LEFT_ANTI_JOIN),
+    "Left semi and anti join size estimation should use compute_mixed_join_output_size_semi.");
+
   CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
                "The left conditional and equality tables must have the same number of rows.");
   CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
@@ -319,14 +316,12 @@ compute_mixed_join_output_size(table_view const& left_equality,
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
-      case join_kind::LEFT_ANTI_JOIN:
       case join_kind::FULL_JOIN: {
         thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
         return {left_num_rows, std::move(matches_per_row)};
       }
       // Inner and left semi joins return empty output because no matches can exist.
-      case join_kind::INNER_JOIN:
-      case join_kind::LEFT_SEMI_JOIN: {
+      case join_kind::INNER_JOIN: {
         thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
         return {0, std::move(matches_per_row)};
       }
@@ -336,9 +331,7 @@ compute_mixed_join_output_size(table_view const& left_equality,
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
-      case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::INNER_JOIN:
-      case join_kind::LEFT_SEMI_JOIN: {
+      case join_kind::INNER_JOIN: {
         thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
         return {0, std::move(matches_per_row)};
       }
@@ -397,7 +390,6 @@ compute_mixed_join_output_size(table_view const& left_equality,
 
   // Allocate storage for the counter used to get the size of the join output
   rmm::device_scalar<std::size_t> size(0, stream, mr);
-  CHECK_CUDA(stream.value());
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
@@ -430,7 +422,6 @@ compute_mixed_join_output_size(table_view const& left_equality,
         size.data(),
         matches_per_row_span);
   }
-  CHECK_CUDA(stream.value());
 
   return {size.value(stream), std::move(matches_per_row)};
 }
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
new file mode 100644
index 00000000000..60c909702ab
--- /dev/null
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <join/join_common_utils.hpp>
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cub/cub.cuh>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Equality comparator for use with cuco map methods that require expression evaluation.
+ *
+ * This class just defines the construction of the class and the necessary
+ * attributes, specifically the equality operator for the non-conditional parts
+ * of the operator and the evaluator used for the conditional.
+ */
+template <bool has_nulls>
+struct expression_equality {
+  __device__ expression_equality(
+    cudf::ast::detail::expression_evaluator<has_nulls> const& evaluator,
+    cudf::ast::detail::IntermediateDataType<has_nulls>* thread_intermediate_storage,
+    bool const swap_tables,
+    row_equality const& equality_probe)
+    : evaluator{evaluator},
+      thread_intermediate_storage{thread_intermediate_storage},
+      swap_tables{swap_tables},
+      equality_probe{equality_probe}
+  {
+  }
+
+  cudf::ast::detail::IntermediateDataType<has_nulls>* thread_intermediate_storage;
+  cudf::ast::detail::expression_evaluator<has_nulls> const& evaluator;
+  bool const swap_tables;
+  row_equality const& equality_probe;
+};
+
+/**
+ * @brief Equality comparator for cuco::static_map queries.
+ *
+ * This equality comparator is designed for use with cuco::static_map's APIs. A
+ * probe hit indicates that the hashes of the keys are equal, at which point
+ * this comparator checks whether the keys themselves are equal (using the
+ * provided equality_probe) and then evaluates the conditional expression
+ */
+template <bool has_nulls>
+struct single_expression_equality : expression_equality<has_nulls> {
+  using expression_equality<has_nulls>::expression_equality;
+
+  // The parameters are build/probe rather than left/right because the operator
+  // is called by cuco's kernels with parameters in this order (note that this
+  // is an implementation detail that we should eventually stop relying on by
+  // defining operators with suitable heterogeneous typing). Rather than
+  // converting to left/right semantics, we can operate directly on build/probe
+  // until we get to the expression evaluator, which needs to convert back to
+  // left/right semantics because the conditional expression need not be
+  // commutative.
+  // TODO: The input types should really be size_type.
+  __device__ __forceinline__ bool operator()(hash_value_type const build_row_index,
+                                             hash_value_type const probe_row_index) const noexcept
+  {
+    auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+    // Two levels of checks:
+    // 1. The contents of the columns involved in the equality condition are equal.
+    // 2. The predicate evaluated on the relevant columns (already encoded in the evaluator)
+    // evaluates to true.
+    if (this->equality_probe(probe_row_index, build_row_index)) {
+      auto const lrow_idx = this->swap_tables ? build_row_index : probe_row_index;
+      auto const rrow_idx = this->swap_tables ? probe_row_index : build_row_index;
+      this->evaluator.evaluate(output_dest,
+                               static_cast<size_type>(lrow_idx),
+                               static_cast<size_type>(rrow_idx),
+                               0,
+                               this->thread_intermediate_storage);
+      return (output_dest.is_valid() && output_dest.value());
+    }
+    return false;
+  }
+};
+
+/**
+ * @brief Equality comparator for cuco::static_multimap queries.
+ *
+ * This equality comparator is designed for use with cuco::static_multimap's
+ * pair* APIs, which will compare equality based on comparing (key, value)
+ * pairs. In the context of joins, these pairs are of the form
+ * (row_hash, row_id). A hash probe hit indicates that hash of a probe row's hash is
+ * equal to the hash of the hash of some row in the multimap, at which point we need an
+ * equality comparator that will check whether the contents of the rows are
+ * identical. This comparator does so by verifying key equality (i.e. that
+ * probe_row_hash == build_row_hash) and then using a row_equality_comparator
+ * to compare the contents of the row indices that are stored as the payload in
+ * the hash map.
+ */
+template <bool has_nulls>
+struct pair_expression_equality : public expression_equality<has_nulls> {
+  using expression_equality<has_nulls>::expression_equality;
+
+  // The parameters are build/probe rather than left/right because the operator
+  // is called by cuco's kernels with parameters in this order (note that this
+  // is an implementation detail that we should eventually stop relying on by
+  // defining operators with suitable heterogeneous typing). Rather than
+  // converting to left/right semantics, we can operate directly on build/probe
+  // until we get to the expression evaluator, which needs to convert back to
+  // left/right semantics because the conditional expression need not be
+  // commutative.
+  __device__ __forceinline__ bool operator()(pair_type const& build_row,
+                                             pair_type const& probe_row) const noexcept
+  {
+    auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+    // Three levels of checks:
+    // 1. Row hashes of the columns involved in the equality condition are equal.
+    // 2. The contents of the columns involved in the equality condition are equal.
+    // 3. The predicate evaluated on the relevant columns (already encoded in the evaluator)
+    // evaluates to true.
+    if ((probe_row.first == build_row.first) &&
+        this->equality_probe(probe_row.second, build_row.second)) {
+      auto const lrow_idx = this->swap_tables ? build_row.second : probe_row.second;
+      auto const rrow_idx = this->swap_tables ? probe_row.second : build_row.second;
+      this->evaluator.evaluate(
+        output_dest, lrow_idx, rrow_idx, 0, this->thread_intermediate_storage);
+      return (output_dest.is_valid() && output_dest.value());
+    }
+    return false;
+  }
+};
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels.cu b/cpp/src/join/mixed_join_kernels.cu
new file mode 100644
index 00000000000..5638f0ddd38
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernels.cu
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <join/hash_join.cuh>
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+#include <join/mixed_join_common_utils.cuh>
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <cooperative_groups.h>
+
+#include <cub/cub.cuh>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace cudf {
+namespace detail {
+namespace cg = cooperative_groups;
+
+template <cudf::size_type block_size, bool has_nulls>
+__global__ void mixed_join(table_device_view left_table,
+                           table_device_view right_table,
+                           table_device_view probe,
+                           table_device_view build,
+                           row_equality const equality_probe,
+                           join_kind const join_type,
+                           cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                           size_type* join_output_l,
+                           size_type* join_output_r,
+                           cudf::ast::detail::expression_device_view device_expression_data,
+                           cudf::size_type const* join_result_offsets,
+                           bool const swap_tables)
+{
+  // Normally the casting of a shared memory array is used to create multiple
+  // arrays of different types from the shared memory buffer, but here it is
+  // used to circumvent conflicts between arrays of different types between
+  // different template instantiations due to the extern specifier.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
+  auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  if (outer_row_index < outer_num_rows) {
+    // Figure out the number of elements for this key.
+    cg::thread_block_tile<1> this_thread = cg::this_thread();
+    // Figure out the number of elements for this key.
+    auto query_pair = pair_func(outer_row_index);
+    auto equality   = pair_expression_equality<has_nulls>{
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+
+    auto probe_key_begin       = thrust::make_discard_iterator();
+    auto probe_value_begin     = swap_tables ? join_output_r + join_result_offsets[outer_row_index]
+                                             : join_output_l + join_result_offsets[outer_row_index];
+    auto contained_key_begin   = thrust::make_discard_iterator();
+    auto contained_value_begin = swap_tables ? join_output_l + join_result_offsets[outer_row_index]
+                                             : join_output_r + join_result_offsets[outer_row_index];
+
+    if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::FULL_JOIN) {
+      hash_table_view.pair_retrieve_outer(this_thread,
+                                          query_pair,
+                                          probe_key_begin,
+                                          probe_value_begin,
+                                          contained_key_begin,
+                                          contained_value_begin,
+                                          equality);
+    } else {
+      hash_table_view.pair_retrieve(this_thread,
+                                    query_pair,
+                                    probe_key_begin,
+                                    probe_value_begin,
+                                    contained_key_begin,
+                                    contained_value_begin,
+                                    equality);
+    }
+  }
+}
+
+template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  size_type* join_output_l,
+  size_type* join_output_r,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const* join_result_offsets,
+  bool const swap_tables);
+
+template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  size_type* join_output_l,
+  size_type* join_output_r,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const* join_result_offsets,
+  bool const swap_tables);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh
index 9812d4c4b7d..18d5e22fd1c 100644
--- a/cpp/src/join/mixed_join_kernels.cuh
+++ b/cpp/src/join/mixed_join_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,93 +16,15 @@
 
 #pragma once
 
-#include <join/join_common_utils.cuh>
 #include <join/join_common_utils.hpp>
+#include <join/mixed_join_common_utils.cuh>
 
-#include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/span.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <cooperative_groups.h>
-
-#include <cub/cub.cuh>
-#include <cuco/detail/pair.cuh>
-#include <thrust/equal.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
 
 namespace cudf {
 namespace detail {
-namespace cg = cooperative_groups;
-
-/**
- * @brief Device functor to determine if two pairs are identical.
- *
- * This equality comparator is designed for use with cuco::static_multimap's
- * pair* APIs, which will compare equality based on comparing (key, value)
- * pairs. In the context of joins, these pairs are of the form
- * (row_hash, row_id). A hash probe hit indicates that hash of a probe row's hash is
- * equal to the hash of the hash of some row in the multimap, at which point we need an
- * equality comparator that will check whether the contents of the rows are
- * identical. This comparator does so by verifying key equality (i.e. that
- * probe_row_hash == build_row_hash) and then using a row_equality_comparator
- * to compare the contents of the row indices that are stored as the payload in
- * the hash map.
- *
- * This particular comparator is a specialized version of the pair_equality used in hash joins. This
- * version also checks the expression_evaluator.
- */
-template <bool has_nulls>
-class pair_expression_equality {
- public:
-  __device__ pair_expression_equality(
-    cudf::ast::detail::expression_evaluator<has_nulls> const& evaluator,
-    cudf::ast::detail::IntermediateDataType<has_nulls>* thread_intermediate_storage,
-    bool const swap_tables,
-    row_equality const& equality_probe)
-    : evaluator{evaluator},
-      thread_intermediate_storage{thread_intermediate_storage},
-      swap_tables{swap_tables},
-      equality_probe{equality_probe}
-  {
-  }
-
-  // The parameters are build/probe rather than left/right because the operator
-  // is called by cuco's kernels with parameters in this order (note that this
-  // is an implementation detail that we should eventually stop relying on by
-  // defining operators with suitable heterogeneous typing). Rather than
-  // converting to left/right semantics, we can operate directly on build/probe
-  // until we get to the expression evaluator, which needs to convert back to
-  // left/right semantics because the conditional expression need not be
-  // commutative.
-  __device__ __forceinline__ bool operator()(const pair_type& build_row,
-                                             const pair_type& probe_row) const noexcept
-  {
-    auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
-    // Three levels of checks:
-    // 1. Row hashes of the columns involved in the equality condition are equal.
-    // 2. The contents of the columns involved in the equality condition are equal.
-    // 3. The predicate evaluated on the relevant columns (already encoded in the evaluator)
-    // evaluates to true.
-    if ((probe_row.first == build_row.first) &&
-        equality_probe(probe_row.second, build_row.second)) {
-      auto const lrow_idx = swap_tables ? build_row.second : probe_row.second;
-      auto const rrow_idx = swap_tables ? probe_row.second : build_row.second;
-      evaluator.evaluate(output_dest, lrow_idx, rrow_idx, 0, thread_intermediate_storage);
-      return (output_dest.is_valid() && output_dest.value());
-    }
-    return false;
-  }
-
- private:
-  cudf::ast::detail::IntermediateDataType<has_nulls>* thread_intermediate_storage;
-  cudf::ast::detail::expression_evaluator<has_nulls> const& evaluator;
-  bool const swap_tables;
-  row_equality const& equality_probe;
-};
 
 /**
  * @brief Computes the output size of joining the left table to the right table.
@@ -146,63 +68,7 @@ __global__ void compute_mixed_join_output_size(
   ast::detail::expression_device_view device_expression_data,
   bool const swap_tables,
   std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row)
-{
-  // The (required) extern storage of the shared memory array leads to
-  // conflicting declarations between different templates. The easiest
-  // workaround is to declare an arbitrary (here char) array type then cast it
-  // after the fact to the appropriate type.
-  extern __shared__ char raw_intermediate_storage[];
-  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
-    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
-  auto thread_intermediate_storage =
-    intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
-
-  std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
-  cudf::size_type const stride         = block_size * gridDim.x;
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-
-  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data);
-
-  // TODO: The hash join code assumes that nulls exist here, so I'm doing the
-  // same but at some point we may want to benchmark that.
-  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
-  auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel();
-  make_pair_function pair_func{hash_probe, empty_key_sentinel};
-
-  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
-       outer_row_index += stride) {
-    // Figure out the number of elements for this key.
-    cg::thread_block_tile<1> this_thread = cg::this_thread();
-    auto query_pair                      = pair_func(outer_row_index);
-    // TODO: Address asymmetry in operator.
-    auto count_equality = pair_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
-    // TODO: This entire kernel probably won't work for left anti joins since I
-    // need to use a normal map (not a multimap), so this condition is probably
-    // overspecified at the moment.
-    if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
-        join_type == join_kind::FULL_JOIN) {
-      matches_per_row[outer_row_index] =
-        hash_table_view.pair_count_outer(this_thread, query_pair, count_equality);
-    } else {
-      matches_per_row[outer_row_index] =
-        hash_table_view.pair_count(this_thread, query_pair, count_equality);
-    }
-    thread_counter += matches_per_row[outer_row_index];
-  }
-
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
-
-  // Add block counter to global counter
-  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
-}
+  cudf::device_span<cudf::size_type> matches_per_row);
 
 /**
  * @brief Performs a join using the combination of a hash lookup to identify
@@ -215,7 +81,6 @@ __global__ void compute_mixed_join_output_size(
  * between probe and build rows.
  *
  * @tparam block_size The number of threads per block for this kernel
- * @tparam output_cache_size The side of the shared memory buffer to cache join
  * @tparam has_nulls Whether or not the inputs may contain nulls.
  *
  * @param[in] left_table The left table
@@ -235,11 +100,7 @@ __global__ void compute_mixed_join_output_size(
  * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
  * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
  */
-template <cudf::size_type block_size,
-          cudf::size_type output_cache_size,
-          bool has_nulls,
-          typename OutputIt1,
-          typename OutputIt2>
+template <cudf::size_type block_size, bool has_nulls>
 __global__ void mixed_join(table_device_view left_table,
                            table_device_view right_table,
                            table_device_view probe,
@@ -247,75 +108,11 @@ __global__ void mixed_join(table_device_view left_table,
                            row_equality const equality_probe,
                            join_kind const join_type,
                            cudf::detail::mixed_multimap_type::device_view hash_table_view,
-                           OutputIt1 join_output_l,
-                           OutputIt2 join_output_r,
+                           size_type* join_output_l,
+                           size_type* join_output_r,
                            cudf::ast::detail::expression_device_view device_expression_data,
                            cudf::size_type const* join_result_offsets,
-                           bool const swap_tables)
-{
-  // Normally the casting of a shared memory array is used to create multiple
-  // arrays of different types from the shared memory buffer, but here it is
-  // used to circumvent conflicts between arrays of different types between
-  // different template instantiations due to the extern specifier.
-  extern __shared__ char raw_intermediate_storage[];
-  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
-    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
-  auto thread_intermediate_storage =
-    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
-
-  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data);
-
-  // TODO: The hash join code assumes that nulls exist here, so I'm doing the
-  // same but at some point we may want to benchmark that.
-  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
-  auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel();
-  make_pair_function pair_func{hash_probe, empty_key_sentinel};
-
-  if (outer_row_index < outer_num_rows) {
-    // Figure out the number of elements for this key.
-    cg::thread_block_tile<1> this_thread = cg::this_thread();
-    // Figure out the number of elements for this key.
-    auto query_pair = pair_func(outer_row_index);
-    auto equality   = pair_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
-
-    auto probe_key_begin       = thrust::make_discard_iterator();
-    auto probe_value_begin     = swap_tables ? join_output_r + join_result_offsets[outer_row_index]
-                                             : join_output_l + join_result_offsets[outer_row_index];
-    auto contained_key_begin   = thrust::make_discard_iterator();
-    auto contained_value_begin = swap_tables ? join_output_l + join_result_offsets[outer_row_index]
-                                             : join_output_r + join_result_offsets[outer_row_index];
-
-    // TODO: This entire kernel probably won't work for left anti joins since I
-    // need to use a normal map (not a multimap), so this condition is probably
-    // overspecified at the moment.
-    if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
-        join_type == join_kind::FULL_JOIN) {
-      hash_table_view.pair_retrieve_outer(this_thread,
-                                          query_pair,
-                                          probe_key_begin,
-                                          probe_value_begin,
-                                          contained_key_begin,
-                                          contained_value_begin,
-                                          equality);
-    } else {
-      hash_table_view.pair_retrieve(this_thread,
-                                    query_pair,
-                                    probe_key_begin,
-                                    probe_value_begin,
-                                    contained_key_begin,
-                                    contained_value_begin,
-                                    equality);
-    }
-  }
-}
+                           bool const swap_tables);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
new file mode 100644
index 00000000000..c8cfc9998f0
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+#include <join/mixed_join_common_utils.cuh>
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <cub/cub.cuh>
+
+namespace cudf {
+namespace detail {
+
+namespace cg = cooperative_groups;
+
+template <cudf::size_type block_size, bool has_nulls>
+__global__ void mixed_join_semi(table_device_view left_table,
+                                table_device_view right_table,
+                                table_device_view probe,
+                                table_device_view build,
+                                row_equality const equality_probe,
+                                join_kind const join_type,
+                                cudf::detail::semi_map_type::device_view hash_table_view,
+                                size_type* join_output_l,
+                                cudf::ast::detail::expression_device_view device_expression_data,
+                                cudf::size_type const* join_result_offsets,
+                                bool const swap_tables)
+{
+  // Normally the casting of a shared memory array is used to create multiple
+  // arrays of different types from the shared memory buffer, but here it is
+  // used to circumvent conflicts between arrays of different types between
+  // different template instantiations due to the extern specifier.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
+
+  if (outer_row_index < outer_num_rows) {
+    // Figure out the number of elements for this key.
+    auto equality = single_expression_equality<has_nulls>{
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+
+    if ((join_type == join_kind::LEFT_ANTI_JOIN) !=
+        (hash_table_view.contains(outer_row_index, hash_probe, equality))) {
+      *(join_output_l + join_result_offsets[outer_row_index]) = outer_row_index;
+    }
+  }
+}
+
+template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::semi_map_type::device_view hash_table_view,
+  size_type* join_output_l,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const* join_result_offsets,
+  bool const swap_tables);
+
+template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::semi_map_type::device_view hash_table_view,
+  size_type* join_output_l,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const* join_result_offsets,
+  bool const swap_tables);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
new file mode 100644
index 00000000000..0a590f5b09a
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <join/join_common_utils.hpp>
+#include <join/mixed_join_common_utils.cuh>
+
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Computes the output size of joining the left table to the right table for semi/anti joins.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ * @param[out] output_size The resulting output size
+ * @param[out] matches_per_row The number of matches in one pair of
+ * equality/conditional tables for each row in the other pair of tables. If
+ * swap_tables is true, matches_per_row corresponds to the right_table,
+ * otherwise it corresponds to the left_table. Note that corresponding swap of
+ * left/right tables to determine which is the build table and which is the
+ * probe table has already happened on the host.
+ */
+template <int block_size, bool has_nulls>
+__global__ void compute_mixed_join_output_size_semi(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::semi_map_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+/**
+ * @brief Performs a semi/anti join using the combination of a hash lookup to
+ * identify equal rows between one pair of tables and the evaluation of an
+ * expression containing an arbitrary expression.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[out] join_output_l The left result of the join operation
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] join_result_offsets The starting indices in join_output[l|r]
+ * where the matches for each row begin. Equivalent to a prefix sum of
+ * matches_per_row.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ */
+template <cudf::size_type block_size, bool has_nulls>
+__global__ void mixed_join_semi(table_device_view left_table,
+                                table_device_view right_table,
+                                table_device_view probe,
+                                table_device_view build,
+                                row_equality const equality_probe,
+                                join_kind const join_type,
+                                cudf::detail::semi_map_type::device_view hash_table_view,
+                                size_type* join_output_l,
+                                cudf::ast::detail::expression_device_view device_expression_data,
+                                cudf::size_type const* join_result_offsets,
+                                bool const swap_tables);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
new file mode 100644
index 00000000000..f38e653c4a6
--- /dev/null
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -0,0 +1,569 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/join.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+#include <join/hash_join.cuh>
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+#include <join/mixed_join_kernels_semi.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <optional>
+#include <utility>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+/**
+ * @brief Device functor to create a pair of hash value and index for a given row.
+ */
+struct make_pair_function_semi {
+  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
+  {
+    // The value is irrelevant since we only ever use the hash map to check for
+    // membership of a particular row index.
+    return cuco::make_pair<hash_value_type, size_type>(i, 0);
+  }
+};
+
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+class double_row_equality {
+ public:
+  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
+    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
+  {
+  }
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    return _equality_comparator(lhs_row_index, rhs_row_index) &&
+           _conditional_comparator(lhs_row_index, rhs_row_index);
+  }
+
+ private:
+  row_equality _equality_comparator;
+  row_equality _conditional_comparator;
+};
+
+}  // namespace
+
+std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  join_kind join_type,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
+                 (join_type != join_kind::FULL_JOIN),
+               "Inner, left, and full joins should use mixed_join.");
+
+  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
+               "The left conditional and equality tables must have the same number of rows.");
+  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
+               "The right conditional and equality tables must have the same number of rows.");
+
+  auto const right_num_rows{right_conditional.num_rows()};
+  auto const left_num_rows{left_conditional.num_rows()};
+  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
+
+  // The "outer" table is the larger of the two tables. The kernels are
+  // launched with one thread per row of the outer table, which also means that
+  // it is the probe table for the hash
+  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+
+  // We can immediately filter out cases where the right table is empty. In
+  // some cases, we return all the rows of the left table with a corresponding
+  // null index for the right table; in others, we return an empty output.
+  if (right_num_rows == 0) {
+    switch (join_type) {
+      // Anti and semi return all the row indices from left
+      // with a corresponding NULL from the right.
+      case join_kind::LEFT_ANTI_JOIN:
+        return get_trivial_left_join_indices(left_conditional, stream).first;
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left_num_rows == 0) {
+    switch (join_type) {
+      // Anti and semi joins both return empty sets.
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  // If evaluating the expression may produce null outputs we create a nullable
+  // output column and follow the null-supporting expression evaluation code
+  // path.
+  auto const has_nulls =
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream);
+
+  auto const parser = ast::detail::expression_parser{
+    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
+
+  // TODO: The non-conditional join impls start with a dictionary matching,
+  // figure out what that is and what it's needed for (and if conditional joins
+  // need to do the same).
+  auto& probe                  = swap_tables ? right_equality : left_equality;
+  auto& build                  = swap_tables ? left_equality : right_equality;
+  auto probe_view              = table_device_view::create(probe, stream);
+  auto build_view              = table_device_view::create(build, stream);
+  auto left_conditional_view   = table_device_view::create(left_conditional, stream);
+  auto right_conditional_view  = table_device_view::create(right_conditional, stream);
+  auto& build_conditional_view = swap_tables ? left_conditional_view : right_conditional_view;
+  row_equality equality_probe{
+    cudf::nullate::DYNAMIC{has_nulls}, *probe_view, *build_view, compare_nulls};
+
+  semi_map_type hash_table{compute_hash_table_size(build.num_rows()),
+                           std::numeric_limits<hash_value_type>::max(),
+                           cudf::detail::JoinNoneValue,
+                           detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+                           stream.value()};
+
+  // Create hash table containing all keys found in right table
+  // TODO: To add support for nested columns we will need to flatten in many
+  // places. However, this probably isn't worth adding any time soon since we
+  // won't be able to support AST conditions for those types anyway.
+  auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
+  row_hash const hash_build{build_nulls, *build_view};
+  // Since we may see multiple rows that are identical in the equality tables
+  // but differ in the conditional tables, the equality comparator used for
+  // insertion must account for both sets of tables. An alternative solution
+  // would be to use a multimap, but that solution would store duplicates where
+  // equality and conditional rows are equal, so this approach is preferable.
+  // One way to make this solution even more efficient would be to only include
+  // the columns of the conditional table that are used by the expression, but
+  // that requires additional plumbing through the AST machinery and is out of
+  // scope for now.
+  row_equality equality_build_equality{build_nulls, *build_view, *build_view, compare_nulls};
+  row_equality equality_build_conditional{
+    build_nulls, *build_conditional_view, *build_conditional_view, compare_nulls};
+  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
+  make_pair_function_semi pair_func_build{};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
+
+  // skip rows that are null here.
+  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
+    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+  } else {
+    thrust::counting_iterator<cudf::size_type> stencil(0);
+    auto const [row_bitmask, _] = cudf::detail::bitmask_and(build, stream);
+    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    // insert valid rows
+    hash_table.insert_if(
+      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+  }
+
+  auto hash_table_view = hash_table.get_device_view();
+
+  // For inner joins we support optimizing the join by launching one thread for
+  // whichever table is larger rather than always using the left table.
+  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  join_kind const kernel_join_type =
+    join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
+
+  // If the join size data was not provided as an input, compute it here.
+  std::size_t join_size;
+  // Using an optional because we only need to allocate a new vector if one was
+  // not passed as input, and rmm::device_uvector is not default constructible
+  std::optional<rmm::device_uvector<size_type>> matches_per_row{};
+  device_span<size_type const> matches_per_row_span{};
+
+  if (output_size_data.has_value()) {
+    join_size            = output_size_data->first;
+    matches_per_row_span = output_size_data->second;
+  } else {
+    // Allocate storage for the counter used to get the size of the join output
+    rmm::device_scalar<std::size_t> size(0, stream, mr);
+
+    matches_per_row =
+      rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
+    // Note that the view goes out of scope after this else statement, but the
+    // data owned by matches_per_row stays alive so the data pointer is valid.
+    auto mutable_matches_per_row_span = cudf::device_span<size_type>{
+      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+    matches_per_row_span = cudf::device_span<size_type const>{
+      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+    if (has_nulls) {
+      compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_conditional_view,
+          *right_conditional_view,
+          *probe_view,
+          *build_view,
+          equality_probe,
+          kernel_join_type,
+          hash_table_view,
+          parser.device_expression_data,
+          swap_tables,
+          size.data(),
+          mutable_matches_per_row_span);
+    } else {
+      compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_conditional_view,
+          *right_conditional_view,
+          *probe_view,
+          *build_view,
+          equality_probe,
+          kernel_join_type,
+          hash_table_view,
+          parser.device_expression_data,
+          swap_tables,
+          size.data(),
+          mutable_matches_per_row_span);
+    }
+    join_size = size.value(stream);
+  }
+
+  if (join_size == 0) { return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr); }
+
+  // Given the number of matches per row, we need to compute the offsets for insertion.
+  auto join_result_offsets =
+    rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
+  thrust::exclusive_scan(rmm::exec_policy{stream},
+                         matches_per_row_span.begin(),
+                         matches_per_row_span.end(),
+                         join_result_offsets.begin());
+
+  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+  auto const& join_output_l = left_indices->data();
+
+  if (has_nulls) {
+    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        kernel_join_type,
+        hash_table_view,
+        join_output_l,
+        parser.device_expression_data,
+        join_result_offsets.data(),
+        swap_tables);
+  } else {
+    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        kernel_join_type,
+        hash_table_view,
+        join_output_l,
+        parser.device_expression_data,
+        join_result_offsets.data(),
+        swap_tables);
+  }
+
+  return left_indices;
+}
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>>
+compute_mixed_join_output_size_semi(table_view const& left_equality,
+                                    table_view const& right_equality,
+                                    table_view const& left_conditional,
+                                    table_view const& right_conditional,
+                                    ast::expression const& binary_predicate,
+                                    null_equality compare_nulls,
+                                    join_kind join_type,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(
+    (join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
+      (join_type != join_kind::FULL_JOIN),
+    "Inner, left, and full join size estimation should use compute_mixed_join_output_size.");
+
+  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
+               "The left conditional and equality tables must have the same number of rows.");
+  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
+               "The right conditional and equality tables must have the same number of rows.");
+
+  auto const right_num_rows{right_conditional.num_rows()};
+  auto const left_num_rows{left_conditional.num_rows()};
+  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
+
+  // The "outer" table is the larger of the two tables. The kernels are
+  // launched with one thread per row of the outer table, which also means that
+  // it is the probe table for the hash
+  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+
+  auto matches_per_row = std::make_unique<rmm::device_uvector<size_type>>(
+    static_cast<std::size_t>(outer_num_rows), stream, mr);
+  auto matches_per_row_span = cudf::device_span<size_type>{
+    matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+
+  // We can immediately filter out cases where one table is empty. In
+  // some cases, we return all the rows of the other table with a corresponding
+  // null index for the empty table; in others, we return an empty output.
+  if (right_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, and full all return all the row indices from left
+      // with a corresponding NULL from the right.
+      case join_kind::LEFT_ANTI_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
+        return {left_num_rows, std::move(matches_per_row)};
+      }
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::LEFT_SEMI_JOIN: return {0, std::move(matches_per_row)};
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, left semi, and inner joins all return empty sets.
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::LEFT_SEMI_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
+        return {0, std::move(matches_per_row)};
+      }
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  // If evaluating the expression may produce null outputs we create a nullable
+  // output column and follow the null-supporting expression evaluation code
+  // path.
+  auto const has_nulls =
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream);
+
+  auto const parser = ast::detail::expression_parser{
+    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
+
+  // TODO: The non-conditional join impls start with a dictionary matching,
+  // figure out what that is and what it's needed for (and if conditional joins
+  // need to do the same).
+  auto& probe                  = swap_tables ? right_equality : left_equality;
+  auto& build                  = swap_tables ? left_equality : right_equality;
+  auto probe_view              = table_device_view::create(probe, stream);
+  auto build_view              = table_device_view::create(build, stream);
+  auto left_conditional_view   = table_device_view::create(left_conditional, stream);
+  auto right_conditional_view  = table_device_view::create(right_conditional, stream);
+  auto& build_conditional_view = swap_tables ? left_conditional_view : right_conditional_view;
+  row_equality equality_probe{
+    cudf::nullate::DYNAMIC{has_nulls}, *probe_view, *build_view, compare_nulls};
+
+  semi_map_type hash_table{compute_hash_table_size(build.num_rows()),
+                           std::numeric_limits<hash_value_type>::max(),
+                           cudf::detail::JoinNoneValue,
+                           detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+                           stream.value()};
+
+  // Create hash table containing all keys found in right table
+  // TODO: To add support for nested columns we will need to flatten in many
+  // places. However, this probably isn't worth adding any time soon since we
+  // won't be able to support AST conditions for those types anyway.
+  auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
+  row_hash const hash_build{build_nulls, *build_view};
+  // Since we may see multiple rows that are identical in the equality tables
+  // but differ in the conditional tables, the equality comparator used for
+  // insertion must account for both sets of tables. An alternative solution
+  // would be to use a multimap, but that solution would store duplicates where
+  // equality and conditional rows are equal, so this approach is preferable.
+  // One way to make this solution even more efficient would be to only include
+  // the columns of the conditional table that are used by the expression, but
+  // that requires additional plumbing through the AST machinery and is out of
+  // scope for now.
+  row_equality equality_build_equality{build_nulls, *build_view, *build_view, compare_nulls};
+  row_equality equality_build_conditional{
+    build_nulls, *build_conditional_view, *build_conditional_view, compare_nulls};
+  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
+  make_pair_function_semi pair_func_build{};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
+
+  // skip rows that are null here.
+  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
+    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+  } else {
+    thrust::counting_iterator<cudf::size_type> stencil(0);
+    auto const [row_bitmask, _] = cudf::detail::bitmask_and(build, stream);
+    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    // insert valid rows
+    hash_table.insert_if(
+      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+  }
+
+  auto hash_table_view = hash_table.get_device_view();
+
+  // For inner joins we support optimizing the join by launching one thread for
+  // whichever table is larger rather than always using the left table.
+  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+
+  // Allocate storage for the counter used to get the size of the join output
+  rmm::device_scalar<std::size_t> size(0, stream, mr);
+
+  // Determine number of output rows without actually building the output to simply
+  // find what the size of the output will be.
+  if (has_nulls) {
+    compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        join_type,
+        hash_table_view,
+        parser.device_expression_data,
+        swap_tables,
+        size.data(),
+        matches_per_row_span);
+  } else {
+    compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        join_type,
+        hash_table_view,
+        parser.device_expression_data,
+        swap_tables,
+        size.data(),
+        matches_per_row_span);
+  }
+
+  return {size.value(stream), std::move(matches_per_row)};
+}
+
+}  // namespace detail
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_mixed_join_output_size_semi(left_equality,
+                                                     right_equality,
+                                                     left_conditional,
+                                                     right_conditional,
+                                                     binary_predicate,
+                                                     compare_nulls,
+                                                     detail::join_kind::LEFT_SEMI_JOIN,
+                                                     rmm::cuda_stream_default,
+                                                     mr);
+}
+
+std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::mixed_join_semi(left_equality,
+                                 right_equality,
+                                 left_conditional,
+                                 right_conditional,
+                                 binary_predicate,
+                                 compare_nulls,
+                                 detail::join_kind::LEFT_SEMI_JOIN,
+                                 output_size_data,
+                                 rmm::cuda_stream_default,
+                                 mr);
+}
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_mixed_join_output_size_semi(left_equality,
+                                                     right_equality,
+                                                     left_conditional,
+                                                     right_conditional,
+                                                     binary_predicate,
+                                                     compare_nulls,
+                                                     detail::join_kind::LEFT_ANTI_JOIN,
+                                                     rmm::cuda_stream_default,
+                                                     mr);
+}
+
+std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::mixed_join_semi(left_equality,
+                                 right_equality,
+                                 left_conditional,
+                                 right_conditional,
+                                 binary_predicate,
+                                 compare_nulls,
+                                 detail::join_kind::LEFT_ANTI_JOIN,
+                                 output_size_data,
+                                 rmm::cuda_stream_default,
+                                 mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernels.cu b/cpp/src/join/mixed_join_size_kernels.cu
new file mode 100644
index 00000000000..1a08b8792c2
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernels.cu
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <join/hash_join.cuh>
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+#include <join/mixed_join_common_utils.cuh>
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <cooperative_groups.h>
+
+#include <cub/cub.cuh>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace cudf {
+namespace detail {
+namespace cg = cooperative_groups;
+
+template <int block_size, bool has_nulls>
+__global__ void compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row)
+{
+  // The (required) extern storage of the shared memory array leads to
+  // conflicting declarations between different templates. The easiest
+  // workaround is to declare an arbitrary (here char) array type then cast it
+  // after the fact to the appropriate type.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
+
+  std::size_t thread_counter{0};
+  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const stride         = block_size * gridDim.x;
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
+  auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  // Figure out the number of elements for this key.
+  cg::thread_block_tile<1> this_thread = cg::this_thread();
+  // TODO: Address asymmetry in operator.
+  auto count_equality = pair_expression_equality<has_nulls>{
+    evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+
+  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
+    auto query_pair = pair_func(outer_row_index);
+    if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::FULL_JOIN) {
+      matches_per_row[outer_row_index] =
+        hash_table_view.pair_count_outer(this_thread, query_pair, count_equality);
+    } else {
+      matches_per_row[outer_row_index] =
+        hash_table_view.pair_count(this_thread, query_pair, count_equality);
+    }
+    thread_counter += matches_per_row[outer_row_index];
+  }
+
+  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
+
+  // Add block counter to global counter
+  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+}
+
+template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
new file mode 100644
index 00000000000..2c077a698f8
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernels_semi.cu
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+#include <join/mixed_join_common_utils.cuh>
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <cub/cub.cuh>
+
+namespace cudf {
+namespace detail {
+
+namespace cg = cooperative_groups;
+
+template <int block_size, bool has_nulls>
+__global__ void compute_mixed_join_output_size_semi(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::semi_map_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row)
+{
+  // The (required) extern storage of the shared memory array leads to
+  // conflicting declarations between different templates. The easiest
+  // workaround is to declare an arbitrary (here char) array type then cast it
+  // after the fact to the appropriate type.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
+
+  std::size_t thread_counter{0};
+  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const stride         = block_size * gridDim.x;
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
+  // TODO: Address asymmetry in operator.
+  auto equality = single_expression_equality<has_nulls>{
+    evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+
+  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
+    matches_per_row[outer_row_index] =
+      ((join_type == join_kind::LEFT_ANTI_JOIN) !=
+       (hash_table_view.contains(outer_row_index, hash_probe, equality)));
+    thread_counter += matches_per_row[outer_row_index];
+  }
+
+  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
+
+  // Add block counter to global counter
+  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+}
+
+template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::semi_map_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::semi_map_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row);
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 5eb8ca2452e..8563a2a3bd3 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -37,8 +37,6 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
-#include <cuco/static_map.cuh>
-
 namespace cudf {
 namespace detail {
 
@@ -91,13 +89,11 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   auto left_flattened_keys  = left_flattened_tables.flattened_columns();
 
   // Create hash table.
-  auto hash_table = cuco::
-    static_map<hash_value_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>{
-      compute_hash_table_size(right_num_rows),
-      std::numeric_limits<hash_value_type>::max(),
-      cudf::detail::JoinNoneValue,
-      hash_table_allocator_type{default_allocator<char>{}, stream},
-      stream.value()};
+  semi_map_type hash_table{compute_hash_table_size(right_num_rows),
+                           std::numeric_limits<hash_value_type>::max(),
+                           cudf::detail::JoinNoneValue,
+                           hash_table_allocator_type{default_allocator<char>{}, stream},
+                           stream.value()};
 
   // Create hash table containing all keys found in right table
   auto right_rows_d      = table_device_view::create(right_flattened_keys, stream);
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index fe45cdfc338..8d2de8997d1 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -53,8 +53,8 @@ namespace detail {
  * @endcode
  */
 struct list_gatherer {
-  typedef size_type argument_type;
-  typedef size_type result_type;
+  using argument_type = size_type;
+  using result_type   = size_type;
 
   size_t offset_count;
   size_type const* base_offsets;
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 7b3b7b0f3fd..66b26148ede 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -257,7 +257,7 @@ __global__ void copy_block_partitions(InputIter input_iter,
     reinterpret_cast<size_type*>(block_output + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_ROWS_PER_THREAD);
   auto partition_offset_global = partition_offset_shared + num_partitions + 1;
 
-  typedef cub::BlockScan<size_type, OPTIMIZED_BLOCK_SIZE> BlockScan;
+  using BlockScan = cub::BlockScan<size_type, OPTIMIZED_BLOCK_SIZE>;
   __shared__ typename BlockScan::TempStorage temp_storage;
 
   // use ELEMENTS_PER_THREAD=2 to support upto 1024 partitions
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 7af1e47087b..a5dc643a688 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -45,8 +45,8 @@ CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac)
   // Underflow may occur when converting int64 to double
   // detail: https://github.com/rapidsai/cudf/issues/1417
 
-  double dlhs           = static_cast<double>(lhs);
-  double drhs           = static_cast<double>(rhs);
+  auto dlhs             = static_cast<double>(lhs);
+  auto drhs             = static_cast<double>(rhs);
   double one_minus_frac = 1.0 - frac;
   return static_cast<Result>(one_minus_frac * dlhs + frac * drhs);
 }
@@ -55,8 +55,8 @@ template <typename Result, typename T>
 CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs)
 {
   // TODO: try std::midpoint (C++20) if available
-  double dlhs = static_cast<double>(lhs);
-  double drhs = static_cast<double>(rhs);
+  auto dlhs = static_cast<double>(lhs);
+  auto drhs = static_cast<double>(rhs);
   return static_cast<Result>(dlhs / 2 + drhs / 2);
 }
 
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index bc1947dfeed..7c52856b147 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -950,9 +950,9 @@ __launch_bounds__(block_size) __global__
     int64_t following_window = following_window_begin[i];
 
     // compute bounds
-    size_type start = static_cast<size_type>(
+    auto start = static_cast<size_type>(
       min(static_cast<int64_t>(input.size()), max(0L, i - preceding_window + 1)));
-    size_type end = static_cast<size_type>(
+    auto end = static_cast<size_type>(
       min(static_cast<int64_t>(input.size()), max(0L, i + following_window + 1)));
     size_type start_index = min(start, end);
     size_type end_index   = max(start, end);
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 9618f325fce..84ae2b73bba 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -108,7 +108,7 @@ struct base_fn {
       if (!d_chars) d_offsets[idx] = 0;
     }
 
-    Derived& derived  = static_cast<Derived&>(*this);
+    auto& derived     = static_cast<Derived&>(*this);
     auto const d_str  = d_column.element<string_view>(idx);
     offset_type bytes = 0;
     auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 9482d4db9b8..8f364f5c9bc 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -61,9 +61,9 @@ struct compute_size_and_concatenate_fn {
   // If d_chars != nullptr: only concatenate strings.
   char* d_chars{nullptr};
 
-  __device__ bool output_is_null(size_type const idx,
-                                 size_type const start_idx,
-                                 size_type const end_idx) const noexcept
+  [[nodiscard]] __device__ bool output_is_null(size_type const idx,
+                                               size_type const start_idx,
+                                               size_type const end_idx) const noexcept
   {
     if (func.is_null_list(lists_dv, idx)) { return true; }
     return empty_list_policy == output_if_empty_list::NULL_ELEMENT && start_idx == end_idx;
@@ -127,13 +127,16 @@ struct compute_size_and_concatenate_fn {
 struct scalar_separator_fn {
   string_scalar_device_view const d_separator;
 
-  __device__ bool is_null_list(column_device_view const& lists_dv,
-                               size_type const idx) const noexcept
+  [[nodiscard]] __device__ bool is_null_list(column_device_view const& lists_dv,
+                                             size_type const idx) const noexcept
   {
     return lists_dv.is_null(idx);
   }
 
-  __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); }
+  [[nodiscard]] __device__ string_view separator(size_type const) const noexcept
+  {
+    return d_separator.value();
+  }
 };
 
 template <typename CompFn>
@@ -222,13 +225,13 @@ struct column_separators_fn {
   column_device_view const separators_dv;
   string_scalar_device_view const sep_narep_dv;
 
-  __device__ bool is_null_list(column_device_view const& lists_dv,
-                               size_type const idx) const noexcept
+  [[nodiscard]] __device__ bool is_null_list(column_device_view const& lists_dv,
+                                             size_type const idx) const noexcept
   {
     return lists_dv.is_null(idx) || (separators_dv.is_null(idx) && !sep_narep_dv.is_valid());
   }
 
-  __device__ string_view separator(size_type const idx) const noexcept
+  [[nodiscard]] __device__ string_view separator(size_type const idx) const noexcept
   {
     return separators_dv.is_valid(idx) ? separators_dv.element<string_view>(idx)
                                        : sep_narep_dv.value();
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 9376a0082a8..efdee65c1f6 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -53,7 +53,7 @@ struct contains_fn {
 
   __device__ bool operator()(size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) return false;
     string_view d_str = d_strings.element<string_view>(idx);
     int32_t begin     = 0;
     int32_t end       = bmatch ? 1    // match only the beginning of the string;
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 8d0c5704a7b..cd3dc3b46f3 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -156,7 +156,7 @@ struct format_compiler {
 
   device_span<format_item const> format_items() { return device_span<format_item const>(d_items); }
 
-  int8_t subsecond_precision() const { return specifiers.at('f'); }
+  [[nodiscard]] int8_t subsecond_precision() const { return specifiers.at('f'); }
 };
 
 /**
@@ -194,7 +194,7 @@ struct parse_datetime {
    *
    * @return `1x10^exponent` for `0 <= exponent <= 9`
    */
-  __device__ constexpr int64_t power_of_ten(int32_t const exponent) const
+  [[nodiscard]] __device__ constexpr int64_t power_of_ten(int32_t const exponent) const
   {
     constexpr int64_t powers_of_ten[] = {
       1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L, 1000000000L};
@@ -202,7 +202,7 @@ struct parse_datetime {
   }
 
   // Walk the format_items to parse the string into date/time components
-  __device__ timestamp_components parse_into_parts(string_view const& d_string) const
+  [[nodiscard]] __device__ timestamp_components parse_into_parts(string_view const& d_string) const
   {
     timestamp_components timeparts = {1970, 1, 1, 0};  // init to epoch time
 
@@ -310,7 +310,7 @@ struct parse_datetime {
     return timeparts;
   }
 
-  __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) const
+  [[nodiscard]] __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) const
   {
     auto const ymd =  // convenient chrono class handles the leap year calculations for us
       cuda::std::chrono::year_month_day(
@@ -689,7 +689,7 @@ struct from_timestamp_base {
    *     modulo(-1,60) -> 59
    * @endcode
    */
-  __device__ int32_t modulo_time(int64_t time, int64_t base) const
+  [[nodiscard]] __device__ int32_t modulo_time(int64_t time, int64_t base) const
   {
     return static_cast<int32_t>(((time % base) + base) % base);
   };
@@ -707,12 +707,12 @@ struct from_timestamp_base {
    *     scale( 61,60) ->  1
    * @endcode
    */
-  __device__ int64_t scale_time(int64_t time, int64_t base) const
+  [[nodiscard]] __device__ int64_t scale_time(int64_t time, int64_t base) const
   {
     return (time - ((time < 0) * (base - 1L))) / base;
   };
 
-  __device__ time_components get_time_components(int64_t tstamp) const
+  [[nodiscard]] __device__ time_components get_time_components(int64_t tstamp) const
   {
     time_components result = {0};
     if constexpr (std::is_same_v<T, cudf::timestamp_D>) { return result; }
@@ -855,7 +855,7 @@ struct datetime_formatter : public from_timestamp_base<T> {
   }
 
   // from https://howardhinnant.github.io/date/date.html
-  __device__ thrust::pair<int32_t, int32_t> get_iso_week_year(
+  [[nodiscard]] __device__ thrust::pair<int32_t, int32_t> get_iso_week_year(
     cuda::std::chrono::year_month_day const& ymd) const
   {
     auto const days = cuda::std::chrono::sys_days(ymd);
@@ -885,8 +885,8 @@ struct datetime_formatter : public from_timestamp_base<T> {
       static_cast<int32_t>(year));
   }
 
-  __device__ int8_t get_week_of_year(cuda::std::chrono::sys_days const days,
-                                     cuda::std::chrono::sys_days const start) const
+  [[nodiscard]] __device__ int8_t get_week_of_year(cuda::std::chrono::sys_days const days,
+                                                   cuda::std::chrono::sys_days const start) const
   {
     return days < start
              ? 0
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index f286149ea46..66e6f31cca2 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -156,7 +156,7 @@ struct format_compiler {
 
   format_item const* compiled_format_items() { return d_items.data(); }
 
-  size_type items_count() const { return static_cast<size_type>(d_items.size()); }
+  [[nodiscard]] size_type items_count() const { return static_cast<size_type>(d_items.size()); }
 };
 
 template <typename T>
diff --git a/cpp/src/strings/count_matches.cuh b/cpp/src/strings/count_matches.cu
similarity index 87%
rename from cpp/src/strings/count_matches.cuh
rename to cpp/src/strings/count_matches.cu
index c14142f4779..d0a6825666b 100644
--- a/cpp/src/strings/count_matches.cuh
+++ b/cpp/src/strings/count_matches.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,13 @@
  * limitations under the License.
  */
 
-#pragma once
-
+#include <strings/count_matches.hpp>
 #include <strings/regex/regex.cuh>
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/string_view.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/transform.h>
@@ -32,6 +29,7 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+namespace {
 /**
  * @brief Functor counts the total matches to the given regex in each string.
  */
@@ -50,12 +48,13 @@ struct count_matches_fn {
     int32_t end   = d_str.length();
     while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
       ++count;
-      begin = end;
+      begin = end + (begin == end);
       end   = d_str.length();
     }
     return count;
   }
 };
+}  // namespace
 
 /**
  * @brief Returns a column of regex match counts for each string in the given column.
@@ -67,11 +66,10 @@ struct count_matches_fn {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-std::unique_ptr<column> count_matches(
-  column_device_view const& d_strings,
-  reprog_device const& d_prog,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> count_matches(column_device_view const& d_strings,
+                                      reprog_device const& d_prog,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   // Create output column
   auto counts = make_numeric_column(
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
new file mode 100644
index 00000000000..1339f2b1ebd
--- /dev/null
+++ b/cpp/src/strings/count_matches.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+
+class column_device_view;
+
+namespace strings {
+namespace detail {
+
+class reprog_device;
+
+/**
+ * @brief Returns a column of regex match counts for each string in the given column.
+ *
+ * A null entry will result in a zero count for that output row.
+ *
+ * @param d_strings Device view of the input strings column.
+ * @param d_prog Regex instance to evaluate on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<column> count_matches(
+  column_device_view const& d_strings,
+  reprog_device const& d_prog,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 584741298c2..c4749eae003 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <strings/count_matches.cuh>
+#include <strings/count_matches.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index c61fb8905f5..ae807db10e6 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -72,7 +72,7 @@ enum class parse_result {
  */
 class parser {
  protected:
-  CUDF_HOST_DEVICE inline parser() : input(nullptr), input_len(0), pos(nullptr) {}
+  CUDF_HOST_DEVICE inline parser() {}
   CUDF_HOST_DEVICE inline parser(const char* _input, int64_t _input_len)
     : input(_input), input_len(_input_len), pos(_input)
   {
@@ -177,9 +177,9 @@ class parser {
   }
 
  protected:
-  char const* input;
-  int64_t input_len;
-  char const* pos;
+  char const* input{nullptr};
+  int64_t input_len{0};
+  char const* pos{nullptr};
 
   CUDF_HOST_DEVICE inline bool is_whitespace(char c) { return c <= ' '; }
 };
@@ -220,18 +220,10 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
  */
 class json_state : private parser {
  public:
-  __device__ json_state()
-    : parser(),
-      cur_el_start(nullptr),
-      cur_el_type(json_element_type::NONE),
-      parent_el_type(json_element_type::NONE)
-  {
-  }
+  __device__ json_state() : parser() {}
   __device__ json_state(const char* _input, int64_t _input_len, get_json_object_options _options)
     : parser(_input, _input_len),
-      cur_el_start(nullptr),
-      cur_el_type(json_element_type::NONE),
-      parent_el_type(json_element_type::NONE),
+
       options(_options)
   {
   }
@@ -340,7 +332,7 @@ class json_state : private parser {
       // next
       parse_result result = next_element_internal(false);
       if (result != parse_result::SUCCESS) { return result; }
-    } while (1);
+    } while (true);
 
     return parse_result::ERROR;
   }
@@ -486,12 +478,12 @@ class json_state : private parser {
     return (c == '\"') || (options.get_allow_single_quotes() && (c == '\''));
   }
 
-  const char* cur_el_start;          // pointer to the first character of the -value- of the current
-                                     // element - not the name
-  string_view cur_el_name;           // name of the current element (if applicable)
-  json_element_type cur_el_type;     // type of the current element
-  json_element_type parent_el_type;  // parent element type
-  get_json_object_options options;   // behavior options
+  const char* cur_el_start{nullptr};  // pointer to the first character of the -value- of the
+                                      // current element - not the name
+  string_view cur_el_name;            // name of the current element (if applicable)
+  json_element_type cur_el_type{json_element_type::NONE};     // type of the current element
+  json_element_type parent_el_type{json_element_type::NONE};  // parent element type
+  get_json_object_options options;                            // behavior options
 };
 
 enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
@@ -501,26 +493,23 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR,
  * an array of these operators applied to the incoming json string,
  */
 struct path_operator {
-  CUDF_HOST_DEVICE inline path_operator()
-    : type(path_operator_type::ERROR), index(-1), expected_type{NONE}
-  {
-  }
+  CUDF_HOST_DEVICE inline path_operator() {}
   CUDF_HOST_DEVICE inline path_operator(path_operator_type _type,
                                         json_element_type _expected_type = NONE)
-    : type(_type), index(-1), expected_type{_expected_type}
+    : type(_type), expected_type{_expected_type}
   {
   }
 
-  path_operator_type type;  // operator type
+  path_operator_type type{path_operator_type::ERROR};  // operator type
   // the expected element type we're applying this operation to.
   // for example:
   //    - you cannot retrieve a subscripted field (eg [5]) from an object.
   //    - you cannot retrieve a field by name (eg  .book) from an array.
   //    - you -can- use .* for both arrays and objects
   // a value of NONE imples any type accepted
-  json_element_type expected_type;  // the expected type of the element we're working with
-  string_view name;                 // name to match against (if applicable)
-  int index;                        // index for subscript operator
+  json_element_type expected_type{NONE};  // the expected type of the element we're working with
+  string_view name;                       // name to match against (if applicable)
+  int index{-1};                          // index for subscript operator
 };
 
 /**
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 20868077cf4..f2a27d1b11d 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -122,7 +122,7 @@ std::unique_ptr<column> pad(
         if (d_strings.is_null(idx)) return;
         string_view d_str = d_strings.element<string_view>(idx);
         char* ptr         = d_chars + d_offsets[idx];
-        int32_t pad       = static_cast<int32_t>(width - d_str.length());
+        auto pad          = static_cast<int32_t>(width - d_str.length());
         auto right_pad    = (width & 1) ? pad / 2 : (pad - pad / 2);  // odd width = right-justify
         auto left_pad =
           pad - right_pad;  // e.g. width=7 gives "++foxx+" while width=6 gives "+fox++"
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 8fbd82b8dc7..244cec1d780 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -20,7 +20,7 @@
 
 #include <algorithm>
 #include <array>
-#include <string.h>
+#include <cstring>
 
 namespace cudf {
 namespace strings {
@@ -830,6 +830,7 @@ class regex_compiler {
     m_prog.set_start_inst(andstack[andstack.size() - 1].id_first);
     m_prog.optimize1();
     m_prog.optimize2();
+    m_prog.check_for_errors();
     m_prog.set_groups_count(cursubid);
   }
 };
@@ -926,6 +927,68 @@ void reprog::optimize2()
   _startinst_ids.push_back(-1);  // terminator mark
 }
 
+/**
+ * @brief Check a specific instruction for errors.
+ *
+ * Currently this is checking for an infinite-loop condition as documented in this issue:
+ * https://github.com/rapidsai/cudf/issues/10006
+ *
+ * Example instructions list created from pattern `(A?)+`
+ * ```
+ *   0:    CHAR c='A', next=2
+ *   1:      OR right=0, left=2, next=2
+ *   2:    RBRA id=1, next=4
+ *   3:    LBRA id=1, next=1
+ *   4:      OR right=3, left=5, next=5
+ *   5:     END
+ * ```
+ *
+ * Following the example above, the instruction at `id==1` (OR)
+ * is being checked. If the instruction path returns to `id==1`
+ * without including the `0==CHAR` or `5==END` as in this example,
+ * then this would cause the runtime to go into an infinite-loop.
+ *
+ * It appears this example pattern is not valid. But Python interprets
+ * its behavior similarly to pattern `(A*)`. Handling this in the same
+ * way does not look feasible with the current implementation.
+ *
+ * @throw cudf::logic_error if instruction logic error is found
+ *
+ * @param id Instruction to check if repeated.
+ * @param next_id Next instruction to process.
+ */
+void reprog::check_for_errors(int32_t id, int32_t next_id)
+{
+  auto inst = inst_at(next_id);
+  while (inst.type == LBRA || inst.type == RBRA) {
+    next_id = inst.u2.next_id;
+    inst    = inst_at(next_id);
+  }
+  if (inst.type == OR) {
+    CUDF_EXPECTS(next_id != id, "Unsupported regex pattern");
+    check_for_errors(id, inst.u2.left_id);
+    check_for_errors(id, inst.u1.right_id);
+  }
+}
+
+/**
+ * @brief Check regex instruction set for any errors.
+ *
+ * Currently, this checks for OR instructions that eventually point back to themselves with only
+ * intervening capture group instructions between causing an infinite-loop during runtime
+ * evaluation.
+ */
+void reprog::check_for_errors()
+{
+  for (auto id = 0; id < insts_count(); ++id) {
+    auto const inst = inst_at(id);
+    if (inst.type == OR) {
+      check_for_errors(id, inst.u2.left_id);
+      check_for_errors(id, inst.u1.right_id);
+    }
+  }
+}
+
 #ifndef NDEBUG
 void reprog::print(regex_flags const flags)
 {
@@ -933,83 +996,81 @@ void reprog::print(regex_flags const flags)
   printf("Instructions:\n");
   for (std::size_t i = 0; i < _insts.size(); i++) {
     const reinst& inst = _insts[i];
-    printf("%zu :", i);
+    printf("%3zu: ", i);
     switch (inst.type) {
-      default: printf("Unknown instruction: %d, nextid= %d", inst.type, inst.u2.next_id); break;
+      default: printf("Unknown instruction: %d, next=%d", inst.type, inst.u2.next_id); break;
       case CHAR:
-        if (inst.u1.c <= 32 || inst.u1.c >= 127)
-          printf(
-            "CHAR, c = '0x%02x', nextid= %d", static_cast<unsigned>(inst.u1.c), inst.u2.next_id);
-        else
-          printf("CHAR, c = '%c', nextid= %d", inst.u1.c, inst.u2.next_id);
+        if (inst.u1.c <= 32 || inst.u1.c >= 127) {
+          printf("   CHAR c='0x%02x', next=%d", static_cast<unsigned>(inst.u1.c), inst.u2.next_id);
+        } else {
+          printf("   CHAR c='%c', next=%d", inst.u1.c, inst.u2.next_id);
+        }
         break;
-      case RBRA: printf("RBRA, subid= %d, nextid= %d", inst.u1.subid, inst.u2.next_id); break;
-      case LBRA: printf("LBRA, subid= %d, nextid= %d", inst.u1.subid, inst.u2.next_id); break;
+      case RBRA: printf("   RBRA id=%d, next=%d", inst.u1.subid, inst.u2.next_id); break;
+      case LBRA: printf("   LBRA id=%d, next=%d", inst.u1.subid, inst.u2.next_id); break;
       case OR:
-        printf("OR, rightid=%d, leftid=%d, nextid=%d",
-               inst.u1.right_id,
-               inst.u2.left_id,
-               inst.u2.next_id);
+        printf(
+          "     OR right=%d, left=%d, next=%d", inst.u1.right_id, inst.u2.left_id, inst.u2.next_id);
         break;
-      case STAR: printf("STAR, nextid= %d", inst.u2.next_id); break;
-      case PLUS: printf("PLUS, nextid= %d", inst.u2.next_id); break;
-      case QUEST: printf("QUEST, nextid= %d", inst.u2.next_id); break;
-      case ANY: printf("ANY, nextid= %d", inst.u2.next_id); break;
-      case ANYNL: printf("ANYNL, nextid= %d", inst.u2.next_id); break;
-      case NOP: printf("NOP, nextid= %d", inst.u2.next_id); break;
+      case STAR: printf("   STAR next=%d", inst.u2.next_id); break;
+      case PLUS: printf("   PLUS next=%d", inst.u2.next_id); break;
+      case QUEST: printf("  QUEST next=%d", inst.u2.next_id); break;
+      case ANY: printf("    ANY next=%d", inst.u2.next_id); break;
+      case ANYNL: printf("  ANYNL next=%d", inst.u2.next_id); break;
+      case NOP: printf("    NOP next=%d", inst.u2.next_id); break;
       case BOL: {
-        printf("BOL, c = ");
+        printf("    BOL c=");
         if (inst.u1.c == '\n') {
           printf("'\\n'");
         } else {
           printf("'%c'", inst.u1.c);
         }
-        printf(", nextid= %d", inst.u2.next_id);
+        printf(", next=%d", inst.u2.next_id);
         break;
       }
       case EOL: {
-        printf("EOL, c = ");
+        printf("    EOL c=");
         if (inst.u1.c == '\n') {
           printf("'\\n'");
         } else {
           printf("'%c'", inst.u1.c);
         }
-        printf(", nextid= %d", inst.u2.next_id);
+        printf(", next=%d", inst.u2.next_id);
         break;
       }
-      case CCLASS: printf("CCLASS, cls_id=%d , nextid= %d", inst.u1.cls_id, inst.u2.next_id); break;
-      case NCCLASS:
-        printf("NCCLASS, cls_id=%d , nextid= %d", inst.u1.cls_id, inst.u2.next_id);
-        break;
-      case BOW: printf("BOW, nextid= %d", inst.u2.next_id); break;
-      case NBOW: printf("NBOW, nextid= %d", inst.u2.next_id); break;
-      case END: printf("END"); break;
+      case CCLASS: printf(" CCLASS cls=%d , next=%d", inst.u1.cls_id, inst.u2.next_id); break;
+      case NCCLASS: printf("NCCLASS cls=%d, next=%d", inst.u1.cls_id, inst.u2.next_id); break;
+      case BOW: printf("    BOW next=%d", inst.u2.next_id); break;
+      case NBOW: printf("   NBOW next=%d", inst.u2.next_id); break;
+      case END: printf("    END"); break;
     }
     printf("\n");
   }
 
   printf("startinst_id=%d\n", _startinst_id);
   if (_startinst_ids.size() > 0) {
-    printf("startinst_ids:");
-    for (size_t i = 0; i < _startinst_ids.size(); i++)
+    printf("startinst_ids: [");
+    for (size_t i = 0; i < _startinst_ids.size(); i++) {
       printf(" %d", _startinst_ids[i]);
-    printf("\n");
+    }
+    printf("]\n");
   }
 
   int count = static_cast<int>(_classes.size());
   printf("\nClasses %d\n", count);
   for (int i = 0; i < count; i++) {
     const reclass& cls = _classes[i];
-    int len            = static_cast<int>(cls.literals.size());
+    auto const size    = static_cast<int>(cls.literals.size());
     printf("%2d: ", i);
-    for (int j = 0; j < len; j += 2) {
+    for (int j = 0; j < size; j += 2) {
       char32_t c1 = cls.literals[j];
       char32_t c2 = cls.literals[j + 1];
-      if (c1 <= 32 || c1 >= 127 || c2 <= 32 || c2 >= 127)
+      if (c1 <= 32 || c1 >= 127 || c2 <= 32 || c2 >= 127) {
         printf("0x%02x-0x%02x", static_cast<unsigned>(c1), static_cast<unsigned>(c2));
-      else
+      } else {
         printf("%c-%c", static_cast<char>(c1), static_cast<char>(c2));
-      if ((j + 2) < len) printf(", ");
+      }
+      if ((j + 2) < size) { printf(", "); }
     }
     printf("\n");
     if (cls.builtins) {
@@ -1024,7 +1085,7 @@ void reprog::print(regex_flags const flags)
     }
     printf("\n");
   }
-  if (_num_capturing_groups) printf("Number of capturing groups: %d\n", _num_capturing_groups);
+  if (_num_capturing_groups) { printf("Number of capturing groups: %d\n", _num_capturing_groups); }
 }
 #endif
 
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 63d7933eebe..18735d0f980 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,9 +51,9 @@ enum InstType {
  * @brief Class type for regex compiler instruction.
  */
 struct reclass {
-  int32_t builtins;         // bit mask identifying builtin classes
+  int32_t builtins{0};      // bit mask identifying builtin classes
   std::u32string literals;  // ranges as pairs of utf-8 characters
-  reclass() : builtins(0) {}
+  reclass() {}
   reclass(int m) : builtins(m) {}
 };
 
@@ -99,31 +99,36 @@ class reprog {
   int32_t add_class(reclass cls);
 
   void set_groups_count(int32_t groups);
-  int32_t groups_count() const;
+  [[nodiscard]] int32_t groups_count() const;
 
-  const reinst* insts_data() const;
-  int32_t insts_count() const;
+  [[nodiscard]] const reinst* insts_data() const;
+  [[nodiscard]] int32_t insts_count() const;
   reinst& inst_at(int32_t id);
 
   reclass& class_at(int32_t id);
-  int32_t classes_count() const;
+  [[nodiscard]] int32_t classes_count() const;
 
-  const int32_t* starts_data() const;
-  int32_t starts_count() const;
+  [[nodiscard]] const int32_t* starts_data() const;
+  [[nodiscard]] int32_t starts_count() const;
 
   void set_start_inst(int32_t id);
-  int32_t get_start_inst() const;
+  [[nodiscard]] int32_t get_start_inst() const;
 
   void optimize1();
   void optimize2();
+  void check_for_errors();
+#ifndef NDEBUG
   void print(regex_flags const flags);
+#endif
 
  private:
   std::vector<reinst> _insts;
   std::vector<reclass> _classes;
   int32_t _startinst_id;
   std::vector<int32_t> _startinst_ids;  // short-cut to speed-up ORs
-  int32_t _num_capturing_groups;
+  int32_t _num_capturing_groups{};
+
+  void check_for_errors(int32_t id, int32_t next_id);
 };
 
 }  // namespace detail
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index d6b8307c3fb..a9928a6bd49 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -132,32 +132,38 @@ class reprog_device {
   /**
    * @brief Returns the number of regex instructions.
    */
-  __host__ __device__ int32_t insts_counts() const { return _insts_count; }
+  [[nodiscard]] __host__ __device__ int32_t insts_counts() const { return _insts_count; }
 
   /**
    * @brief Returns true if this is an empty program.
    */
-  __device__ bool is_empty() const { return insts_counts() == 0 || get_inst(0)->type == END; }
+  [[nodiscard]] __device__ bool is_empty() const
+  {
+    return insts_counts() == 0 || get_inst(0)->type == END;
+  }
 
   /**
    * @brief Returns the number of regex groups found in the expression.
    */
-  CUDF_HOST_DEVICE inline int32_t group_counts() const { return _num_capturing_groups; }
+  [[nodiscard]] CUDF_HOST_DEVICE inline int32_t group_counts() const
+  {
+    return _num_capturing_groups;
+  }
 
   /**
    * @brief Returns the regex instruction object for a given index.
    */
-  __device__ inline reinst* get_inst(int32_t idx) const;
+  [[nodiscard]] __device__ inline reinst* get_inst(int32_t idx) const;
 
   /**
    * @brief Returns the regex class object for a given index.
    */
-  __device__ inline reclass_device get_class(int32_t idx) const;
+  [[nodiscard]] __device__ inline reclass_device get_class(int32_t idx) const;
 
   /**
    * @brief Returns the start-instruction-ids vector.
    */
-  __device__ inline int32_t* startinst_ids() const;
+  [[nodiscard]] __device__ inline int32_t* startinst_ids() const;
 
   /**
    * @brief Does a find evaluation using the compiled expression on the given string.
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 0e11e9c1bbd..50aab8c3ac4 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -246,7 +246,7 @@ __device__ inline int32_t reprog_device::regexec(
       expanded = false;
 
       for (int16_t i = 0; i < jnk.list1->size; i++) {
-        int32_t inst_id     = static_cast<int32_t>(jnk.list1->inst_ids[i]);
+        auto inst_id        = static_cast<int32_t>(jnk.list1->inst_ids[i]);
         int2& range         = jnk.list1->ranges[i];
         const reinst* inst  = get_inst(inst_id);
         int32_t id_activate = -1;
@@ -283,7 +283,7 @@ __device__ inline int32_t reprog_device::regexec(
             break;
           case BOW: {
             auto codept           = utf8_to_codepoint(c);
-            char32_t last_c       = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
+            auto last_c           = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
             auto last_codept      = utf8_to_codepoint(last_c);
             bool cur_alphaNumeric = (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]);
             bool last_alphaNumeric =
@@ -296,7 +296,7 @@ __device__ inline int32_t reprog_device::regexec(
           }
           case NBOW: {
             auto codept           = utf8_to_codepoint(c);
-            char32_t last_c       = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
+            auto last_c           = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
             auto last_codept      = utf8_to_codepoint(last_c);
             bool cur_alphaNumeric = (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]);
             bool last_alphaNumeric =
@@ -323,7 +323,7 @@ __device__ inline int32_t reprog_device::regexec(
     bool continue_execute = true;
     jnk.list2->reset();
     for (int16_t i = 0; continue_execute && i < jnk.list1->size; i++) {
-      int32_t inst_id     = static_cast<int32_t>(jnk.list1->inst_ids[i]);
+      auto inst_id        = static_cast<int32_t>(jnk.list1->inst_ids[i]);
       int2& range         = jnk.list1->ranges[i];
       const reinst* inst  = get_inst(inst_id);
       int32_t id_activate = -1;
@@ -415,11 +415,11 @@ __device__ inline int32_t reprog_device::call_regexec<RX_STACK_ANY>(
   auto const schar = get_inst(_startinst_id)->u1.c;
 
   auto const relists_size = relist::alloc_size(_insts_count);
-  u_char* listmem         = reinterpret_cast<u_char*>(_relists_mem);  // beginning of relist buffer;
+  auto* listmem           = reinterpret_cast<u_char*>(_relists_mem);  // beginning of relist buffer;
   listmem += (idx * relists_size * 2);                                // two relist ptrs in reljunk:
 
-  relist* list1 = new (listmem) relist(static_cast<int16_t>(_insts_count));
-  relist* list2 = new (listmem + relists_size) relist(static_cast<int16_t>(_insts_count));
+  auto* list1 = new (listmem) relist(static_cast<int16_t>(_insts_count));
+  auto* list2 = new (listmem + relists_size) relist(static_cast<int16_t>(_insts_count));
 
   reljunk jnk(list1, list2, stype, schar);
   return regexec(dstr, jnk, begin, end, group_id);
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 4f93bbd6e7b..b286812226b 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -67,8 +67,7 @@ reprog_device::reprog_device(reprog& prog)
     _num_capturing_groups{prog.groups_count()},
     _insts_count{prog.insts_count()},
     _starts_count{prog.starts_count()},
-    _classes_count{prog.classes_count()},
-    _relists_mem{nullptr}
+    _classes_count{prog.classes_count()}
 {
 }
 
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 9fd1768453a..2c594bb86a8 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,37 +62,49 @@ struct replace_regex_fn {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
-    auto const d_str  = d_strings.element<string_view>(idx);
-    auto const nchars = d_str.length();                  // number of characters in input string
-    auto nbytes       = d_str.size_bytes();              // number of bytes in input string
-    auto mxn          = maxrepl < 0 ? nchars : maxrepl;  // max possible replaces for this string
-    auto in_ptr       = d_str.data();                    // input pointer (i)
-    auto out_ptr      = d_chars ? d_chars + d_offsets[idx] : nullptr;  // output pointer (o)
-    size_type lpos    = 0;
-    int32_t begin     = 0;
-    int32_t end       = static_cast<int32_t>(nchars);
+
+    auto const d_str = d_strings.element<string_view>(idx);
+    auto nbytes      = d_str.size_bytes();                  // number of bytes in input string
+    auto mxn = maxrepl < 0 ? d_str.length() + 1 : maxrepl;  // max possible replaces for this string
+    auto in_ptr        = d_str.data();                      // input pointer (i)
+    auto out_ptr       = d_chars ? d_chars + d_offsets[idx]  // output pointer (o)
+                                 : nullptr;
+    size_type last_pos = 0;
+    int32_t begin      = 0;   // these are for calling prog.find
+    int32_t end        = -1;  // matches final word-boundary if at the end of the string
+
     // copy input to output replacing strings as we go
-    while (mxn-- > 0)  // maximum number of replaces
-    {
-      if (prog.is_empty() || prog.find<stack_size>(idx, d_str, begin, end) <= 0)
-        break;                                        // no more matches
-      auto spos = d_str.byte_offset(begin);           // get offset for these
-      auto epos = d_str.byte_offset(end);             // character position values
-      nbytes += d_repl.size_bytes() - (epos - spos);  // compute new size
-      if (out_ptr)                                    // replace
-      {                                               // i:bbbbsssseeee
-        out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);  // o:bbbb
-        out_ptr = copy_string(out_ptr, d_repl);                             // o:bbbbrrrrrr
-                                                                            //  out_ptr ---^
-        lpos = epos;                                                        // i:bbbbsssseeee
-      }                                                                     //  in_ptr --^
-      begin = end;
-      end   = static_cast<int32_t>(nchars);
+    while (mxn-- > 0) {  // maximum number of replaces
+
+      if (prog.is_empty() || prog.find<stack_size>(idx, d_str, begin, end) <= 0) {
+        break;  // no more matches
+      }
+
+      auto const start_pos = d_str.byte_offset(begin);        // get offset for these
+      auto const end_pos   = d_str.byte_offset(end);          // character position values
+      nbytes += d_repl.size_bytes() - (end_pos - start_pos);  // and compute new size
+
+      if (out_ptr) {                                         // replace:
+                                                             // i:bbbbsssseeee
+        out_ptr = copy_and_increment(out_ptr,                //   ^
+                                     in_ptr + last_pos,      // o:bbbb
+                                     start_pos - last_pos);  //       ^
+        out_ptr = copy_string(out_ptr, d_repl);              // o:bbbbrrrrrr
+                                                             //  out_ptr ---^
+        last_pos = end_pos;                                  // i:bbbbsssseeee
+      }                                                      //  in_ptr --^
+
+      begin = end + (begin == end);
+      end   = -1;
     }
-    if (out_ptr)                                                  // copy the remainder
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);  // o:bbbbrrrrrreeee
-    else
+
+    if (out_ptr) {
+      memcpy(out_ptr,                         // copy the remainder
+             in_ptr + last_pos,               // o:bbbbrrrrrreeee
+             d_str.size_bytes() - last_pos);  //             ^   ^
+    } else {
       d_offsets[idx] = static_cast<int32_t>(nbytes);
+    }
   }
 };
 
diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/search/find.cu
similarity index 100%
rename from cpp/src/strings/find.cu
rename to cpp/src/strings/search/find.cu
diff --git a/cpp/src/strings/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
similarity index 67%
rename from cpp/src/strings/find_multiple.cu
rename to cpp/src/strings/search/find_multiple.cu
index 72e7081cb7a..5756c239f1c 100644
--- a/cpp/src/strings/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sequence.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -31,37 +33,32 @@ namespace cudf {
 namespace strings {
 namespace detail {
 std::unique_ptr<column> find_multiple(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::INT32);
-  auto targets_count = targets.size();
+  auto const strings_count = input.size();
+  auto const targets_count = targets.size();
   CUDF_EXPECTS(targets_count > 0, "Must include at least one search target");
   CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
   auto targets_column = column_device_view::create(targets.parent(), stream);
   auto d_targets      = *targets_column;
 
+  auto const total_count = strings_count * targets_count;
+
   // create output column
-  auto total_count  = strings_count * targets_count;
-  auto results      = make_numeric_column(data_type{type_id::INT32},
-                                     total_count,
-                                     rmm::device_buffer{0, stream, mr},
-                                     0,
-                                     stream,
-                                     mr);  // no nulls
-  auto results_view = results->mutable_view();
-  auto d_results    = results_view.data<int32_t>();
+  auto results = make_numeric_column(
+    data_type{type_id::INT32}, total_count, rmm::device_buffer{0, stream, mr}, 0, stream, mr);
+
   // fill output column with position values
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(total_count),
-                    d_results,
+                    results->mutable_view().begin<int32_t>(),
                     [d_strings, d_targets, targets_count] __device__(size_type idx) {
                       size_type str_idx = idx / targets_count;
                       if (d_strings.is_null(str_idx)) return -1;
@@ -70,18 +67,30 @@ std::unique_ptr<column> find_multiple(
                       return d_str.find(d_tgt);
                     });
   results->set_null_count(0);
-  return results;
+
+  auto offsets = cudf::detail::sequence(strings_count + 1,
+                                        numeric_scalar<offset_type>(0),
+                                        numeric_scalar<offset_type>(targets_count),
+                                        stream,
+                                        mr);
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(results),
+                           0,
+                           rmm::device_buffer{0, stream, mr},
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> find_multiple(strings_column_view const& strings,
+std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find_multiple(strings, targets, rmm::cuda_stream_default, mr);
+  return detail::find_multiple(input, targets, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/search/findall.cu
similarity index 94%
rename from cpp/src/strings/findall.cu
rename to cpp/src/strings/search/findall.cu
index 8d96f0de415..8fb754848d4 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -67,7 +67,7 @@ struct findall_fn {
     string_view d_str      = d_strings.element<string_view>(idx);
     auto const nchars      = d_str.length();
     int32_t spos           = 0;
-    int32_t epos           = static_cast<int32_t>(nchars);
+    auto epos              = static_cast<int32_t>(nchars);
     size_type column_count = 0;
     while (spos <= nchars) {
       if (prog.find<stack_size>(idx, d_str, spos, epos) <= 0) break;  // no more matches found
@@ -109,11 +109,11 @@ struct findall_count_fn : public findall_fn<stack_size> {
 }  // namespace
 
 //
-std::unique_ptr<table> findall_re(
+std::unique_ptr<table> findall(
   strings_column_view const& strings,
   std::string const& pattern,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto const strings_count = strings.size();
   auto const d_strings     = column_device_view::create(strings.parent(), stream);
@@ -205,12 +205,12 @@ std::unique_ptr<table> findall_re(
 
 // external API
 
-std::unique_ptr<table> findall_re(strings_column_view const& strings,
-                                  std::string const& pattern,
-                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> findall(strings_column_view const& strings,
+                               std::string const& pattern,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::findall_re(strings, pattern, mr);
+  return detail::findall(strings, pattern, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu
new file mode 100644
index 00000000000..9ffdb33f5f2
--- /dev/null
+++ b/cpp/src/strings/search/findall_record.cu
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <strings/count_matches.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/findall.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+using string_index_pair = thrust::pair<const char*, size_type>;
+
+namespace {
+
+/**
+ * @brief This functor handles extracting matched strings by applying the compiled regex pattern
+ * and creating string_index_pairs for all the substrings.
+ */
+template <int stack_size>
+struct findall_fn {
+  column_device_view const d_strings;
+  reprog_device prog;
+  offset_type const* d_offsets;
+  string_index_pair* d_indices;
+
+  __device__ void operator()(size_type const idx)
+  {
+    if (d_strings.is_null(idx)) { return; }
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    auto d_output        = d_indices + d_offsets[idx];
+    size_type output_idx = 0;
+
+    int32_t begin = 0;
+    int32_t end   = d_str.length();
+    while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
+      auto const spos = d_str.byte_offset(begin);  // convert
+      auto const epos = d_str.byte_offset(end);    // to bytes
+
+      d_output[output_idx++] = string_index_pair{d_str.data() + spos, (epos - spos)};
+
+      begin = end + (begin == end);
+      end   = d_str.length();
+    }
+  }
+};
+
+}  // namespace
+
+//
+std::unique_ptr<column> findall_record(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const strings_count = strings.size();
+  auto const d_strings     = column_device_view::create(strings.parent(), stream);
+
+  // compile regex into device object
+  auto const d_prog =
+    reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+
+  // Create lists offsets column
+  auto offsets   = count_matches(*d_strings, *d_prog, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<offset_type>();
+
+  // Compute null output rows
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    d_offsets,
+    d_offsets + strings_count,
+    [] __device__(auto const v) { return v > 0; },
+    stream,
+    mr);
+
+  auto const valid_count = strings_count - null_count;
+  // Return an empty lists column if there are no valid rows
+  if (valid_count == 0) {
+    return make_lists_column(0,
+                             make_empty_column(type_to_id<offset_type>()),
+                             make_empty_column(type_id::STRING),
+                             0,
+                             rmm::device_buffer{},
+                             stream,
+                             mr);
+  }
+
+  // Convert counts into offsets
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+  // Create indices vector with the total number of groups that will be extracted
+  auto total_matches = cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
+
+  rmm::device_uvector<string_index_pair> indices(total_matches, stream);
+  auto d_indices = indices.data();
+  auto begin     = thrust::make_counting_iterator<size_type>(0);
+
+  // Build the string indices
+  auto const regex_insts = d_prog->insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    findall_fn<RX_STACK_SMALL> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    findall_fn<RX_STACK_MEDIUM> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    findall_fn<RX_STACK_LARGE> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else {
+    findall_fn<RX_STACK_ANY> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  }
+
+  // Build the child strings column from the resulting indices
+  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
+
+  // Build the lists column from the offsets and the strings
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           null_count,
+                           std::move(null_mask),
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> findall_record(strings_column_view const& strings,
+                                       std::string const& pattern,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::findall_record(strings, pattern, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index c6e52a79059..aae911e8ed6 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -547,7 +547,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
  */
 struct base_whitespace_split_tokenizer {
   // count the tokens only between non-whitespace characters
-  __device__ size_type count_tokens(size_type idx) const
+  [[nodiscard]] __device__ size_type count_tokens(size_type idx) const
   {
     if (d_strings.is_null(idx)) return 0;
     const string_view d_str = d_strings.element<string_view>(idx);
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index c6dd11c1d82..62fd98d2027 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <text/subword/detail/data_normalizer.hpp>
+#include <text/subword/detail/tokenizer_utils.cuh>
 #include <text/utilities/tokenize_ops.cuh>
 
 #include <nvtext/normalize.hpp>
@@ -199,12 +200,14 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create the normalizer and call it
-  data_normalizer normalizer(stream, do_lower_case);
-  auto result = [&strings, &normalizer, stream] {
-    auto const offsets   = strings.offsets();
-    auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
-    auto const offset    = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-    auto const d_chars   = strings.chars().data<char>() + offset;
+  auto result = [&] {
+    auto const cp_metadata = get_codepoint_metadata(stream);
+    auto const aux_table   = get_aux_codepoint_data(stream);
+    auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
+    auto const offsets     = strings.offsets();
+    auto const d_offsets   = offsets.data<uint32_t>() + strings.offset();
+    auto const offset      = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
+    auto const d_chars     = strings.chars().data<char>() + offset;
     return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
   }();
 
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 2e6dbe62cf1..5af87f4de0e 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -250,9 +250,8 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
 
   chars_per_thread[char_for_thread] = num_new_chars;
 
-  typedef cub::
-    BlockStore<uint32_t, THREADS_PER_BLOCK, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>
-      BlockStore;
+  using BlockStore =
+    cub::BlockStore<uint32_t, THREADS_PER_BLOCK, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
   __shared__ typename BlockStore::TempStorage temp_storage;
 
   // Now we perform coalesced writes back to global memory using cub.
@@ -262,17 +261,17 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
 
 }  // namespace
 
-data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case)
-  : do_lower_case(do_lower_case)
+data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
+                                 aux_codepoint_data_type const* aux_table,
+                                 bool do_lower_case)
+  : d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case}
 {
-  d_cp_metadata = detail::get_codepoint_metadata(stream);
-  d_aux_table   = detail::get_aux_codepoint_data(stream);
 }
 
 uvector_pair data_normalizer::normalize(char const* d_strings,
                                         uint32_t const* d_offsets,
                                         uint32_t num_strings,
-                                        rmm::cuda_stream_view stream)
+                                        rmm::cuda_stream_view stream) const
 {
   if (num_strings == 0)
     return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp
index 1a9eb5ba997..927de5a74f9 100644
--- a/cpp/src/text/subword/detail/data_normalizer.hpp
+++ b/cpp/src/text/subword/detail/data_normalizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,14 +48,17 @@ namespace detail {
 class data_normalizer {
  public:
   /**
-   * @brief Transfer to the GPU the metadata needed to normalize characters.
+   * @brief Create instance of the normalizer.
    *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param cp_metadata The code point metadata table to use for normalization.
+   * @param aux_table The auxiliary code point table.
    * @param do_lower_case If true, the normalizer will convert uppercase characters in the
    *        input stream to lower case and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
    */
-  data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true);
+  data_normalizer(codepoint_metadata_type const* cp_metadata,
+                  aux_codepoint_data_type const* aux_table,
+                  bool do_lower_case = true);
 
   /**
    * @brief Normalize a vector of strings.
@@ -84,7 +87,7 @@ class data_normalizer {
   uvector_pair normalize(char const* d_strings,
                          uint32_t const* d_offsets,
                          uint32_t num_strings,
-                         rmm::cuda_stream_view stream);
+                         rmm::cuda_stream_view stream) const;
 
  private:
   bool const do_lower_case;
diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh
index 48ee0fc2b51..5e8de1ba244 100644
--- a/cpp/src/text/subword/detail/tokenizer_utils.cuh
+++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,9 @@
 #include <text/subword/detail/cp_data.h>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
-#include <stdint.h>
+#include <cstdint>
 
 namespace nvtext {
 namespace detail {
@@ -57,22 +58,16 @@ struct update_strings_lengths_fn {
 /**
  * @brief Retrieve the code point metadata table.
  *
- * This is a singleton instance that copies a large table of integers into
- * device memory on the very first call.
- *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream);
+rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream);
 
 /**
- * @brief Retrieve the aux code point metadata table.
- *
- * This is a singleton instance that copies a large table of integers into
- * device memory on the very first call.
+ * @brief Retrieve the auxiliary code point metadata table.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream);
+rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
index 0259e8ce4f4..b5ad9724d72 100644
--- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
+++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,6 @@ class wordpiece_tokenizer {
    * @param do_lower_case If true, the tokenizer will convert uppercase characters in the
    *        input stream to lowercase and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param max_word_length The length of the longest word that will be tokenized. Words
    *        longer than this will simply be replaced by the unknown token
    *        specified in the `vocab_file`.
@@ -72,7 +71,6 @@ class wordpiece_tokenizer {
                       uint32_t stride,
                       bool do_truncate,
                       bool do_lower_case,
-                      rmm::cuda_stream_view stream,
                       uint32_t max_word_length = 200);
 
   /**
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 0af34eb8092..7cfdb4dea96 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <text/subword/detail/codepoint_metadata.ah>
-#include <text/subword/detail/data_normalizer.hpp>
 #include <text/subword/detail/tokenizer_utils.cuh>
 
 #include <nvtext/detail/load_hash_file.hpp>
@@ -29,95 +28,40 @@
 #include <rmm/device_uvector.hpp>
 
 #include <algorithm>
+#include <cstdint>
 #include <fstream>
 #include <iostream>
-#include <stdint.h>
 #include <vector>
 
 namespace nvtext {
 namespace detail {
 
-namespace {
-struct get_codepoint_metadata_init {
-  rmm::cuda_stream_view stream;
-
-  rmm::device_uvector<codepoint_metadata_type>* operator()() const
-  {
-    auto table_vector =
-      new rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
-    auto table = table_vector->data();
-    thrust::fill(rmm::exec_policy(stream),
-                 table + cp_section1_end,
-                 table + codepoint_metadata_size,
-                 codepoint_metadata_default_value);
-    CUDA_TRY(cudaMemcpyAsync(table,
-                             codepoint_metadata,
-                             cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + cp_section2_begin,
-      cp_metadata_917505_917999,
-      (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    return table_vector;
-  };
-};
-
-struct get_aux_codepoint_data_init {
-  rmm::cuda_stream_view stream;
-
-  rmm::device_uvector<aux_codepoint_data_type>* operator()() const
-  {
-    auto table_vector =
-      new rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
-    auto table = table_vector->data();
-    thrust::fill(rmm::exec_policy(stream),
-                 table + aux_section1_end,
-                 table + aux_codepoint_data_size,
-                 aux_codepoint_default_value);
-    CUDA_TRY(cudaMemcpyAsync(table,
-                             aux_codepoint_data,
-                             aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section2_begin,
-      aux_cp_data_44032_55203,
-      (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section3_begin,
-      aux_cp_data_70475_71099,
-      (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section4_begin,
-      aux_cp_data_119134_119232,
-      (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    return table_vector;
-  }
-};
-}  // namespace
-
 /**
  * @brief Retrieve the code point metadata table.
  *
  * Build the code point metadata table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
+rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<
-    rmm::device_uvector<codepoint_metadata_type>>
-    g_codepoint_metadata;
-
-  return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data();
+  auto table_vector = rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
+  auto table        = table_vector.data();
+  thrust::fill(rmm::exec_policy(stream),
+               table + cp_section1_end,
+               table + codepoint_metadata_size,
+               codepoint_metadata_default_value);
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           codepoint_metadata,
+                           cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + cp_section2_begin,
+    cp_metadata_917505_917999,
+    (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  return table_vector;
 }
 
 /**
@@ -126,13 +70,38 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre
  * Build the aux code point data table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
+rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<
-    rmm::device_uvector<aux_codepoint_data_type>>
-    g_aux_codepoint_data;
-
-  return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data();
+  auto table_vector = rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
+  auto table        = table_vector.data();
+  thrust::fill(rmm::exec_policy(stream),
+               table + aux_section1_end,
+               table + aux_codepoint_data_size,
+               aux_codepoint_default_value);
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           aux_codepoint_data,
+                           aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section2_begin,
+    aux_cp_data_44032_55203,
+    (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section3_begin,
+    aux_cp_data_70475_71099,
+    (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section4_begin,
+    aux_cp_data_119134_119232,
+    (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  return table_vector;
 }
 
 namespace {
@@ -293,10 +262,15 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
                            cudaMemcpyHostToDevice,
                            stream.value()));
 
-  // this just initializes some constant tables into device memory
-  // to help speed up the runtime
-  detail::get_codepoint_metadata(stream);
-  detail::get_aux_codepoint_data(stream);
+  auto cp_metadata            = detail::get_codepoint_metadata(stream);
+  auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
+  result.cp_metadata          = std::make_unique<cudf::column>(
+    cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release());
+
+  auto aux_cp_table            = detail::get_aux_codepoint_data(stream);
+  auto const aux_cp_table_size = static_cast<cudf::size_type>(aux_cp_table.size());
+  result.aux_cp_table          = std::make_unique<cudf::column>(
+    cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release());
 
   return std::make_unique<hashed_vocabulary>(std::move(result));
 }
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 193cd80d9a6..1ac7dd0d8a1 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -153,7 +153,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
-    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case, stream);
+    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
   // Run tokenizer
   auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
   // assign output components
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index 00798e7e4e2..afd82f0bb5d 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -394,10 +394,11 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
                                          uint32_t stride,
                                          bool do_truncate,
                                          bool do_lower_case,
-                                         rmm::cuda_stream_view stream,
                                          uint32_t max_word_length)
   : vocab_table(vocab_table),
-    normalizer(stream, do_lower_case),
+    normalizer(vocab_table.cp_metadata->view().data<codepoint_metadata_type>(),
+               vocab_table.aux_cp_table->view().data<aux_codepoint_data_type>(),
+               do_lower_case),
     max_sequence_length{max_sequence_length},
     stride(stride),
     do_truncate(do_truncate),
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index ff720daa5cb..f6b10cfc583 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -116,17 +116,17 @@ struct column_info {
  *
  */
 struct hierarchy_info {
-  hierarchy_info() : simple_per_row_size(0), complex_type_count(0), max_branch_depth(0) {}
+  hierarchy_info() {}
 
   // These two fields act as an optimization. If we find that the entire table
   // is just fixed-width types, we do not need to do the more expensive kernel call that
   // traverses the individual columns. So if complex_type_count is 0, we can just
   // return a column where every row contains the value simple_per_row_size
-  size_type simple_per_row_size;  // in bits
-  size_type complex_type_count;
+  size_type simple_per_row_size{0};  // in bits
+  size_type complex_type_count{0};
 
   // max depth of span branches present in the hierarchy.
-  size_type max_branch_depth;
+  size_type max_branch_depth{0};
 };
 
 /**
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 0339d52dda9..00408741653 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -475,6 +475,64 @@ TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalOr_Vector_Vector)
   this->template test<cudf::library::operation::LogicalOr>(cudf::binary_operator::LOGICAL_OR);
 }
 
+template <typename T>
+using column_wrapper = std::conditional_t<std::is_same_v<T, std::string>,
+                                          cudf::test::strings_column_wrapper,
+                                          cudf::test::fixed_width_column_wrapper<T>>;
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs, class OP>
+auto NullOp_Result(column_view lhs, column_view rhs)
+{
+  auto [lhs_data, lhs_mask] = cudf::test::to_host<TypeLhs>(lhs);
+  auto [rhs_data, rhs_mask] = cudf::test::to_host<TypeRhs>(rhs);
+  std::vector<TypeOut> result(lhs.size());
+  std::vector<bool> result_mask;
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(lhs.size()),
+                 result.begin(),
+                 [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut {
+                   auto lhs_valid    = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i);
+                   auto rhs_valid    = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i);
+                   bool output_valid = lhs_valid or rhs_valid;
+                   auto result = OP{}(lhs_data[i], rhs_data[i], lhs_valid, rhs_valid, output_valid);
+                   result_mask.push_back(output_valid);
+                   return result;
+                 });
+  return column_wrapper<TypeOut>(result.cbegin(), result.cend(), result_mask.cbegin());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Logical, NullLogicalAnd_Vector_Vector)
+{
+  using TypeOut  = bool;
+  using TypeLhs  = typename TestFixture::TypeLhs;
+  using TypeRhs  = typename TestFixture::TypeRhs;
+  using NULL_AND = cudf::library::operation::NullLogicalAnd<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_AND>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_LOGICAL_AND, data_type(type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Logical, NullLogicalOr_Vector_Vector)
+{
+  using TypeOut = bool;
+  using TypeLhs = typename TestFixture::TypeLhs;
+  using TypeRhs = typename TestFixture::TypeRhs;
+  using NULL_OR = cudf::library::operation::NullLogicalOr<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_OR>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_LOGICAL_OR, data_type(type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 // Comparison Operations ==, !=, <, >, <=, >=
 // n<!=>n, t<!=>t, d<!=>d, s<!=>s, dc<!=>dc
 using Comparison_types = cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
@@ -554,32 +612,6 @@ struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<
 };
 TYPED_TEST_SUITE(BinaryOperationCompiledTest_NullOps, Null_types);
 
-template <typename T>
-using column_wrapper = std::conditional_t<std::is_same_v<T, std::string>,
-                                          cudf::test::strings_column_wrapper,
-                                          cudf::test::fixed_width_column_wrapper<T>>;
-
-template <typename TypeOut, typename TypeLhs, typename TypeRhs, class OP>
-auto NullOp_Result(column_view lhs, column_view rhs)
-{
-  auto [lhs_data, lhs_mask] = cudf::test::to_host<TypeLhs>(lhs);
-  auto [rhs_data, rhs_mask] = cudf::test::to_host<TypeRhs>(rhs);
-  std::vector<TypeOut> result(lhs.size());
-  std::vector<bool> result_mask;
-  std::transform(thrust::make_counting_iterator(0),
-                 thrust::make_counting_iterator(lhs.size()),
-                 result.begin(),
-                 [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut {
-                   auto lhs_valid    = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i);
-                   auto rhs_valid    = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i);
-                   bool output_valid = lhs_valid or rhs_valid;
-                   auto result = OP{}(lhs_data[i], rhs_data[i], lhs_valid, rhs_valid, output_valid);
-                   result_mask.push_back(output_valid);
-                   return result;
-                 });
-  return column_wrapper<TypeOut>(result.cbegin(), result.cend(), result_mask.cbegin());
-}
-
 TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
 {
   using TypeOut     = bool;
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index 481e5cfd4a9..22802580cd0 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -323,6 +323,48 @@ struct PyMod {
   }
 };
 
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullLogicalAnd {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    if (lhs_valid && !x) {
+      output_valid = true;
+      return false;
+    }
+    if (rhs_valid && !y) {
+      output_valid = true;
+      return false;
+    }
+    if (lhs_valid && rhs_valid) {
+      output_valid = true;
+      return true;
+    }
+    output_valid = false;
+    return false;
+  }
+};
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullLogicalOr {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    if (lhs_valid && x) {
+      output_valid = true;
+      return true;
+    }
+    if (rhs_valid && y) {
+      output_valid = true;
+      return true;
+    }
+    if (lhs_valid && rhs_valid) {
+      output_valid = true;
+      return false;
+    }
+    output_valid = false;
+    return false;
+  }
+};
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct NullEquals {
   TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index 4afa96f08d7..8a742b50baa 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -163,7 +163,7 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash_update_data)
     col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
     auto col_view_new = cudf::column_view{*col};
     EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
-    col_view_new.null_count();
+    [[maybe_unused]] auto const nulls = col_view_new.null_count();
     EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
     auto col_view_new2 = cudf::column_view{*col};
     EXPECT_EQ(shallow_hash(col_view_new), shallow_hash(col_view_new2));
@@ -332,7 +332,7 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_update_data)
     col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
     auto col_view_new = cudf::column_view{*col};
     EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
-    col_view_new.null_count();
+    [[maybe_unused]] auto const nulls = col_view_new.null_count();
     EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
     auto col_view_new2 = cudf::column_view{*col};
     EXPECT_TRUE(is_shallow_equivalent(col_view_new, col_view_new2));
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index 306037e6473..a306736d131 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -343,7 +343,7 @@ TEST_F(OverflowTest, OverflowTest)
 
   // primitive column
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 char columns of size 1 billion each
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
@@ -355,7 +355,7 @@ TEST_F(OverflowTest, OverflowTest)
 
   // string column, overflow on chars
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
     auto offsets    = cudf::test::fixed_width_column_wrapper<offset_type>{0, size};
@@ -370,7 +370,7 @@ TEST_F(OverflowTest, OverflowTest)
 
   // string column, overflow on offsets (rows)
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns 1 billion rows each
     auto many_offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, size + 1);
@@ -385,8 +385,7 @@ TEST_F(OverflowTest, OverflowTest)
 
   // list<struct>, structs too long
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
 
     // struct
     std::vector<std::unique_ptr<column>> children;
@@ -408,9 +407,8 @@ TEST_F(OverflowTest, OverflowTest)
 
   // struct<int, list>, list child too long
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
-    constexpr size_type size = 3;
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
+    constexpr size_type size  = 3;
 
     // list
     auto offsets    = cudf::test::fixed_width_column_wrapper<offset_type>{0, 0, 0, inner_size};
@@ -437,7 +435,7 @@ TEST_F(OverflowTest, Presliced)
 
   // primitive column
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 4 char columns of size ~1/2 billion each
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
@@ -454,7 +452,7 @@ TEST_F(OverflowTest, Presliced)
 
   // struct<int8> column
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 4 char columns of size ~1/2 billion each
     std::vector<std::unique_ptr<column>> children;
@@ -542,8 +540,7 @@ TEST_F(OverflowTest, Presliced)
 
   // list<struct>, structs too long
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // struct
     std::vector<std::unique_ptr<column>> children;
@@ -616,8 +613,7 @@ TEST_F(OverflowTest, Presliced)
 
   // struct<int8, list>, list child elements too long
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
     constexpr size_type num_rows  = 4;
     constexpr size_type list_size = inner_size / num_rows;
 
@@ -656,7 +652,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
 
   // primitive column
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
     auto sliced     = cudf::slice(*many_chars, {16, 32});
@@ -668,8 +664,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
 
   // strings column
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
     constexpr size_type num_rows    = 1024;
     constexpr size_type string_size = inner_size / num_rows;
 
@@ -696,8 +691,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
 
   // list<int8> column
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
     constexpr size_type num_rows  = 1024;
     constexpr size_type list_size = inner_size / num_rows;
 
@@ -724,8 +718,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
 
   // struct<int8, list>
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
     constexpr size_type num_rows  = 1024;
     constexpr size_type list_size = inner_size / num_rows;
 
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 4468bc69640..2f02f4cba02 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -70,30 +70,38 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong)
   // make sure we span at least 2 warps
   int num_els = 64;
 
-  bool mask[] = {1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {true, false, true, false, true, true, true,  true,  true,  true,  true, true, true,
+                 true, true,  true, true,  true, true, false, false, false, false, true, true, true,
+                 true, true,  true, true,  true, true, false, false, false, false, true, true, true,
+                 true, true,  true, true,  true, true, true,  true,  true,  true,  true, true, true,
+                 true, true,  true, true,  true, true, true,  true,  true,  true,  true, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
-  bool lhs_v[] = {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool lhs_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true};
   wrapper<T, int32_t> lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
                              5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
                              5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
                             lhs_v);
 
-  bool rhs_v[] = {1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool rhs_v[] = {true, true, true, true, true, true, false, false, true, true, true, true, true,
+                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+                  true, true, true, true, true, true, true,  true,  true, true, true, true};
   wrapper<T, int32_t> rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
                              6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
                              6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
                             rhs_v);
 
-  bool exp_v[] = {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool exp_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true};
   wrapper<T, int32_t> expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
                                   6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
                                   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
@@ -309,13 +317,13 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
   cudf::numeric_scalar<T> lhs_w(5);
 
   const auto rhs = cudf::test::make_type_param_vector<T>({6, 6, 6, 6});
-  bool rhs_v[]   = {1, 0, 1, 1};
+  bool rhs_v[]   = {true, false, true, true};
   wrapper<T> rhs_w(rhs.begin(), rhs.end(), rhs_v);
 
   const auto expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 5});
@@ -331,12 +339,12 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar)
 
   int num_els = 4;
 
-  bool mask[]   = {1, 0, 0, 1};
-  bool mask_v[] = {1, 1, 1, 0};
+  bool mask[]   = {true, false, false, true};
+  bool mask_v[] = {true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els, mask_v);
 
   const auto lhs = cudf::test::make_type_param_vector<T>({5, 5, 5, 5});
-  bool lhs_v[]   = {0, 1, 1, 1};
+  bool lhs_v[]   = {false, true, true, true};
   wrapper<T> lhs_w(lhs.begin(), lhs.end(), lhs_v);
 
   cudf::numeric_scalar<T> rhs_w(6);
@@ -354,7 +362,7 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
   cudf::numeric_scalar<T> lhs_w(5);
@@ -399,12 +407,12 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
   auto lhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(5), true);
 
-  bool rhs_v[] = {1, 0, 1, 1};
+  bool rhs_v[] = {true, false, true, true};
   wrapper<T, int32_t> rhs_w({6, 6, 6, 6}, rhs_v);
 
   wrapper<T, int32_t> expected_w({5, 6, 6, 5}, rhs_v);
@@ -419,10 +427,10 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestColumnScalar)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
-  bool lhs_v[] = {0, 1, 1, 1};
+  bool lhs_v[] = {false, true, true, true};
   wrapper<T, int32_t> lhs_w({5, 5, 5, 5}, lhs_v);
 
   auto rhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(6), true);
@@ -439,7 +447,7 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarScalar)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
   auto lhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(5), true);
@@ -477,8 +485,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse)
   std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {1, 1, 0, 1, 0, 1};
-  bool mask_v[] = {1, 1, 1, 1, 1, 0};
+  bool mask[]   = {true, true, false, true, false, true};
+  bool mask_v[] = {true, true, true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
 
   auto results = cudf::copy_if_else(strings1, strings2, mask_w);
@@ -504,8 +512,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn)
   std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {1, 0, 1, 0, 1, 0};
-  bool mask_v[] = {1, 1, 1, 1, 1, 0};
+  bool mask[]   = {true, false, true, false, true, false};
+  bool mask_v[] = {true, true, true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
 
   auto results = cudf::copy_if_else(strings1, strings2, mask_w);
@@ -532,7 +540,7 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar)
   std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[] = {0, 1, 1, 1, 0, 1};
+  bool mask[] = {false, true, true, true, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);
 
   auto results = cudf::copy_if_else(strings2, strings1, mask_w);
@@ -560,14 +568,14 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar)
   cudf::string_scalar string2{h_string2[0], false};
 
   constexpr cudf::size_type mask_size = 6;
-  bool mask[]                         = {1, 0, 1, 0, 1, 0};
+  bool mask[]                         = {true, false, true, false, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + mask_size);
 
   auto results = cudf::copy_if_else(string1, string2, mask_w);
 
   std::vector<const char*> h_expected;
-  for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(mask_size); ++idx) {
-    if (mask[idx]) {
+  for (bool idx : mask) {
+    if (idx) {
       h_expected.push_back(h_string1[0]);
     } else {
       h_expected.push_back(h_string2[0]);
@@ -649,8 +657,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn)
   cudf::test::dictionary_column_wrapper<std::string> input2(
     h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {1, 1, 0, 1, 0, 1};
-  bool mask_v[] = {1, 1, 1, 1, 1, 0};
+  bool mask[]   = {true, true, false, true, false, true};
+  bool mask_v[] = {true, true, true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
 
   auto results = cudf::copy_if_else(input1, input2, mask_w);
@@ -676,7 +684,7 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar)
   cudf::test::dictionary_column_wrapper<std::string> input2(
     h_strings.begin(), h_strings.end(), valids);
 
-  bool mask[] = {0, 1, 1, 1, 0, 1};
+  bool mask[] = {false, true, true, true, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);
 
   auto results = cudf::copy_if_else(input2, input1, mask_w);
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 1199dfb44f2..2591f395914 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -36,8 +36,7 @@ namespace cudf {
 namespace test {
 
 using namespace cudf;
-
-typedef thrust::tuple<size_type, double, double> expected_value;
+using expected_value = thrust::tuple<size_type, double, double>;
 
 template <typename T>
 struct TDigestAllTypes : public cudf::test::BaseFixture {
diff --git a/cpp/tests/hash_map/multimap_test.cu b/cpp/tests/hash_map/multimap_test.cu
index 4a0e3807a4c..456ba951a45 100644
--- a/cpp/tests/hash_map/multimap_test.cu
+++ b/cpp/tests/hash_map/multimap_test.cu
@@ -66,20 +66,19 @@ class MultimapTest : public cudf::test::BaseFixture {
     rmm::cuda_stream_default.synchronize();
   }
 
-  ~MultimapTest() {}
+  ~MultimapTest() override {}
 };
 
 // Google Test can only do a parameterized typed-test over a single type, so we
 // have to nest multiple types inside of the KeyValueTypes struct above
 // KeyValueTypes<type1, type2> implies key_type = type1, value_type = type2
 // This list is the types across which Google Test will run our tests
-typedef ::testing::Types<KeyValueTypes<int, int>,
-                         KeyValueTypes<int, long long int>,
-                         KeyValueTypes<int, unsigned long long int>,
-                         KeyValueTypes<unsigned long long int, int>,
-                         KeyValueTypes<unsigned long long int, long long int>,
-                         KeyValueTypes<unsigned long long int, unsigned long long int>>
-  Implementations;
+using Implementations = ::testing::Types<KeyValueTypes<int, int>,
+                                         KeyValueTypes<int, long long>,
+                                         KeyValueTypes<int, unsigned long long>,
+                                         KeyValueTypes<unsigned long long, int>,
+                                         KeyValueTypes<unsigned long long, long long>,
+                                         KeyValueTypes<unsigned long long, unsigned long long>>;
 
 TYPED_TEST_SUITE(MultimapTest, Implementations);
 
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
index bd6deae9dc4..da933b44b8d 100644
--- a/cpp/tests/hashing/hash_test.cpp
+++ b/cpp/tests/hashing/hash_test.cpp
@@ -38,7 +38,7 @@ TEST_F(HashTest, MultiValue)
                                             "The quick brown fox",
                                             "jumps over the lazy dog.",
                                             "All work and no play makes Jack a dull boy",
-                                            "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"});
+                                            R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
   using limits = std::numeric_limits<int32_t>;
   fixed_width_column_wrapper<int32_t> const ints_col({0, 100, -100, limits::min(), limits::max()});
@@ -71,13 +71,13 @@ TEST_F(HashTest, MultiValueNulls)
                                              "The quick brown fox",
                                              "jumps over the lazy dog.",
                                              "All work and no play makes Jack a dull boy",
-                                             "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"},
+                                             R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
                                             {0, 1, 1, 0, 1});
   strings_column_wrapper const strings_col2({"different but null",
                                              "The quick brown fox",
                                              "jumps over the lazy dog.",
                                              "I am Jack's complete lack of null value",
-                                             "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"},
+                                             R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
                                             {0, 1, 1, 0, 1});
 
   // Nulls with different values should be equal
@@ -298,32 +298,36 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   // The hash values were determined by running the following Scala code in Apache Spark:
   // import org.apache.spark.sql.catalyst.util.DateTimeUtils
   // val schema = new StructType().add("structs", new StructType().add("a",IntegerType)
-  //     .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType)))
+  //   .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType)))
   //   .add("strings",StringType).add("doubles",DoubleType).add("timestamps",TimestampType)
   //   .add("decimal64", DecimalType(18,7)).add("longs",LongType).add("floats",FloatType)
   //   .add("dates",DateType).add("decimal32", DecimalType(9,3)).add("ints",IntegerType)
   //   .add("shorts",ShortType).add("bytes",ByteType).add("bools",BooleanType)
+  //   .add("decimal128", DecimalType(38,11))
   // val data = Seq(
   // Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0),
   //     0.toLong, 0.toFloat, DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
-  //     false),
+  //     false, BigDecimal(0)),
   // Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble),
   //     DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
-  //     DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true),
+  //     DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true,
+  //     BigDecimal("0.000000001")),
   // Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN,
   //     DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
   //     DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
-  //     true),
+  //     true, BigDecimal("-0.00000000001")),
   // Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)),
   //     "All work and no play makes Jack a dull boy", Double.MinValue,
   //     DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
   //     Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
-  //     BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true),
+  //     BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
+  //     BigDecimal("-9999999999999999.99999999999")),
   // Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)),
   //     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
   //     DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
   //     Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
-  //     BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false))
+  //     BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
+  //     BigDecimal("99999999999999999999999999.99999999999")))
   // val df = spark.createDataFrame(sc.parallelize(data), schema)
   // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
   // df.select(hash(col("*"))).collect
@@ -353,8 +357,10 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
     {933211791, 751823303, -1080202046, 1110053733, 1135925485});
   fixed_width_column_wrapper<int32_t> const hash_bools_expected(
     {933211791, -559580957, -559580957, -559580957, 933211791});
+  fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
+    {-783713497, -295670906, 1398487324, -52622807, -1359749815});
   fixed_width_column_wrapper<int32_t> const hash_combined_expected(
-    {-1172364561, -442972638, 1213234395, 796626751, 214075225});
+    {401603227, 588162166, 552160517, 1132537411, -326043017});
 
   using double_limits = std::numeric_limits<double>;
   using long_limits   = std::numeric_limits<int64_t>;
@@ -394,6 +400,13 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
   fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
   fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+  fixed_point_column_wrapper<__int128_t> const decimal128_col(
+    {static_cast<__int128>(0),
+     static_cast<__int128>(100),
+     static_cast<__int128>(-1),
+     (static_cast<__int128>(0xFFFFFFFFFCC4D1C3u) << 64 | 0x602F7FC318000001u),
+     (static_cast<__int128>(0x0785EE10D5DA46D9u) << 64 | 0x00F4369FFFFFFFFFu)},
+    numeric::scale_type{-11});
 
   constexpr auto hasher      = cudf::hash_id::HASH_SPARK_MURMUR3;
   auto const hash_structs    = cudf::hash(cudf::table_view({structs_col}), hasher, 42);
@@ -410,6 +423,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   auto const hash_bytes      = cudf::hash(cudf::table_view({bytes_col}), hasher, 42);
   auto const hash_bools1     = cudf::hash(cudf::table_view({bools_col1}), hasher, 42);
   auto const hash_bools2     = cudf::hash(cudf::table_view({bools_col2}), hasher, 42);
+  auto const hash_decimal128 = cudf::hash(cudf::table_view({decimal128_col}), hasher, 42);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
@@ -425,6 +439,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
 
   auto const combined_table = cudf::table_view({structs_col,
                                                 strings_col,
@@ -438,7 +453,8 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
                                                 ints_col,
                                                 shorts_col,
                                                 bytes_col,
-                                                bools_col2});
+                                                bools_col2,
+                                                decimal128_col});
   auto const hash_combined  = cudf::hash(combined_table, hasher, 42);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
 }
@@ -462,7 +478,7 @@ TEST_F(MD5HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
      "MD5 hash function. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"});
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
   strings_column_wrapper const md5_string_results1({"d41d8cd98f00b204e9800998ecf8427e",
                                                     "682240021651ae166d08fe2a014d5c09",
@@ -509,7 +525,7 @@ TEST_F(MD5HashTest, MultiValueNulls)
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
      "MD5 hash function. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"},
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
     {1, 0, 0, 1, 0});
   strings_column_wrapper const strings_col2(
     {"",
@@ -551,7 +567,7 @@ TEST_F(MD5HashTest, StringListsNulls)
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
      "MD5 hash function. This string needed to be longer. It needed to be even longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"});
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
   lists_column_wrapper<cudf::string_view> strings_list_col(
     {{""},
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index b7835b4d4d1..868b19254ca 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1901,7 +1901,7 @@ class TestSource : public cudf::io::datasource {
     return read_size;
   }
 
-  size_t size() const override { return str.size(); }
+  [[nodiscard]] size_t size() const override { return str.size(); }
 };
 
 TEST_F(CsvReaderTest, UserImplementedSource)
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 837ac96ef21..a31cd22ee3e 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -85,8 +85,8 @@ std::unique_ptr<cudf::table> create_random_fixed_table(cudf::size_type num_colum
   }
   std::vector<std::unique_ptr<cudf::column>> columns(num_columns);
   std::transform(src_cols.begin(), src_cols.end(), columns.begin(), [](column_wrapper<T>& in) {
-    auto ret = in.release();
-    ret->has_nulls();
+    auto ret                    = in.release();
+    [[maybe_unused]] auto nulls = ret->has_nulls();  // pre-cache the null count
     return ret;
   });
   return std::make_unique<cudf::table>(std::move(columns));
@@ -162,8 +162,8 @@ inline auto random_values(size_t size)
 }
 
 struct SkipRowTest {
-  int test_calls;
-  SkipRowTest(void) : test_calls(0) {}
+  int test_calls{0};
+  SkipRowTest() {}
 
   std::unique_ptr<table> get_expected_result(const std::string& filepath,
                                              int skip_rows,
@@ -773,12 +773,12 @@ TEST_F(OrcChunkedWriterTest, Metadata)
 
 TEST_F(OrcChunkedWriterTest, Strings)
 {
-  bool mask1[] = {1, 1, 0, 1, 1, 1, 1};
+  bool mask1[] = {true, true, false, true, true, true, true};
   std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
   str_col strings1(h_strings1.begin(), h_strings1.end(), mask1);
   table_view tbl1({strings1});
 
-  bool mask2[] = {0, 1, 1, 1, 1, 1, 1};
+  bool mask2[] = {false, true, true, true, true, true, true};
   std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
   str_col strings2(h_strings2.begin(), h_strings2.end(), mask2);
   table_view tbl2({strings2});
@@ -885,8 +885,9 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
 
   int num_els = 31;
 
-  bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true};
 
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
@@ -927,8 +928,9 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
 
   int num_els = 33;
 
-  bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true};
 
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 9c656abb666..b45670fd265 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -76,7 +76,8 @@ std::unique_ptr<cudf::table> create_fixed_table(cudf::size_type num_columns,
                  columns.begin(),
                  [](cudf::test::fixed_width_column_wrapper<T>& in) {
                    auto ret = in.release();
-                   ret->has_nulls();
+                   // pre-cache the null count
+                   [[maybe_unused]] auto const nulls = ret->has_nulls();
                    return ret;
                  });
   return std::make_unique<cudf::table>(std::move(columns));
@@ -1086,7 +1087,7 @@ class custom_test_data_sink : public cudf::io::data_sink {
     outfile_.write(static_cast<char const*>(data), size);
   }
 
-  bool supports_device_write() const override { return true; }
+  [[nodiscard]] bool supports_device_write() const override { return true; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
@@ -1413,13 +1414,13 @@ TEST_F(ParquetChunkedWriterTest, Strings)
 {
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask1[] = {1, 1, 0, 1, 1, 1, 1};
+  bool mask1[] = {true, true, false, true, true, true, true};
   std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
   cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1);
   cols.push_back(strings1.release());
   cudf::table tbl1(std::move(cols));
 
-  bool mask2[] = {0, 1, 1, 1, 1, 1, 1};
+  bool mask2[] = {false, true, true, true, true, true, true};
   std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2);
   cols.push_back(strings2.release());
@@ -2052,8 +2053,9 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize)
   int num_els = 31;
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true};
 
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
@@ -2099,8 +2101,9 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2)
   int num_els = 33;
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true};
 
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
@@ -2149,7 +2152,7 @@ class custom_test_memmap_sink : public cudf::io::data_sink {
 
   void host_write(void const* data, size_t size) override { mm_writer->host_write(data, size); }
 
-  bool supports_device_write() const override { return supports_device_writes; }
+  [[nodiscard]] bool supports_device_write() const override { return supports_device_writes; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index d6a348698b5..f9ed22150b7 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -641,3 +641,302 @@ TYPED_TEST(MixedFullJoinTest, Basic2)
               {JoinNoneValue, 1},
               {JoinNoneValue, 2}});
 }
+
+template <typename T>
+struct MixedJoinSingleReturnTest : public MixedJoinTest<T> {
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * verify that the outputs match the expected outputs (up to order).
+   */
+  virtual void _test(cudf::table_view left_equality,
+                     cudf::table_view right_equality,
+                     cudf::table_view left_conditional,
+                     cudf::table_view right_conditional,
+                     cudf::ast::operation predicate,
+                     std::vector<cudf::size_type> expected_outputs,
+                     cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+  {
+    auto [result_size, actual_counts] = this->join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+    EXPECT_TRUE(result_size == expected_outputs.size());
+
+    auto result = this->join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+    std::vector<cudf::size_type> resulting_indices;
+    for (size_t i = 0; i < result->size(); ++i) {
+      // Note: Not trying to be terribly efficient here since these tests are
+      // small, otherwise a batch copy to host before constructing the tuples
+      // would be important.
+      resulting_indices.push_back(result->element(i, rmm::cuda_stream_default));
+    }
+    std::sort(resulting_indices.begin(), resulting_indices.end());
+    std::sort(expected_outputs.begin(), expected_outputs.end());
+    EXPECT_TRUE(
+      std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
+  }
+
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * the provided predicate and verify that the outputs match the expected
+   * outputs (up to order).
+   */
+  void test(ColumnVector<T> left_data,
+            ColumnVector<T> right_data,
+            std::vector<cudf::size_type> equality_columns,
+            std::vector<cudf::size_type> conditional_columns,
+            cudf::ast::operation predicate,
+            std::vector<cudf::size_type> expected_outputs)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers,
+          right_wrappers,
+          left_columns,
+          right_columns,
+          left_equality,
+          right_equality,
+          left_conditional,
+          right_conditional] =
+      this->parse_input(left_data, right_data, equality_columns, conditional_columns);
+    this->_test(left_equality,
+                right_equality,
+                left_conditional,
+                right_conditional,
+                predicate,
+                expected_outputs);
+  }
+
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * the provided predicate and verify that the outputs match the expected
+   * outputs (up to order).
+   */
+  void test_nulls(NullableColumnVector<T> left_data,
+                  NullableColumnVector<T> right_data,
+                  std::vector<cudf::size_type> equality_columns,
+                  std::vector<cudf::size_type> conditional_columns,
+                  cudf::ast::operation predicate,
+                  std::vector<cudf::size_type> expected_outputs,
+                  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers,
+          right_wrappers,
+          left_columns,
+          right_columns,
+          left_equality,
+          right_equality,
+          left_conditional,
+          right_conditional] =
+      this->parse_input(left_data, right_data, equality_columns, conditional_columns);
+    this->_test(left_equality,
+                right_equality,
+                left_conditional,
+                right_conditional,
+                predicate,
+                expected_outputs,
+                compare_nulls);
+  }
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * mixed join API.
+   */
+  virtual SingleJoinReturn join(cudf::table_view left_equality,
+                                cudf::table_view right_equality,
+                                cudf::table_view left_conditional,
+                                cudf::table_view right_conditional,
+                                cudf::ast::operation predicate,
+                                cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * mixed join size computation API.
+   */
+  virtual std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
+};
+
+/**
+ * Tests of mixed left semi joins.
+ */
+template <typename T>
+struct MixedLeftSemiJoinTest : public MixedJoinSingleReturnTest<T> {
+  SingleJoinReturn join(cudf::table_view left_equality,
+                        cudf::table_view right_equality,
+                        cudf::table_view left_conditional,
+                        cudf::table_view right_conditional,
+                        cudf::ast::operation predicate,
+                        cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_left_semi_join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+
+  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_left_semi_join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+};
+
+TYPED_TEST_SUITE(MixedLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
+{
+  this->test({{0, 1, 2}, {3, 4, 5}, {10, 20, 30}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {1});
+}
+
+TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
+{
+  this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
+             {{0, 1, 3, 1}, {5, 4, 5, 6}, {30, 40, 50, 40}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {1, 3});
+}
+
+TYPED_TEST(MixedLeftSemiJoinTest, BasicNullEqualityEqual)
+{
+  this->test_nulls({{{0, 1, 2}, {1, 1, 0}}, {{3, 4, 5}, {1, 1, 1}}, {{10, 20, 30}, {1, 1, 1}}},
+                   {{{0, 1, 3}, {1, 1, 0}}, {{5, 4, 5}, {1, 1, 1}}, {{30, 40, 30}, {1, 1, 1}}},
+                   {0},
+                   {1, 2},
+                   left_zero_eq_right_zero,
+                   {1, 2},
+                   cudf::null_equality::EQUAL);
+};
+
+TYPED_TEST(MixedLeftSemiJoinTest, BasicNullEqualityUnequal)
+{
+  this->test_nulls({{{0, 1, 2}, {1, 1, 0}}, {{3, 4, 5}, {1, 1, 1}}, {{10, 20, 30}, {1, 1, 1}}},
+                   {{{0, 1, 3}, {1, 1, 0}}, {{5, 4, 5}, {1, 1, 1}}, {{30, 40, 30}, {1, 1, 1}}},
+                   {0},
+                   {1, 2},
+                   left_zero_eq_right_zero,
+                   {1},
+                   cudf::null_equality::UNEQUAL);
+};
+
+TYPED_TEST(MixedLeftSemiJoinTest, AsymmetricEquality)
+{
+  this->test({{0, 2, 1}, {3, 5, 4}, {10, 30, 20}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {2});
+}
+
+TYPED_TEST(MixedLeftSemiJoinTest, AsymmetricLeftLargerEquality)
+{
+  this->test({{0, 2, 1, 4}, {3, 5, 4, 10}, {10, 30, 20, 100}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {2});
+}
+
+/**
+ * Tests of mixed left semi joins.
+ */
+template <typename T>
+struct MixedLeftAntiJoinTest : public MixedJoinSingleReturnTest<T> {
+  SingleJoinReturn join(cudf::table_view left_equality,
+                        cudf::table_view right_equality,
+                        cudf::table_view left_conditional,
+                        cudf::table_view right_conditional,
+                        cudf::ast::operation predicate,
+                        cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_left_anti_join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+
+  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_left_anti_join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+};
+
+TYPED_TEST_SUITE(MixedLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(MixedLeftAntiJoinTest, BasicEquality)
+{
+  this->test({{0, 1, 2}, {3, 4, 5}, {10, 20, 30}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 2});
+}
+
+TYPED_TEST(MixedLeftAntiJoinTest, BasicNullEqualityEqual)
+{
+  this->test_nulls({{{0, 1, 2}, {1, 1, 0}}, {{3, 4, 5}, {1, 1, 1}}, {{10, 20, 30}, {1, 1, 1}}},
+                   {{{0, 1, 3}, {1, 1, 0}}, {{5, 4, 5}, {1, 1, 1}}, {{30, 40, 30}, {1, 1, 1}}},
+                   {0},
+                   {1, 2},
+                   left_zero_eq_right_zero,
+                   {0},
+                   cudf::null_equality::EQUAL);
+};
+
+TYPED_TEST(MixedLeftAntiJoinTest, BasicNullEqualityUnequal)
+{
+  this->test_nulls({{{0, 1, 2}, {1, 1, 0}}, {{3, 4, 5}, {1, 1, 1}}, {{10, 20, 30}, {1, 1, 1}}},
+                   {{{0, 1, 3}, {1, 1, 0}}, {{5, 4, 5}, {1, 1, 1}}, {{30, 40, 30}, {1, 1, 1}}},
+                   {0},
+                   {1, 2},
+                   left_zero_eq_right_zero,
+                   {0, 2},
+                   cudf::null_equality::UNEQUAL);
+};
+
+TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricEquality)
+{
+  this->test({{0, 2, 1}, {3, 5, 4}, {10, 30, 20}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 1});
+}
+
+TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
+{
+  this->test({{0, 2, 1, 4}, {3, 5, 4, 10}, {10, 30, 20, 100}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 1, 3});
+}
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index fe65fe0474a..7540dfd94c5 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -298,7 +298,7 @@ struct ReplaceTest : cudf::test::BaseFixture {
     std::srand(number_of_instantiations++);
   }
 
-  ~ReplaceTest() {}
+  ~ReplaceTest() override {}
 };
 
 /**
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index 3e89e435bc0..7cd8b655231 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -114,7 +114,7 @@ TYPED_TEST(DefaultScalarFactory, TypeCast)
 
   auto numeric_s = static_cast<cudf::scalar_type_t<TypeParam>*>(s.get());
 
-  EXPECT_NO_THROW(numeric_s->value());
+  EXPECT_NO_THROW((void)numeric_s->value());
   EXPECT_FALSE(numeric_s->is_valid());
   EXPECT_FALSE(s->is_valid());
 }
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index ff9f79ea87f..c1552ab3f57 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -51,13 +51,20 @@ TEST_P(CharsTypes, AllTypes)
                                      "de",
                                      "\t\r\n\f "};
 
-  bool expecteds[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,   // decimal
-                      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,   // numeric
-                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,   // digit
-                      1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,   // alpha
-                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,   // space
-                      0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // upper
-                      0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};  // lower
+  bool expecteds[] = {false, false, false, false, false, false, false, false,
+                      false, false, false, false, false, true,  false, false,  // decimal
+                      false, false, false, false, false, false, false, false,
+                      false, true,  false, true,  false, true,  false, false,  // numeric
+                      false, false, false, false, false, false, false, false,
+                      false, false, false, true,  false, true,  false, false,  // digit
+                      true,  true,  false, true,  false, false, false, false,
+                      false, false, false, false, false, false, true,  false,  // alpha
+                      false, false, false, false, false, false, false, false,
+                      false, false, false, false, false, false, false, true,  // space
+                      false, false, false, true,  false, false, false, false,
+                      false, false, false, false, false, false, false, false,  // upper
+                      false, true,  false, false, false, false, false, false,
+                      false, false, false, false, false, false, true,  false};  // lower
 
   auto is_parm = GetParam();
 
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 48c4aac9e8a..12a00aa35ab 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -274,6 +274,15 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
 }
 
+TEST_F(StringsContainsTests, Errors)
+{
+  cudf::test::strings_column_wrapper input({"3", "33"});
+  auto strings_view = cudf::strings_column_view(input);
+
+  EXPECT_THROW(cudf::strings::contains_re(strings_view, "(3?)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::contains_re(strings_view, "3?+"), cudf::logic_error);
+}
+
 TEST_F(StringsContainsTests, CountTest)
 {
   std::vector<const char*> h_strings{
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 2bb1c6dac8e..516882bd8ad 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -183,7 +183,7 @@ TEST_F(StringsExtractTests, ExtractAllTest)
 
   auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)");
 
-  bool valids[] = {1, 1, 1, 0, 0, 0, 1};
+  bool valids[] = {true, true, true, false, false, false, true};
   using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"123", "banana", "7", "eleven"},
                 LCW{"41", "apple"},
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 38f905078a7..d35cb5c3b9d 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -68,7 +68,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
       strings[idx] = thrust::pair<const char*, cudf::size_type>{nullptr, 0};
       nulls++;
     } else {
-      cudf::size_type length = (cudf::size_type)strlen(str);
+      auto length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
       strings[idx] = thrust::pair<const char*, cudf::size_type>{d_buffer.data() + offset, length};
       offset += length;
@@ -130,7 +130,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
     h_null_mask     = (h_null_mask << 1);
     const char* str = h_test_strings[idx];
     if (str) {
-      cudf::size_type length = (cudf::size_type)strlen(str);
+      auto length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
       offset += length;
       h_null_mask |= 1;
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index a4cb27b7a9f..7b9f639f965 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,13 +41,16 @@ TEST_F(StringsFindMultipleTest, FindMultiple)
   cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
   auto targets_view = cudf::strings_column_view(targets);
 
-  auto results                = cudf::strings::find_multiple(strings_view, targets_view);
-  cudf::size_type total_count = static_cast<cudf::size_type>(h_strings.size() * h_targets.size());
-  EXPECT_EQ(total_count, results->size());
+  auto results = cudf::strings::find_multiple(strings_view, targets_view);
+
+  using LCW = cudf::test::lists_column_wrapper<int32_t>;
+  LCW expected({LCW{1, -1, -1, -1, 4, -1, -1},
+                LCW{4, -1, 2, -1, -1, -1, 2},
+                LCW{-1, -1, -1, -1, -1, -1, -1},
+                LCW{-1, 2, 1, -1, -1, -1, -1},
+                LCW{-1, -1, 1, 8, -1, -1, 1},
+                LCW{-1, -1, -1, -1, -1, -1, -1}});
 
-  cudf::test::fixed_width_column_wrapper<int32_t> expected(
-    {1,  -1, -1, -1, 4,  -1, -1, 4,  -1, 2, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1,
-     -1, 2,  1,  -1, -1, -1, -1, -1, -1, 1, 8,  -1, -1, 1, -1, -1, -1, -1, -1, -1, -1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index d7bf162d36f..4b1305a870a 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ TEST_F(StringsFindallTests, FindallTest)
                                        nullptr};
 
   std::string pattern = "(\\w+)";
-  auto results        = cudf::strings::findall_re(strings_view, pattern);
+  auto results        = cudf::strings::findall(strings_view, pattern);
   EXPECT_TRUE(results->num_columns() == 2);
 
   cudf::test::strings_column_wrapper expected1(
@@ -75,6 +75,28 @@ TEST_F(StringsFindallTests, FindallTest)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
+TEST_F(StringsFindallTests, FindallRecord)
+{
+  cudf::test::strings_column_wrapper input(
+    {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"},
+    {1, 1, 1, 1, 1, 0, 1, 1});
+
+  auto results = cudf::strings::findall_record(cudf::strings_column_view(input), "(\\d+)-(\\w+)");
+
+  bool valids[] = {1, 1, 1, 0, 0, 0, 0, 1};
+  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"3-A"},
+                LCW{"4-May", "5-Day", "6-Hay"},
+                LCW{"12-Dec", "2021-Jan"},
+                LCW{},
+                LCW{},
+                LCW{},
+                LCW{},
+                LCW{"25-9000"}},
+               valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsFindallTests, MediumRegex)
 {
   // This results in 15 regex instructions and falls in the 'medium' range.
@@ -87,7 +109,7 @@ TEST_F(StringsFindallTests, MediumRegex)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::findall_re(strings_view, medium_regex);
+  auto results      = cudf::strings::findall(strings_view, medium_regex);
   EXPECT_TRUE(results->num_columns() == 2);
 
   std::vector<const char*> h_expected1{"first words 1234", nullptr};
@@ -115,9 +137,11 @@ TEST_F(StringsFindallTests, LargeRegex)
   std::vector<const char*> h_strings{
     "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello "
     "http://www.world.com I'm here @home zzzz",
-    "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
+    "12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012"
+    "34"
     "5678901234567890",
-    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnop"
+    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmn"
+    "op"
     "qrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -125,7 +149,7 @@ TEST_F(StringsFindallTests, LargeRegex)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::findall_re(strings_view, large_regex);
+  auto results      = cudf::strings::findall(strings_view, large_regex);
   EXPECT_TRUE(results->num_columns() == 1);
 
   std::vector<const char*> h_expected{large_regex.c_str(), nullptr, nullptr};
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index dfcc646a8f6..2dfe50d2ef5 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -389,7 +389,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
-    cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"};
+    cudf::test::strings_column_wrapper expected_raw{R"(["0-553-21311-3","0-395-19395-8"])"};
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -402,7 +402,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{
-      "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"};
+      R"(["reference","fiction","fiction","fiction"])"};
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -415,7 +415,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{
-      "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"};
+      R"(["Sayings of the Century","Sword of Honour","Moby Dick","The Lord of the Rings"])"};
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -470,7 +470,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyQuery)
 {
   // empty query -> null
   {
-    cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a" : "b"})"};
     std::string json_path("");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
@@ -497,7 +497,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs)
   // returning something, but it happens to be empty. so we expect
   // a valid, but empty row
   {
-    cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"};
+    cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"};
     std::string json_path("$.store.bicycle");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
@@ -513,7 +513,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
   // can't have more than one root operator, or a root operator anywhere other
   // than the beginning
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$$");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -523,7 +523,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 
   // invalid index
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[auh46h-]");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -533,7 +533,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 
   // invalid index
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[[]]");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -543,7 +543,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 
   // negative index
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[-1]");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -553,7 +553,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 
   // child operator with no name specified
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path(".");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -562,7 +562,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
   }
 
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("][");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -571,7 +571,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
   }
 
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("6hw6,56i3");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -585,7 +585,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
 {
   // non-existent field
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[*].c");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
@@ -596,7 +596,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
 
   // non-existent field
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[*].c[2]");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
@@ -667,8 +667,8 @@ TEST_F(JsonPathTests, MixedOutput)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "{\"b\" : \"c\"}",
-      "{\"b\" : \"c\"}",
+      R"({"b" : "c"})",
+      R"({"b" : "c"})",
       "",
       "[\"y\",500]",
       "",
@@ -786,7 +786,7 @@ TEST_F(JsonPathTests, StripQuotes)
 
   // a valid, but empty row
   {
-    cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"};
+    cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"};
     std::string json_path("$.store.bicycle");
 
     cudf::strings::get_json_object_options options;
@@ -858,8 +858,8 @@ TEST_F(JsonPathTests, AllowSingleQuotes)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "{\'b\' : \'c\'}",
-      "{\'b\' : \"c\"}",
+      R"({'b' : 'c'})",
+      R"({'b' : "c"})",
       "",
       "[\'y\',500]",
       "",
@@ -902,7 +902,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
 
       // clang-format off
       cudf::test::strings_column_wrapper expected({
-        "[{\"key\" : \"value[\"}]",
+        R"([{"key" : "value["}])",
       });
       // clang-format on
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
@@ -928,7 +928,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
 
       // clang-format off
       cudf::test::strings_column_wrapper expected({
-        "[}{}][][{[\\\"}}[\\\"]",
+        R"([}{}][][{[\"}}[\"])",
       });
       // clang-format on
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
@@ -961,8 +961,8 @@ TEST_F(JsonPathTests, EscapeSequences)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "\\\" \\\\ \\/ \\b \\f \\n \\r \\t",
-      "\\u1248 \\uacdf \\uACDF \\u10EF"
+      R"(\" \\ \/ \b \f \n \r \t)",
+      R"(\u1248 \uacdf \uACDF \u10EF)"
     });
     // clang-format on
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index eac06fa4588..ddbd9f5b3d6 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,6 +145,16 @@ TEST_F(StringsReplaceRegexTest, MultiReplacement)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
 }
 
+TEST_F(StringsReplaceRegexTest, WordBoundary)
+{
+  cudf::test::strings_column_wrapper input({"aba bcd\naba", "zéz", "A1B2-é3", "e é"});
+  auto results =
+    cudf::strings::replace_re(cudf::strings_column_view(input), "\\b", cudf::string_scalar("X"));
+  cudf::test::strings_column_wrapper expected(
+    {"XabaX XbcdX\nXabaX", "XzézX", "XA1B2X-Xé3X", "XeX XéX"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
 TEST_F(StringsReplaceRegexTest, Multiline)
 {
   auto const multiline = cudf::strings::regex_flags::MULTILINE;
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index c94963525a0..a1c0c49a881 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -123,7 +123,7 @@ TEST_F(TableViewTest, SelectOutOfBounds)
   fixed_width_column_wrapper<int64_t> col4{{4, 5, 6, 7}};
   cudf::table_view t{{col1, col2}};
 
-  EXPECT_THROW(t.select({2, 3, 4}), std::out_of_range);
+  EXPECT_THROW((void)t.select({2, 3, 4}), std::out_of_range);
 }
 
 TEST_F(TableViewTest, SelectNoColumns)
diff --git a/docker/package_versions.sh b/docker/package_versions.sh
deleted file mode 100755
index c558f66d511..00000000000
--- a/docker/package_versions.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-# Usage:
-# "./package_versions.sh /cudf/conda/environments/cudf_dev.yml" - Updates package versions in file based on Docker build-args
-
-FILENAME=$1
-
-set_version() {
-    sed -i "s/\- $1\([<>=][^a-zA-Z]*\)\?$/\- $1=$2/" $FILENAME
-}
-
-replace_text() {
-    sed -i "s/$1/$2/" $FILENAME
-}
-
-add_package() {
-    sed -i "s/\- $1\([<>=][^a-zA-Z]*\)\?$/a \- $2=$3/" $FILENAME
-}
-
-if [ "$PYTHON_VERSION" ]; then
-    PACKAGE_NAME="python"
-    set_version "$PACKAGE_NAME" "$PYTHON_VERSION"
-fi
-
-if [ "$NUMBA_VERSION" ]; then
-    PACKAGE_NAME="numba"
-    set_version "$PACKAGE_NAME" "$NUMBA_VERSION"
-fi
-
-if [ "$PANDAS_VERSION" ]; then
-    PACKAGE_NAME="pandas"
-    set_version "$PACKAGE_NAME" "$PANDAS_VERSION"
-fi
-
-if [ "$PYARROW_VERSION" ]; then
-    PACKAGE_NAME="pyarrow"
-    set_version "$PACKAGE_NAME" "$PYARROW_VERSION"
-fi
-
-if [ "$CYTHON_VERSION" ]; then
-    PACKAGE_NAME="cython"
-    set_version "$PACKAGE_NAME" "$CYTHON_VERSION"
-fi
-
-if [ "$CMAKE_VERSION" ]; then
-    PACKAGE_NAME="cmake"
-    set_version "$PACKAGE_NAME" "$CMAKE_VERSION"
-fi
-
-if [ "$NUMPY_VERSION" ]; then
-    ABOVE_PACKAGE="pandas"
-    PACKAGE_NAME="numpy"
-    add_package "$ABOVE_PACKAGE" "$PACKAGE_NAME" "$NUMPY_VERSION"
-fi
diff --git a/docker_build/Dockerfile b/docker_build/Dockerfile
deleted file mode 100644
index 696a6969778..00000000000
--- a/docker_build/Dockerfile
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-ARG CUDA_VERSION=11.2.2
-FROM nvidia/cuda:${CUDA_VERSION}-devel
-ENV CUDA_SHORT_VERSION=11.2
-
-SHELL ["/bin/bash", "-c"]
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/repos/dist/lib
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-ENV CUDA_HOME=/usr/local/cuda
-ENV CUDA_PATH=$CUDA_HOME
-ENV PATH=${CUDA_HOME}/lib64/:${PATH}:${CUDA_HOME}/bin
-
-# Build env variables for arrow
-ENV CMAKE_BUILD_TYPE=release
-ENV PYARROW_WITH_PARQUET=1
-ENV PYARROW_WITH_CUDA=1
-ENV PYARROW_WITH_ORC=1
-ENV PYARROW_WITH_DATASET=1
-
-ENV ARROW_HOME=/repos/dist
-
-# Build env variables for rmm
-ENV INSTALL_PREFIX=/usr
-
-
-RUN apt update -y --fix-missing && \
-    apt upgrade -y && \
-      apt install -y --no-install-recommends software-properties-common && \
-      add-apt-repository ppa:deadsnakes/ppa && \
-      apt update -y --fix-missing
-
-RUN apt install -y --no-install-recommends \
-      git \
-      python3.8-dev \
-      build-essential \
-      autoconf \
-      bison \
-      flex \
-      libjemalloc-dev \
-      wget \
-      libssl-dev \
-      protobuf-compiler && \
-    apt-get autoremove -y && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
-    wget https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py
-
-# Install cmake
-RUN version=3.18 && build=5 && mkdir ~/temp && cd ~/temp && wget https://cmake.org/files/v$version/cmake-$version.$build.tar.gz && \
-    tar -xzvf cmake-$version.$build.tar.gz && cd cmake-$version.$build/ && ./bootstrap && make -j$(nproc) && make install
-
-# Install arrow from source
-RUN git clone https://github.com/apache/arrow.git /repos/arrow && mkdir /repos/dist/ && cd /repos/arrow && git checkout apache-arrow-1.0.1 && git submodule init && \
-    git submodule update && export PARQUET_TEST_DATA="${PWD}/cpp/submodules/parquet-testing/data" && export ARROW_TEST_DATA="${PWD}/testing/data" && \
-    cd /repos/arrow/cpp && mkdir release && cd /repos/arrow/cpp/release && pip install -r /repos/arrow/python/requirements-build.txt && \
-    cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME -DCMAKE_INSTALL_LIBDIR=lib -DARROW_FLIGHT=ON -DARROW_GANDIVA=OFF -DARROW_ORC=ON -DARROW_WITH_BZ2=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON -DARROW_WITH_LZ4=ON -DARROW_WITH_SNAPPY=ON -DARROW_WITH_BROTLI=ON -DARROW_PARQUET=ON -DARROW_PYTHON=ON -DARROW_PLASMA=ON -DARROW_BUILD_TESTS=ON -DARROW_CUDA=ON -DARROW_DATASET=ON .. && \
-    make -j$(nproc) && make install && cd /repos/arrow/python/ && python setup.py build_ext --build-type=release bdist_wheel && pip install /repos/arrow/python/dist/*.whl
-
-
-# Install rmm from source
-RUN cd /repos/ && git clone https://github.com/rapidsai/rmm.git && cd /repos/rmm/ && ./build.sh librmm && pip install /repos/rmm/python/.
-
-ADD . /repos/cudf/
-
-# Build env for CUDF build
-ENV CUDF_HOME=/repos/cudf/
-ENV CUDF_ROOT=/repos/cudf/cpp/build/
-
-# Install cudf from source
-RUN cd /repos/cudf/ && git submodule update --init --recursive && ./build.sh libcudf && \
-    pip install /repos/cudf/python/cudf/.
-
diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index 575d7442cdf..190978a7581 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -34,6 +34,7 @@ Function application
    SeriesGroupBy.aggregate
    DataFrameGroupBy.aggregate
    GroupBy.pipe
+   GroupBy.transform
 
 Computations / descriptive stats
 --------------------------------
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index e0dc4bd4f46..891bb3a1e61 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -174,7 +174,6 @@ Reindexing / selection / label manipulation
    Series.reset_index
    Series.reverse
    Series.sample
-   Series.set_index
    Series.set_mask
    Series.take
    Series.tail
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
index f3269768025..cbc8f7e712f 100644
--- a/docs/cudf/source/basics/groupby.rst
+++ b/docs/cudf/source/basics/groupby.rst
@@ -1,3 +1,5 @@
+.. _basics.groupby:
+
 GroupBy
 =======
 
@@ -220,6 +222,27 @@ Limitations
  .. |describe| replace:: ``describe``
  .. _describe: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply
 
+
+Transform
+---------
+
+The ``.transform()`` method aggregates per group, and broadcasts the
+result to the group size, resulting in a Series/DataFrame that is of
+the same size as the input Series/DataFrame.
+
+.. code:: python
+
+     >>> import cudf
+     >>> df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]})
+     >>> df.groupby('a').transform('max')
+        b
+     0  5
+     1  3
+     2  3
+     3  5
+     4  5
+
+
 Rolling window calculations
 ---------------------------
 
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 2c184252192..3d6d3ceb399 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -80,9 +80,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '22.02'
+version = '22.04'
 # The full version, including alpha/beta/rc tags.
-release = '22.02.00'
+release = '22.04.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index e1e0d5ef2da..4c7a8d4e449 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -11,7 +11,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.02/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/RAPIDS.cmake
      ${CMAKE_BINARY_DIR}/RAPIDS.cmake
 )
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
diff --git a/java/ci/README.md b/java/ci/README.md
index 0e947b62511..f022bec04e3 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.5.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-22.02
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-22.04
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,5 +47,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-22.02.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-22.04.0-SNAPSHOT-cuda11.jar.
 
diff --git a/java/pom.xml b/java/pom.xml
index ec6968ca761..8f0fb1000d8 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>22.02.0-SNAPSHOT</version>
+    <version>22.04.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOp.java b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
index 8b58d8383b4..15b8d32d6da 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOp.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020,2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,8 +49,10 @@ public enum BinaryOp {
   GREATER_EQUAL(25), // >=
   NULL_EQUALS(26), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
   NULL_MAX(27), // MAX but NULL < not NULL
-  NULL_MIN(28); // MIN but NULL > not NULL
+  NULL_MIN(28), // MIN but NULL > not NULL
   //NOT IMPLEMENTED YET GENERIC_BINARY(29);
+  NULL_LOGICAL_AND(30),
+  NULL_LOGICAL_OR(31);
 
 
   static final EnumSet<BinaryOp> COMPARISON = EnumSet.of(
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index a2e080e02f6..cc1fa46becb 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -2331,13 +2331,27 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    * Null string entries return corresponding null output columns.
    * @param delimiter UTF-8 encoded string identifying the split points in each string.
    *                  An empty string indicates split on whitespace.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(Scalar delimiter) {
+  public final Table stringSplit(Scalar delimiter, int maxSplit) {
     assert type.equals(DType.STRING) : "column type must be a String";
     assert delimiter != null : "delimiter may not be null";
     assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle()));
+    return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+  }
+  
+  /**
+   * Returns a list of columns by splitting each string using the specified delimiter.
+   * The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted.
+   * Null string entries return corresponding null output columns.
+   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   *                  An empty string indicates split on whitespace.
+   * @return New table of strings columns.
+   */
+  public final Table stringSplit(Scalar delimiter) {
+    return stringSplit(delimiter, -1);
   }
 
   /**
@@ -2349,7 +2363,7 @@ public final Table stringSplit(Scalar delimiter) {
    */
   public final Table stringSplit() {
     try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplit(emptyString);
+      return stringSplit(emptyString, -1);
     }
   }
 
@@ -2362,7 +2376,7 @@ public final ColumnVector stringSplitRecord() {
 
   /**
    * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
-   * @param maxSplit the maximum number of records to split, or -1 for all of them.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    */
   public final ColumnVector stringSplitRecord(int maxSplit) {
     try (Scalar emptyString = Scalar.fromString("")) {
@@ -2384,7 +2398,7 @@ public final ColumnVector stringSplitRecord(Scalar delimiter) {
    * string using the specified delimiter.
    * @param delimiter UTF-8 encoded string identifying the split points in each string.
    *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of records to split, or -1 for all of them.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @return New table of strings columns.
    */
   public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) {
@@ -3234,7 +3248,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) {
    * The index is set to null if one of the following is true: 
    * 1. The search key row is null.
    * 2. The list row is null.
-   * @param key ColumnView of search keys.
+   * @param keys ColumnView of search keys.
    * @param findOption Whether to find the first index of the key, or the last.
    * @return The resultant column of int32 indices
    */
@@ -3270,6 +3284,17 @@ public final Scalar getScalarElement(int index) {
     return new Scalar(getType(), getElement(getNativeView(), index));
   }
 
+  /**
+   * Get the number of bytes needed to allocate a validity buffer for the given number of rows.
+   * According to cudf::bitmask_allocation_size_bytes, the padding boundary for null mask is 64 bytes.
+   */
+  static long getValidityBufferSize(int numRows) {
+    // number of bytes required = Math.ceil(number of bits / 8)
+    long actualBytes = ((long) numRows + 7) >> 3;
+    // padding to the multiplies of the padding boundary(64 bytes)
+    return ((actualBytes + 63) >> 6) << 6;
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
   /////////////////////////////////////////////////////////////////////////////
@@ -3490,8 +3515,9 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
    * delimiter.
    * @param columnView native handle of the cudf::column_view being operated on.
    * @param delimiter  UTF-8 encoded string identifying the split points in each string.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    */
-  private static native long[] stringSplit(long columnView, long delimiter);
+  private static native long[] stringSplit(long columnView, long delimiter, int maxSplit);
 
   private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit);
 
@@ -3686,7 +3712,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    * Native method to find the first (or last) index of each search key in the specified column,
    * in each row of a list column.
    * @param nativeView the column view handle of the list
-   * @param scalarColumnHandle handle to the search key column
+   * @param keyColumnHandle handle to the search key column
    * @param isFindFirst Whether to find the first index of the key, or the last.
    * @return column handle of the resultant column of int32 indices
    */
@@ -3866,11 +3892,6 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi
   private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, 
                                                              long boolColumnViewHandle) throws CudfException;
 
-  /**
-   * Get the number of bytes needed to allocate a validity buffer for the given number of rows.
-   */
-  static native long getNativeValidPointerSize(int size);
-
   ////////
   // Native cudf::column_view life cycle and metadata access methods. Life cycle methods
   // should typically only be called from the OffHeap inner class.
@@ -3960,7 +3981,7 @@ static ColumnVector createColumnVector(DType type, int rows, HostMemoryBuffer da
       DeviceMemoryBuffer mainValidDevBuff = null;
       DeviceMemoryBuffer mainOffsetsDevBuff = null;
       if (mainColValid != null) {
-        long validLen = getNativeValidPointerSize(mainColRows);
+        long validLen = getValidityBufferSize(mainColRows);
         mainValidDevBuff = DeviceMemoryBuffer.allocate(validLen);
         mainValidDevBuff.copyFromHostBuffer(mainColValid, 0, validLen);
       }
@@ -4069,7 +4090,7 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows
         data.copyFromHostBuffer(dataBuffer, 0, dataLen);
       }
       if (validityBuffer != null) {
-        long validLen = getNativeValidPointerSize((int)rows);
+        long validLen = getValidityBufferSize((int)rows);
         valid = DeviceMemoryBuffer.allocate(validLen);
         valid.copyFromHostBuffer(validityBuffer, 0, validLen);
       }
diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index 742501be375..2e5b0202dc5 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -307,7 +307,7 @@ public static DType fromJavaBigDecimal(BigDecimal dec) {
       return new DType(DTypeEnum.DECIMAL128, -dec.scale());
     }
     throw new IllegalArgumentException("Precision " + dec.precision() +
-        " exceeds max precision cuDF can support " + DECIMAL64_MAX_PRECISION);
+        " exceeds max precision cuDF can support " + DECIMAL128_MAX_PRECISION);
   }
 
   /**
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index e21a4ac81c6..3abc6db385d 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -199,7 +199,7 @@ public ColumnVector copyToDevice() {
         }
         HostMemoryBuffer hvalid = this.offHeap.valid;
         if (hvalid != null) {
-          long validLen = ColumnView.getNativeValidPointerSize((int) rows);
+          long validLen = ColumnView.getValidityBufferSize((int) rows);
           valid = DeviceMemoryBuffer.allocate(validLen);
           valid.copyFromHostBuffer(hvalid, 0, validLen);
         }
@@ -858,7 +858,7 @@ public static HostColumnVector timestampNanoSecondsFromBoxedLongs(Long... values
    * Build
    */
 
-  public static final class ColumnBuilder implements  AutoCloseable {
+  public static final class ColumnBuilder implements AutoCloseable {
 
     private DType type;
     private HostMemoryBuffer data;
@@ -869,28 +869,78 @@ public static final class ColumnBuilder implements  AutoCloseable {
     private boolean nullable;
     private long rows;
     private long estimatedRows;
+    private long rowCapacity = 0L;
+    private long validCapacity = 0L;
     private boolean built = false;
     private List<ColumnBuilder> childBuilders = new ArrayList<>();
+    private Runnable nullHandler;
 
-    private int currentIndex = 0;
-    private int currentByteIndex = 0;
-
+    // The value of currentIndex can't exceed Int32.Max. Storing currentIndex as a long is to
+    // adapt HostMemoryBuffer.setXXX, which requires a long offset.
+    private long currentIndex = 0;
+    // Only for Strings: pointer of the byte (data) buffer
+    private int currentStringByteIndex = 0;
+    // Use bit shift instead of multiply to transform row offset to byte offset
+    private int bitShiftBySize = 0;
+    private static final int bitShiftByOffset = (int)(Math.log(OFFSET_SIZE) / Math.log(2));
 
     public ColumnBuilder(HostColumnVector.DataType type, long estimatedRows) {
       this.type = type.getType();
       this.nullable = type.isNullable();
       this.rows = 0;
-      this.estimatedRows = estimatedRows;
+      this.estimatedRows = Math.max(estimatedRows, 1L);
+      this.bitShiftBySize = (int)(Math.log(this.type.getSizeInBytes()) / Math.log(2));
+
+      // initialize the null handler according to the data type
+      this.setupNullHandler();
+
       for (int i = 0; i < type.getNumChildren(); i++) {
         childBuilders.add(new ColumnBuilder(type.getChild(i), estimatedRows));
       }
     }
 
+    private void setupNullHandler() {
+      if (this.type == DType.LIST) {
+        this.nullHandler = () -> {
+          this.growListBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          offsets.setInt(currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex());
+        };
+      } else if (this.type == DType.STRING) {
+        this.nullHandler = () -> {
+          this.growStringBuffersAndRows(0);
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          offsets.setInt(currentIndex << bitShiftByOffset, currentStringByteIndex);
+        };
+      } else if (this.type == DType.STRUCT) {
+        this.nullHandler = () -> {
+          this.growStructBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          for (ColumnBuilder childBuilder : childBuilders) {
+            childBuilder.appendNull();
+          }
+        };
+      } else {
+        this.nullHandler = () -> {
+          this.growFixedWidthBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+        };
+      }
+    }
+
     public HostColumnVector build() {
       List<HostColumnVectorCore> hostColumnVectorCoreList = new ArrayList<>();
       for (ColumnBuilder childBuilder : childBuilders) {
         hostColumnVectorCoreList.add(childBuilder.buildNestedInternal());
       }
+      // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily.
+      if (valid != null) {
+        growValidBuffer();
+      }
       HostColumnVector hostColumnVector = new HostColumnVector(type, rows, Optional.of(nullCount), data, valid, offsets,
           hostColumnVectorCoreList);
       built = true;
@@ -902,6 +952,10 @@ private HostColumnVectorCore buildNestedInternal() {
       for (ColumnBuilder childBuilder : childBuilders) {
         hostColumnVectorCoreList.add(childBuilder.buildNestedInternal());
       }
+      // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily.
+      if (valid != null) {
+        growValidBuffer();
+      }
       return new HostColumnVectorCore(type, rows, Optional.of(nullCount), data, valid, offsets, hostColumnVectorCoreList);
     }
 
@@ -929,71 +983,113 @@ public ColumnBuilder appendStructValues(StructData... inputList) {
     }
 
     /**
-     * A method that is responsible for growing the buffers as needed
-     * and incrementing the row counts when we append values or nulls.
-     * @param hasNull indicates whether the validity buffer needs to be considered, as the
-     *                nullcount may not have been fully calculated yet
-     * @param length used for strings
+     * Grows valid buffer lazily. The valid buffer won't be materialized until the first null
+     * value appended. This method reuses the rowCapacity to track the sizes of column.
+     * Therefore, please call specific growBuffer method to update rowCapacity before calling
+     * this method.
      */
-    private void growBuffersAndRows(boolean hasNull, int length) {
+    private void growValidBuffer() {
+      if (valid == null) {
+        long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity);
+        valid = HostMemoryBuffer.allocate(maskBytes);
+        valid.setMemory(0, valid.length, (byte) 0xFF);
+        validCapacity = rowCapacity;
+        return;
+      }
+      if (validCapacity < rowCapacity) {
+        long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity);
+        HostMemoryBuffer newValid = HostMemoryBuffer.allocate(maskBytes);
+        newValid.setMemory(0, newValid.length, (byte) 0xFF);
+        valid = copyBuffer(newValid, valid);
+        validCapacity = rowCapacity;
+      }
+    }
+
+    /**
+     * A method automatically grows data buffer for fixed-width columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
+     */
+    private void growFixedWidthBuffersAndRows() {
       assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
       rows++;
-      long targetDataSize = 0;
 
-      if (!type.isNestedType()) {
-        if (type.equals(DType.STRING)) {
-          targetDataSize = data == null ? length : currentByteIndex + length;
-        } else {
-          targetDataSize = data == null ? estimatedRows * type.getSizeInBytes() : rows * type.getSizeInBytes();
-        }
+      if (data == null) {
+        data = HostMemoryBuffer.allocate(estimatedRows << bitShiftBySize);
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1);
+        data = copyBuffer(HostMemoryBuffer.allocate(newCap << bitShiftBySize), data);
+        rowCapacity = newCap;
       }
+    }
 
-      if (targetDataSize > 0) {
-        if (data == null) {
-          data = HostMemoryBuffer.allocate(targetDataSize);
-        } else {
-          long maxLen;
-          if (type.equals(DType.STRING)) {
-            maxLen = Integer.MAX_VALUE;
-          } else {
-            maxLen = Integer.MAX_VALUE * (long) type.getSizeInBytes();
-          }
-          long oldLen = data.getLength();
-          long newDataLen = Math.max(1, oldLen);
-          while (targetDataSize > newDataLen) {
-            newDataLen = newDataLen * 2;
-          }
-          if (newDataLen != oldLen) {
-            newDataLen = Math.min(newDataLen, maxLen);
-            if (newDataLen < targetDataSize) {
-              throw new IllegalStateException("A data buffer for strings is not supported over 2GB in size");
-            }
-            HostMemoryBuffer newData = HostMemoryBuffer.allocate(newDataLen);
-            data = copyBuffer(newData, data);
-          }
-        }
+    /**
+     * A method automatically grows offsets buffer for list columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
+     */
+    private void growListBuffersAndRows() {
+      assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (offsets == null) {
+        offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset);
+        offsets.setInt(0, 0);
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2);
+        offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets);
+        rowCapacity = newCap;
       }
-      if (type.equals(DType.LIST) || type.equals(DType.STRING)) {
-        if (offsets == null) {
-          offsets = HostMemoryBuffer.allocate((estimatedRows + 1) * OFFSET_SIZE);
-          offsets.setInt(0, 0);
-        } else if ((rows +1) * OFFSET_SIZE > offsets.length) {
-          long newOffsetLen = offsets.length * 2;
-          HostMemoryBuffer newOffsets = HostMemoryBuffer.allocate(newOffsetLen);
-          offsets = copyBuffer(newOffsets, offsets);
-        }
+    }
+
+    /**
+     * A method automatically grows offsets and data buffer for string columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
+     *
+     * @param stringLength number of bytes required by the next row
+     */
+    private void growStringBuffersAndRows(int stringLength) {
+      assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (offsets == null) {
+        // Initialize data buffer with at least 1 byte in case the first appended value is null.
+        data = HostMemoryBuffer.allocate(Math.max(1, stringLength));
+        offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset);
+        offsets.setInt(0, 0);
+        rowCapacity = estimatedRows;
+        return;
       }
-      if (hasNull || nullCount > 0) {
-        if (valid == null) {
-          long targetValidSize = ColumnView.getNativeValidPointerSize((int)estimatedRows);
-          valid = HostMemoryBuffer.allocate(targetValidSize);
-          valid.setMemory(0, targetValidSize, (byte) 0xFF);
-        } else if (valid.length < ColumnView.getNativeValidPointerSize((int)rows)) {
-          long newValidLen = valid.length * 2;
-          HostMemoryBuffer newValid = HostMemoryBuffer.allocate(newValidLen);
-          newValid.setMemory(0, newValidLen, (byte) 0xFF);
-          valid = copyBuffer(newValid, valid);
-        }
+
+      if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2);
+        offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets);
+        rowCapacity = newCap;
+      }
+
+      long currentLength = currentStringByteIndex + stringLength;
+      if (currentLength > data.length) {
+        long requiredLength = data.length;
+        do {
+          requiredLength = requiredLength * 2;
+        } while (currentLength > requiredLength);
+        data = copyBuffer(HostMemoryBuffer.allocate(requiredLength), data);
+      }
+    }
+
+    /**
+     * For struct columns, we only need to update rows and rowCapacity (for the growth of
+     * valid buffer), because struct columns hold no buffer itself.
+     * Please call this method before appending any value or null.
+     */
+    private void growStructBuffersAndRows() {
+      assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (rowCapacity == 0) {
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        rowCapacity = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1);
       }
     }
 
@@ -1015,29 +1111,13 @@ private HostMemoryBuffer copyBuffer(HostMemoryBuffer targetBuffer, HostMemoryBuf
      * Method that sets the null bit in the validity vector
      * @param index the row index at which the null is marked
      */
-    private void setNullAt(int index) {
+    private void setNullAt(long index) {
       assert index < rows : "Index for null value should fit the column with " + rows + " rows";
       nullCount += BitVectorHelper.setNullAt(valid, index);
     }
 
     public final ColumnBuilder appendNull() {
-      growBuffersAndRows(true, 0);
-      setNullAt(currentIndex);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
-      if (type.hasOffsets()) {
-        if (type.equals(DType.LIST)) {
-          offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex());
-        } else {
-          // It is a String
-          offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex);
-        }
-      } else if (type.equals(DType.STRUCT)) {
-        // structs propagate nulls to children and even further down if needed
-        for (ColumnBuilder childBuilder : childBuilders) {
-          childBuilder.appendNull();
-        }
-      }
+      nullHandler.run();
       return this;
     }
 
@@ -1081,7 +1161,7 @@ public ColumnBuilder endStruct() {
       assert type.equals(DType.STRUCT) : "This only works for structs";
       assert allChildrenHaveSameIndex() : "Appending structs data appears to be off " +
           childBuilders + " should all have the same currentIndex " + type;
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growStructBuffersAndRows();
       currentIndex++;
       return this;
     }
@@ -1095,9 +1175,8 @@ assert allChildrenHaveSameIndex() : "Appending structs data appears to be off "
      */
     public ColumnBuilder endList() {
       assert type.equals(DType.LIST);
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
-      currentIndex++;
-      offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex());
+      growListBuffersAndRows();
+      offsets.setInt(++currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex());
       return this;
     }
 
@@ -1136,6 +1215,8 @@ private void appendChildOrNull(ColumnBuilder childBuilder, Object listElement) {
         childBuilder.append((Short) listElement);
       } else if (listElement instanceof BigDecimal) {
         childBuilder.append((BigDecimal) listElement);
+      } else if (listElement instanceof BigInteger) {
+        childBuilder.append((BigInteger) listElement);
       } else if (listElement instanceof List) {
         childBuilder.append((List<?>) listElement);
       } else if (listElement instanceof StructData) {
@@ -1153,102 +1234,88 @@ public void incrCurrentIndex() {
     }
 
     public int getCurrentIndex() {
-      return currentIndex;
+      return (int) currentIndex;
     }
 
+    @Deprecated
     public int getCurrentByteIndex() {
-      return currentByteIndex;
+      return currentStringByteIndex;
     }
 
     public final ColumnBuilder append(byte value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByByte();
       assert currentIndex < rows;
-      data.setByte(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setByte(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(short value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByShort();
       assert currentIndex < rows;
-      data.setShort(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setShort(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(int value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByInt();
       assert currentIndex < rows;
-      data.setInt(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setInt(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(long value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByLong();
       assert currentIndex < rows;
-      data.setLong(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setLong(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(float value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.FLOAT32);
       assert currentIndex < rows;
-      data.setFloat(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setFloat(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(double value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.FLOAT64);
       assert currentIndex < rows;
-      data.setDouble(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setDouble(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(boolean value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.BOOL8);
       assert currentIndex < rows;
-      data.setBoolean(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setBoolean(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
-    public final ColumnBuilder append(BigDecimal value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+    public ColumnBuilder append(BigDecimal value) {
+      return append(value.setScale(-type.getScale(), RoundingMode.UNNECESSARY).unscaledValue());
+    }
+
+    public ColumnBuilder append(BigInteger unscaledVal) {
+      growFixedWidthBuffersAndRows();
       assert currentIndex < rows;
-      // Rescale input decimal with UNNECESSARY policy, which accepts no precision loss.
-      BigInteger unscaledVal = value.setScale(-type.getScale(), RoundingMode.UNNECESSARY).unscaledValue();
       if (type.typeId == DType.DTypeEnum.DECIMAL32) {
-        data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact());
+        data.setInt(currentIndex++ << bitShiftBySize, unscaledVal.intValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
-        data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact());
+        data.setLong(currentIndex++ << bitShiftBySize, unscaledVal.longValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
-        assert currentIndex < rows;
-        byte[] unscaledValueBytes = value.unscaledValue().toByteArray();
+        byte[] unscaledValueBytes = unscaledVal.toByteArray();
         byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes);
-        data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length);
-      }  else {
+        data.setBytes(currentIndex++ << bitShiftBySize, result, 0, result.length);
+      } else {
         throw new IllegalStateException(type + " is not a supported decimal type.");
       }
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
       return this;
     }
 
@@ -1267,14 +1334,13 @@ public ColumnBuilder appendUTF8String(byte[] value, int srcOffset, int length) {
       assert length >= 0;
       assert value.length + srcOffset <= length;
       assert type.equals(DType.STRING) : " type " + type + " is not String";
-      currentIndex++;
-      growBuffersAndRows(false, length);
-      assert currentIndex < rows + 1;
+      growStringBuffersAndRows(length);
+      assert currentIndex < rows;
       if (length > 0) {
-        data.setBytes(currentByteIndex, value, srcOffset, length);
+        data.setBytes(currentStringByteIndex, value, srcOffset, length);
       }
-      currentByteIndex += length;
-      offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex);
+      currentStringByteIndex += length;
+      offsets.setInt(++currentIndex << bitShiftByOffset, currentStringByteIndex);
       return this;
     }
 
@@ -1818,7 +1884,7 @@ public final Builder append(HostColumnVector columnVector) {
     }
 
     private void allocateBitmaskAndSetDefaultValues() {
-      long bitmaskSize = ColumnView.getNativeValidPointerSize((int) rows);
+      long bitmaskSize = ColumnView.getValidityBufferSize((int) rows);
       valid = HostMemoryBuffer.allocate(bitmaskSize);
       valid.setMemory(0, bitmaskSize, (byte) 0xFF);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index a021ded4588..bb0321d0a16 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -616,10 +616,6 @@ private static native long[] conditionalInnerJoinGatherMapsWithCount(long leftTa
   private static native long[] conditionalFullJoinGatherMaps(long leftTable, long rightTable,
                                                              long condition) throws CudfException;
 
-  private static native long[] conditionalFullJoinGatherMapsWithCount(long leftTable, long rightTable,
-                                                                      long condition,
-                                                                      long rowCount) throws CudfException;
-
   private static native long conditionalLeftSemiJoinRowCount(long leftTable, long rightTable,
                                                              long condition) throws CudfException;
 
@@ -670,6 +666,32 @@ private static native long[] mixedFullJoinGatherMaps(long leftKeysTable, long ri
                                                        long leftConditionTable, long rightConditionTable,
                                                        long condition, boolean compareNullsEqual);
 
+  private static native long[] mixedLeftSemiJoinSize(long leftKeysTable, long rightKeysTable,
+                                                     long leftConditionTable, long rightConditionTable,
+                                                     long condition, boolean compareNullsEqual);
+
+  private static native long[] mixedLeftSemiJoinGatherMap(long leftKeysTable, long rightKeysTable,
+                                                          long leftConditionTable, long rightConditionTable,
+                                                          long condition, boolean compareNullsEqual);
+
+  private static native long[] mixedLeftSemiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable,
+                                                                  long leftConditionTable, long rightConditionTable,
+                                                                  long condition, boolean compareNullsEqual,
+                                                                  long outputRowCount, long matchesColumnView);
+
+  private static native long[] mixedLeftAntiJoinSize(long leftKeysTable, long rightKeysTable,
+                                                     long leftConditionTable, long rightConditionTable,
+                                                     long condition, boolean compareNullsEqual);
+
+  private static native long[] mixedLeftAntiJoinGatherMap(long leftKeysTable, long rightKeysTable,
+                                                          long leftConditionTable, long rightConditionTable,
+                                                          long condition, boolean compareNullsEqual);
+
+  private static native long[] mixedLeftAntiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable,
+                                                                  long leftConditionTable, long rightConditionTable,
+                                                                  long condition, boolean compareNullsEqual,
+                                                                  long outputRowCount, long matchesColumnView);
+
   private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException;
 
   private static native long[] concatenate(long[] cudfTablePointers) throws CudfException;
@@ -2853,7 +2875,7 @@ public static GatherMap[] mixedFullJoinGatherMaps(Table leftKeys, Table rightKey
     return buildJoinGatherMaps(gatherMapData);
   }
 
-  private GatherMap buildSemiJoinGatherMap(long[] gatherMapData) {
+  private static GatherMap buildSemiJoinGatherMap(long[] gatherMapData) {
     long bufferSize = gatherMapData[0];
     long leftAddr = gatherMapData[1];
     long leftHandle = gatherMapData[2];
@@ -2939,6 +2961,94 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable,
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * Computes output size information for a left semi join between two tables using a mix of
+   * equality and inequality conditions. The entire join condition is assumed to be a logical AND
+   * of the equality condition and inequality condition.
+   * NOTE: It is the responsibility of the caller to close the resulting size information object
+   * or native resources can be leaked!
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return size information for the join
+   */
+  public static MixedJoinSize mixedLeftSemiJoinSize(Table leftKeys, Table rightKeys,
+                                                    Table leftConditional, Table rightConditional,
+                                                    CompiledExpression condition,
+                                                    NullEquality nullEquality) {
+    long[] mixedSizeInfo = mixedLeftSemiJoinSize(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
+    assert mixedSizeInfo.length == 2;
+    long outputRowCount = mixedSizeInfo[0];
+    long matchesColumnHandle = mixedSizeInfo[1];
+    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
+  }
+
+  /**
+   * Computes the gather map that can be used to manifest the result of a left semi join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * A {@link GatherMap} instance will be returned that can be used to gather
+   * the left table to produce the result of the left semi join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return left and right table gather maps
+   */
+  public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKeys,
+                                                     Table leftConditional, Table rightConditional,
+                                                     CompiledExpression condition,
+                                                     NullEquality nullEquality) {
+    long[] gatherMapData = mixedLeftSemiJoinGatherMap(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(),
+        nullEquality == NullEquality.EQUAL);
+    return buildSemiJoinGatherMap(gatherMapData);
+  }
+
+  /**
+   * Computes the gather map that can be used to manifest the result of a left semi join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * A {@link GatherMap} instance will be returned that can be used to gather
+   * the left table to produce the result of the left semi join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing the size result from
+   * {@link #mixedLeftSemiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
+   * when the output size was computed previously.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @param joinSize mixed join size result
+   * @return left and right table gather maps
+   */
+  public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKeys,
+                                                     Table leftConditional, Table rightConditional,
+                                                     CompiledExpression condition,
+                                                     NullEquality nullEquality,
+                                                     MixedJoinSize joinSize) {
+    long[] gatherMapData = mixedLeftSemiJoinGatherMapWithSize(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(),
+        nullEquality == NullEquality.EQUAL,
+        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
+    return buildSemiJoinGatherMap(gatherMapData);
+  }
+
   /**
    * Computes the gather map that can be used to manifest the result of a left anti-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -3018,6 +3128,94 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * Computes output size information for a left anti join between two tables using a mix of
+   * equality and inequality conditions. The entire join condition is assumed to be a logical AND
+   * of the equality condition and inequality condition.
+   * NOTE: It is the responsibility of the caller to close the resulting size information object
+   * or native resources can be leaked!
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return size information for the join
+   */
+  public static MixedJoinSize mixedLeftAntiJoinSize(Table leftKeys, Table rightKeys,
+                                                    Table leftConditional, Table rightConditional,
+                                                    CompiledExpression condition,
+                                                    NullEquality nullEquality) {
+    long[] mixedSizeInfo = mixedLeftAntiJoinSize(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
+    assert mixedSizeInfo.length == 2;
+    long outputRowCount = mixedSizeInfo[0];
+    long matchesColumnHandle = mixedSizeInfo[1];
+    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
+  }
+
+  /**
+   * Computes the gather map that can be used to manifest the result of a left anti join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * A {@link GatherMap} instance will be returned that can be used to gather
+   * the left table to produce the result of the left anti join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return left and right table gather maps
+   */
+  public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKeys,
+                                                     Table leftConditional, Table rightConditional,
+                                                     CompiledExpression condition,
+                                                     NullEquality nullEquality) {
+    long[] gatherMapData = mixedLeftAntiJoinGatherMap(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(),
+        nullEquality == NullEquality.EQUAL);
+    return buildSemiJoinGatherMap(gatherMapData);
+  }
+
+  /**
+   * Computes the gather map that can be used to manifest the result of a left anti join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * A {@link GatherMap} instance will be returned that can be used to gather
+   * the left table to produce the result of the left anti join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing the size result from
+   * {@link #mixedLeftAntiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
+   * when the output size was computed previously.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @param joinSize mixed join size result
+   * @return left and right table gather maps
+   */
+  public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKeys,
+                                                     Table leftConditional, Table rightConditional,
+                                                     CompiledExpression condition,
+                                                     NullEquality nullEquality,
+                                                     MixedJoinSize joinSize) {
+    long[] gatherMapData = mixedLeftAntiJoinGatherMapWithSize(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(),
+        nullEquality == NullEquality.EQUAL,
+        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
+    return buildSemiJoinGatherMap(gatherMapData);
+  }
+
   /**
    * For details about how this method functions refer to
    * {@link #convertToRowsFixedWidthOptimized()}.
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 2db37d57cbb..00747efff27 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -26,7 +26,7 @@ rapids_cuda_init_architectures(CUDF_JNI)
 
 project(
   CUDF_JNI
-  VERSION 22.02.00
+  VERSION 22.04.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index d7acaa679f6..a45716a89b3 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <memory>
 #include <vector>
 
@@ -68,6 +69,32 @@ inline void check_java_exception(JNIEnv *const env) {
   }
 }
 
+/**
+ * @brief Helper to convert a pointer to a jlong.
+ *
+ * This is useful when, for instance, converting a cudf::column pointer
+ * to a jlong, for use in JNI.
+ */
+template <typename T> jlong ptr_as_jlong(T *ptr) {
+  return reinterpret_cast<jlong>(ptr);
+}
+
+/**
+ * @brief Helper to release the data held by a unique_ptr, and return
+ * the pointer as a jlong.
+ */
+template <typename T> jlong release_as_jlong(std::unique_ptr<T> &&ptr) {
+  return ptr_as_jlong(ptr.release());
+}
+
+/**
+ * @brief Helper to release the data held by a unique_ptr, and return
+ * the pointer as a jlong.
+ */
+template <typename T> jlong release_as_jlong(std::unique_ptr<T> &ptr) {
+  return release_as_jlong(std::move(ptr));
+}
+
 class native_jdoubleArray_accessor {
 public:
   jdouble *getArrayElements(JNIEnv *const env, jdoubleArray arr) const {
@@ -256,6 +283,19 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
 
   J_ARRAY_TYPE get_jArray() { return orig; }
 
+  /**
+   * @brief Conversion to std::vector
+   *
+   * @tparam target_t Target data type
+   * @return std::vector<target_t> Vector with the copied contents
+   */
+  template <typename target_t = N_TYPE> std::vector<target_t> to_vector() const {
+    std::vector<target_t> ret;
+    ret.reserve(size());
+    std::copy(begin(), end(), std::back_inserter(ret));
+    return ret;
+  }
+
   /**
    * @brief if data has been written back into this array, don't commit
    * it.
@@ -277,11 +317,34 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
   ~native_jArray() { commit(); }
 };
 
-typedef native_jArray<jdouble, jdoubleArray, native_jdoubleArray_accessor> native_jdoubleArray;
-typedef native_jArray<jlong, jlongArray, native_jlongArray_accessor> native_jlongArray;
-typedef native_jArray<jint, jintArray, native_jintArray_accessor> native_jintArray;
-typedef native_jArray<jbyte, jbyteArray, native_jbyteArray_accessor> native_jbyteArray;
-typedef native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor> native_jbooleanArray;
+using native_jdoubleArray = native_jArray<jdouble, jdoubleArray, native_jdoubleArray_accessor>;
+using native_jlongArray = native_jArray<jlong, jlongArray, native_jlongArray_accessor>;
+using native_jintArray = native_jArray<jint, jintArray, native_jintArray_accessor>;
+using native_jbyteArray = native_jArray<jbyte, jbyteArray, native_jbyteArray_accessor>;
+
+/**
+ * @brief Specialization of native_jArray for jboolean
+ *
+ * This class adds special support for conversion to std::vector<X>, where the element
+ * value is chosen depending on the jboolean value.
+ */
+struct native_jbooleanArray
+    : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor> {
+  native_jbooleanArray(JNIEnv *const env, jbooleanArray orig)
+      : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor>(env, orig) {}
+
+  native_jbooleanArray(native_jbooleanArray const &) = delete;
+  native_jbooleanArray &operator=(native_jbooleanArray const &) = delete;
+
+  template <typename target_t>
+  std::vector<target_t> transform_if_else(target_t const &if_true, target_t const &if_false) const {
+    std::vector<target_t> ret;
+    ret.reserve(size());
+    std::transform(begin(), end(), std::back_inserter(ret),
+                   [&](jboolean const &b) { return b ? if_true : if_false; });
+    return ret;
+  }
+};
 
 /**
  * @brief wrapper around native_jlongArray to make it take pointers instead.
@@ -332,10 +395,31 @@ template <typename T> class native_jpointerArray {
 
   T **data() { return reinterpret_cast<T **>(wrapped.data()); }
 
+  T *const *begin() const { return data(); }
+  T *const *end() const { return data() + size(); }
+
   const jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   jlongArray get_jArray() { return wrapped.get_jArray(); }
 
+  void assert_no_nulls() const {
+    if (std::any_of(data(), data() + size(), [](T *const ptr) { return ptr == nullptr; })) {
+      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
+    }
+  }
+
+  /**
+   * @brief Convert from `T*[]` to `vector<T>`.
+   */
+  std::vector<T> get_dereferenced() const {
+    assert_no_nulls();
+    auto ret = std::vector<T>{};
+    ret.reserve(size());
+    std::transform(data(), data() + size(), std::back_inserter(ret),
+                   [](T *const &p) { return *p; });
+    return ret;
+  }
+
   /**
    * @brief if data has been written back into this array, don't commit
    * it.
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index b0286f9ac27..0e559ad0403 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <algorithm>
+
 #include <arrow/api.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
@@ -33,6 +35,10 @@
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
+#include "jni_utils.hpp"
+
+using cudf::jni::ptr_as_jlong;
+using cudf::jni::release_as_jlong;
 
 extern "C" {
 
@@ -44,13 +50,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, j
     cudf::jni::auto_set_device(env);
     auto initial_val = reinterpret_cast<cudf::scalar const *>(j_initial_val);
     auto step = reinterpret_cast<cudf::scalar const *>(j_step);
-    std::unique_ptr<cudf::column> col;
-    if (step) {
-      col = cudf::sequence(row_count, *initial_val, *step);
-    } else {
-      col = cudf::sequence(row_count, *initial_val);
-    }
-    return reinterpret_cast<jlong>(col.release());
+    return release_as_jlong(step ? cudf::sequence(row_count, *initial_val, *step) :
+                                   cudf::sequence(row_count, *initial_val));
   }
   CATCH_STD(env, 0);
 }
@@ -66,13 +67,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(JNIEnv *env,
     auto start = reinterpret_cast<cudf::column_view const *>(j_start_handle);
     auto size = reinterpret_cast<cudf::column_view const *>(j_size_handle);
     auto step = reinterpret_cast<cudf::column_view const *>(j_step_handle);
-    std::unique_ptr<cudf::column> col;
-    if (step) {
-      col = cudf::lists::sequences(*start, *step, *size);
-    } else {
-      col = cudf::lists::sequences(*start, *size);
-    }
-    return reinterpret_cast<jlong>(col.release());
+    auto ret =
+        step ? cudf::lists::sequences(*start, *step, *size) : cudf::lists::sequences(*start, *size);
+    return release_as_jlong(ret);
   }
   CATCH_STD(env, 0);
 }
@@ -143,12 +140,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
     std::shared_ptr<arrow::Schema> schema = std::make_shared<arrow::Schema>(fields);
     auto arrow_table =
         arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
-    std::unique_ptr<cudf::table> table_result = cudf::from_arrow(*(arrow_table));
-    std::vector<std::unique_ptr<cudf::column>> retCols = table_result->release();
+    auto retCols = cudf::from_arrow(*(arrow_table))->release();
     if (retCols.size() != 1) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0);
     }
-    return reinterpret_cast<jlong>(retCols[0].release());
+    return release_as_jlong(retCols[0]);
   }
   CATCH_STD(env, 0);
 }
@@ -167,14 +163,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(
                                         cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
-    std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
-                   std::back_inserter(column_views),
-                   [](auto const &p_column) { return *p_column; });
-
-    std::unique_ptr<cudf::column> result = cudf::strings::concatenate(
-        cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy);
-    return reinterpret_cast<jlong>(result.release());
+    auto column_views = n_cudf_columns.get_dereferenced();
+    return release_as_jlong(cudf::strings::concatenate(
+        cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy));
   }
   CATCH_STD(env, 0);
 }
@@ -194,17 +185,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepC
                                         cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
-    std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
-                   std::back_inserter(column_views),
-                   [](auto const &p_column) { return *p_column; });
-
+    auto column_views = n_cudf_columns.get_dereferenced();
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
     cudf::strings_column_view strings_column(*column);
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::concatenate(cudf::table_view(column_views), strings_column,
-                                   separator_narep_scalar, col_narep_scalar, null_policy);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::concatenate(cudf::table_view(column_views),
+                                                       strings_column, separator_narep_scalar,
+                                                       col_narep_scalar, null_policy));
   }
   CATCH_STD(env, 0);
 }
@@ -219,14 +205,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv
                                      cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
-    std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
-                   std::back_inserter(column_views),
-                   [](auto const &p_column) { return *p_column; });
-
-    std::unique_ptr<cudf::column> result =
-        cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy);
-    return reinterpret_cast<jlong>(result.release());
+    auto column_views = n_cudf_columns.get_dereferenced();
+    return release_as_jlong(
+        cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy));
   }
   CATCH_STD(env, 0);
 }
@@ -238,12 +219,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, j
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    std::unique_ptr<cudf::column> ret;
-    cudf::jni::native_jpointerArray<cudf::column_view> children(env, handles);
-    std::vector<cudf::column_view> children_vector(children.size());
-    for (int i = 0; i < children.size(); i++) {
-      children_vector[i] = *children[i];
-    }
+    auto children = cudf::jni::native_jpointerArray<cudf::column_view>(env, handles);
+    auto children_vector = children.get_dereferenced();
     auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
     zero->set_valid_async(true);
     static_cast<ScalarType *>(zero.get())->set_value(0);
@@ -253,8 +230,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, j
       auto offsets = cudf::make_column_from_scalar(*zero, row_count + 1);
       cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
       auto empty_col = cudf::make_empty_column(n_data_type);
-      ret = cudf::make_lists_column(row_count, std::move(offsets), std::move(empty_col), 0,
-                                    rmm::device_buffer());
+      return release_as_jlong(cudf::make_lists_column(
+          row_count, std::move(offsets), std::move(empty_col), 0, rmm::device_buffer()));
     } else {
       auto count = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
       count->set_valid_async(true);
@@ -262,11 +239,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, j
 
       std::unique_ptr<cudf::column> offsets = cudf::sequence(row_count + 1, *zero, *count);
       auto data_col = cudf::interleave_columns(cudf::table_view(children_vector));
-      ret = cudf::make_lists_column(row_count, std::move(offsets), std::move(data_col), 0,
-                                    rmm::device_buffer());
+      return release_as_jlong(cudf::make_lists_column(
+          row_count, std::move(offsets), std::move(data_col), 0, rmm::device_buffer()));
     }
-
-    return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
@@ -282,10 +257,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeListFromOffsets(
     CUDF_EXPECTS(offsets_cv->type().id() == cudf::type_id::INT32,
                  "Input offsets does not have type INT32.");
 
-    auto result = cudf::make_lists_column(static_cast<cudf::size_type>(row_count),
-                                          std::make_unique<cudf::column>(*offsets_cv),
-                                          std::make_unique<cudf::column>(*child_cv), 0, {});
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::make_lists_column(
+        static_cast<cudf::size_type>(row_count), std::make_unique<cudf::column>(*offsets_cv),
+        std::make_unique<cudf::column>(*child_cv), 0, {}));
   }
   CATCH_STD(env, 0);
 }
@@ -297,7 +271,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
   try {
     cudf::jni::auto_set_device(env);
     auto scalar_val = reinterpret_cast<cudf::scalar const *>(j_scalar);
-    std::unique_ptr<cudf::column> col;
     if (scalar_val->type().id() == cudf::type_id::STRING) {
       // Tests fail when using the cudf implementation, complaining no child for string column.
       // So here take care of the String type itself.
@@ -309,17 +282,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
       auto str_col = cudf::make_strings_column(row_count, std::move(offsets), std::move(data), 0,
                                                std::move(mask_buffer));
 
-      col = cudf::fill(str_col->view(), 0, row_count, *scalar_val);
+      return release_as_jlong(cudf::fill(str_col->view(), 0, row_count, *scalar_val));
     } else if (scalar_val->type().id() == cudf::type_id::STRUCT && row_count == 0) {
       // Specialize the creation of empty struct column, since libcudf doesn't support it.
       auto struct_scalar = reinterpret_cast<cudf::struct_scalar const *>(j_scalar);
       auto children = cudf::empty_like(struct_scalar->view())->release();
       auto mask_buffer = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
-      col = cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer));
+      return release_as_jlong(
+          cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer)));
     } else {
-      col = cudf::make_column_from_scalar(*scalar_val, row_count);
+      return release_as_jlong(cudf::make_column_from_scalar(*scalar_val, row_count));
     }
-    return reinterpret_cast<jlong>(col.release());
   }
   CATCH_STD(env, 0);
 }
@@ -331,19 +304,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env
   using cudf::column_view;
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<column_view> columns(env, column_handles);
-    std::vector<column_view> columns_vector(columns.size());
-    for (int i = 0; i < columns.size(); ++i) {
-      JNI_NULL_CHECK(env, columns[i], "column to concat is null", 0);
-      columns_vector[i] = *columns[i];
-    }
-    std::unique_ptr<column> result;
-    if (columns_vector[0].type().id() == cudf::type_id::LIST) {
-      result = cudf::lists::detail::concatenate(columns_vector);
-    } else {
-      result = cudf::concatenate(columns_vector);
-    }
-    return reinterpret_cast<jlong>(result.release());
+    auto columns =
+        cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
+    auto const is_lists_column = columns[0].type().id() == cudf::type_id::LIST;
+    return release_as_jlong(is_lists_column ? cudf::lists::detail::concatenate(columns) :
+                                              cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
 }
@@ -354,16 +319,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobje
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
 
   try {
-    cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
-    std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
-                   std::back_inserter(column_views),
-                   [](auto const &p_column) { return *p_column; });
-    cudf::table_view input_table{column_views};
-
-    std::unique_ptr<cudf::column> result =
-        cudf::hash(input_table, static_cast<cudf::hash_id>(hash_function_id), seed);
-    return reinterpret_cast<jlong>(result.release());
+    auto column_views =
+        cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
+    return release_as_jlong(cudf::hash(cudf::table_view{column_views},
+                                       static_cast<cudf::hash_id>(hash_function_id), seed));
   }
   CATCH_STD(env, 0);
 }
@@ -405,9 +364,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeColumnView(JNI
   try {
     cudf::jni::auto_set_device(env);
     cudf::column *column = reinterpret_cast<cudf::column *>(handle);
-    std::unique_ptr<cudf::column_view> view(new cudf::column_view());
-    *view.get() = column->view();
-    return reinterpret_cast<jlong>(view.release());
+    return ptr_as_jlong(new cudf::column_view{*column});
   }
   CATCH_STD(env, 0);
 }
@@ -419,9 +376,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNI
   try {
     cudf::jni::auto_set_device(env);
     cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
-
-    std::unique_ptr<cudf::column> column(cudf::make_empty_column(n_data_type));
-    return reinterpret_cast<jlong>(column.release());
+    return release_as_jlong(cudf::make_empty_column(n_data_type));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index d2cc2ab7d1c..63247eb0066 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -75,6 +75,9 @@
 #include "jni_utils.hpp"
 #include "map_lookup.hpp"
 
+using cudf::jni::ptr_as_jlong;
+using cudf::jni::release_as_jlong;
+
 namespace {
 
 std::size_t calc_device_memory_size(cudf::column_view const &view) {
@@ -107,9 +110,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_upperStrings(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::strings_column_view strings_column(*column);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::to_upper(strings_column);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::to_upper(strings_column));
   }
   CATCH_STD(env, 0);
 }
@@ -122,9 +123,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::strings_column_view strings_column(*column);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::to_lower(strings_column);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::to_lower(strings_column));
   }
   CATCH_STD(env, 0);
 }
@@ -138,8 +137,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv
     cudf::jni::auto_set_device(env);
     cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
     auto val = reinterpret_cast<cudf::scalar *>(j_scalar);
-    std::unique_ptr<cudf::column> result = cudf::replace_nulls(col, *val);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::replace_nulls(col, *val));
   }
   CATCH_STD(env, 0);
 }
@@ -153,8 +151,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv
     cudf::jni::auto_set_device(env);
     auto col = reinterpret_cast<cudf::column_view *>(j_col);
     auto replacements = reinterpret_cast<cudf::column_view *>(j_replace_col);
-    std::unique_ptr<cudf::column> result = cudf::replace_nulls(*col, *replacements);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::replace_nulls(*col, *replacements));
   }
   CATCH_STD(env, 0);
 }
@@ -166,9 +163,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsPolicy(JNIEnv
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
-    std::unique_ptr<cudf::column> result = cudf::replace_nulls(
-        col, is_preceding ? cudf::replace_policy::PRECEDING : cudf::replace_policy::FOLLOWING);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::replace_nulls(
+        col, is_preceding ? cudf::replace_policy::PRECEDING : cudf::replace_policy::FOLLOWING));
   }
   CATCH_STD(env, 0);
 }
@@ -184,8 +180,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(JNIEnv *env, jcl
     auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
     auto true_vec = reinterpret_cast<cudf::column_view *>(j_true_vec);
     auto false_vec = reinterpret_cast<cudf::column_view *>(j_false_vec);
-    std::unique_ptr<cudf::column> result = cudf::copy_if_else(*true_vec, *false_vec, *pred_vec);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::copy_if_else(*true_vec, *false_vec, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
@@ -201,8 +196,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVS(JNIEnv *env, jcl
     auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
     auto true_vec = reinterpret_cast<cudf::column_view *>(j_true_vec);
     auto false_scalar = reinterpret_cast<cudf::scalar *>(j_false_scalar);
-    std::unique_ptr<cudf::column> result = cudf::copy_if_else(*true_vec, *false_scalar, *pred_vec);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::copy_if_else(*true_vec, *false_scalar, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
@@ -219,8 +213,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSV(JNIEnv *env, jcl
     auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
     auto true_scalar = reinterpret_cast<cudf::scalar *>(j_true_scalar);
     auto false_vec = reinterpret_cast<cudf::column_view *>(j_false_vec);
-    std::unique_ptr<cudf::column> result = cudf::copy_if_else(*true_scalar, *false_vec, *pred_vec);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::copy_if_else(*true_scalar, *false_vec, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
@@ -237,9 +230,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSS(JNIEnv *env, jcl
     auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
     auto true_scalar = reinterpret_cast<cudf::scalar *>(j_true_scalar);
     auto false_scalar = reinterpret_cast<cudf::scalar *>(j_false_scalar);
-    std::unique_ptr<cudf::column> result =
-        cudf::copy_if_else(*true_scalar, *false_scalar, *pred_vec);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::copy_if_else(*true_scalar, *false_scalar, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
@@ -250,8 +241,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getElement(JNIEnv *env, j
   try {
     cudf::jni::auto_set_device(env);
     auto from_vec = reinterpret_cast<cudf::column_view *>(from);
-    std::unique_ptr<cudf::scalar> result = cudf::get_element(*from_vec, index);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::get_element(*from_vec, index));
   }
   CATCH_STD(env, 0);
 }
@@ -266,9 +256,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(JNIEnv *env, jclas
     auto col = reinterpret_cast<cudf::column_view *>(j_col_view);
     auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
     cudf::data_type out_dtype = cudf::jni::make_data_type(j_dtype, scale);
-
-    std::unique_ptr<cudf::scalar> result = cudf::reduce(*col, agg->clone(), out_dtype);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::reduce(*col, agg->clone(), out_dtype));
   }
   CATCH_STD(env, 0);
 }
@@ -282,11 +270,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(JNIEnv *env, jclass,
     cudf::jni::auto_set_device(env);
     auto col = reinterpret_cast<cudf::column_view *>(j_col_view);
     auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
-
-    std::unique_ptr<cudf::column> result = cudf::scan(
-        *col, agg->clone(), is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE,
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE);
-    return reinterpret_cast<jlong>(result.release());
+    auto scan_type = is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE;
+    auto null_policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+    return release_as_jlong(cudf::scan(*col, agg->clone(), scan_type, null_policy));
   }
   CATCH_STD(env, 0);
 }
@@ -303,8 +289,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv *
     auto const tdigest_view =
         tdigest_column_view{structs_column_view{*reinterpret_cast<column_view *>(input_column)}};
     auto const p_percentiles = reinterpret_cast<column_view *>(percentiles_column);
-    auto result = percentile_approx(tdigest_view, *p_percentiles);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(percentile_approx(tdigest_view, *p_percentiles));
   }
   CATCH_STD(env, 0);
 }
@@ -321,9 +306,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quantile(JNIEnv *env, jcl
                                   native_quantiles.data() + native_quantiles.size());
     cudf::column_view *n_input_column = reinterpret_cast<cudf::column_view *>(input_column);
     cudf::interpolation n_quantile_method = static_cast<cudf::interpolation>(quantile_method);
-    std::unique_ptr<cudf::column> result =
-        cudf::quantile(*n_input_column, quantiles, n_quantile_method);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::quantile(*n_input_column, quantiles, n_quantile_method));
   }
   CATCH_STD(env, 0);
 }
@@ -365,7 +348,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(
         ret = cudf::rolling_window(*n_input_col, preceding, following, min_periods, *agg);
       }
     }
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(ret);
   }
   CATCH_STD(env, 0);
 }
@@ -380,22 +363,16 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_slice(JNIEnv *env, j
     cudf::jni::auto_set_device(env);
     cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(input_column);
     cudf::jni::native_jintArray n_slice_indices(env, slice_indices);
-
-    std::vector<cudf::size_type> indices(n_slice_indices.size());
-    for (int i = 0; i < n_slice_indices.size(); i++) {
-      indices[i] = n_slice_indices[i];
-    }
+    std::vector<cudf::size_type> indices(n_slice_indices.begin(), n_slice_indices.end());
 
     std::vector<cudf::column_view> result = cudf::slice(*n_column, indices);
     cudf::jni::native_jlongArray n_result(env, result.size());
-    std::vector<std::unique_ptr<cudf::column>> column_result(result.size());
-    for (size_t i = 0; i < result.size(); i++) {
-      column_result[i].reset(new cudf::column(result[i]));
-      n_result[i] = reinterpret_cast<jlong>(column_result[i].get());
-    }
-    for (size_t i = 0; i < result.size(); i++) {
-      column_result[i].release();
-    }
+
+    std::transform(result.begin(), result.end(), n_result.begin(),
+                   [](cudf::column_view const &result_col) {
+                     return ptr_as_jlong(new cudf::column{result_col});
+                   });
+
     return n_result.get_jArray();
   }
   CATCH_STD(env, NULL);
@@ -409,9 +386,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv
     cudf::jni::auto_set_device(env);
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::lists_column_view lcv(*cv);
-
-    std::unique_ptr<cudf::column> ret = cudf::lists::extract_list_element(lcv, index);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::lists::extract_list_element(lcv, index));
   }
   CATCH_STD(env, 0);
 }
@@ -423,9 +398,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv
     cudf::jni::auto_set_device(env);
     cudf::column_view const *cv = reinterpret_cast<cudf::column_view const *>(column_view);
     cudf::lists_column_view lcv(*cv);
-
-    std::unique_ptr<cudf::column> ret = cudf::lists::drop_list_duplicates(lcv);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::lists::drop_list_duplicates(lcv));
   }
   CATCH_STD(env, 0);
 }
@@ -486,10 +459,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicatesWithKey
 
     auto out_structs =
         cudf::make_structs_column(out_child_size, std::move(out_structs_members), 0, {});
-    auto result = cudf::make_lists_column(input_cv->size(), std::move(out_offsets),
-                                          std::move(out_structs), 0, {});
-
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::make_lists_column(input_cv->size(), std::move(out_offsets),
+                                                    std::move(out_structs), 0, {}));
   }
   CATCH_STD(env, 0);
 }
@@ -504,9 +475,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env,
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::lists_column_view lcv(*cv);
     cudf::scalar *lookup_scalar = reinterpret_cast<cudf::scalar *>(lookup_key);
-
-    std::unique_ptr<cudf::column> ret = cudf::lists::contains(lcv, *lookup_scalar);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::lists::contains(lcv, *lookup_scalar));
   }
   CATCH_STD(env, 0);
 }
@@ -518,7 +487,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv
     cudf::jni::auto_set_device(env);
     auto cv = reinterpret_cast<cudf::column_view *>(column_view);
     auto lcv = cudf::lists_column_view{*cv};
-    return reinterpret_cast<jlong>(cudf::lists::contains_nulls(lcv).release());
+    return release_as_jlong(cudf::lists::contains_nulls(lcv));
   }
   CATCH_STD(env, 0);
 }
@@ -533,9 +502,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::lists_column_view lcv(*cv);
     cudf::column_view *lookup_cv = reinterpret_cast<cudf::column_view *>(lookup_key_cv);
-
-    std::unique_ptr<cudf::column> ret = cudf::lists::contains(lcv, *lookup_cv);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::lists::contains(lcv, *lookup_cv));
   }
   CATCH_STD(env, 0);
 }
@@ -553,8 +520,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(JNIEnv
     auto const lookup_key_scalar = reinterpret_cast<cudf::scalar const *>(lookup_key);
     auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
                                              cudf::lists::duplicate_find_option::FIND_LAST;
-    auto result = cudf::lists::index_of(lcv, *lookup_key_scalar, find_option);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::lists::index_of(lcv, *lookup_key_scalar, find_option));
   }
   CATCH_STD(env, 0);
 }
@@ -572,8 +538,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(JNIEnv
     auto const lookup_key_column = reinterpret_cast<cudf::column_view const *>(lookup_keys);
     auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
                                              cudf::lists::duplicate_find_option::FIND_LAST;
-    auto result = cudf::lists::index_of(lcv, *lookup_key_column, find_option);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::lists::index_of(lcv, *lookup_key_column, find_option));
   }
   CATCH_STD(env, 0);
 }
@@ -588,25 +553,25 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
     auto sort_order = is_descending ? cudf::order::DESCENDING : cudf::order::ASCENDING;
     auto null_order = is_null_smallest ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
     auto *cv = reinterpret_cast<cudf::column_view *>(column_view);
-    auto ret = cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(
+        cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong column_view,
-                                                                        jlong delimiter) {
+                                                                        jlong delimiter_ptr,
+                                                                        jint max_split) {
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0);
+  JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
-    cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
+    cudf::strings_column_view const scv{*reinterpret_cast<cudf::column_view *>(column_view)};
+    auto delimiter = reinterpret_cast<cudf::string_scalar *>(delimiter_ptr);
 
-    std::unique_ptr<cudf::table> table_result = cudf::strings::split(scv, *ss_scalar);
-    return cudf::jni::convert_table_for_return(env, table_result);
+    return cudf::jni::convert_table_for_return(env,
+                                               cudf::strings::split(scv, *delimiter, max_split));
   }
   CATCH_STD(env, 0);
 }
@@ -622,9 +587,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
-
-    std::unique_ptr<cudf::column> ret = cudf::strings::split_record(scv, *ss_scalar, max_split);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::strings::split_record(scv, *ss_scalar, max_split));
   }
   CATCH_STD(env, 0);
 }
@@ -639,19 +602,16 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, j
     cudf::jni::auto_set_device(env);
     cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(input_column);
     cudf::jni::native_jintArray n_split_indices(env, split_indices);
-
-    std::vector<cudf::size_type> indices(n_split_indices.size());
-    for (int i = 0; i < n_split_indices.size(); i++) {
-      indices[i] = n_split_indices[i];
-    }
+    std::vector<cudf::size_type> indices(n_split_indices.begin(), n_split_indices.end());
 
     std::vector<cudf::column_view> result = cudf::split(*n_column, indices);
-
     cudf::jni::native_jlongArray n_result(env, result.size());
-    for (size_t i = 0; i < result.size(); i++) {
-      cudf::column_view const *c = new cudf::column_view(result[i]);
-      n_result[i] = reinterpret_cast<jlong>(c);
-    }
+
+    std::transform(result.begin(), result.end(), n_result.begin(),
+                   [](cudf::column_view const &result_col) {
+                     return ptr_as_jlong(new cudf::column_view{result_col});
+                   });
+
     return n_result.get_jArray();
   }
   CATCH_STD(env, NULL);
@@ -663,9 +623,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv *env
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
-    std::unique_ptr<cudf::column> result =
-        cudf::lists::count_elements(cudf::lists_column_view(*n_column));
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::lists::count_elements(cudf::lists_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
@@ -676,9 +634,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv *env,
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::count_characters(cudf::strings_column_view(*n_column));
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::count_characters(cudf::strings_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
@@ -689,9 +645,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteCount(JNIEnv *env, jc
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::count_bytes(cudf::strings_column_view(*n_column));
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::count_bytes(cudf::strings_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
@@ -712,11 +666,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(JNIEnv
     column_view *input_column = reinterpret_cast<column_view *>(input_handle);
     column_view *old_values_column = reinterpret_cast<column_view *>(old_values_handle);
     column_view *new_values_column = reinterpret_cast<column_view *>(new_values_handle);
-
-    std::unique_ptr<column> result =
-        cudf::find_and_replace_all(*input_column, *old_values_column, *new_values_column);
-
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(
+        cudf::find_and_replace_all(*input_column, *old_values_column, *new_values_column));
   }
   CATCH_STD(env, 0);
 }
@@ -727,8 +678,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNullNative(JNIEnv *env,
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
-    std::unique_ptr<cudf::column> ret = cudf::is_null(*input);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::is_null(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -739,8 +689,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNullNative(JNIEnv *e
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
-    std::unique_ptr<cudf::column> ret = cudf::is_valid(*input);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::is_valid(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -751,8 +700,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNanNative(JNIEnv *env,
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
-    std::unique_ptr<cudf::column> ret = cudf::is_nan(*input);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::is_nan(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -763,8 +711,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNanNative(JNIEnv *en
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
-    std::unique_ptr<cudf::column> ret = cudf::is_not_nan(*input);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::is_not_nan(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -777,8 +724,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_unaryOperation(JNIEnv *en
     cudf::jni::auto_set_device(env);
     cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
     cudf::unary_operator op = static_cast<cudf::unary_operator>(int_op);
-    std::unique_ptr<cudf::column> ret = cudf::unary_operation(*input, op);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::unary_operation(*input, op));
   }
   CATCH_STD(env, 0);
 }
@@ -791,8 +737,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_round(JNIEnv *env, jclass
     cudf::jni::auto_set_device(env);
     cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
     cudf::rounding_method method = static_cast<cudf::rounding_method>(rounding_method);
-    std::unique_ptr<cudf::column> ret = cudf::round(*input, decimal_places, method);
-    return reinterpret_cast<jlong>(ret.release());
+    return release_as_jlong(cudf::round(*input, decimal_places, method));
   }
   CATCH_STD(env, 0);
 }
@@ -802,8 +747,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv *env, jclass,
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::extract_year(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::extract_year(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -813,8 +757,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv *env, jclass
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::extract_month(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::extract_month(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -824,8 +767,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv *env, jclass,
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::extract_day(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::extract_day(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -835,8 +777,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv *env, jclass,
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::extract_hour(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::extract_hour(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -847,8 +788,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv *env, jclas
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::extract_minute(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::extract_minute(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -859,8 +799,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv *env, jclas
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::extract_second(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::extract_second(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -871,8 +810,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv *env, jcla
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::extract_weekday(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::extract_weekday(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -883,8 +821,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv *en
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::last_day_of_month(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::last_day_of_month(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -895,8 +832,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv *env, jc
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::day_of_year(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::day_of_year(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -907,8 +843,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv *env
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::extract_quarter(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::extract_quarter(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -922,8 +857,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIE
     cudf::jni::auto_set_device(env);
     const cudf::column_view *ts = reinterpret_cast<cudf::column_view *>(ts_ptr);
     const cudf::column_view *months = reinterpret_cast<cudf::column_view *>(months_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::add_calendrical_months(*ts, *months);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::add_calendrical_months(*ts, *months));
   }
   CATCH_STD(env, 0);
 }
@@ -934,8 +868,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv *env, j
   try {
     cudf::jni::auto_set_device(env);
     const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    std::unique_ptr<cudf::column> output = cudf::datetime::is_leap_year(*input);
-    return reinterpret_cast<jlong>(output.release());
+    return release_as_jlong(cudf::datetime::is_leap_year(*input));
   }
   CATCH_STD(env, 0);
 }
@@ -947,16 +880,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
     cudf::jni::auto_set_device(env);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
-    std::unique_ptr<cudf::column> result;
     if (n_data_type == column->type()) {
-      std::unique_ptr<cudf::column> copy(new cudf::column(*column));
-      return reinterpret_cast<jlong>(copy.release());
+      return ptr_as_jlong(new cudf::column(*column));
     }
     if (n_data_type.id() == cudf::type_id::STRING) {
       switch (column->type().id()) {
-        case cudf::type_id::BOOL8: result = cudf::strings::from_booleans(*column); break;
+        case cudf::type_id::BOOL8: return release_as_jlong(cudf::strings::from_booleans(*column));
         case cudf::type_id::FLOAT32:
-        case cudf::type_id::FLOAT64: result = cudf::strings::from_floats(*column); break;
+        case cudf::type_id::FLOAT64: return release_as_jlong(cudf::strings::from_floats(*column));
         case cudf::type_id::INT8:
         case cudf::type_id::UINT8:
         case cudf::type_id::INT16:
@@ -964,17 +895,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::INT32:
         case cudf::type_id::UINT32:
         case cudf::type_id::INT64:
-        case cudf::type_id::UINT64: result = cudf::strings::from_integers(*column); break;
+        case cudf::type_id::UINT64: return release_as_jlong(cudf::strings::from_integers(*column));
         case cudf::type_id::DECIMAL32:
         case cudf::type_id::DECIMAL64:
-        case cudf::type_id::DECIMAL128: result = cudf::strings::from_fixed_point(*column); break;
+        case cudf::type_id::DECIMAL128:
+          return release_as_jlong(cudf::strings::from_fixed_point(*column));
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
       switch (n_data_type.id()) {
-        case cudf::type_id::BOOL8: result = cudf::strings::to_booleans(*column); break;
+        case cudf::type_id::BOOL8: return release_as_jlong(cudf::strings::to_booleans(*column));
         case cudf::type_id::FLOAT32:
-        case cudf::type_id::FLOAT64: result = cudf::strings::to_floats(*column, n_data_type); break;
+        case cudf::type_id::FLOAT64:
+          return release_as_jlong(cudf::strings::to_floats(*column, n_data_type));
         case cudf::type_id::INT8:
         case cudf::type_id::UINT8:
         case cudf::type_id::INT16:
@@ -983,13 +916,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::UINT32:
         case cudf::type_id::INT64:
         case cudf::type_id::UINT64:
-          result = cudf::strings::to_integers(*column, n_data_type);
-          break;
+          return release_as_jlong(cudf::strings::to_integers(*column, n_data_type));
         case cudf::type_id::DECIMAL32:
         case cudf::type_id::DECIMAL64:
         case cudf::type_id::DECIMAL128:
-          result = cudf::strings::to_fixed_point(*column, n_data_type);
-          break;
+          return release_as_jlong(cudf::strings::to_fixed_point(*column, n_data_type));
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (cudf::is_timestamp(n_data_type) && cudf::is_numeric(column->type())) {
@@ -1010,7 +941,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
       cudf::data_type duration_type = cudf::jni::timestamp_to_duration(n_data_type);
       cudf::column_view duration_view = cudf::column_view(
           duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
-      result = cudf::cast(duration_view, n_data_type);
+      return release_as_jlong(cudf::cast(duration_view, n_data_type));
     } else if (cudf::is_timestamp(column->type()) && cudf::is_numeric(n_data_type)) {
       // This is a temporary workaround to allow Java to cast from timestamp types to integral types
       // without forcing an intermediate duration column to be manifested.  Ultimately this style of
@@ -1018,11 +949,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
       cudf::data_type duration_type = cudf::jni::timestamp_to_duration(column->type());
       cudf::column_view duration_view = cudf::column_view(
           duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
-      result = cudf::cast(duration_view, n_data_type);
+      return release_as_jlong(cudf::cast(duration_view, n_data_type));
     } else {
-      result = cudf::cast(*column, n_data_type);
+      return release_as_jlong(cudf::cast(*column, n_data_type));
     }
-    return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
@@ -1034,9 +964,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitCastTo(JNIEnv *env, jc
     cudf::jni::auto_set_device(env);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
-    std::unique_ptr<cudf::column_view> result = std::make_unique<cudf::column_view>();
-    *result = cudf::bit_cast(*column, n_data_type);
-    return reinterpret_cast<jlong>(result.release());
+    return ptr_as_jlong(new cudf::column_view{cudf::bit_cast(*column, n_data_type)});
   }
   CATCH_STD(env, 0);
 }
@@ -1049,8 +977,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::flip_endianness config(static_cast<cudf::flip_endianness>(endianness_config));
-    std::unique_ptr<cudf::column> result = byte_cast(*column, config);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(byte_cast(*column, config));
   }
   CATCH_STD(env, 0);
 }
@@ -1066,9 +993,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringTimestampToTimestam
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::strings_column_view strings_column(*column);
 
-    std::unique_ptr<cudf::column> result = cudf::strings::to_timestamps(
-        strings_column, cudf::data_type(static_cast<cudf::type_id>(time_unit)), format.get());
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::to_timestamps(
+        strings_column, cudf::data_type(static_cast<cudf::type_id>(time_unit)), format.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1084,10 +1010,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isTimestamp(JNIEnv *env,
     cudf::jni::native_jstring format(env, formatObj);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::strings_column_view strings_column(*column);
-
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::is_timestamp(strings_column, format.get());
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::is_timestamp(strings_column, format.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1101,9 +1024,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_timestampToStringTimestam
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, j_format);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::from_timestamps(*column, format.get());
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::from_timestamps(*column, format.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1133,9 +1054,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *en
     cudf::jni::auto_set_device(env);
     cudf::column_view *haystack = reinterpret_cast<cudf::column_view *>(j_haystack_handle);
     cudf::column_view *needle = reinterpret_cast<cudf::column_view *>(j_needle_handle);
-
-    std::unique_ptr<cudf::column> result = std::move(cudf::contains(*haystack, *needle));
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::contains(*haystack, *needle));
   }
   CATCH_STD(env, 0);
 }
@@ -1148,9 +1067,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_transform(JNIEnv *env, jo
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
     cudf::jni::native_jstring n_j_udf(env, j_udf);
     std::string n_udf(n_j_udf.get());
-    std::unique_ptr<cudf::column> result =
-        cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(
+        cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
   }
   CATCH_STD(env, 0);
 }
@@ -1167,9 +1085,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStartWith(JNIEnv *e
     cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
     cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::starts_with(strings_column, *comp_scalar);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::starts_with(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
@@ -1185,9 +1101,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringEndWith(JNIEnv *env
     cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
     cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::ends_with(strings_column, *comp_scalar);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::ends_with(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
@@ -1203,9 +1117,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv *en
     cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
     cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::contains(strings_column, *comp_scalar);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::contains(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
@@ -1221,9 +1133,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv *env, jo
     cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
     cudf::jni::native_jstring pattern(env, patternObj);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::matches_re(strings_column, pattern.get());
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::matches_re(strings_column, pattern.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1239,10 +1149,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, j
     cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
     cudf::jni::native_jstring pattern(env, patternObj);
-
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::contains_re(strings_column, pattern.get());
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::contains_re(strings_column, pattern.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1260,8 +1167,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
 
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
     cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
-    std::unique_ptr<cudf::column> result = cudf::binary_operation(*lhs, *rhs, op, n_data_type);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::binary_operation(*lhs, *rhs, op, n_data_type));
   }
   CATCH_STD(env, 0);
 }
@@ -1291,8 +1197,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
 
     cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
-    std::unique_ptr<cudf::column> result = cudf::binary_operation(*lhs, *rhs, op, n_data_type);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::binary_operation(*lhs, *rhs, op, n_data_type));
   }
   CATCH_STD(env, 0);
 }
@@ -1305,11 +1210,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(JNIEnv *env, jc
     cudf::jni::auto_set_device(env);
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::strings_column_view scv(*cv);
-
-    std::unique_ptr<cudf::column> result =
-        (end == -1 ? cudf::strings::slice_strings(scv, start) :
-                     cudf::strings::slice_strings(scv, start, end));
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong((end == -1 ? cudf::strings::slice_strings(scv, start) :
+                                         cudf::strings::slice_strings(scv, start, end)));
   }
   CATCH_STD(env, 0);
 }
@@ -1327,9 +1229,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringColumn(JNIEnv *e
     cudf::strings_column_view scv(*cv);
     cudf::column_view *sc = reinterpret_cast<cudf::column_view *>(start_column);
     cudf::column_view *ec = reinterpret_cast<cudf::column_view *>(end_column);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::slice_strings(scv, *sc, *ec);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::slice_strings(scv, *sc, *ec));
   }
   CATCH_STD(env, 0);
 }
@@ -1345,9 +1245,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringLocate(JNIEnv *e
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(substring);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::find(scv, *ss_scalar, start, end);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::find(scv, *ss_scalar, start, end));
   }
   CATCH_STD(env, 0);
 }
@@ -1364,9 +1262,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(JNIEnv *env
     cudf::strings_column_view scv(*cv);
     cudf::string_scalar *ss_target = reinterpret_cast<cudf::string_scalar *>(target);
     cudf::string_scalar *ss_replace = reinterpret_cast<cudf::string_scalar *>(replace);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::replace(scv, *ss_target, *ss_replace);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::replace(scv, *ss_target, *ss_replace));
   }
   CATCH_STD(env, 0);
 }
@@ -1380,9 +1276,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookup(JNIEnv *env, jc
     cudf::jni::auto_set_device(env);
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
     cudf::string_scalar *ss_key = reinterpret_cast<cudf::string_scalar *>(lookup_key);
-
-    std::unique_ptr<cudf::column> result = cudf::jni::map_lookup(*cv, *ss_key);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::jni::map_lookup(*cv, *ss_key));
   }
   CATCH_STD(env, 0);
 }
@@ -1396,9 +1290,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
     cudf::string_scalar *ss_key = reinterpret_cast<cudf::string_scalar *>(lookup_key);
-
-    std::unique_ptr<cudf::column> result = cudf::jni::map_contains(*cv, *ss_key);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::jni::map_contains(*cv, *ss_key));
   }
   CATCH_STD(env, 0);
 }
@@ -1417,10 +1309,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(JNIEnv *env,
     cudf::strings_column_view scv(*cv);
     cudf::jni::native_jstring pattern(env, j_pattern);
     auto repl = reinterpret_cast<cudf::string_scalar const *>(j_repl);
-
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::replace_re(scv, pattern.get(), *repl, j_maxrepl);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::replace_re(scv, pattern.get(), *repl, j_maxrepl));
   }
   CATCH_STD(env, 0);
 }
@@ -1440,10 +1329,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(JNIEnv
     cudf::jni::native_jstringArray patterns(env, j_patterns);
     auto repl_cv = reinterpret_cast<cudf::column_view const *>(j_repls);
     cudf::strings_column_view repl_scv(*repl_cv);
-
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::replace_re(scv, patterns.as_cpp_vector(), repl_scv);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::replace_re(scv, patterns.as_cpp_vector(), repl_scv));
   }
   CATCH_STD(env, 0);
 }
@@ -1460,10 +1346,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs
     cudf::strings_column_view scv(*cv);
     cudf::jni::native_jstring ss_pattern(env, patternObj);
     cudf::jni::native_jstring ss_replace(env, replaceObj);
-
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::replace_with_backrefs(scv, ss_pattern.get(), ss_replace.get());
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(
+        cudf::strings::replace_with_backrefs(scv, ss_pattern.get(), ss_replace.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1477,9 +1361,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv *env, jclass
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::size_type width = reinterpret_cast<cudf::size_type>(j_width);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::zfill(scv, width);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::zfill(scv, width));
   }
   CATCH_STD(env, 0);
 }
@@ -1497,9 +1379,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(JNIEnv *env, jclass,
     cudf::size_type width = reinterpret_cast<cudf::size_type>(j_width);
     cudf::strings::pad_side side = static_cast<cudf::strings::pad_side>(j_side);
     cudf::jni::native_jstring ss_fill(env, fill_char);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::pad(scv, width, side, ss_fill.get());
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::pad(scv, width, side, ss_fill.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1516,9 +1396,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(JNIEnv *env,
     cudf::strings_column_view scv(*cv);
     cudf::strings::strip_type s_striptype = static_cast<cudf::strings::strip_type>(strip_type);
     cudf::string_scalar *ss_tostrip = reinterpret_cast<cudf::string_scalar *>(to_strip);
-
-    std::unique_ptr<cudf::column> result = cudf::strings::strip(scv, s_striptype, *ss_tostrip);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::strip(scv, s_striptype, *ss_tostrip));
   }
   CATCH_STD(env, 0);
 }
@@ -1531,13 +1409,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *en
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
-    cudf::strings_column_view strings_column(*column_view);
+    cudf::strings_column_view const strings_column{
+        *reinterpret_cast<cudf::column_view *>(j_view_handle)};
     cudf::jni::native_jstring pattern(env, patternObj);
 
-    std::unique_ptr<cudf::table> table_result =
-        cudf::strings::extract(strings_column, pattern.get());
-    return cudf::jni::convert_table_for_return(env, table_result);
+    return cudf::jni::convert_table_for_return(
+        env, cudf::strings::extract(strings_column, pattern.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1550,8 +1427,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlDecode(JNIEnv *env, jc
     cudf::jni::auto_set_device(env);
     auto view_ptr = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view strings_view(*view_ptr);
-    auto result = cudf::strings::url_decode(strings_view);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::url_decode(strings_view));
   }
   CATCH_STD(env, 0);
 }
@@ -1564,8 +1440,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlEncode(JNIEnv *env, jc
     cudf::jni::auto_set_device(env);
     auto view_ptr = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view strings_view(*view_ptr);
-    auto result = cudf::strings::url_encode(strings_view);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::url_encode(strings_view));
   }
   CATCH_STD(env, 0);
 }
@@ -1578,8 +1453,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_normalizeNANsAndZeros(JNI
   JNI_NULL_CHECK(env, input_column, "Input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    return reinterpret_cast<jlong>(
-        cudf::normalize_nans_and_zeros(*reinterpret_cast<column_view *>(input_column)).release());
+    return release_as_jlong(
+        cudf::normalize_nans_and_zeros(*reinterpret_cast<column_view *>(input_column)));
   }
   CATCH_STD(env, 0);
 }
@@ -1595,17 +1470,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
 
     if (n_cudf_columns.size() == 0) {
-      rmm::device_buffer null_mask{};
-      copy->set_null_mask(null_mask);
-      return reinterpret_cast<jlong>(copy.release());
+      copy->set_null_mask({}, 0);
+      return release_as_jlong(copy);
     }
 
-    std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
-                   std::back_inserter(column_views),
-                   [](auto const &p_column) { return *p_column; });
-    cudf::table_view input_table{column_views};
-
+    auto input_table = cudf::table_view{n_cudf_columns.get_dereferenced()};
     cudf::binary_operator op = static_cast<cudf::binary_operator>(bin_op);
     switch (op) {
       case cudf::binary_operator::BITWISE_AND: {
@@ -1621,7 +1490,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
       default: JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Unsupported merge operation", 0);
     }
 
-    return reinterpret_cast<jlong>(copy.release());
+    return release_as_jlong(copy);
   }
   CATCH_STD(env, 0);
 }
@@ -1632,11 +1501,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsVa
   JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-
     auto const exemplar = *reinterpret_cast<cudf::column_view *>(exemplar_handle);
     auto const validity = *reinterpret_cast<cudf::column_view *>(validity_column_handle);
-    auto deep_copy = cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity);
-    return reinterpret_cast<jlong>(deep_copy.release());
+    return release_as_jlong(
+        cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
   }
   CATCH_STD(env, 0);
 }
@@ -1656,7 +1524,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
     cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
     cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
 
-    std::unique_ptr<cudf::column_view> ret;
     void *data = reinterpret_cast<void *>(j_data);
     cudf::bitmask_type *valid = reinterpret_cast<cudf::bitmask_type *>(j_valid);
     if (valid == nullptr) {
@@ -1665,7 +1532,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
 
     if (n_type == cudf::type_id::STRING) {
       if (size == 0) {
-        ret.reset(
+        return ptr_as_jlong(
             new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0));
       } else {
         JNI_NULL_CHECK(env, j_offset, "offset is null", 0);
@@ -1676,8 +1543,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
         cudf::size_type *offsets = reinterpret_cast<cudf::size_type *>(j_offset);
         cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, size + 1, offsets);
         cudf::column_view data_column(cudf::data_type{cudf::type_id::INT8}, j_data_size, data);
-        ret.reset(new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, size, nullptr,
-                                        valid, j_null_count, 0, {offsets_column, data_column}));
+        return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, size,
+                                                  nullptr, valid, j_null_count, 0,
+                                                  {offsets_column, data_column}));
       }
     } else if (n_type == cudf::type_id::LIST) {
       JNI_NULL_CHECK(env, j_children, "children of a list are null", 0);
@@ -1692,22 +1560,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
       }
       cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, offsets_size,
                                        offsets);
-      ret.reset(new cudf::column_view(cudf::data_type{cudf::type_id::LIST}, size, nullptr, valid,
-                                      j_null_count, 0, {offsets_column, *children[0]}));
+      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::LIST}, size, nullptr,
+                                                valid, j_null_count, 0,
+                                                {offsets_column, *children[0]}));
     } else if (n_type == cudf::type_id::STRUCT) {
       JNI_NULL_CHECK(env, j_children, "children of a struct are null", 0);
       cudf::jni::native_jpointerArray<cudf::column_view> children(env, j_children);
-      std::vector<column_view> children_vector(children.size());
-      for (int i = 0; i < children.size(); i++) {
-        children_vector[i] = *children[i];
-      }
-      ret.reset(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, size, nullptr, valid,
-                                      j_null_count, 0, children_vector));
+      std::vector<column_view> children_vector = children.get_dereferenced();
+      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, size,
+                                                nullptr, valid, j_null_count, 0, children_vector));
     } else {
-      ret.reset(new cudf::column_view(n_data_type, size, data, valid, j_null_count));
+      return ptr_as_jlong(new cudf::column_view(n_data_type, size, data, valid, j_null_count));
     }
-
-    return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
@@ -1843,20 +1707,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getChildCvPointer(JNIEnv
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    if (column->type().id() == cudf::type_id::LIST) {
-      std::unique_ptr<cudf::lists_column_view> view =
-          std::make_unique<cudf::lists_column_view>(*column);
-      // first child is always offsets which we do not want to get from this call
-      std::unique_ptr<cudf::column_view> next_view =
-          std::make_unique<cudf::column_view>(column->child(1 + child_index));
-      return reinterpret_cast<jlong>(next_view.release());
-    } else {
-      std::unique_ptr<cudf::structs_column_view> view =
-          std::make_unique<cudf::structs_column_view>(*column);
-      std::unique_ptr<cudf::column_view> next_view =
-          std::make_unique<cudf::column_view>(column->child(child_index));
-      return reinterpret_cast<jlong>(next_view.release());
-    }
+    auto const is_list = column->type().id() == cudf::type_id::LIST;
+    auto const child = column->child(child_index + (is_list ? 1 : 0));
+    return ptr_as_jlong(new cudf::column_view(child));
   }
   CATCH_STD(env, 0);
 }
@@ -1872,13 +1725,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(J
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = reinterpret_cast<jlong>(offsets_view.data<char>());
+        result = ptr_as_jlong(offsets_view.data<char>());
       }
     } else if (column->type().id() == cudf::type_id::LIST) {
       if (column->size() > 0) {
         cudf::lists_column_view view = cudf::lists_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = reinterpret_cast<jlong>(offsets_view.data<char>());
+        result = ptr_as_jlong(offsets_view.data<char>());
       }
     }
     return result;
@@ -1917,7 +1770,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityAddress(
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    return reinterpret_cast<jlong>(column->null_mask());
+    return ptr_as_jlong(column->null_mask());
   }
   CATCH_STD(env, 0);
 }
@@ -1937,16 +1790,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidPointerSize(JNIEnv *env,
-                                                                                 jobject j_object,
-                                                                                 jint size) {
-  try {
-    cudf::jni::auto_set_device(env);
-    return static_cast<jlong>(cudf::bitmask_allocation_size_bytes(size));
-  }
-  CATCH_STD(env, 0);
-}
-
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass,
                                                                            jlong handle) {
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
@@ -1978,10 +1821,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobj
     cudf::scalar *hi_scalar = reinterpret_cast<cudf::scalar *>(j_hi_scalar);
     cudf::scalar *hi_replace_scalar = reinterpret_cast<cudf::scalar *>(j_hi_replace_scalar);
 
-    std::unique_ptr<cudf::column> result =
-        clamp(*column_view, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar);
-
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(
+        clamp(*column_view, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar));
   }
   CATCH_STD(env, 0);
 }
@@ -1994,8 +1835,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_title(JNIEnv *env, jobjec
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
-    std::unique_ptr<cudf::column> result = cudf::strings::title(*view);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::title(*view));
   }
   CATCH_STD(env, 0);
 }
@@ -2011,8 +1851,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv *env, j
     cudf::jni::auto_set_device(env);
     cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
     cudf::string_scalar *deli = reinterpret_cast<cudf::string_scalar *>(delimiters_handle);
-    std::unique_ptr<cudf::column> result = cudf::strings::capitalize(*view, *deli);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::capitalize(*view, *deli));
   }
   CATCH_STD(env, 0);
 }
@@ -2024,16 +1863,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv *en
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    std::unique_ptr<cudf::column_view> ret;
-    cudf::jni::native_jpointerArray<cudf::column_view> children(env, handles);
-    std::vector<cudf::column_view> children_vector(children.size());
-    for (int i = 0; i < children.size(); i++) {
-      children_vector[i] = *children[i];
-    }
-    ret.reset(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, row_count, nullptr,
-                                    nullptr, 0, 0, children_vector));
-
-    return reinterpret_cast<jlong>(ret.release());
+    auto children = cudf::jni::native_jpointerArray<cudf::column_view>{env, handles};
+    auto children_vector = children.get_dereferenced();
+    return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, row_count,
+                                              nullptr, nullptr, 0, 0, children_vector));
   }
   CATCH_STD(env, 0);
 }
@@ -2045,19 +1878,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env,
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    auto const input = *reinterpret_cast<cudf::column_view *>(handle);
     // get a new null mask by setting all the nans to null
-    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> pair =
-        cudf::nans_to_nulls(*view);
+    auto [new_nullmask, new_null_count] = cudf::nans_to_nulls(input);
     // create a column_view which is a no-copy wrapper around the original column without the null
     // mask
-    std::unique_ptr<cudf::column_view> copy_view(
-        new cudf::column_view(view->type(), view->size(), view->data<char>()));
-    // create a column by deep copying the copy_view
-    std::unique_ptr<cudf::column> copy(new cudf::column(*copy_view));
-    // set the null mask with nans set to null
-    copy->set_null_mask(std::move(*pair.first), pair.second);
-    return reinterpret_cast<jlong>(copy.release());
+    auto const input_without_nullmask = cudf::column_view(
+        input.type(), input.size(), input.head<void>(), nullptr, 0, input.offset(),
+        std::vector<cudf::column_view>{input.child_begin(), input.child_end()});
+    // create a column by deep copying `input_without_nullmask`.
+    auto deep_copy = std::make_unique<cudf::column>(input_without_nullmask);
+    deep_copy->set_null_mask(std::move(*new_nullmask), new_null_count);
+    return release_as_jlong(deep_copy);
   }
   CATCH_STD(env, 0)
 }
@@ -2070,8 +1902,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFloat(JNIEnv *env, jobj
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
-    std::unique_ptr<cudf::column> result = cudf::strings::is_float(*view);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::is_float(*view));
   }
   CATCH_STD(env, 0)
 }
@@ -2084,8 +1915,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jo
   try {
     cudf::jni::auto_set_device(env);
     cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
-    std::unique_ptr<cudf::column> result = cudf::strings::is_integer(*view);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::is_integer(*view));
   }
   CATCH_STD(env, 0)
 }
@@ -2100,8 +1930,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(JNIEnv *env,
     cudf::jni::auto_set_device(env);
     cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
     cudf::data_type fp_dtype = cudf::jni::make_data_type(j_dtype, scale);
-    std::unique_ptr<cudf::column> result = cudf::strings::is_fixed_point(*view, fp_dtype);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::is_fixed_point(*view, fp_dtype));
   }
   CATCH_STD(env, 0)
 }
@@ -2116,8 +1945,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv
     cudf::jni::auto_set_device(env);
     cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
     cudf::data_type int_dtype = cudf::jni::make_data_type(j_dtype, scale);
-    std::unique_ptr<cudf::column> result = cudf::strings::is_integer(*view, int_dtype);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::is_integer(*view, int_dtype));
   }
   CATCH_STD(env, 0)
 }
@@ -2130,10 +1958,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv
 
   try {
     cudf::jni::auto_set_device(env);
-    std::unique_ptr<cudf::column> ret;
     cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
-    ret.reset(reinterpret_cast<cudf::column *>(new cudf::column(*view)));
-    return reinterpret_cast<jlong>(ret.release());
+    return ptr_as_jlong(new cudf::column(*view));
   }
   CATCH_STD(env, 0)
 }
@@ -2150,10 +1976,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env
     cudf::column_view *n_column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
     cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);
-
-    auto result = cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path);
-
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path));
   }
   CATCH_STD(env, 0)
 }
@@ -2179,10 +2002,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListEl
     cudf::strings_column_view strings_column(*column);
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
     cudf::lists_column_view lcv(*cv);
-    std::unique_ptr<cudf::column> result =
+    return release_as_jlong(
         cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar,
-                                          col_narep_scalar, null_policy, empty_list_output);
-    return reinterpret_cast<jlong>(result.release());
+                                          col_narep_scalar, null_policy, empty_list_output));
   }
   CATCH_STD(env, 0);
 }
@@ -2205,9 +2027,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListEl
 
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
     cudf::lists_column_view lcv(*cv);
-    std::unique_ptr<cudf::column> result = cudf::strings::join_list_elements(
-        lcv, separator_scalar, narep_scalar, null_policy, empty_list_output);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::strings::join_list_elements(lcv, separator_scalar, narep_scalar,
+                                                              null_policy, empty_list_output));
   }
   CATCH_STD(env, 0);
 }
@@ -2220,7 +2041,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv *env
     cudf::jni::auto_set_device(env);
     auto const cv = *reinterpret_cast<cudf::column_view *>(strings_handle);
     auto const strs_col = cudf::strings_column_view(cv);
-    return reinterpret_cast<jlong>(cudf::strings::repeat_strings(strs_col, repeat_times).release());
+    return release_as_jlong(cudf::strings::repeat_strings(strs_col, repeat_times));
   }
   CATCH_STD(env, 0);
 }
@@ -2237,11 +2058,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStringsWithColumnRe
     auto const repeat_times_cv = *reinterpret_cast<cudf::column_view *>(repeat_times_handle);
     if (output_sizes_handle != 0) {
       auto const output_sizes_cv = *reinterpret_cast<cudf::column_view *>(output_sizes_handle);
-      return reinterpret_cast<jlong>(
-          cudf::strings::repeat_strings(strs_col, repeat_times_cv, output_sizes_cv).release());
+      return release_as_jlong(
+          cudf::strings::repeat_strings(strs_col, repeat_times_cv, output_sizes_cv));
     } else {
-      return reinterpret_cast<jlong>(
-          cudf::strings::repeat_strings(strs_col, repeat_times_cv).release());
+      return release_as_jlong(cudf::strings::repeat_strings(strs_col, repeat_times_cv));
     }
   }
   CATCH_STD(env, 0);
@@ -2260,7 +2080,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_repeatStringsSizes(
     auto [output_sizes, total_bytes] =
         cudf::strings::repeat_strings_output_sizes(strs_col, repeat_times_cv);
     auto results = cudf::jni::native_jlongArray(env, 2);
-    results[0] = reinterpret_cast<jlong>(output_sizes.release());
+    results[0] = release_as_jlong(output_sizes);
     results[1] = static_cast<jlong>(total_bytes);
     return results.get_jArray();
   }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 03faf9be021..aeac1856db0 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -43,6 +43,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <thrust/iterator/counting_iterator.h>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
@@ -487,7 +488,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
 
   arrow::Result<int64_t> Read(int64_t nbytes, void *out) override {
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    jlong ret = read_into(env, reinterpret_cast<jlong>(out), nbytes);
+    jlong ret = read_into(env, ptr_as_jlong(out), nbytes);
     total_read += ret;
     return ret;
   }
@@ -499,7 +500,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
     if (!tmp_buffer.ok()) {
       return tmp_buffer;
     }
-    jlong amount_read = read_into(env, reinterpret_cast<jlong>((*tmp_buffer)->data()), nbytes);
+    jlong amount_read = read_into(env, ptr_as_jlong((*tmp_buffer)->data()), nbytes);
     arrow::Status stat = (*tmp_buffer)->Resize(amount_read);
     if (!stat.ok()) {
       return stat;
@@ -598,37 +599,27 @@ class native_arrow_ipc_reader_handle final {
   void close() { source->Close(); }
 };
 
-/**
- * Take a table returned by some operation and turn it into an array of column* so we can track them
- * ourselves in java instead of having their life tied to the table.
- * @param table_result the table to convert for return
- * @param extra_columns columns not in the table that will be added to the result at the end.
- */
-static jlongArray
-convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
-                         std::vector<std::unique_ptr<cudf::column>> &extra_columns) {
+jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
+                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
   int table_cols = ret.size();
   int num_columns = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-  for (int i = 0; i < table_cols; i++) {
-    outcol_handles[i] = reinterpret_cast<jlong>(ret[i].release());
-  }
-  for (size_t i = 0; i < extra_columns.size(); i++) {
-    outcol_handles[i + table_cols] = reinterpret_cast<jlong>(extra_columns[i].release());
-  }
+  std::transform(ret.begin(), ret.end(), outcol_handles.begin(),
+                 [](auto &col) { return release_as_jlong(col); });
+  std::transform(extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols,
+                 [](auto &col) { return release_as_jlong(col); });
   return outcol_handles.get_jArray();
 }
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result) {
-  std::vector<std::unique_ptr<cudf::column>> extra;
-  return convert_table_for_return(env, table_result, extra);
+jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
+                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
+  return convert_table_for_return(env, std::move(table_result), std::move(extra_columns));
 }
 
 jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &first_table,
                                     std::unique_ptr<cudf::table> &second_table) {
-  std::vector<std::unique_ptr<cudf::column>> second_tmp = second_table->release();
-  return convert_table_for_return(env, first_table, second_tmp);
+  return convert_table_for_return(env, first_table, second_table->release());
 }
 
 // Convert the JNI boolean array of key column sort order to a vector of cudf::order
@@ -778,10 +769,10 @@ jlongArray gather_maps_to_java(JNIEnv *env,
   auto right_map_buffer = std::make_unique<rmm::device_buffer>(maps.second->release());
   cudf::jni::native_jlongArray result(env, 5);
   result[0] = static_cast<jlong>(left_map_buffer->size());
-  result[1] = reinterpret_cast<jlong>(left_map_buffer->data());
-  result[2] = reinterpret_cast<jlong>(left_map_buffer.release());
-  result[3] = reinterpret_cast<jlong>(right_map_buffer->data());
-  result[4] = reinterpret_cast<jlong>(right_map_buffer.release());
+  result[1] = ptr_as_jlong(left_map_buffer->data());
+  result[2] = release_as_jlong(left_map_buffer);
+  result[3] = ptr_as_jlong(right_map_buffer->data());
+  result[4] = release_as_jlong(right_map_buffer);
   return result.get_jArray();
 }
 
@@ -796,8 +787,8 @@ jlongArray gather_map_to_java(JNIEnv *env,
   auto gather_map_buffer = std::make_unique<rmm::device_buffer>(map->release());
   cudf::jni::native_jlongArray result(env, 3);
   result[0] = static_cast<jlong>(gather_map_buffer->size());
-  result[1] = reinterpret_cast<jlong>(gather_map_buffer->data());
-  result[2] = reinterpret_cast<jlong>(gather_map_buffer.release());
+  result[1] = ptr_as_jlong(gather_map_buffer->data());
+  result[2] = release_as_jlong(gather_map_buffer);
   return result.get_jArray();
 }
 
@@ -905,19 +896,18 @@ jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
     auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
     auto const nulls_equal =
         j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size_info =
+    auto [join_size, matches_per_row] =
         join_size_func(*left_keys, *right_keys, *left_condition, *right_condition,
                        condition->get_top_expression(), nulls_equal);
-    if (join_size_info.second->size() > std::numeric_limits<cudf::size_type>::max()) {
+    if (matches_per_row->size() > std::numeric_limits<cudf::size_type>::max()) {
       throw std::runtime_error("Too many values in device buffer to convert into a column");
     }
-    auto col_size = join_size_info.second->size();
-    auto col_data = join_size_info.second->release();
-    auto col = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32}, col_size,
-                                              std::move(col_data), rmm::device_buffer{}, 0);
+    auto col_size = static_cast<size_type>(matches_per_row->size());
+    auto col_data = matches_per_row->release();
     cudf::jni::native_jlongArray result(env, 2);
-    result[0] = static_cast<jlong>(join_size_info.first);
-    result[1] = reinterpret_cast<jlong>(col.release());
+    result[0] = static_cast<jlong>(join_size);
+    result[1] = ptr_as_jlong(new cudf::column{cudf::data_type{cudf::type_id::INT32}, col_size,
+                                              std::move(col_data), rmm::device_buffer{}, 0});
     return result.get_jArray();
   }
   CATCH_STD(env, NULL);
@@ -948,13 +938,37 @@ jlongArray mixed_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_
   CATCH_STD(env, NULL);
 }
 
+template <typename T>
+jlongArray mixed_join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
+                                        jlong j_left_condition, jlong j_right_condition,
+                                        jlong j_condition, jboolean j_nulls_equal, T join_func) {
+  JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
+  JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
+  JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
+  JNI_NULL_CHECK(env, j_right_condition, "right condition table is null", 0);
+  JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
+    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const nulls_equal =
+        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    return gather_map_to_java(env,
+                              join_func(*left_keys, *right_keys, *left_condition, *right_condition,
+                                        condition->get_top_expression(), nulls_equal));
+  }
+  CATCH_STD(env, NULL);
+}
+
 std::pair<std::size_t, cudf::device_span<cudf::size_type const>>
 get_mixed_size_info(JNIEnv *env, jlong j_output_row_count, jlong j_matches_view) {
   auto const row_count = static_cast<std::size_t>(j_output_row_count);
   auto const matches = reinterpret_cast<cudf::column_view const *>(j_matches_view);
-  return std::pair<std::size_t, cudf::device_span<cudf::size_type const>>(
-      row_count, cudf::device_span<cudf::size_type const>(matches->template data<cudf::size_type>(),
-                                                          matches->size()));
+  return std::make_pair(row_count, cudf::device_span<cudf::size_type const>(
+                                       matches->template data<cudf::size_type>(), matches->size()));
 }
 
 // Returns a table view containing only the columns at the specified indices
@@ -986,12 +1000,11 @@ cudf::table_view const get_non_keys_table(cudf::table_view const *t,
 jlongArray combine_join_results(JNIEnv *env, std::vector<std::unique_ptr<cudf::column>> left_cols,
                                 std::vector<std::unique_ptr<cudf::column>> right_cols) {
   cudf::jni::native_jlongArray outcol_handles(env, left_cols.size() + right_cols.size());
-  auto iter = std::transform(
-      left_cols.begin(), left_cols.end(), outcol_handles.begin(),
-      [](std::unique_ptr<cudf::column> &col) { return reinterpret_cast<jlong>(col.release()); });
-  std::transform(
-      right_cols.begin(), right_cols.end(), iter,
-      [](std::unique_ptr<cudf::column> &col) { return reinterpret_cast<jlong>(col.release()); });
+  auto iter =
+      std::transform(left_cols.begin(), left_cols.end(), outcol_handles.begin(),
+                     [](std::unique_ptr<cudf::column> &col) { return release_as_jlong(col); });
+  std::transform(right_cols.begin(), right_cols.end(), iter,
+                 [](std::unique_ptr<cudf::column> &col) { return release_as_jlong(col); });
   return outcol_handles.get_jArray();
 }
 
@@ -1014,21 +1027,19 @@ cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
       return cudf::column_view(column_view);
     }
   } else {
-    std::unique_ptr<cudf::column_view> ret;
     std::vector<cudf::column_view> children;
     children.reserve(column_view.num_children());
     for (auto it = column_view.child_begin(); it != column_view.child_end(); it++) {
       children.push_back(remove_validity_from_col(*it));
     }
     if (!column_view.nullable() || column_view.null_count() != 0) {
-      ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr,
-                                      column_view.null_mask(), column_view.null_count(),
-                                      column_view.offset(), children));
+      return cudf::column_view(column_view.type(), column_view.size(), nullptr,
+                               column_view.null_mask(), column_view.null_count(),
+                               column_view.offset(), children);
     } else {
-      ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0,
-                                      column_view.offset(), children));
+      return cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0,
+                               column_view.offset(), children);
     }
-    return *ret.release();
   }
 }
 
@@ -1047,6 +1058,10 @@ cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
 } // namespace jni
 } // namespace cudf
 
+using cudf::jni::convert_table_for_return;
+using cudf::jni::ptr_as_jlong;
+using cudf::jni::release_as_jlong;
+
 extern "C" {
 
 // This is a method purely added for testing remove_validity_if_needed method
@@ -1059,10 +1074,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(J
     cudf::table m_tbl(result);
     std::vector<std::unique_ptr<cudf::column>> cols = m_tbl.release();
     auto results = cudf::jni::native_jlongArray(env, cols.size());
-    int i = 0;
-    for (auto it = cols.begin(); it != cols.end(); it++) {
-      results[i++] = reinterpret_cast<jlong>(it->release());
-    }
+    std::transform(cols.begin(), cols.end(), results.begin(),
+                   [](auto &col) { return release_as_jlong(col); });
     return results.get_jArray();
   }
   CATCH_STD(env, 0);
@@ -1076,12 +1089,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *en
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, j_cudf_columns);
 
-    std::vector<cudf::column_view> column_views(n_cudf_columns.size());
-    for (int i = 0; i < n_cudf_columns.size(); i++) {
-      column_views[i] = *n_cudf_columns[i];
-    }
-    cudf::table_view *tv = new cudf::table_view(column_views);
-    return reinterpret_cast<jlong>(tv);
+    std::vector<cudf::column_view> column_views = n_cudf_columns.get_dereferenced();
+    return ptr_as_jlong(new cudf::table_view(column_views));
   }
   CATCH_STD(env, 0);
 }
@@ -1118,8 +1127,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNI
       // In the ideal case we would keep the view where it is at, and pass in a pointer to it
       // That pointer would then be copied when Java takes ownership of it, but that adds an
       // extra JNI call that I would like to avoid for performance reasons.
-      cudf::column_view *cv = new cudf::column_view(table.column(i));
-      views[i] = reinterpret_cast<jlong>(cv);
+      views[i] = ptr_as_jlong(new cudf::column_view(table.column(i)));
     }
     views.commit();
 
@@ -1157,23 +1165,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
     JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
                   "columns and is_descending lengths don't match", 0);
 
-    std::vector<cudf::order> order(n_is_descending.size());
-    for (int i = 0; i < n_is_descending.size(); i++) {
-      order[i] = n_is_descending[i] ? cudf::order::DESCENDING : cudf::order::ASCENDING;
-    }
-    std::vector<cudf::null_order> null_order(n_are_nulls_smallest.size());
-    for (int i = 0; i < n_are_nulls_smallest.size(); i++) {
-      null_order[i] = n_are_nulls_smallest[i] ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
-    }
+    std::vector<cudf::order> order =
+        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+    std::vector<cudf::null_order> null_order =
+        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
 
-    std::vector<cudf::column_view> columns(num_columns);
-    for (int i = 0; i < num_columns; i++) {
-      columns[i] = *n_sort_keys_columns[i];
-    }
-    cudf::table_view keys(columns);
-
-    auto sorted_col = cudf::sorted_order(keys, order, null_order);
-    return reinterpret_cast<jlong>(sorted_col.release());
+    std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
+    return release_as_jlong(cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order));
   }
   CATCH_STD(env, 0);
 }
@@ -1207,26 +1205,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jcla
     JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
                   "columns and areNullsSmallest lengths don't match", 0);
 
-    std::vector<cudf::order> order(n_is_descending.size());
-    for (int i = 0; i < n_is_descending.size(); i++) {
-      order[i] = n_is_descending[i] ? cudf::order::DESCENDING : cudf::order::ASCENDING;
-    }
-    std::vector<cudf::null_order> null_order(n_are_nulls_smallest.size());
-    for (int i = 0; i < n_are_nulls_smallest.size(); i++) {
-      null_order[i] = n_are_nulls_smallest[i] ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
-    }
+    std::vector<cudf::order> order =
+        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
 
-    std::vector<cudf::column_view> columns(num_columns);
-    for (int i = 0; i < num_columns; i++) {
-      columns[i] = *n_sort_keys_columns[i];
-    }
-    cudf::table_view keys(columns);
+    std::vector<cudf::null_order> null_order =
+        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
 
-    auto sorted_col = cudf::sorted_order(keys, order, null_order);
+    std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
+    auto sorted_col = cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order);
 
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(j_input_table);
-    std::unique_ptr<cudf::table> result = cudf::gather(*input_table, sorted_col->view());
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_input_table);
+    return convert_table_for_return(env, cudf::gather(*input_table, sorted_col->view()));
   }
   CATCH_STD(env, NULL);
 }
@@ -1261,28 +1250,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
     JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
                   "columns and areNullsSmallest lengths don't match", NULL);
 
-    std::vector<int> indexes(n_sort_key_indexes.size());
-    for (int i = 0; i < n_sort_key_indexes.size(); i++) {
-      indexes[i] = n_sort_key_indexes[i];
-    }
-    std::vector<cudf::order> order(n_is_descending.size());
-    for (int i = 0; i < n_is_descending.size(); i++) {
-      order[i] = n_is_descending[i] ? cudf::order::DESCENDING : cudf::order::ASCENDING;
-    }
-    std::vector<cudf::null_order> null_order(n_are_nulls_smallest.size());
-    for (int i = 0; i < n_are_nulls_smallest.size(); i++) {
-      null_order[i] = n_are_nulls_smallest[i] ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
-    }
-
-    jsize num_tables = n_table_handles.size();
-    std::vector<cudf::table_view> tables;
-    tables.reserve(num_tables);
-    for (int i = 0; i < num_tables; i++) {
-      tables.push_back(*n_table_handles[i]);
-    }
+    std::vector<int> indexes = n_sort_key_indexes.to_vector<int>();
+    std::vector<cudf::order> order =
+        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+    std::vector<cudf::null_order> null_order =
+        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+    std::vector<cudf::table_view> tables = n_table_handles.get_dereferenced();
 
-    std::unique_ptr<cudf::table> result = cudf::merge(tables, indexes, order, null_order);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::merge(tables, indexes, order, null_order));
   }
   CATCH_STD(env, NULL);
 }
@@ -1322,9 +1297,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
                       NULL);
       }
       data_types.reserve(n_types.size());
-      for (int index = 0; index < n_types.size(); index++) {
-        data_types.emplace_back(cudf::jni::make_data_type(n_types[index], n_scales[index]));
-      }
+      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
+                     std::back_inserter(data_types), [](auto type, auto scale) {
+                       return cudf::data_type{static_cast<cudf::type_id>(type), scale};
+                     });
     }
 
     cudf::jni::native_jstring filename(env, inputfilepath);
@@ -1338,14 +1314,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     cudf::jni::native_jstringArray n_false_values(env, false_values);
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    std::unique_ptr<cudf::io::source_info> source;
-    if (read_buffer) {
-      source.reset(new cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length));
-    } else {
-      source.reset(new cudf::io::source_info(filename.get()));
-    }
+    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)} :
+                                cudf::io::source_info{filename.get()};
 
-    cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(*source)
+    cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
                                             .delimiter(delim)
                                             .header(header_row)
                                             .names(n_col_names.as_cpp_vector())
@@ -1360,8 +1333,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
                                             .comment(comment)
                                             .build();
 
-    cudf::io::table_with_metadata result = cudf::io::read_csv(opts);
-    return cudf::jni::convert_table_for_return(env, result.tbl);
+    return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
@@ -1398,9 +1370,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
                       NULL);
       }
       data_types.reserve(n_types.size());
-      for (int index = 0; index < n_types.size(); index++) {
-        data_types.emplace_back(cudf::jni::make_data_type(n_types[index], n_scales[index]));
-      }
+      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
+                     std::back_inserter(data_types), [](auto const &type, auto const &scale) {
+                       return cudf::data_type{static_cast<cudf::type_id>(type), scale};
+                     });
     }
 
     cudf::jni::native_jstring filename(env, inputfilepath);
@@ -1409,14 +1382,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
                     NULL);
     }
 
-    std::unique_ptr<cudf::io::source_info> source;
-    if (read_buffer) {
-      source.reset(new cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length));
-    } else {
-      source.reset(new cudf::io::source_info(filename.get()));
-    }
+    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)} :
+                                cudf::io::source_info{filename.get()};
 
-    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(*source)
+    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines));
 
@@ -1443,16 +1413,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
 
     // there is no need to re-order columns when inferring schema
     if (result.metadata.column_names.empty() || n_col_names.size() <= 0) {
-      return cudf::jni::convert_table_for_return(env, result.tbl);
+      return convert_table_for_return(env, result.tbl);
     } else {
       // json reader will not return the correct column order,
       // so we need to re-order the column of table according to table meta.
 
       // turn name and its index in table into map<name, index>
       std::map<std::string, cudf::size_type> m;
-      for (size_t i = 0; i < result.metadata.column_names.size(); i++) {
-        m.insert(std::make_pair(result.metadata.column_names[i], i));
-      }
+      std::transform(result.metadata.column_names.begin(), result.metadata.column_names.end(),
+                     thrust::make_counting_iterator(0), std::inserter(m, m.end()),
+                     [](auto const &column_name, auto const &index) {
+                       return std::make_pair(column_name, index);
+                     });
 
       auto col_names_vec = n_col_names.as_cpp_vector();
       std::vector<cudf::size_type> indices;
@@ -1469,11 +1441,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
 
       if (!match) {
         // can't find some input column names in table meta, return what json reader reads.
-        return cudf::jni::convert_table_for_return(env, result.tbl);
+        return convert_table_for_return(env, result.tbl);
       } else {
         auto tbv = result.tbl->view().select(std::move(indices));
         auto table = std::make_unique<cudf::table>(tbv);
-        return cudf::jni::convert_table_for_return(env, table);
+        return convert_table_for_return(env, table);
       }
     }
   }
@@ -1507,21 +1479,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env,
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    std::unique_ptr<cudf::io::source_info> source;
-    if (read_buffer) {
-      source.reset(new cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length));
-    } else {
-      source.reset(new cudf::io::source_info(filename.get()));
-    }
+    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)) :
+                                cudf::io::source_info(filename.get());
 
     cudf::io::parquet_reader_options opts =
-        cudf::io::parquet_reader_options::builder(*source)
+        cudf::io::parquet_reader_options::builder(source)
             .columns(n_filter_col_names.as_cpp_vector())
             .convert_strings_to_categories(false)
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
             .build();
-    cudf::io::table_with_metadata result = cudf::io::read_parquet(opts);
-    return cudf::jni::convert_table_for_return(env, result.tbl);
+    return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
@@ -1547,13 +1515,13 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
                         j_precisions, j_is_map, metadata);
 
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    for (auto i = 0; i < meta_keys.size(); ++i) {
-      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
-    }
+    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+                   std::inserter(kv_metadata, kv_metadata.end()),
+                   [](auto const &key, auto const &value) { return std::make_pair(key, value); });
 
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
@@ -1565,7 +1533,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
     cudf::jni::native_parquet_writer_handle *ret =
         new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), std::move(data_sink));
-    return reinterpret_cast<jlong>(ret);
+    return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
@@ -1589,13 +1557,13 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
                         j_precisions, j_is_map, metadata);
 
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    for (auto i = 0; i < meta_keys.size(); ++i) {
-      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
-    }
+    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+                   std::inserter(kv_metadata, kv_metadata.end()),
+                   [](auto const &key, auto const &value) { return std::make_pair(key, value); });
 
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
@@ -1609,7 +1577,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
     cudf::jni::native_parquet_writer_handle *ret =
         new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr));
-    return reinterpret_cast<jlong>(ret);
+    return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
@@ -1679,23 +1647,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 
     cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
 
-    std::unique_ptr<cudf::io::source_info> source;
-    if (read_buffer) {
-      source.reset(new cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length));
-    } else {
-      source.reset(new cudf::io::source_info(filename.get()));
-    }
+    auto source = read_buffer ?
+                      cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length) :
+                      cudf::io::source_info(filename.get());
 
     cudf::io::orc_reader_options opts =
-        cudf::io::orc_reader_options::builder(*source)
+        cudf::io::orc_reader_options::builder(source)
             .columns(n_filter_col_names.as_cpp_vector())
             .use_index(false)
             .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
             .decimal128_columns(n_dec128_col_names.as_cpp_vector())
             .build();
-    cudf::io::table_with_metadata result = cudf::io::read_orc(opts);
-    return cudf::jni::convert_table_for_return(env, result.tbl);
+    return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
@@ -1719,13 +1683,13 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
                         j_precisions, j_is_map, metadata);
 
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    for (auto i = 0; i < meta_keys.size(); ++i) {
-      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
-    }
+    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+                   std::inserter(kv_metadata, kv_metadata.end()),
+                   [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
@@ -1733,13 +1697,13 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
                                           .metadata(&metadata)
                                           .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(true)
+                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
                                           .key_value_metadata(kv_metadata)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
     cudf::jni::native_orc_writer_handle *ret =
         new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), std::move(data_sink));
-    return reinterpret_cast<jlong>(ret);
+    return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
@@ -1764,25 +1728,25 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
                         j_precisions, j_is_map, metadata);
 
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    for (auto i = 0; i < meta_keys.size(); ++i) {
-      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
-    }
+    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+                   std::inserter(kv_metadata, kv_metadata.end()),
+                   [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
 
     sink_info sink{output_path.get()};
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
                                           .metadata(&metadata)
                                           .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(true)
+                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
                                           .key_value_metadata(kv_metadata)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
     cudf::jni::native_orc_writer_handle *ret =
         new cudf::jni::native_orc_writer_handle(std::move(writer_ptr));
-    return reinterpret_cast<jlong>(ret);
+    return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
@@ -1837,7 +1801,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv
 
     cudf::jni::native_arrow_ipc_writer_handle *ret =
         new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
-    return reinterpret_cast<jlong>(ret);
+    return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
@@ -1854,7 +1818,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *
 
     cudf::jni::native_arrow_ipc_writer_handle *ret =
         new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get());
-    return reinterpret_cast<jlong>(ret);
+    return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
@@ -1871,13 +1835,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
 
   try {
     cudf::jni::auto_set_device(env);
-    std::unique_ptr<std::shared_ptr<arrow::Table>> result(
-        new std::shared_ptr<arrow::Table>(nullptr));
-    *result = cudf::to_arrow(*tview, state->get_column_metadata(*tview));
-    if (!result->get()) {
-      return 0;
-    }
-    return reinterpret_cast<jlong>(result.release());
+    // The semantics of this function are confusing:
+    // The return value is a pointer to a heap-allocated shared_ptr<arrow::Table>.
+    // i.e. the shared_ptr<> is on the heap.
+    // The pointer to the shared_ptr<> is returned as a jlong.
+    using result_t = std::shared_ptr<arrow::Table>;
+
+    auto result = cudf::to_arrow(*tview, state->get_column_metadata(*tview));
+    return ptr_as_jlong(new result_t{result});
   }
   CATCH_STD(env, 0)
 }
@@ -1921,10 +1886,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *e
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring input_path(env, j_input_path);
-
-    cudf::jni::native_arrow_ipc_reader_handle *ret =
-        new cudf::jni::native_arrow_ipc_reader_handle(input_path.get());
-    return reinterpret_cast<jlong>(ret);
+    return ptr_as_jlong(new cudf::jni::native_arrow_ipc_reader_handle(input_path.get()));
   }
   CATCH_STD(env, 0)
 }
@@ -1934,13 +1896,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv
   JNI_NULL_CHECK(env, provider, "null provider", 0);
   try {
     cudf::jni::auto_set_device(env);
-
     std::shared_ptr<cudf::jni::jni_arrow_input_stream> data_source(
         new cudf::jni::jni_arrow_input_stream(env, provider));
-
-    cudf::jni::native_arrow_ipc_reader_handle *ret =
-        new cudf::jni::native_arrow_ipc_reader_handle(data_source);
-    return reinterpret_cast<jlong>(ret);
+    return ptr_as_jlong(new cudf::jni::native_arrow_ipc_reader_handle(data_source));
   }
   CATCH_STD(env, 0)
 }
@@ -1957,13 +1915,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(
     cudf::jni::auto_set_device(env);
     // This is a little odd because we have to return a pointer
     // and arrow wants to deal with shared pointers for everything.
-    std::unique_ptr<std::shared_ptr<arrow::Table>> result(
-        new std::shared_ptr<arrow::Table>(nullptr));
-    *result = state->next(row_target);
-    if (!result->get()) {
-      return 0;
-    }
-    return reinterpret_cast<jlong>(result.release());
+    auto result = state->next(row_target);
+    return result ? ptr_as_jlong(new std::shared_ptr<arrow::Table>{result}) : 0;
   }
   CATCH_STD(env, 0)
 }
@@ -1989,8 +1942,7 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arr
 
   try {
     cudf::jni::auto_set_device(env);
-    std::unique_ptr<cudf::table> result = cudf::from_arrow(*(handle->get()));
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::from_arrow(*(handle->get())));
   }
   CATCH_STD(env, 0)
 }
@@ -2175,7 +2127,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoin(
                              static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
                                                                       cudf::null_equality::UNEQUAL);
 
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, NULL);
 }
@@ -2204,7 +2156,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoin(
                              static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
                                                                       cudf::null_equality::UNEQUAL);
 
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, NULL);
 }
@@ -2595,6 +2547,50 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGa
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinSize(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_size(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_semi_join_size(left_keys, right_keys, left_condition,
+                                               right_condition, condition, nulls_equal);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_gather_single_map(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition,
+                                          condition, nulls_equal);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMapWithSize(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
+    jlong j_matches_view) {
+  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
+  return cudf::jni::mixed_join_gather_single_map(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition,
+                                          condition, nulls_equal, size_info);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_single_map(
@@ -2643,6 +2639,50 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGa
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinSize(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_size(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_anti_join_size(left_keys, right_keys, left_condition,
+                                               right_condition, condition, nulls_equal);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_gather_single_map(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition,
+                                          condition, nulls_equal);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMapWithSize(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
+    jlong j_matches_view) {
+  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
+  return cudf::jni::mixed_join_gather_single_map(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition,
+                                          condition, nulls_equal, size_info);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass,
                                                                  jlong left_table,
                                                                  jlong right_table) {
@@ -2651,12 +2691,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jc
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_left_table = reinterpret_cast<cudf::table_view *>(left_table);
-    cudf::table_view *n_right_table = reinterpret_cast<cudf::table_view *>(right_table);
-
-    std::unique_ptr<cudf::table> result = cudf::cross_join(*n_left_table, *n_right_table);
-
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const left = reinterpret_cast<cudf::table_view const *>(left_table);
+    auto const right = reinterpret_cast<cudf::table_view const *>(right_table);
+    return convert_table_for_return(env, cudf::cross_join(*left, *right));
   }
   CATCH_STD(env, NULL);
 }
@@ -2668,8 +2705,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv *env,
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *table_view = reinterpret_cast<cudf::table_view *>(j_cudf_table_view);
-    std::unique_ptr<cudf::column> result = cudf::interleave_columns(*table_view);
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(cudf::interleave_columns(*table_view));
   }
   CATCH_STD(env, 0);
 }
@@ -2680,18 +2716,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jpointerArray<cudf::table_view> tables(env, table_handles);
-
-    int num_tables = tables.size();
-    // There are some issues with table_view and std::vector. We cannot give the
-    // vector a size or it will not compile.
-    std::vector<cudf::table_view> to_concat;
-    to_concat.reserve(num_tables);
-    for (int i = 0; i < num_tables; i++) {
-      JNI_NULL_CHECK(env, tables[i], "input table included a null", NULL);
-      to_concat.push_back(*tables[i]);
-    }
-    std::unique_ptr<cudf::table> table_result = cudf::concatenate(to_concat);
-    return cudf::jni::convert_table_for_return(env, table_result);
+    std::vector<cudf::table_view> const to_concat = tables.get_dereferenced();
+    return convert_table_for_return(env, cudf::concatenate(to_concat));
   }
   CATCH_STD(env, NULL);
 }
@@ -2709,20 +2735,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    cudf::column_view *n_part_column = reinterpret_cast<cudf::column_view *>(partition_column);
-    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
+    auto const n_part_column = reinterpret_cast<cudf::column_view const *>(partition_column);
 
-    auto result = cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
+    auto [partitioned_table, partition_offsets] =
+        cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
 
-    for (size_t i = 0; i < result.second.size() - 1; i++) {
-      // for what ever reason partition returns the length of the result at then
-      // end and hash partition/round robin do not, so skip the last entry for
-      // consistency
-      n_output_offsets[i] = result.second[i];
-    }
+    // for what ever reason partition returns the length of the result at then
+    // end and hash partition/round robin do not, so skip the last entry for
+    // consistency
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+    std::copy(partition_offsets.begin(), partition_offsets.end() - 1, n_output_offsets.begin());
 
-    return cudf::jni::convert_table_for_return(env, result.first);
+    return convert_table_for_return(env, partitioned_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -2738,26 +2763,21 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::hash_id hash_func = static_cast<cudf::hash_id>(hash_function);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto const hash_func = static_cast<cudf::hash_id>(hash_function);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
-    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
-
     JNI_ARG_CHECK(env, n_columns_to_hash.size() > 0, "columns_to_hash is zero", NULL);
 
-    std::vector<cudf::size_type> columns_to_hash_vec(n_columns_to_hash.size());
-    for (int i = 0; i < n_columns_to_hash.size(); i++) {
-      columns_to_hash_vec[i] = n_columns_to_hash[i];
-    }
+    std::vector<cudf::size_type> columns_to_hash_vec(n_columns_to_hash.begin(),
+                                                     n_columns_to_hash.end());
 
-    std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> result =
+    auto [partitioned_table, partition_offsets] =
         cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions, hash_func);
 
-    for (size_t i = 0; i < result.second.size(); i++) {
-      n_output_offsets[i] = result.second[i];
-    }
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+    std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
 
-    return cudf::jni::convert_table_for_return(env, result.first);
+    return convert_table_for_return(env, partitioned_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -2773,15 +2793,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
   try {
     cudf::jni::auto_set_device(env);
     auto n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
 
-    auto result = cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
+    auto [partitioned_table, partition_offsets] =
+        cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
 
-    for (size_t i = 0; i < result.second.size(); i++) {
-      n_output_offsets[i] = result.second[i];
-    }
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+    std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
 
-    return cudf::jni::convert_table_for_return(env, result.first);
+    return convert_table_for_return(env, partitioned_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -2851,7 +2870,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
         result_columns.push_back(std::move(result.second[agg_result_index].results[col_agg_index]));
       }
     }
-    return cudf::jni::convert_table_for_return(env, result.first, result_columns);
+    return convert_table_for_return(env, result.first, std::move(result_columns));
   }
   CATCH_STD(env, NULL);
 }
@@ -2921,7 +2940,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
         result_columns.push_back(std::move(result.second[agg_result_index].results[col_agg_index]));
       }
     }
-    return cudf::jni::convert_table_for_return(env, result.first, result_columns);
+    return convert_table_for_return(env, result.first, std::move(result_columns));
   }
   CATCH_STD(env, NULL);
 }
@@ -2963,17 +2982,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
     }
     cudf::table_view n_replace_table(n_replace_cols);
 
-    std::vector<cudf::replace_policy> policies;
-    policies.reserve(n_is_preceding.size());
-    for (int i = 0; i < n_is_preceding.size(); i++) {
-      policies.push_back(n_is_preceding[i] ? cudf::replace_policy::PRECEDING :
-                                             cudf::replace_policy::FOLLOWING);
-    }
-
-    std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> result =
-        grouper.replace_nulls(n_replace_table, policies);
+    std::vector<cudf::replace_policy> policies = n_is_preceding.transform_if_else(
+        cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING);
 
-    return cudf::jni::convert_table_for_return(env, result.first, result.second);
+    auto [keys, results] = grouper.replace_nulls(n_replace_table, policies);
+    return convert_table_for_return(env, keys, results);
   }
   CATCH_STD(env, NULL);
 }
@@ -2984,10 +2997,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas
   JNI_NULL_CHECK(env, mask_jcol, "mask column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::column_view *mask = reinterpret_cast<cudf::column_view *>(mask_jcol);
-    std::unique_ptr<cudf::table> result = cudf::apply_boolean_mask(*input, *mask);
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const mask = reinterpret_cast<cudf::column_view const *>(mask_jcol);
+    return convert_table_for_return(env, cudf::apply_boolean_mask(*input, *mask));
   }
   CATCH_STD(env, 0);
 }
@@ -3013,7 +3025,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
         nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
         nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER,
         rmm::mr::get_current_device_resource());
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
@@ -3024,12 +3036,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
-    cudf::column_view *map = reinterpret_cast<cudf::column_view *>(j_map);
+    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
+    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
     auto bounds_policy =
         check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
-    std::unique_ptr<cudf::table> result = cudf::gather(*input, *map, bounds_policy);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::gather(*input, *map, bounds_policy));
   }
   CATCH_STD(env, 0);
 }
@@ -3040,14 +3051,13 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols =
         cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-    for (int i = 0; i < num_columns; i++) {
-      outcol_handles[i] = reinterpret_cast<jlong>(cols[i].release());
-    }
+    std::transform(cols.begin(), cols.end(), outcol_handles.begin(),
+                   [](auto &col) { return release_as_jlong(col); });
     return outcol_handles.get_jArray();
   }
   CATCH_STD(env, 0);
@@ -3065,8 +3075,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env,
     auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
     auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
     auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
-    auto result = cudf::scatter(*input, *map, *target, check_bounds);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::scatter(*input, *map, *target, check_bounds));
   }
   CATCH_STD(env, 0);
 }
@@ -3082,13 +3091,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en
     cudf::jni::auto_set_device(env);
     auto const scalars_array = cudf::jni::native_jpointerArray<cudf::scalar>(env, j_input);
     std::vector<std::reference_wrapper<cudf::scalar const>> input;
-    for (int i = 0; i < scalars_array.size(); ++i) {
-      input.emplace_back(*scalars_array[i]);
-    }
+    std::transform(scalars_array.begin(), scalars_array.end(), std::back_inserter(input),
+                   [](auto &scalar) { return std::ref(*scalar); });
     auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
     auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
-    auto result = cudf::scatter(input, *map, *target, check_bounds);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::scatter(input, *map, *target, check_bounds));
   }
   CATCH_STD(env, 0);
 }
@@ -3099,13 +3106,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols = cudf::jni::convert_to_rows(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-    for (int i = 0; i < num_columns; i++) {
-      outcol_handles[i] = reinterpret_cast<jlong>(cols[i].release());
-    }
+    std::transform(cols.begin(), cols.end(), outcol_handles.begin(),
+                   [](auto &col) { return release_as_jlong(col); });
     return outcol_handles.get_jArray();
   }
   CATCH_STD(env, 0);
@@ -3118,17 +3124,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
-    cudf::lists_column_view list_input(*input);
+    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view *>(input_column)};
     cudf::jni::native_jintArray n_types(env, types);
     cudf::jni::native_jintArray n_scale(env, scale);
-    std::vector<cudf::data_type> types_vec;
-    for (int i = 0; i < n_types.size(); i++) {
-      types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
+    if (n_types.size() != n_scale.size()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                    NULL);
     }
+    std::vector<cudf::data_type> types_vec;
+    std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec),
+                   [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
     std::unique_ptr<cudf::table> result =
         cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
@@ -3142,16 +3150,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
-    cudf::lists_column_view list_input(*input);
+    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view *>(input_column)};
     cudf::jni::native_jintArray n_types(env, types);
     cudf::jni::native_jintArray n_scale(env, scale);
-    std::vector<cudf::data_type> types_vec;
-    for (int i = 0; i < n_types.size(); i++) {
-      types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
+    if (n_types.size() != n_scale.size()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                    NULL);
     }
+    std::vector<cudf::data_type> types_vec;
+    std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec),
+                   [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
     std::unique_ptr<cudf::table> result = cudf::jni::convert_from_rows(list_input, types_vec);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
@@ -3162,9 +3172,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(input_jtable);
-    std::unique_ptr<cudf::table> result = cudf::repeat(*input, count);
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    return convert_table_for_return(env, cudf::repeat(*input, count));
   }
   CATCH_STD(env, 0);
 }
@@ -3177,10 +3186,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv
   JNI_NULL_CHECK(env, count_jcol, "count column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::column_view *count = reinterpret_cast<cudf::column_view *>(count_jcol);
-    std::unique_ptr<cudf::table> result = cudf::repeat(*input, *count, check_count);
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const count = reinterpret_cast<cudf::column_view const *>(count_jcol);
+    return convert_table_for_return(env, cudf::repeat(*input, *count, check_count));
   }
   CATCH_STD(env, 0);
 }
@@ -3201,25 +3209,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_bound(JNIEnv *env, jclass, jlo
     cudf::jni::native_jbooleanArray const n_desc_flags(env, desc_flags);
     cudf::jni::native_jbooleanArray const n_are_nulls_smallest(env, are_nulls_smallest);
 
-    std::vector<cudf::order> column_desc_flags(n_desc_flags.size());
-    std::vector<cudf::null_order> column_null_orders(n_are_nulls_smallest.size());
+    std::vector<cudf::order> column_desc_flags{
+        n_desc_flags.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING)};
+    std::vector<cudf::null_order> column_null_orders{
+        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER)};
 
     JNI_ARG_CHECK(env, (column_desc_flags.size() == column_null_orders.size()),
                   "null-order and sort-order size mismatch", 0);
-    size_t num_columns = column_null_orders.size();
-    for (size_t i = 0; i < num_columns; i++) {
-      column_desc_flags[i] = n_desc_flags[i] ? cudf::order::DESCENDING : cudf::order::ASCENDING;
-      column_null_orders[i] =
-          n_are_nulls_smallest[i] ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
-    }
 
-    std::unique_ptr<column> result;
-    if (is_upper_bound) {
-      result = std::move(cudf::upper_bound(*input, *values, column_desc_flags, column_null_orders));
-    } else {
-      result = std::move(cudf::lower_bound(*input, *values, column_desc_flags, column_null_orders));
-    }
-    return reinterpret_cast<jlong>(result.release());
+    return release_as_jlong(
+        is_upper_bound ? cudf::upper_bound(*input, *values, column_desc_flags, column_null_orders) :
+                         cudf::lower_bound(*input, *values, column_desc_flags, column_null_orders));
   }
   CATCH_STD(env, 0);
 }
@@ -3294,18 +3294,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
 
       int agg_column_index = values[i];
       if (default_output[i] != nullptr) {
-        result_columns.emplace_back(std::move(cudf::grouped_rolling_window(
+        result_columns.emplace_back(cudf::grouped_rolling_window(
             groupby_keys, input_table->column(agg_column_index), *default_output[i], preceding[i],
-            following[i], min_periods[i], *agg)));
+            following[i], min_periods[i], *agg));
       } else {
-        result_columns.emplace_back(std::move(
+        result_columns.emplace_back(
             cudf::grouped_rolling_window(groupby_keys, input_table->column(agg_column_index),
-                                         preceding[i], following[i], min_periods[i], *agg)));
+                                         preceding[i], following[i], min_periods[i], *agg));
       }
     }
 
     auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
-    return cudf::jni::convert_table_for_return(env, result_table);
+    return convert_table_for_return(env, result_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -3386,7 +3386,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
       JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation",
                     nullptr);
 
-      result_columns.emplace_back(std::move(cudf::grouped_range_rolling_window(
+      result_columns.emplace_back(cudf::grouped_range_rolling_window(
           groupby_keys, order_by_column,
           orderbys_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
           input_table->column(agg_column_index),
@@ -3394,11 +3394,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
                                    cudf::range_window_bounds::get(*preceding[i]),
           unbounded_following[i] ? cudf::range_window_bounds::unbounded(unbounded_type) :
                                    cudf::range_window_bounds::get(*following[i]),
-          min_periods[i], *agg)));
+          min_periods[i], *agg));
     }
 
     auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
-    return cudf::jni::convert_table_for_return(env, result_table);
+    return convert_table_for_return(env, result_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -3409,10 +3409,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv *env, jcla
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
-    std::unique_ptr<cudf::table> exploded = cudf::explode(*input_table, col_index);
-    return cudf::jni::convert_table_for_return(env, exploded);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const col_index = static_cast<cudf::size_type>(column_index);
+    return convert_table_for_return(env, cudf::explode(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
@@ -3423,10 +3422,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
-    std::unique_ptr<cudf::table> exploded = cudf::explode_position(*input_table, col_index);
-    return cudf::jni::convert_table_for_return(env, exploded);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const col_index = static_cast<cudf::size_type>(column_index);
+    return convert_table_for_return(env, cudf::explode_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
@@ -3437,10 +3435,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env,
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
-    std::unique_ptr<cudf::table> exploded = cudf::explode_outer(*input_table, col_index);
-    return cudf::jni::convert_table_for_return(env, exploded);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const col_index = static_cast<cudf::size_type>(column_index);
+    return convert_table_for_return(env, cudf::explode_outer(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
@@ -3451,10 +3448,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIE
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
-    std::unique_ptr<cudf::table> exploded = cudf::explode_outer_position(*input_table, col_index);
-    return cudf::jni::convert_table_for_return(env, exploded);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const col_index = static_cast<cudf::size_type>(column_index);
+    return convert_table_for_return(env, cudf::explode_outer_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
@@ -3463,9 +3459,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv *env, jclas
   JNI_NULL_CHECK(env, j_table, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto t = reinterpret_cast<cudf::table_view *>(j_table);
-    std::unique_ptr<cudf::column> result = cudf::row_bit_count(*t);
-    return reinterpret_cast<jlong>(result.release());
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_table);
+    return release_as_jlong(cudf::row_bit_count(*input_table));
   }
   CATCH_STD(env, 0);
 }
@@ -3483,7 +3478,7 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jintArray n_key_indices(env, jkey_indices);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(jinput_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(jinput_table);
 
     // Prepares arguments for the groupby:
     //   (keys, null_handling, keys_are_sorted, column_order, null_precedence)
@@ -3577,11 +3572,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclas
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
+    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
     auto sample_with_replacement =
         replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
-    std::unique_ptr<cudf::table> result = cudf::sample(*input, n, sample_with_replacement, seed);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::sample(*input, n, sample_with_replacement, seed));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index fbcca0c82ee..12fd45b831a 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -23,7 +23,28 @@
 namespace cudf {
 namespace jni {
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result);
+/**
+ * @brief Detach all columns from the specified table, and pointers to them as an array.
+ *
+ * This function takes a table (presumably returned by some operation), and turns it into an
+ * array of column* (as jlongs).
+ * The lifetime of the columns is decoupled from that of the table, and is managed by the caller.
+ *
+ * @param env The JNI environment
+ * @param table_result the table to convert for return
+ * @param extra_columns columns not in the table that will be appended to the result.
+ */
+jlongArray
+convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
+                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
+
+/**
+ * @copydoc convert_table_for_return(JNIEnv*, std::unique_ptr<cudf::table>&,
+ *                                   std::vector<std::unique_ptr<cudf::column>>&&)
+ */
+jlongArray
+convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
+                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
 
 //
 // ContiguousTable APIs
diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
index 0ca997d3c80..862f3860d3d 100644
--- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -1363,6 +1363,46 @@ public void testBitXor() {
     }
   }
 
+  @Test
+  public void testNullAnd() {
+    try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans(
+        true, true, true,
+        false, false, false,
+        null, null, null);
+         ColumnVector icv2 = ColumnVector.fromBoxedBooleans(
+             true, false, null,
+             true, false, null,
+             true, false, null)) {
+      try (ColumnVector answer = icv1.binaryOp(BinaryOp.NULL_LOGICAL_AND, icv2, DType.BOOL8);
+           ColumnVector expected = ColumnVector.fromBoxedBooleans(
+               true, false, null,
+               false, false, false,
+               null, false, null)) {
+        assertColumnsAreEqual(expected, answer, "boolean NULL AND boolean");
+      }
+    }
+  }
+
+  @Test
+  public void testNullOr() {
+    try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans(
+        true, true, true,
+        false, false, false,
+        null, null, null);
+         ColumnVector icv2 = ColumnVector.fromBoxedBooleans(
+             true, false, null,
+             true, false, null,
+             true, false, null)) {
+      try (ColumnVector answer = icv1.binaryOp(BinaryOp.NULL_LOGICAL_OR, icv2, DType.BOOL8);
+           ColumnVector expected = ColumnVector.fromBoxedBooleans(
+               true, true, true,
+               true, false, null,
+               true, null, null)) {
+        assertColumnsAreEqual(expected, answer, "boolean NULL OR boolean");
+      }
+    }
+  }
+
   @Test
   public void testAnd() {
     try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans(BOOLEANS_1);
diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
index a26dbec4907..7b476c31b95 100644
--- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -39,21 +40,34 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertFalse(byteColumnVector.hasNulls());
-      assertEquals(byteColumnVector.getByte(0), 2);
-      assertEquals(byteColumnVector.getByte(1), 3);
-      assertEquals(byteColumnVector.getByte(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getByte(0), 2);
+      assertEquals(cv.getByte(1), 3);
+      assertEquals(cv.getByte(2), 5);
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector v = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) {
-      assertFalse(v.hasNulls());
-      assertEquals(0xff, Byte.toUnsignedInt(v.getByte(0)), 0xff);
-      assertEquals(128, Byte.toUnsignedInt(v.getByte(1)), 128);
-      assertEquals(5, Byte.toUnsignedInt(v.getByte(2)), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(0xff, Byte.toUnsignedInt(cv.getByte(0)), 0xff);
+      assertEquals(128, Byte.toUnsignedInt(cv.getByte(1)), 128);
+      assertEquals(5, Byte.toUnsignedInt(cv.getByte(2)), 5);
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(false,
+        new byte[]{(byte)0xff, (byte)128, 5})) {
+      verify.accept(bcv);
     }
   }
 
@@ -70,47 +84,73 @@ public void testAppendRepeatingValues() {
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertThrows(AssertionError.class, () -> byteColumnVector.getByte(3));
-      assertFalse(byteColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getByte(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertFalse(byteColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> byteColumnVector.getByte(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getByte(-1));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedBytes(
-        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
-      assertTrue(byteColumnVector.hasNulls());
-      assertEquals(2, byteColumnVector.getNullCount());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertTrue(cv.hasNulls());
+      assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
-        assertFalse(byteColumnVector.isNull(i));
+        assertFalse(cv.isNull(i));
       }
-      assertTrue(byteColumnVector.isNull(6));
-      assertTrue(byteColumnVector.isNull(7));
+      assertTrue(cv.isNull(6));
+      assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBoxedBytes(
+        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(true,
+        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedUnsignedBytes(
-        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
-      assertTrue(byteColumnVector.hasNulls());
-      assertEquals(2, byteColumnVector.getNullCount());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertTrue(cv.hasNulls());
+      assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
-        assertFalse(byteColumnVector.isNull(i));
+        assertFalse(cv.isNull(i));
       }
-      assertEquals(128, Byte.toUnsignedInt(byteColumnVector.getByte(4)));
-      assertEquals(254, Byte.toUnsignedInt(byteColumnVector.getByte(5)));
-      assertTrue(byteColumnVector.isNull(6));
-      assertTrue(byteColumnVector.isNull(7));
+      assertEquals(128, Byte.toUnsignedInt(cv.getByte(4)));
+      assertEquals(254, Byte.toUnsignedInt(cv.getByte(5)));
+      assertTrue(cv.isNull(6));
+      assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBoxedUnsignedBytes(
+        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(false,
+        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
+      verify.accept(bcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java
new file mode 100644
index 00000000000..263244b2413
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java
@@ -0,0 +1,158 @@
+/*
+ *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Objects;
+import java.util.function.Consumer;
+
+/**
+ * ColumnBuilderHelper helps to test ColumnBuilder with existed ColumnVector tests.
+ */
+public class ColumnBuilderHelper {
+
+  public static HostColumnVector build(
+      HostColumnVector.DataType type,
+      int rows,
+      Consumer<HostColumnVector.ColumnBuilder> init) {
+    try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) {
+      init.accept(b);
+      return b.build();
+    }
+  }
+
+  public static ColumnVector buildOnDevice(
+      HostColumnVector.DataType type,
+      int rows,
+      Consumer<HostColumnVector.ColumnBuilder> init) {
+    try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) {
+      init.accept(b);
+      return b.buildAndPutOnDevice();
+    }
+  }
+
+  public static HostColumnVector fromBoxedBytes(boolean signed, Byte... values) {
+    DType dt = signed ? DType.INT8 : DType.UINT8;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Byte v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedDoubles(Double... values) {
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, DType.FLOAT64),
+        values.length,
+        (b) -> {
+          for (Double v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedInts(boolean signed, Integer... values) {
+    DType dt = signed ? DType.INT32 : DType.UINT32;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Integer v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedLongs(boolean signed, Long... values) {
+    DType dt = signed ? DType.INT64 : DType.UINT64;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Long v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBytes(boolean signed, byte... values) {
+    DType dt = signed ? DType.INT8 : DType.UINT8;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (byte v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromDecimals(BigDecimal... values) {
+    // Simply copy from HostColumnVector.fromDecimals
+    BigDecimal maxDec = Arrays.stream(values).filter(Objects::nonNull)
+        .max(Comparator.comparingInt(BigDecimal::precision))
+        .orElse(BigDecimal.ZERO);
+    int maxScale = Arrays.stream(values).filter(Objects::nonNull)
+        .map(decimal -> decimal.scale())
+        .max(Comparator.naturalOrder())
+        .orElse(0);
+    maxDec = maxDec.setScale(maxScale, RoundingMode.UNNECESSARY);
+
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, DType.fromJavaBigDecimal(maxDec)),
+        values.length,
+        (b) -> {
+          for (BigDecimal v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromDoubles(double... values) {
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, DType.FLOAT64),
+        values.length,
+        (b) -> {
+          for (double v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromInts(boolean signed, int... values) {
+    DType dt = signed ? DType.INT32 : DType.UINT32;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (int v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromLongs(boolean signed, long... values) {
+    DType dt = signed ? DType.INT64 : DType.UINT64;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (long v : values) b.append(v);
+        });
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 8d4bbff1542..8f39c3c51ce 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3559,7 +3559,6 @@ void testCastDecimal64ToString() {
     for (int scale : new int[]{-5, -2, -1, 0, 1, 2, 5}) {
       for (int i = 0; i < strDecimalValues.length; i++) {
         strDecimalValues[i] = dumpDecimal(unScaledValues[i], scale);
-        System.out.println(strDecimalValues[i]);
       }
 
       testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL64, scale),
@@ -4703,13 +4702,21 @@ void testStringSplitRecord() {
 
   @Test
   void testStringSplit() {
-    try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", null, "", "ARé some", "test strings");
-         Table expected = new Table.TestBuilder().column("Héllo", "thésé", null, "", "ARé", "test")
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here");
+         Table expectedSplitOnce = new Table.TestBuilder()
+         .column("Héllo", "thésé", null, "", "ARé", "test")
+         .column("there all", null, null, null, "some things", "strings here")
+         .build();
+         Table expectedSplitAll = new Table.TestBuilder()
+         .column("Héllo", "thésé", null, "", "ARé", "test")
          .column("there", null, null, null, "some", "strings")
+         .column("all", null, null, null, "things", "here")
          .build();
          Scalar pattern = Scalar.fromString(" ");
-         Table result = v.stringSplit(pattern)) {
-      assertTablesAreEqual(expected, result);
+         Table resultSplitOnce = v.stringSplit(pattern, 1);
+         Table resultSplitAll = v.stringSplit(pattern)) {
+          assertTablesAreEqual(expectedSplitOnce, resultSplitOnce);
+          assertTablesAreEqual(expectedSplitAll, resultSplitAll);      
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
index c2772520f57..994066c5df0 100644
--- a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,10 +22,12 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.*;
 
@@ -33,9 +35,11 @@ public class DecimalColumnVectorTest extends CudfTestBase {
   private static final Random rdSeed = new Random(1234);
   private static final int dec32Scale = 4;
   private static final int dec64Scale = 10;
+  private static final int dec128Scale = 30;
 
   private static final BigDecimal[] decimal32Zoo = new BigDecimal[20];
   private static final BigDecimal[] decimal64Zoo = new BigDecimal[20];
+  private static final BigDecimal[] decimal128Zoo = new BigDecimal[20];
   private static final int[] unscaledDec32Zoo = new int[decimal32Zoo.length];
   private static final long[] unscaledDec64Zoo = new long[decimal64Zoo.length];
 
@@ -45,6 +49,9 @@ public class DecimalColumnVectorTest extends CudfTestBase {
   private final BigDecimal[] boundaryDecimal64 = new BigDecimal[]{
       new BigDecimal("999999999999999999"), new BigDecimal("-999999999999999999")};
 
+  private final BigDecimal[] boundaryDecimal128 = new BigDecimal[]{
+      new BigDecimal("99999999999999999999999999999999999999"), new BigDecimal("-99999999999999999999999999999999999999")};
+
   private final BigDecimal[] overflowDecimal32 = new BigDecimal[]{
       BigDecimal.valueOf(Integer.MAX_VALUE), BigDecimal.valueOf(Integer.MIN_VALUE)};
 
@@ -72,6 +79,12 @@ public static void setup() {
       } else {
         decimal64Zoo[i] = null;
       }
+      if (rdSeed.nextBoolean()) {
+        BigInteger unscaledVal = BigInteger.valueOf(rdSeed.nextLong()).multiply(BigInteger.valueOf(rdSeed.nextLong()));
+        decimal128Zoo[i] = new BigDecimal(unscaledVal, dec128Scale);
+      } else {
+        decimal128Zoo[i] = null;
+      }
     }
   }
 
@@ -190,27 +203,44 @@ public void testDecimalGeneral() {
 
   @Test
   public void testDecimalFromDecimals() {
-    DecimalColumnVectorTest.testDecimalImpl(false, dec32Scale, decimal32Zoo);
-    DecimalColumnVectorTest.testDecimalImpl(true, dec64Scale, decimal64Zoo);
-    DecimalColumnVectorTest.testDecimalImpl(false, 0, boundaryDecimal32);
-    DecimalColumnVectorTest.testDecimalImpl(true, 0, boundaryDecimal64);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, dec32Scale, decimal32Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, dec64Scale, decimal64Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, dec128Scale, decimal128Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, 0, boundaryDecimal32);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, 0, boundaryDecimal64);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, 0, boundaryDecimal128);
   }
 
-  private static void testDecimalImpl(boolean isInt64, int scale, BigDecimal[] decimalZoo) {
-    try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) {
-      try (HostColumnVector hcv = cv.copyToHost()) {
-        assertEquals(-scale, hcv.getType().getScale());
-        assertEquals(isInt64, hcv.getType().typeId == DType.DTypeEnum.DECIMAL64);
-        assertEquals(decimalZoo.length, hcv.rows);
-        for (int i = 0; i < decimalZoo.length; i++) {
-          assertEquals(decimalZoo[i] == null, hcv.isNull(i));
-          if (decimalZoo[i] != null) {
-            assertEquals(decimalZoo[i].floatValue(), hcv.getBigDecimal(i).floatValue());
-            long backValue = isInt64 ? hcv.getLong(i) : hcv.getInt(i);
-            assertEquals(decimalZoo[i].setScale(scale, RoundingMode.UNNECESSARY), BigDecimal.valueOf(backValue, scale));
+  private static void testDecimalImpl(DType.DTypeEnum decimalType, int scale, BigDecimal[] decimalZoo) {
+    Consumer<HostColumnVector> assertions = (hcv) -> {
+      assertEquals(-scale, hcv.getType().getScale());
+      assertEquals(hcv.getType().typeId, decimalType);
+      assertEquals(decimalZoo.length, hcv.rows);
+      for (int i = 0; i < decimalZoo.length; i++) {
+        assertEquals(decimalZoo[i] == null, hcv.isNull(i));
+        if (decimalZoo[i] != null) {
+          BigDecimal actual;
+          switch (decimalType) {
+          case DECIMAL32:
+            actual = BigDecimal.valueOf(hcv.getInt(i), scale);
+            break;
+          case DECIMAL64:
+            actual = BigDecimal.valueOf(hcv.getLong(i), scale);
+            break;
+          default:
+            actual = hcv.getBigDecimal(i);
           }
+          assertEquals(decimalZoo[i].subtract(actual).longValueExact(), 0L);
         }
       }
+    };
+    try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) {
+      try (HostColumnVector hcv = cv.copyToHost()) {
+        assertions.accept(hcv);
+      }
+    }
+    try (HostColumnVector hcv = ColumnBuilderHelper.fromDecimals(decimalZoo)) {
+      assertions.accept(hcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
index d82565e1d2d..fa34429685e 100644
--- a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -40,34 +41,51 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertFalse(doubleColumnVector.hasNulls());
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(0), 2.1, 0.01);
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(1), 3.02, 0.01);
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(2), 5.003, 0.001);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEqualsWithinPercentage(cv.getDouble(0), 2.1, 0.01);
+      assertEqualsWithinPercentage(cv.getDouble(1), 3.02, 0.01);
+      assertEqualsWithinPercentage(cv.getDouble(2), 5.003, 0.001);
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(3));
-      assertFalse(doubleColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getDouble(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertFalse(doubleColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getDouble(-1));
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv =
-             HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -75,6 +93,14 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector dcv =
+             HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromBoxedDoubles(
+        2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+      verify.accept(dcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
index 2fb8164534b..7d6311fb24c 100644
--- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -34,47 +35,75 @@ public void testCreateColumnVectorBuilder() {
     try (ColumnVector intColumnVector = ColumnVector.build(DType.INT32, 3, (b) -> b.append(1))) {
       assertFalse(intColumnVector.hasNulls());
     }
+    try (ColumnVector intColumnVector = ColumnBuilderHelper.buildOnDevice(
+        new HostColumnVector.BasicType(true, DType.INT32), 3, (b) -> b.append(1))) {
+      assertFalse(intColumnVector.hasNulls());
+    }
   }
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertFalse(intColumnVector.hasNulls());
-      assertEquals(intColumnVector.getInt(0), 2);
-      assertEquals(intColumnVector.getInt(1), 3);
-      assertEquals(intColumnVector.getInt(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getInt(0), 2);
+      assertEquals(cv.getInt(1), 3);
+      assertEquals(cv.getInt(2), 5);
+    };
+    try (HostColumnVector cv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector v = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) {
-      assertFalse(v.hasNulls());
-      assertEquals(0xfedcba98L, Integer.toUnsignedLong(v.getInt(0)));
-      assertEquals(0x80000000L, Integer.toUnsignedLong(v.getInt(1)));
-      assertEquals(5, Integer.toUnsignedLong(v.getInt(2)));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(0xfedcba98L, Integer.toUnsignedLong(cv.getInt(0)));
+      assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(1)));
+      assertEquals(5, Integer.toUnsignedLong(cv.getInt(2)));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromInts(false, 0xfedcba98, 0x80000000, 5)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertThrows(AssertionError.class, () -> intColumnVector.getInt(3));
-      assertFalse(intColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getInt(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(icv);
+    }
+    try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(icv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertFalse(intColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> intColumnVector.getInt(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getInt(-1));
+    };
+    try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(icv);
+    }
+    try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(icv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -82,13 +111,18 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(true, 2, 3, 4, 5, 6, 7, null, null)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts(
-            2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -98,6 +132,14 @@ public void testAddingUnsignedNullValues() {
       assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(5)));
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts(
+            2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(false,
+        2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+      verify.accept(cv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
index 43c2b5a99c2..193992f5304 100644
--- a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -38,46 +39,71 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
-      assertEquals(longColumnVector.getLong(0), 2);
-      assertEquals(longColumnVector.getLong(1), 3);
-      assertEquals(longColumnVector.getLong(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getLong(0), 2);
+      assertEquals(cv.getLong(1), 3);
+      assertEquals(cv.getLong(2), 5);
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true,2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromUnsignedLongs(
-        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
       assertEquals(Long.toUnsignedString(0xfedcba9876543210L),
-          Long.toUnsignedString(longColumnVector.getLong(0)));
+          Long.toUnsignedString(cv.getLong(0)));
       assertEquals(Long.toUnsignedString(0x8000000000000000L),
-          Long.toUnsignedString(longColumnVector.getLong(1)));
-      assertEquals(5L, longColumnVector.getLong(2));
+          Long.toUnsignedString(cv.getLong(1)));
+      assertEquals(5L, cv.getLong(2));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromUnsignedLongs(
+        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(false,
+        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertThrows(AssertionError.class, () -> longColumnVector.getLong(3));
-      assertFalse(longColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getLong(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> longColumnVector.getLong(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getLong(-1));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -85,13 +111,19 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(true,
+        2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedLongs(
-        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -103,6 +135,14 @@ public void testAddingUnsignedNullValues() {
           Long.toUnsignedString(cv.getLong(5)));
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromBoxedUnsignedLongs(
+        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(false,
+        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+      verify.accept(lcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 47c468de8c8..db1327c5471 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -2380,6 +2380,222 @@ void testMixedFullJoinGatherMapsNulls() {
     }
   }
 
+  @Test
+  void testMixedLeftSemiJoinGatherMap() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5)
+             .column(7, 8, 9, 0, 1, 2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8)
+             .build();
+         GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
+             condition, NullEquality.UNEQUAL)) {
+      verifySemiJoinGatherMap(map, expected);
+    }
+  }
+
+  @Test
+  void testMixedLeftSemiJoinGatherMapNulls() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(0, 7, 8)
+             .build();
+         GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
+             condition, NullEquality.EQUAL)) {
+      verifySemiJoinGatherMap(map, expected);
+    }
+  }
+
+  @Test
+  void testMixedLeftSemiJoinGatherMapWithSize() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5)
+             .column(7, 8, 9, 0, 1, 2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8)
+             .build();
+         MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right,
+             condition, NullEquality.UNEQUAL)) {
+      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
+      try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
+          condition, NullEquality.UNEQUAL, sizeInfo)) {
+        verifySemiJoinGatherMap(map, expected);
+      }
+    }
+  }
+
+  @Test
+  void testMixedLeftSemiJoinGatherMapNullsWithSize() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(0, 7, 8)
+             .build();
+         MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right,
+             condition, NullEquality.EQUAL)) {
+      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
+      try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
+          condition, NullEquality.EQUAL, sizeInfo)) {
+        verifySemiJoinGatherMap(map, expected);
+      }
+    }
+  }
+
+  @Test
+  void testMixedLeftAntiJoinGatherMap() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5)
+             .column(7, 8, 9, 0, 1, 2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(0, 1, 3, 4, 5, 6, 9)
+             .build();
+         GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
+             condition, NullEquality.UNEQUAL)) {
+      verifySemiJoinGatherMap(map, expected);
+    }
+  }
+
+  @Test
+  void testMixedLeftAntiJoinGatherMapNulls() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(1, 2, 3, 4, 5, 6, 9)
+             .build();
+         GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
+             condition, NullEquality.EQUAL)) {
+      verifySemiJoinGatherMap(map, expected);
+    }
+  }
+
+  @Test
+  void testMixedLeftAntiJoinGatherMapWithSize() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5)
+             .column(7, 8, 9, 0, 1, 2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(0, 1, 3, 4, 5, 6, 9)
+             .build();
+         MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right,
+             condition, NullEquality.UNEQUAL)) {
+      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
+      try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
+          condition, NullEquality.UNEQUAL, sizeInfo)) {
+        verifySemiJoinGatherMap(map, expected);
+      }
+    }
+  }
+
+  @Test
+  void testMixedLeftAntiJoinGatherMapNullsWithSize() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(1, 2, 3, 4, 5, 6, 9)
+             .build();
+         MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right,
+             condition, NullEquality.EQUAL)) {
+      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
+      try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
+          condition, NullEquality.EQUAL, sizeInfo)) {
+        verifySemiJoinGatherMap(map, expected);
+      }
+    }
+  }
+
   @Test
   void testLeftSemiJoinGatherMap() {
     try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py
index 1312300f714..193fb4c7f7f 100644
--- a/python/cudf/cudf/_fuzz_testing/io.py
+++ b/python/cudf/cudf/_fuzz_testing/io.py
@@ -25,6 +25,9 @@ def __init__(
         max_string_length=None,
         max_lists_length=None,
         max_lists_nesting_depth=None,
+        max_structs_nesting_depth=None,
+        max_struct_null_frequency=None,
+        max_struct_types_at_each_level=None,
     ):
         dirs = [] if dirs is None else dirs
         self._inputs = []
@@ -33,6 +36,9 @@ def __init__(
         self._max_string_length = max_string_length
         self._max_lists_length = max_lists_length
         self._max_lists_nesting_depth = max_lists_nesting_depth
+        self._max_structs_nesting_depth = max_structs_nesting_depth
+        self._max_struct_null_frequency = max_struct_null_frequency
+        self._max_struct_types_at_each_level = max_struct_types_at_each_level
 
         for i, path in enumerate(dirs):
             if i == 0 and not os.path.exists(path):
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 2aa01eb3967..78e01fb76a4 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -83,7 +83,10 @@ def generate_input(self):
         self._df = df
         file_obj = io.BytesIO()
         pandas_to_orc(
-            df, file_io_obj=file_obj, stripe_size=self._rand(len(df))
+            df,
+            file_io_obj=file_obj,
+            stripe_size=self._rand(len(df)),
+            arrow_table_schema=table.schema,
         )
         file_obj.seek(0)
         buf = file_obj.read()
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 5b00f96d88d..859d09b407f 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -59,6 +59,7 @@ def generate_input(self):
                 - {"uint32"}
                 | {"list", "decimal64"}
             )
+
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
@@ -80,6 +81,7 @@ def generate_input(self):
         # https://issues.apache.org/jira/browse/ARROW-10123
 
         # file = io.BytesIO()
+
         df.to_parquet("temp_file")
         # file.seek(0)
         # self._current_buffer = copy.copy(file.read())
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
index b3fd7e8c5a7..977038d1fcb 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import io
 import sys
@@ -74,7 +74,7 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
     data_handle=OrcWriter,
     params={
         "compression": [None, "snappy"],
-        "enable_statistics": [True, False],
+        "enable_statistics": ["NONE", "STRIPE", "ROWGROUP"],
     },
 )
 def orc_writer_test(pdf, compression, enable_statistics):
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index ff5870c50be..87a8fc46374 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -6,6 +6,7 @@
 import fastavro
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pyorc
 
 import cudf
@@ -114,6 +115,26 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             meta["value_type"] = random.choice(
                 list(cudf.utils.dtypes.ALL_TYPES - {"category"})
             )
+        elif dtype == "struct":
+            if obj._max_lists_nesting_depth is None:
+                meta["nesting_max_depth"] = np.random.randint(2, 10)
+            else:
+                meta["nesting_max_depth"] = obj._max_lists_nesting_depth
+
+            if obj._max_struct_null_frequency is None:
+                meta["max_null_frequency"] = random.uniform(0, 1)
+            else:
+                meta["max_null_frequency"] = obj._max_struct_null_frequency
+
+            if obj._max_struct_types_at_each_level is None:
+                meta["max_types_at_each_level"] = np.random.randint(
+                    low=1, high=10
+                )
+            else:
+                meta[
+                    "max_types_at_each_level"
+                ] = obj._max_struct_types_at_each_level
+
         elif dtype == "decimal64":
             meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
         elif dtype == "decimal32":
@@ -161,6 +182,8 @@ def pyarrow_to_pandas(table):
             df[column._name] = pd.Series(
                 column, dtype=pyarrow_dtypes_to_pandas_dtypes[column.type]
             )
+        elif isinstance(column.type, pa.StructType):
+            df[column._name] = column.to_pandas(integer_object_nulls=True)
         else:
             df[column._name] = column.to_pandas()
 
@@ -196,6 +219,14 @@ def get_orc_dtype_info(dtype):
         )
 
 
+def get_arrow_dtype_info_for_pyorc(dtype):
+    if isinstance(dtype, pa.StructType):
+        return get_orc_schema(df=None, arrow_table_schema=dtype)
+    else:
+        pd_dtype = cudf.dtype(dtype.to_pandas_dtype())
+        return get_orc_dtype_info(pd_dtype)
+
+
 def get_avro_schema(df):
     fields = [
         {"name": col_name, "type": get_avro_dtype_info(col_dtype)}
@@ -205,11 +236,17 @@ def get_avro_schema(df):
     return schema
 
 
-def get_orc_schema(df):
-    ordered_dict = OrderedDict(
-        (col_name, get_orc_dtype_info(col_dtype))
-        for col_name, col_dtype in df.dtypes.items()
-    )
+def get_orc_schema(df, arrow_table_schema=None):
+    if arrow_table_schema is None:
+        ordered_dict = OrderedDict(
+            (col_name, get_orc_dtype_info(col_dtype))
+            for col_name, col_dtype in df.dtypes.items()
+        )
+    else:
+        ordered_dict = OrderedDict(
+            (field.name, get_arrow_dtype_info_for_pyorc(field.type))
+            for field in arrow_table_schema
+        )
 
     schema = pyorc.Struct(**ordered_dict)
     return schema
@@ -255,13 +292,25 @@ def pandas_to_avro(df, file_name=None, file_io_obj=None):
         fastavro.writer(file_io_obj, avro_schema, records)
 
 
-def _preprocess_to_orc_tuple(df):
+def _preprocess_to_orc_tuple(df, arrow_table_schema):
     def _null_to_None(value):
         if value is pd.NA or value is pd.NaT:
             return None
         else:
             return value
 
+    def sanitize(value, struct_type):
+        if value is None:
+            return None
+
+        values_list = []
+        for name, sub_type in struct_type.fields.items():
+            if isinstance(sub_type, cudf.StructDtype):
+                values_list.append(sanitize(value[name], sub_type))
+            else:
+                values_list.append(value[name])
+        return tuple(values_list)
+
     has_nulls_or_nullable_dtype = any(
         [
             True
@@ -271,19 +320,35 @@ def _null_to_None(value):
             for col in df.columns
         ]
     )
+    pdf = df.copy(deep=True)
+    for field in arrow_table_schema:
+        if isinstance(field.type, pa.StructType):
+            pdf[field.name] = pdf[field.name].apply(
+                sanitize, args=(cudf.StructDtype.from_arrow(field.type),)
+            )
+        else:
+            pdf[field.name] = pdf[field.name]
 
     tuple_list = [
         tuple(map(_null_to_None, tup)) if has_nulls_or_nullable_dtype else tup
-        for tup in df.itertuples(index=False, name=None)
+        for tup in pdf.itertuples(index=False, name=None)
     ]
 
-    return tuple_list
+    return tuple_list, pdf, df
 
 
-def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864):
-    schema = get_orc_schema(df)
+def pandas_to_orc(
+    df,
+    file_name=None,
+    file_io_obj=None,
+    stripe_size=67108864,
+    arrow_table_schema=None,
+):
+    schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema)
 
-    tuple_list = _preprocess_to_orc_tuple(df)
+    tuple_list, pdf, df = _preprocess_to_orc_tuple(
+        df, arrow_table_schema=arrow_table_schema
+    )
 
     if file_name is not None:
         with open(file_name, "wb") as data:
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index e5a8bb926c1..0c2f971a26c 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -37,7 +37,6 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
         void set_decimal_cols_as_float(vector[string] val) except+
-        void enable_decimal128(bool val) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -59,7 +58,6 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& decimal_cols_as_float(
             vector[string] val
         ) except+
-        orc_reader_options_builder& decimal128(bool val) except+
 
         orc_reader_options build() except+
 
diff --git a/python/cudf/cudf/_lib/cpp/strings/findall.pxd b/python/cudf/cudf/_lib/cpp/strings/findall.pxd
index 189d0770b81..5533467d72a 100644
--- a/python/cudf/cudf/_lib/cpp/strings/findall.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/findall.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -10,6 +10,10 @@ from cudf._lib.cpp.table.table cimport table
 
 cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
 
-    cdef unique_ptr[table] findall_re(
-        column_view source_strings,
-        string pattern) except +
+    cdef unique_ptr[table] findall(
+        const column_view& source_strings,
+        const string& pattern) except +
+
+    cdef unique_ptr[column] findall_record(
+        const column_view& source_strings,
+        const string& pattern) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index cbba1796c26..ce4f183e795 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -9,6 +9,7 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+cimport cudf._lib.cpp.io.types as cudf_io_types
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.io.orc cimport (
@@ -144,10 +145,27 @@ cdef compression_type _get_comp_type(object compression):
         raise ValueError(f"Unsupported `compression` type {compression}")
 
 
+cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
+    """
+    Convert ORC statistics terms to CUDF convention:
+      - ORC "STRIPE"   == CUDF "ROWGROUP"
+      - ORC "ROWGROUP" == CUDF "PAGE"
+    """
+    statistics = str(statistics).upper()
+    if statistics == "NONE":
+        return cudf_io_types.statistics_freq.STATISTICS_NONE
+    elif statistics == "STRIPE":
+        return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP
+    elif statistics == "ROWGROUP":
+        return cudf_io_types.statistics_freq.STATISTICS_PAGE
+    else:
+        raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
+
+
 cpdef write_orc(table,
                 object path_or_buf,
                 object compression=None,
-                bool enable_statistics=True,
+                object statistics="ROWGROUP",
                 object stripe_size_bytes=None,
                 object stripe_size_rows=None,
                 object row_index_stride=None):
@@ -189,7 +207,7 @@ cpdef write_orc(table,
             sink_info_c, table_view_from_table(table, ignore_index=True)
         ).metadata(tbl_meta.get())
         .compression(compression_)
-        .enable_statistics(<bool> (True if enable_statistics else False))
+        .enable_statistics(_get_orc_stat_freq(statistics))
         .build()
     )
     if stripe_size_bytes is not None:
@@ -268,15 +286,15 @@ cdef class ORCWriter:
     cdef unique_ptr[orc_chunked_writer] writer
     cdef sink_info sink
     cdef unique_ptr[data_sink] _data_sink
-    cdef bool enable_stats
+    cdef cudf_io_types.statistics_freq stat_freq
     cdef compression_type comp_type
     cdef object index
     cdef unique_ptr[table_input_metadata] tbl_meta
 
     def __cinit__(self, object path, object index=None,
-                  object compression=None, bool enable_statistics=True):
+                  object compression=None, object statistics="ROWGROUP"):
         self.sink = make_sink_info(path, self._data_sink)
-        self.enable_stats = enable_statistics
+        self.stat_freq = _get_orc_stat_freq(statistics)
         self.comp_type = _get_comp_type(compression)
         self.index = index
         self.initialized = False
@@ -350,7 +368,7 @@ cdef class ORCWriter:
                 .metadata(self.tbl_meta.get())
                 .key_value_metadata(move(user_data))
                 .compression(self.comp_type)
-                .enable_statistics(self.enable_stats)
+                .enable_statistics(self.stat_freq)
                 .build()
             )
             self.writer.reset(new orc_chunked_writer(args))
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 80af18e7c71..b17988018a6 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -8,7 +8,10 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.findall cimport findall_re as cpp_findall_re
+from cudf._lib.cpp.strings.findall cimport (
+    findall as cpp_findall,
+    findall_record as cpp_findall_record,
+)
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
@@ -25,7 +28,7 @@ def findall(Column source_strings, pattern):
     cdef string pattern_string = <string>str(pattern).encode()
 
     with nogil:
-        c_result = move(cpp_findall_re(
+        c_result = move(cpp_findall(
             source_view,
             pattern_string
         ))
@@ -34,3 +37,22 @@ def findall(Column source_strings, pattern):
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
+
+
+def findall_record(Column source_strings, pattern):
+    """
+    Returns data with all non-overlapping matches of `pattern`
+    in each string of `source_strings` as a lists column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    cdef string pattern_string = <string>str(pattern).encode()
+
+    with nogil:
+        c_result = move(cpp_findall_record(
+            source_view,
+            pattern_string
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index be5a1e7cc93..6569184e90b 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -569,17 +569,6 @@ def to_dlpack(self):
 
         return cudf.io.dlpack.to_dlpack(self)
 
-    @property
-    def gpu_values(self):
-        """
-        View the data as a numba device array object
-        """
-        warnings.warn(
-            "The gpu_values property is deprecated and will be removed.",
-            FutureWarning,
-        )
-        return self._values.data_array_view
-
     def append(self, other):
         """
         Append a collection of Index options together.
@@ -1254,10 +1243,6 @@ def astype(self, dtype, copy=False):
             self.copy(deep=copy)._values.astype(dtype), name=self.name
         )
 
-    # TODO: This method is deprecated and can be removed.
-    def to_array(self, fillna=None):
-        return self._values.to_array(fillna=fillna)
-
     def to_series(self, index=None, name=None):
         """
         Create a Series with both index and values equal to the index keys.
@@ -1444,7 +1429,8 @@ def drop_duplicates(
         """
 
         # This utilizes the fact that all `Index` is also a `Frame`.
-        result = self.__class__._from_columns(
+        # Except RangeIndex.
+        return self._from_columns_like_self(
             drop_duplicates(
                 list(self._columns),
                 keys=range(len(self._data)),
@@ -1453,8 +1439,6 @@ def drop_duplicates(
             ),
             self._column_names,
         )
-        result._copy_type_metadata(self, include_index=False)
-        return result
 
     def dropna(self, how="any"):
         """
@@ -1476,12 +1460,10 @@ def dropna(self, how="any"):
             for col in self._columns
         ]
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             drop_nulls(data_columns, how=how, keys=range(len(data_columns)),),
             self._column_names,
         )
-        result._copy_type_metadata(self, include_index=False)
-        return result
 
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         """Gather rows of index specified by indices in `gather_map`.
@@ -1501,14 +1483,11 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
         ):
             raise IndexError("Gather map index is out of bounds.")
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             gather(list(self._columns), gather_map, nullify=nullify),
             self._column_names,
         )
 
-        result._copy_type_metadata(self, include_index=False)
-        return result
-
     def take(self, indices, axis=0, allow_fill=True, fill_value=None):
         """Return a new index containing the rows specified by *indices*
 
@@ -1542,14 +1521,6 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
                 "`allow_fill` and `fill_value` are unsupported."
             )
 
-        indices = cudf.core.column.as_column(indices)
-        if is_bool_dtype(indices):
-            warnings.warn(
-                "Calling take with a boolean array is deprecated and will be "
-                "removed in the future.",
-                FutureWarning,
-            )
-            return self._apply_boolean_mask(indices)
         return self._gather(indices)
 
     def _apply_boolean_mask(self, boolean_mask):
@@ -1561,12 +1532,10 @@ def _apply_boolean_mask(self, boolean_mask):
         if not is_bool_dtype(boolean_mask.dtype):
             raise ValueError("boolean_mask is not boolean type.")
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             apply_boolean_mask(list(self._columns), boolean_mask),
             column_names=self._column_names,
         )
-        result._copy_type_metadata(self)
-        return result
 
     def _split_columns_by_levels(self, levels):
         if isinstance(levels, int) and levels > 0:
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index de06e62cbb1..24f9dc83ca9 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -46,6 +46,9 @@
     )
 
 
+_DEFAULT_CATEGORICAL_VALUE = -1
+
+
 class CategoricalAccessor(ColumnMethods):
     """
     Accessor object for categorical properties of the Series values.
@@ -946,7 +949,11 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series:
             col = self
 
         signed_dtype = min_signed_type(len(col.categories))
-        codes = col.codes.astype(signed_dtype).fillna(-1).to_array()
+        codes = (
+            col.codes.astype(signed_dtype)
+            .fillna(_DEFAULT_CATEGORICAL_VALUE)
+            .values_host
+        )
         if is_interval_dtype(col.categories.dtype):
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
@@ -1015,13 +1022,10 @@ def _encode(self, value) -> ScalarLike:
         return self.categories.find_first_value(value)
 
     def _decode(self, value: int) -> ScalarLike:
-        if value == self._default_na_value():
+        if value == _DEFAULT_CATEGORICAL_VALUE:
             return None
         return self.categories.element_indexing(value)
 
-    def _default_na_value(self) -> ScalarLike:
-        return -1
-
     def find_and_replace(
         self,
         to_replace: ColumnLike,
@@ -1178,7 +1182,7 @@ def fillna(
             fill_is_scalar = np.isscalar(fill_value)
 
             if fill_is_scalar:
-                if fill_value == self._default_na_value():
+                if fill_value == _DEFAULT_CATEGORICAL_VALUE:
                     fill_value = self.codes.dtype.type(fill_value)
                 else:
                     try:
@@ -1578,7 +1582,7 @@ def _create_empty_categorical_column(
         categories=column.as_column(dtype.categories),
         codes=column.as_column(
             cudf.utils.utils.scalar_broadcast_to(
-                categorical_column._default_na_value(),
+                _DEFAULT_CATEGORICAL_VALUE,
                 categorical_column.size,
                 categorical_column.codes.dtype,
             )
@@ -1601,7 +1605,7 @@ def pandas_categorical_as_column(
     codes = categorical.codes if codes is None else codes
     codes = column.as_column(codes)
 
-    valid_codes = codes != codes.dtype.type(-1)
+    valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE)
 
     mask = None
     if not valid_codes.all():
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1a83194489d..5d694dac255 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -314,51 +314,6 @@ def memory_usage(self) -> int:
             n += bitmask_allocation_size_bytes(self.size)
         return n
 
-    def _default_na_value(self) -> Any:
-        raise NotImplementedError()
-
-    # TODO: This method is deprecated and can be removed when the associated
-    # Frame methods are removed.
-    def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
-        """Get a dense numba device array for the data.
-
-        Parameters
-        ----------
-        fillna : scalar, 'pandas', or None
-            See *fillna* in ``.to_array``.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-        """
-        if fillna:
-            return self.fillna(self._default_na_value()).data_array_view
-        else:
-            return self.dropna(drop_nan=False).data_array_view
-
-    # TODO: This method is deprecated and can be removed when the associated
-    # Frame methods are removed.
-    def to_array(self, fillna=None) -> np.ndarray:
-        """Get a dense numpy array for the data.
-
-        Parameters
-        ----------
-        fillna : scalar, 'pandas', or None
-            Defaults to None, which will skip null values.
-            If it equals "pandas", null values are filled with NaNs.
-            Non integral dtype is promoted to np.float64.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-        """
-
-        return self.to_gpu_array(fillna=fillna).copy_to_host()
-
     def _fill(
         self,
         fill_value: ScalarLike,
@@ -1213,11 +1168,7 @@ def corr(self, other: ColumnBase):
         )
 
     def nans_to_nulls(self: T) -> T:
-        # Only floats can contain nan.
-        if self.dtype.kind != "f":
-            return self
-        newmask = libcudf.transform.nans_to_nulls(self)
-        return self.set_mask(newmask)
+        return self
 
     def _process_for_reduction(
         self, skipna: bool = None, min_count: int = 0
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b763790986a..c72fb66addc 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -199,7 +199,7 @@ def to_pandas(
 
         # Pandas supports only `datetime64[ns]`, hence the cast.
         return pd.Series(
-            self.astype("datetime64[ns]").to_array("NAT"),
+            self.astype("datetime64[ns]").fillna("NaT").values_host,
             copy=False,
             index=index,
         )
@@ -346,10 +346,6 @@ def as_string_column(
                 column.column_empty(0, dtype="object", masked=False),
             )
 
-    def _default_na_value(self) -> DatetimeLikeScalar:
-        """Returns the default NA value for this column"""
-        return np.datetime64("nat", self.time_unit)
-
     def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
         return pd.Timestamp(
             self.as_numerical.mean(skipna=skipna, dtype=dtype),
@@ -488,15 +484,6 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
             return False
 
 
-def binop_offset(lhs, rhs, op):
-    if rhs._is_no_op:
-        return lhs
-    else:
-        rhs = rhs._generate_column(len(lhs), op)
-        out = libcudf.datetime.add_months(lhs, rhs)
-        return out
-
-
 def infer_format(element: str, **kwargs) -> str:
     """
     Infers datetime format from a string, also takes cares for `ms` and `ns`
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 2b0d7cfea38..9b54c4d9acd 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -217,6 +217,13 @@ def binary_operator(
         lhs, rhs = (self, rhs) if not reflect else (rhs, self)
         return libcudf.binaryop.binaryop(lhs, rhs, binop, out_dtype)
 
+    def nans_to_nulls(self: NumericalColumn) -> NumericalColumn:
+        # Only floats can contain nan.
+        if self.dtype.kind != "f" or self.nan_count == 0:
+            return self
+        newmask = libcudf.transform.nans_to_nulls(self)
+        return self.set_mask(newmask)
+
     def normalize_binop_value(
         self, other: ScalarLike
     ) -> Union[ColumnBase, ScalarLike]:
@@ -348,20 +355,6 @@ def _process_for_reduction(
             skipna=skipna, min_count=min_count
         )
 
-    def _default_na_value(self) -> ScalarLike:
-        """Returns the default NA value for this column"""
-        dkind = self.dtype.kind
-        if dkind == "f":
-            return self.dtype.type(np.nan)
-        elif dkind == "i":
-            return np.iinfo(self.dtype).min
-        elif dkind == "u":
-            return np.iinfo(self.dtype).max
-        elif dkind == "b":
-            return self.dtype.type(False)
-        else:
-            raise TypeError(f"numeric column of {self.dtype} has no NaN value")
-
     def find_and_replace(
         self,
         to_replace: ColumnLike,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 9b44b4e6831..6467fd39ddd 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5218,26 +5218,6 @@ def values(self) -> cupy.ndarray:
         """
         raise TypeError("String Arrays is not yet implemented in cudf")
 
-    # TODO: This method is deprecated and should be removed when the associated
-    # Frame methods are removed.
-    def to_array(self, fillna: bool = None) -> np.ndarray:
-        """Get a dense numpy array for the data.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-
-        Raises
-        ------
-        ``NotImplementedError`` if there are nulls
-        """
-        if fillna is not None:
-            warnings.warn("fillna parameter not supported for string arrays")
-
-        return self.to_arrow().to_pandas().values
-
     def to_pandas(
         self, index: pd.Index = None, nullable: bool = False, **kwargs
     ) -> "pd.Series":
@@ -5402,9 +5382,6 @@ def normalize_binop_value(self, other) -> "column.ColumnBase":
         else:
             raise TypeError(f"cannot broadcast {type(other)}")
 
-    def _default_na_value(self) -> ScalarLike:
-        return None
-
     def binary_operator(
         self, op: builtins.str, rhs, reflect: bool = False
     ) -> "column.ColumnBase":
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 4b7a3bcc197..6c8c904e13c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -12,13 +12,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import (
-    BinaryOperand,
-    DatetimeLikeScalar,
-    Dtype,
-    DtypeObj,
-    ScalarLike,
-)
+from cudf._typing import BinaryOperand, DatetimeLikeScalar, Dtype, DtypeObj
 from cudf.api.types import is_scalar
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, column, string
@@ -123,7 +117,8 @@ def to_pandas(
 
         # Pandas supports only `timedelta64[ns]`, hence the cast.
         pd_series = pd.Series(
-            self.astype("timedelta64[ns]").to_array("NAT"), copy=False
+            self.astype("timedelta64[ns]").fillna("NaT").values_host,
+            copy=False,
         )
 
         if index is not None:
@@ -304,10 +299,6 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn":
             ),
         )
 
-    def _default_na_value(self) -> ScalarLike:
-        """Returns the default NA value for this column"""
-        return np.timedelta64("nat", self.time_unit)
-
     @property
     def time_unit(self) -> str:
         return self._time_unit
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index c2ea9d756f7..67976ac27d4 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -523,14 +523,19 @@ def rename_column(x):
                 raise IndexError(
                     f"Too many levels: Index has only 1 level, not {level+1}"
                 )
+
             if isinstance(mapper, Mapping):
-                new_names = (
+                new_col_names = [
                     mapper.get(col_name, col_name) for col_name in self.keys()
-                )
+                ]
             else:
-                new_names = (mapper(col_name) for col_name in self.keys())
+                new_col_names = [mapper(col_name) for col_name in self.keys()]
+
+            if len(new_col_names) != len(set(new_col_names)):
+                raise ValueError("Duplicate column names are not allowed")
+
             ca = ColumnAccessor(
-                dict(zip(new_names, self.values())),
+                dict(zip(new_col_names, self.values())),
                 level_names=self.level_names,
                 multiindex=self.multiindex,
             )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f9fef7dc4dc..bb9cd9b5cc7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -17,7 +17,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba import cuda
 from nvtx import annotate
 from pandas._config import get_option
 from pandas.io.formats import console
@@ -63,6 +62,7 @@
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
+from cudf.core.udf.row_function import _get_row_kernel
 from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -268,7 +268,9 @@ def _getitem_tuple_arg(self, arg):
             else:
                 row_selection = as_column(arg[0])
                 if is_bool_dtype(row_selection.dtype):
-                    df.index = self._frame.index.take(row_selection)
+                    df.index = self._frame.index._apply_boolean_mask(
+                        row_selection
+                    )
                 else:
                     df.index = as_index(row_selection)
         # Step 4: Downcast
@@ -3021,124 +3023,6 @@ def add_suffix(self, suffix):
         ]
         return out
 
-    def as_gpu_matrix(self, columns=None, order="F"):
-        warnings.warn(
-            "The as_gpu_matrix method will be removed in a future cuDF "
-            "release. Consider using `to_cupy` instead.",
-            FutureWarning,
-        )
-        if columns is None:
-            columns = self._data.names
-
-        cols = [self._data[k] for k in columns]
-        ncol = len(cols)
-        nrow = len(self)
-        if ncol < 1:
-            # This is the case for empty dataframe - construct empty cupy array
-            matrix = cupy.empty(
-                shape=(0, 0), dtype=cudf.dtype("float64"), order=order
-            )
-            return cuda.as_cuda_array(matrix)
-
-        if any(
-            (is_categorical_dtype(c) or np.issubdtype(c, cudf.dtype("object")))
-            for c in cols
-        ):
-            raise TypeError("non-numeric data not yet supported")
-
-        dtype = find_common_type([col.dtype for col in cols])
-        for k, c in self._data.items():
-            if c.has_nulls():
-                raise ValueError(
-                    f"column '{k}' has null values. "
-                    f"hint: use .fillna() to replace null values"
-                )
-        cupy_dtype = dtype
-        if np.issubdtype(cupy_dtype, np.datetime64):
-            cupy_dtype = cudf.dtype("int64")
-
-        if order not in ("F", "C"):
-            raise ValueError(
-                "order parameter should be 'C' for row major or 'F' for"
-                "column major GPU matrix"
-            )
-
-        matrix = cupy.empty(shape=(nrow, ncol), dtype=cupy_dtype, order=order)
-        for colidx, inpcol in enumerate(cols):
-            dense = inpcol.astype(cupy_dtype)
-            matrix[:, colidx] = cupy.asarray(dense)
-        return cuda.as_cuda_array(matrix).view(dtype)
-
-    def as_matrix(self, columns=None):
-        warnings.warn(
-            "The as_matrix method will be removed in a future cuDF "
-            "release. Consider using `to_numpy` instead.",
-            FutureWarning,
-        )
-        return self.as_gpu_matrix(columns=columns).copy_to_host()
-
-    def label_encoding(
-        self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1
-    ):
-        """Encode labels in a column with label encoding.
-
-        Parameters
-        ----------
-        column : str
-            the source column with binary encoding for the data.
-        prefix : str
-            the new column name prefix.
-        cats : sequence of ints
-            the sequence of categories as integers.
-        prefix_sep : str
-            the separator between the prefix and the category.
-        dtype :
-            the dtype for the outputs; see Series.label_encoding
-        na_sentinel : number
-            Value to indicate missing category.
-
-        Returns
-        -------
-        A new DataFrame with a new column appended for the coded values.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 10, 20]})
-        >>> df
-           a   b
-        0  1  10
-        1  2  10
-        2  3  20
-        >>> df.label_encoding(column="b", prefix="b_col", cats=[10, 20])
-           a   b  b_col_labels
-        0  1  10             0
-        1  2  10             0
-        2  3  20             1
-        """
-
-        warnings.warn(
-            "DataFrame.label_encoding is deprecated and will be removed in "
-            "the future. Consider using cuML's LabelEncoder instead.",
-            FutureWarning,
-        )
-
-        return self._label_encoding(
-            column, prefix, cats, prefix_sep, dtype, na_sentinel
-        )
-
-    def _label_encoding(
-        self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1
-    ):
-        # Private implementation of deprecated public label_encoding method
-        newname = prefix_sep.join([prefix, "labels"])
-        newcol = self[column]._label_encoding(
-            cats=cats, dtype=dtype, na_sentinel=na_sentinel
-        )
-        outdf = self.copy()
-        outdf.insert(len(outdf._data), newname, newcol)
-        return outdf
-
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -3926,10 +3810,8 @@ def apply(
             raise ValueError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
             raise ValueError("The `result_type` kwarg is not yet supported.")
-        if kwargs:
-            raise ValueError("UDFs using **kwargs are not yet supported.")
 
-        return self._apply(func, *args)
+        return self._apply(func, _get_row_kernel, *args, **kwargs)
 
     @applyutils.doc_apply()
     def apply_rows(
@@ -5536,7 +5418,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                     )
                     .fillna(np.nan)
                 )
-        arr = cupy.asarray(prepared.as_gpu_matrix())
+        arr = prepared.to_cupy()
 
         if skipna is not False and method in _cupy_nan_methods_map:
             method = _cupy_nan_methods_map[method]
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 6142f0f0f40..a05986555b1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -45,7 +45,6 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import Merge, MergeSemi
-from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
@@ -163,6 +162,22 @@ def _from_columns(
 
         return cls._from_data(data, index)
 
+    def _from_columns_like_self(
+        self,
+        columns: List[ColumnBase],
+        column_names: List[str],
+        index_names: Optional[List[str]] = None,
+    ):
+        """Construct a `Frame` from a list of columns with metadata from self.
+
+        If `index_names` is set, the first `len(index_names)` columns are
+        used to construct the index of the frame.
+        """
+        frame = self.__class__._from_columns(
+            columns, column_names, index_names
+        )
+        return frame._copy_type_metadata(self, include_index=bool(index_names))
+
     def _mimic_inplace(
         self: T, result: Frame, inplace: bool = False
     ) -> Optional[Frame]:
@@ -1351,39 +1366,6 @@ def _quantiles(
         result._copy_type_metadata(self)
         return result
 
-    @annotate("APPLY", color="purple", domain="cudf_python")
-    def _apply(self, func, *args):
-        """
-        Apply `func` across the rows of the frame.
-        """
-        kernel, retty = compile_or_get(self, func, args)
-
-        # Mask and data column preallocated
-        ans_col = cupy.empty(len(self), dtype=retty)
-        ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
-        launch_args = [(ans_col, ans_mask), len(self)]
-        offsets = []
-
-        # if compile_or_get succeeds, it is safe to create a kernel that only
-        # consumes the columns that are of supported dtype
-        for col in supported_cols_from_frame(self).values():
-            data = col.data
-            mask = col.mask
-            if mask is None:
-                launch_args.append(data)
-            else:
-                launch_args.append((data, mask))
-            offsets.append(col.offset)
-        launch_args += offsets
-        launch_args += list(args)
-        kernel.forall(len(self))(*launch_args)
-
-        col = as_column(ans_col)
-        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self._index)
-
-        return result
-
     def rank(
         self,
         axis=0,
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6da98bf980d..a393d8e9457 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -184,11 +184,25 @@ def agg(self, func):
         Parameters
         ----------
         func : str, callable, list or dict
+            Argument specifying the aggregation(s) to perform on the
+            groups. `func` can be any of the following:
+
+              - string: the name of a supported aggregation
+              - callable: a function that accepts a Series/DataFrame and
+                performs a supported operation on it.
+              - list: a list of strings/callables specifying the
+                aggregations to perform on every column.
+              - dict: a mapping of column names to string/callable
+                specifying the aggregations to perform on those
+                columns.
+
+        See :ref:`the user guide <basics.groupby>` for supported
+        aggregations.
 
         Returns
         -------
         A Series or DataFrame containing the combined results of the
-        aggregation.
+        aggregation(s).
 
         Examples
         --------
@@ -655,6 +669,54 @@ def rolling_avg(val, avg):
         kwargs.update({"chunks": offsets})
         return grouped_values.apply_chunks(function, **kwargs)
 
+    def transform(self, function):
+        """Apply an aggregation, then broadcast the result to the group size.
+
+        Parameters
+        ----------
+        function: str or callable
+            Aggregation to apply to each group. Note that the set of
+            operations currently supported by `transform` is identical
+            to that supported by the `agg` method.
+
+        Returns
+        -------
+        A Series or DataFrame of the same size as the input, with the
+        result of the aggregation per group broadcasted to the group
+        size.
+
+        Examples
+        --------
+        .. code-block:: python
+
+          import cudf
+          df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]})
+          df.groupby('a').transform('max')
+             b
+          0  5
+          1  3
+          2  3
+          3  5
+          4  5
+
+        See also
+        --------
+        cudf.core.groupby.GroupBy.agg
+        """
+        try:
+            result = self.agg(function)
+        except TypeError as e:
+            raise NotImplementedError(
+                "Currently, `transform()` supports only aggregations."
+            ) from e
+
+        if not result.index.equals(self.grouping.keys):
+            result = result._align_to_index(
+                self.grouping.keys, how="right", allow_non_unique=True
+            )
+            result = result.reset_index(drop=True)
+        return result
+
     def rolling(self, *args, **kwargs):
         """
         Returns a `RollingGroupby` object that enables rolling window
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 91c7a740699..fc59d15e264 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -682,6 +682,7 @@ def _intersection(self, other, sort=False):
         return new_index
 
     def _gather(self, gather_map, nullify=False, check_bounds=True):
+        gather_map = cudf.core.column.as_column(gather_map)
         return Int64Index._from_columns(
             [self._values.take(gather_map, nullify, check_bounds)], [self.name]
         )
@@ -771,23 +772,6 @@ def __init__(self, data, **kwargs):
         name = kwargs.get("name")
         super().__init__({name: data})
 
-    @classmethod
-    def deserialize(cls, header, frames):
-        if "index_column" in header:
-            warnings.warn(
-                "Index objects serialized in cudf version "
-                "21.10 or older will no longer be deserializable "
-                "after version 21.12. Please load and resave any "
-                "pickles before upgrading to version 22.02.",
-                FutureWarning,
-            )
-            header["columns"] = [header.pop("index_column")]
-            header["column_names"] = pickle.dumps(
-                [pickle.loads(header["name"])]
-            )
-
-        return super().deserialize(header, frames)
-
     def _binaryop(
         self,
         other: T,
@@ -2508,7 +2492,7 @@ def to_pandas(self):
 
     def __repr__(self):
         return (
-            f"{self.__class__.__name__}({self._values.to_array()},"
+            f"{self.__class__.__name__}({self._values.values_host},"
             f" dtype='object'"
             + (
                 f", name={pd.io.formats.printing.default_pprint(self.name)}"
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 9458057894a..8ecab2c7c65 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -24,11 +24,12 @@
     is_integer_dtype,
     is_list_like,
 )
-from cudf.core.column import arange
+from cudf.core.column import arange, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex, _index_from_columns
 from cudf.core.multiindex import MultiIndex
+from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
 from cudf.utils.utils import cached_property
 
 doc_reset_index_template = """
@@ -556,7 +557,7 @@ def _gather(
         ):
             raise IndexError("Gather map index is out of bounds.")
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             libcudf.copying.gather(
                 list(self._index._columns + self._columns)
                 if keep_index
@@ -568,9 +569,6 @@ def _gather(
             self._index.names if keep_index else None,
         )
 
-        result._copy_type_metadata(self, include_index=keep_index)
-        return result
-
     def _positions_from_column_names(
         self, column_names, offset_by_index_columns=False
     ):
@@ -628,7 +626,7 @@ def drop_duplicates(
         keys = self._positions_from_column_names(
             subset, offset_by_index_columns=not ignore_index
         )
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             libcudf.stream_compaction.drop_duplicates(
                 list(self._columns)
                 if ignore_index
@@ -640,8 +638,6 @@ def drop_duplicates(
             self._column_names,
             self._index.names if not ignore_index else None,
         )
-        result._copy_type_metadata(self)
-        return result
 
     def add_prefix(self, prefix):
         """
@@ -761,6 +757,51 @@ def add_suffix(self, suffix):
                 Use `Series.add_suffix` or `DataFrame.add_suffix`"
         )
 
+    @annotate("APPLY", color="purple", domain="cudf_python")
+    def _apply(self, func, kernel_getter, *args, **kwargs):
+        """Apply `func` across the rows of the frame."""
+        if kwargs:
+            raise ValueError("UDFs using **kwargs are not yet supported.")
+
+        try:
+            kernel, retty = _compile_or_get(
+                self, func, args, kernel_getter=kernel_getter
+            )
+        except Exception as e:
+            raise ValueError(
+                "user defined function compilation failed."
+            ) from e
+
+        # Mask and data column preallocated
+        ans_col = cp.empty(len(self), dtype=retty)
+        ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
+        launch_args = [(ans_col, ans_mask), len(self)]
+        offsets = []
+
+        # if _compile_or_get succeeds, it is safe to create a kernel that only
+        # consumes the columns that are of supported dtype
+        for col in _supported_cols_from_frame(self).values():
+            data = col.data
+            mask = col.mask
+            if mask is None:
+                launch_args.append(data)
+            else:
+                launch_args.append((data, mask))
+            offsets.append(col.offset)
+        launch_args += offsets
+        launch_args += list(args)
+
+        try:
+            kernel.forall(len(self))(*launch_args)
+        except Exception as e:
+            raise RuntimeError("UDF kernel execution failed.") from e
+
+        col = as_column(ans_col)
+        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
+        result = cudf.Series._from_data({None: col}, self._index)
+
+        return result
+
     def sort_values(
         self,
         by,
@@ -1303,9 +1344,7 @@ def dropna(
         0  Alfred  Batmobile 1940-04-25
         """
         if axis == 0:
-            result = self._drop_na_rows(
-                how=how, subset=subset, thresh=thresh, drop_nan=True
-            )
+            result = self._drop_na_rows(how=how, subset=subset, thresh=thresh)
         else:
             result = self._drop_na_columns(
                 how=how, subset=subset, thresh=thresh
@@ -1313,9 +1352,7 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    def _drop_na_rows(
-        self, how="any", subset=None, thresh=None, drop_nan=False
-    ):
+    def _drop_na_rows(self, how="any", subset=None, thresh=None):
         """
         Drop null rows from `self`.
 
@@ -1326,7 +1363,7 @@ def _drop_na_rows(
             *all* null values.
         subset : list, optional
             List of columns to consider when dropping rows.
-        thresh: int, optional
+        thresh : int, optional
             If specified, then drops every row containing
             less than `thresh` non-null values.
         """
@@ -1346,17 +1383,16 @@ def _drop_na_rows(
         if len(subset) == 0:
             return self.copy(deep=True)
 
-        if drop_nan:
-            data_columns = [
-                col.nans_to_nulls()
-                if isinstance(col, cudf.core.column.NumericalColumn)
-                else col
-                for col in self._columns
-            ]
+        data_columns = [
+            col.nans_to_nulls()
+            if isinstance(col, cudf.core.column.NumericalColumn)
+            else col
+            for col in self._columns
+        ]
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                list(self._index._data.columns) + data_columns,
+                [*self._index._data.columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -1366,8 +1402,6 @@ def _drop_na_rows(
             self._column_names,
             self._index.names,
         )
-        result._copy_type_metadata(self)
-        return result
 
     def _apply_boolean_mask(self, boolean_mask):
         """Apply boolean mask to each row of `self`.
@@ -1378,15 +1412,13 @@ def _apply_boolean_mask(self, boolean_mask):
         if not is_bool_dtype(boolean_mask.dtype):
             raise ValueError("boolean_mask is not boolean type.")
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             libcudf.stream_compaction.apply_boolean_mask(
                 list(self._index._columns + self._columns), boolean_mask
             ),
             column_names=self._column_names,
             index_names=self._index.names,
         )
-        result._copy_type_metadata(self)
-        return result
 
     def take(self, indices, axis=0):
         """Return a new frame containing the rows specified by *indices*.
@@ -1427,18 +1459,9 @@ def take(self, indices, axis=0):
         0  1.0  a
         2  3.0  c
         """
-        axis = self._get_axis_from_axis_arg(axis)
-        if axis != 0:
+        if self._get_axis_from_axis_arg(axis) != 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
-        indices = cudf.core.column.as_column(indices)
-        if is_bool_dtype(indices):
-            warnings.warn(
-                "Calling take with a boolean array is deprecated and will be "
-                "removed in the future.",
-                FutureWarning,
-            )
-            return self._apply_boolean_mask(indices)
         return self._gather(indices)
 
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index e8ff7838a9e..adce3c24a83 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -775,7 +775,7 @@ def _index_and_downcast(self, result, index, index_key):
             )
 
         if isinstance(index_key, tuple):
-            result = result.set_index(index)
+            result.index = index
         return result
 
     def _get_row_major(
@@ -859,28 +859,8 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        if "names" in header:
-            warnings.warn(
-                "MultiIndex objects serialized in cudf version "
-                "21.10 or older will no longer be deserializable "
-                "after version 21.12. Please load and resave any "
-                "pickles before upgrading to version 22.02.",
-                FutureWarning,
-            )
-            header["column_names"] = header["names"]
-        column_names = pickle.loads(header["column_names"])
-        if "source_data" in header:
-            warnings.warn(
-                "MultiIndex objects serialized in cudf version "
-                "21.08 or older will no longer be deserializable "
-                "after version 21.10. Please load and resave any "
-                "pickles before upgrading to version 21.12.",
-                FutureWarning,
-            )
-            df = cudf.DataFrame.deserialize(header["source_data"], frames)
-            return cls.from_frame(df)._set_names(column_names)
-
         # Spoof the column names to construct the frame, then set manually.
+        column_names = pickle.loads(header["column_names"])
         header["column_names"] = pickle.dumps(range(0, len(column_names)))
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 0032dc25cee..12a2538b776 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -14,7 +14,6 @@
 import cupy
 import numpy as np
 import pandas as pd
-from numba import cuda
 from pandas._config import get_option
 
 import cudf
@@ -67,6 +66,7 @@
     doc_reset_index_template,
 )
 from cudf.core.single_column_frame import SingleColumnFrame
+from cudf.core.udf.scalar_function import _get_scalar_kernel
 from cudf.utils import cudautils, docutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -559,19 +559,6 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        if "column" in header:
-            warnings.warn(
-                "Series objects serialized in cudf version "
-                "21.10 or older will no longer be deserializable "
-                "after version 21.12. Please load and resave any "
-                "pickles before upgrading to version 22.02.",
-                FutureWarning,
-            )
-            header["columns"] = [header.pop("column")]
-            header["column_names"] = pickle.dumps(
-                [pickle.loads(header["name"])]
-            )
-
         index_nframes = header["index_frame_count"]
         obj = super().deserialize(
             header, frames[header["index_frame_count"] :]
@@ -923,45 +910,6 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False):
             inplace=inplace,
         )
 
-    def set_index(self, index):
-        """Returns a new Series with a different index.
-
-        Parameters
-        ----------
-        index : Index, Series-convertible
-            the new index or values for the new index
-
-        Returns
-        -------
-        Series
-            A new Series with assigned index.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 11, 12, 13, 14])
-        >>> series
-        0    10
-        1    11
-        2    12
-        3    13
-        4    14
-        dtype: int64
-        >>> series.set_index(['a', 'b', 'c', 'd', 'e'])
-        a    10
-        b    11
-        c    12
-        d    13
-        e    14
-        dtype: int64
-        """
-        warnings.warn(
-            "Series.set_index is deprecated and will be removed in the future",
-            FutureWarning,
-        )
-        index = index if isinstance(index, BaseIndex) else as_index(index)
-        return self._from_data(self._data, index, self.name)
-
     def to_frame(self, name=None):
         """Convert Series into a DataFrame
 
@@ -1004,15 +952,6 @@ def to_frame(self, name=None):
 
         return cudf.DataFrame({col: self._column}, index=self.index)
 
-    def set_mask(self, mask, null_count=None):
-        warnings.warn(
-            "Series.set_mask is deprecated and will be removed in the future.",
-            FutureWarning,
-        )
-        return self._from_data(
-            {self.name: self._column.set_mask(mask)}, self._index
-        )
-
     def memory_usage(self, index=True, deep=False):
         """
         Return the memory usage of the Series.
@@ -1662,25 +1601,6 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    def fill(self, fill_value, begin=0, end=-1, inplace=False):
-        warnings.warn(
-            "The fill method will be removed in a future cuDF release.",
-            FutureWarning,
-        )
-        fill_values = [fill_value]
-        col_and_fill = zip(self._columns, fill_values)
-
-        if not inplace:
-            data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill)
-            return self.__class__._from_data(
-                zip(self._column_names, data_columns), self._index
-            )
-
-        for (c, v) in col_and_fill:
-            c.fill(v, begin, end, inplace=True)
-
-        return self
-
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1704,15 +1624,6 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
-    # TODO: When this method is removed we can also remove ColumnBase.to_array.
-    def to_array(self, fillna=None):
-        warnings.warn(
-            "The to_array method will be removed in a future cuDF "
-            "release. Consider using `to_numpy` instead.",
-            FutureWarning,
-        )
-        return self._column.to_array(fillna=fillna)
-
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1821,27 +1732,6 @@ def nullmask(self):
         """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
-    def as_mask(self):
-        """Convert booleans to bitmask
-
-        Returns
-        -------
-        device array
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([True, False, True])
-        >>> s.as_mask()
-        <cudf.core.buffer.Buffer object at 0x...>
-        """
-        if not is_bool_dtype(self.dtype):
-            raise TypeError(
-                f"Series must of boolean dtype, found: {self.dtype}"
-            )
-
-        return self._column.as_mask()
-
     def astype(self, dtype, copy=False, errors="raise"):
         """
         Cast the Series to the given dtype
@@ -2282,76 +2172,6 @@ def update(self, other):
 
         self.mask(mask, other, inplace=True)
 
-    def reverse(self):
-        warnings.warn(
-            "Series.reverse is deprecated and will be removed in the future.",
-            FutureWarning,
-        )
-        rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32)
-        return self._from_data(
-            {self.name: self._column[rinds]}, self.index._values[rinds]
-        )
-
-    def label_encoding(self, cats, dtype=None, na_sentinel=-1):
-        """Perform label encoding.
-
-        Parameters
-        ----------
-        values : sequence of input values
-        dtype : numpy.dtype; optional
-            Specifies the output dtype.  If `None` is given, the
-            smallest possible integer dtype (starting with np.int8)
-            is used.
-        na_sentinel : number, default -1
-            Value to indicate missing category.
-
-        Returns
-        -------
-        A sequence of encoded labels with value between 0 and n-1 classes(cats)
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([1, 2, 3, 4, 10])
-        >>> s.label_encoding([2, 3])
-        0   -1
-        1    0
-        2    1
-        3   -1
-        4   -1
-        dtype: int8
-
-        `na_sentinel` parameter can be used to
-        control the value when there is no encoding.
-
-        >>> s.label_encoding([2, 3], na_sentinel=10)
-        0    10
-        1     0
-        2     1
-        3    10
-        4    10
-        dtype: int8
-
-        When none of `cats` values exist in s, entire
-        Series will be `na_sentinel`.
-
-        >>> s.label_encoding(['a', 'b', 'c'])
-        0   -1
-        1   -1
-        2   -1
-        3   -1
-        4   -1
-        dtype: int8
-        """
-
-        warnings.warn(
-            "Series.label_encoding is deprecated and will be removed in the "
-            "future. Consider using cuML's LabelEncoder instead.",
-            FutureWarning,
-        )
-
-        return self._label_encoding(cats, dtype, na_sentinel)
-
     def _label_encoding(self, cats, dtype=None, na_sentinel=-1):
         # Private implementation of deprecated public label_encoding method
         def _return_sentinel_series():
@@ -2413,7 +2233,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
             by numba based on the function logic and argument types.
             See examples for details.
         args : tuple
-            Not supported
+            Positional arguments passed to func after the series value.
         **kwargs
             Not supported
 
@@ -2479,20 +2299,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         2     4.5
         dtype: float64
         """
-        if args or kwargs:
-            raise ValueError(
-                "UDFs using *args or **kwargs are not yet supported."
-            )
-
-        # these functions are generally written as functions of scalar
-        # values rather than rows. Rather than writing an entirely separate
-        # numba kernel that is not built around a row object, its simpler
-        # to just turn this into the equivalent single column dataframe case
-        name = self.name or "__temp_srname"
-        df = cudf.DataFrame({name: self})
-        f_ = cuda.jit(device=True)(func)
-
-        return df.apply(lambda row: f_(row[name]))
+        if convert_dtype is not True:
+            raise ValueError("Series.apply only supports convert_dtype=True")
+        return self._apply(func, _get_scalar_kernel, *args, **kwargs)
 
     def applymap(self, udf, out_dtype=None):
         """Apply an elementwise function to transform the values in the Column.
@@ -3500,12 +3309,8 @@ def rename(self, index=None, copy=True):
         >>> renamed_series.name
         'numeric_series'
         """
-        out = self.copy(deep=False)
-        out = out.set_index(self.index)
-        if index:
-            out.name = index
-
-        return out.copy(deep=copy)
+        out_data = self._data.copy(deep=copy)
+        return Series._from_data(out_data, self.index, name=index)
 
     def merge(
         self,
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index e480e31bc4b..ea1917acc10 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -3,7 +3,6 @@
 
 from __future__ import annotations
 
-import warnings
 from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union
 
 import cupy
@@ -143,16 +142,6 @@ def tolist(self):  # noqa: D102
 
     to_list = tolist
 
-    # TODO: When this method is removed we can also remove
-    # ColumnBase.to_gpu_array.
-    def to_gpu_array(self, fillna=None):  # noqa: D102
-        warnings.warn(
-            "The to_gpu_array method will be removed in a future cuDF "
-            "release. Consider using `to_cupy` instead.",
-            FutureWarning,
-        )
-        return self._column.to_gpu_array(fillna=fillna)
-
     @classmethod
     def from_arrow(cls, array):
         """Create from PyArrow Array/ChunkedArray.
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
deleted file mode 100644
index 2464906be04..00000000000
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ /dev/null
@@ -1,388 +0,0 @@
-import math
-from typing import Callable
-
-import cachetools
-import numpy as np
-from numba import cuda, typeof
-from numba.np import numpy_support
-from numba.types import Poison, Record, Tuple, boolean, int64, void
-from nvtx import annotate
-
-from cudf.core.dtypes import CategoricalDtype
-from cudf.core.udf.api import Masked, pack_return
-from cudf.core.udf.typing import MaskedType
-from cudf.utils import cudautils
-from cudf.utils.dtypes import (
-    BOOL_TYPES,
-    DATETIME_TYPES,
-    NUMERIC_TYPES,
-    TIMEDELTA_TYPES,
-)
-
-libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32"))
-MASK_BITSIZE = np.dtype("int32").itemsize * 8
-precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
-
-JIT_SUPPORTED_TYPES = (
-    NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES
-)
-
-
-def _is_jit_supported_type(dtype):
-    # category dtype isn't hashable
-    if isinstance(dtype, CategoricalDtype):
-        return False
-    return str(dtype) in JIT_SUPPORTED_TYPES
-
-
-def all_dtypes_from_frame(frame):
-    return {
-        colname: col.dtype
-        if _is_jit_supported_type(col.dtype)
-        else np.dtype("O")
-        for colname, col in frame._data.items()
-    }
-
-
-def supported_dtypes_from_frame(frame):
-    return {
-        colname: col.dtype
-        for colname, col in frame._data.items()
-        if _is_jit_supported_type(col.dtype)
-    }
-
-
-def supported_cols_from_frame(frame):
-    return {
-        colname: col
-        for colname, col in frame._data.items()
-        if _is_jit_supported_type(col.dtype)
-    }
-
-
-def generate_cache_key(frame, func: Callable):
-    """Create a cache key that uniquely identifies a compilation.
-
-    A new compilation is needed any time any of the following things change:
-    - The UDF itself as defined in python by the user
-    - The types of the columns utilized by the UDF
-    - The existence of the input columns masks
-    """
-    return (
-        *cudautils.make_cache_key(func, all_dtypes_from_frame(frame).values()),
-        *(col.mask is None for col in frame._data.values()),
-        *frame._data.keys(),
-    )
-
-
-def get_frame_row_type(dtype):
-    """
-    Get the numba `Record` type corresponding to a frame.
-    Models each column and its mask as a MaskedType and
-    models the row as a dictionary like data structure
-    containing these MaskedTypes.
-
-    Large parts of this function are copied with comments
-    from the Numba internals and slightly modified to
-    account for validity bools to be present in the final
-    struct.
-    """
-
-    # Create the numpy structured type corresponding to the numpy dtype.
-
-    fields = []
-    offset = 0
-
-    sizes = [val[0].itemsize for val in dtype.fields.values()]
-    for i, (name, info) in enumerate(dtype.fields.items()):
-        # *info* consists of the element dtype, its offset from the beginning
-        # of the record, and an optional "title" containing metadata.
-        # We ignore the offset in info because its value assumes no masking;
-        # instead, we compute the correct offset based on the masked type.
-        elemdtype = info[0]
-        title = info[2] if len(info) == 3 else None
-        ty = numpy_support.from_dtype(elemdtype)
-        infos = {
-            "type": MaskedType(ty),
-            "offset": offset,
-            "title": title,
-        }
-        fields.append((name, infos))
-
-        # increment offset by itemsize plus one byte for validity
-        offset += elemdtype.itemsize + 1
-
-        # Align the next member of the struct to be a multiple of the
-        # memory access size, per PTX ISA 7.4/5.4.5
-        if i < len(sizes) - 1:
-            next_itemsize = sizes[i + 1]
-            offset = int(math.ceil(offset / next_itemsize) * next_itemsize)
-
-    # Numba requires that structures are aligned for the CUDA target
-    _is_aligned_struct = True
-    return Record(fields, offset, _is_aligned_struct)
-
-
-@annotate("NUMBA JIT", color="green", domain="cudf_python")
-def get_udf_return_type(frame, func: Callable, args=()):
-
-    """
-    Get the return type of a masked UDF for a given set of argument dtypes. It
-    is assumed that the function consumes a dictionary whose keys are strings
-    and whose values are of MaskedType. Initially assume that the UDF may be
-    written to utilize any field in the row - including those containing an
-    unsupported dtype. If an unsupported dtype is actually used in the function
-    the compilation should fail at `compile_udf`. If compilation succeeds, one
-    can infer that the function does not use any of the columns of unsupported
-    dtype - meaning we can drop them going forward and the UDF will still end
-    up getting fed rows containing all the fields it actually needs to use to
-    compute the answer for that row.
-    """
-
-    # present a row containing all fields to the UDF and try and compile
-    row_type = get_frame_row_type(
-        np.dtype(list(all_dtypes_from_frame(frame).items()))
-    )
-    compile_sig = (row_type, *(typeof(arg) for arg in args))
-
-    # Get the return type. The PTX is also returned by compile_udf, but is not
-    # needed here.
-    ptx, output_type = cudautils.compile_udf(func, compile_sig)
-    if not isinstance(output_type, MaskedType):
-        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
-    else:
-        numba_output_type = output_type
-
-    return (
-        numba_output_type
-        if not isinstance(numba_output_type, MaskedType)
-        else numba_output_type.value_type
-    )
-
-
-def masked_array_type_from_col(col):
-    """
-    Return a type representing a tuple of arrays,
-    the first element an array of the numba type
-    corresponding to `dtype`, and the second an
-    array of bools representing a mask.
-    """
-    nb_scalar_ty = numpy_support.from_dtype(col.dtype)
-    if col.mask is None:
-        return nb_scalar_ty[::1]
-    else:
-        return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1]))
-
-
-def construct_signature(frame, return_type, args):
-    """
-    Build the signature of numba types that will be used to
-    actually JIT the kernel itself later, accounting for types
-    and offsets. Skips columns with unsupported dtypes.
-    """
-
-    # Tuple of arrays, first the output data array, then the mask
-    return_type = Tuple((return_type[::1], boolean[::1]))
-    offsets = []
-    sig = [return_type, int64]
-    for col in supported_cols_from_frame(frame).values():
-        sig.append(masked_array_type_from_col(col))
-        offsets.append(int64)
-
-    # return_type, size, data, masks, offsets, extra args
-    sig = void(*(sig + offsets + [typeof(arg) for arg in args]))
-
-    return sig
-
-
-@cuda.jit(device=True)
-def mask_get(mask, pos):
-    return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
-
-
-kernel_template = """\
-def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}):
-    i = cuda.grid(1)
-    ret_data_arr, ret_mask_arr = retval
-    if i < size:
-        # Create a structured array with the desired fields
-        rows = cuda.local.array(1, dtype=row_type)
-
-        # one element of that array
-        row = rows[0]
-
-{masked_input_initializers}
-{row_initializers}
-
-        # pass the assembled row into the udf
-        ret = f_(row, {extra_args})
-
-        # pack up the return values and set them
-        ret_masked = pack_return(ret)
-        ret_data_arr[i] = ret_masked.value
-        ret_mask_arr[i] = ret_masked.valid
-"""
-
-unmasked_input_initializer_template = """\
-        d_{idx} = input_col_{idx}
-        masked_{idx} = Masked(d_{idx}[i], True)
-"""
-
-masked_input_initializer_template = """\
-        d_{idx}, m_{idx} = input_col_{idx}
-        masked_{idx} = Masked(d_{idx}[i], mask_get(m_{idx}, i + offset_{idx}))
-"""
-
-row_initializer_template = """\
-        row["{name}"] = masked_{idx}
-"""
-
-
-def _define_function(frame, row_type, args):
-    """
-    The kernel we want to JIT compile looks something like the following,
-    which is an example for two columns that both have nulls present
-
-    def _kernel(retval, input_col_0, input_col_1, offset_0, offset_1, size):
-        i = cuda.grid(1)
-        ret_data_arr, ret_mask_arr = retval
-        if i < size:
-            rows = cuda.local.array(1, dtype=row_type)
-            row = rows[0]
-
-            d_0, m_0 = input_col_0
-            masked_0 = Masked(d_0[i], mask_get(m_0, i + offset_0))
-            d_1, m_1 = input_col_1
-            masked_1 = Masked(d_1[i], mask_get(m_1, i + offset_1))
-
-            row["a"] = masked_0
-            row["b"] = masked_1
-
-            ret = f_(row)
-
-            ret_masked = pack_return(ret)
-            ret_data_arr[i] = ret_masked.value
-            ret_mask_arr[i] = ret_masked.valid
-
-    However we do not always have two columns and columns do not always have
-    an associated mask. Ideally, we would just write one kernel and make use
-    of `*args` - and then one function would work for any number of columns,
-    currently numba does not support `*args` and treats functions it JITs as
-    if `*args` is a singular argument. Thus we are forced to write the right
-    functions dynamically at runtime and define them using `exec`.
-    """
-    # Create argument list for kernel
-    frame = supported_cols_from_frame(frame)
-
-    input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))])
-    input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))])
-    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
-
-    # Generate the initializers for each device function argument
-    initializers = []
-    row_initializers = []
-    for i, (colname, col) in enumerate(frame.items()):
-        idx = str(i)
-        if col.mask is not None:
-            template = masked_input_initializer_template
-        else:
-            template = unmasked_input_initializer_template
-
-        initializer = template.format(idx=idx)
-
-        initializers.append(initializer)
-
-        row_initializer = row_initializer_template.format(
-            idx=idx, name=colname
-        )
-        row_initializers.append(row_initializer)
-
-    # Incorporate all of the above into the kernel code template
-    d = {
-        "input_columns": input_columns,
-        "input_offsets": input_offsets,
-        "extra_args": extra_args,
-        "masked_input_initializers": "\n".join(initializers),
-        "row_initializers": "\n".join(row_initializers),
-        "numba_rectype": row_type,  # from global
-    }
-
-    return kernel_template.format(**d)
-
-
-@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
-def compile_or_get(frame, func, args):
-    """
-    Return a compiled kernel in terms of MaskedTypes that launches a
-    kernel equivalent of `f` for the dtypes of `df`. The kernel uses
-    a thread for each row and calls `f` using that rows data / mask
-    to produce an output value and output validity for each row.
-
-    If the UDF has already been compiled for this requested dtypes,
-    a cached version will be returned instead of running compilation.
-
-    CUDA kernels are void and do not return values. Thus, we need to
-    preallocate a column of the correct dtype and pass it in as one of
-    the kernel arguments. This creates a chicken-and-egg problem where
-    we need the column type to compile the kernel, but normally we would
-    be getting that type FROM compiling the kernel (and letting numba
-    determine it as a return value). As a workaround, we compile the UDF
-    itself outside the final kernel to invoke a full typing pass, which
-    unfortunately is difficult to do without running full compilation.
-    we then obtain the return type from that separate compilation and
-    use it to allocate an output column of the right dtype.
-    """
-
-    # check to see if we already compiled this function
-    cache_key = generate_cache_key(frame, func)
-    if precompiled.get(cache_key) is not None:
-        kernel, masked_or_scalar = precompiled[cache_key]
-        return kernel, masked_or_scalar
-
-    # precompile the user udf to get the right return type.
-    # could be a MaskedType or a scalar type.
-    scalar_return_type = get_udf_return_type(frame, func, args)
-
-    # get_udf_return_type will throw a TypingError if the user tries to use
-    # a field in the row containing an unsupported dtype, except in the
-    # edge case where all the function does is return that element:
-
-    # def f(row):
-    #    return row[<bad dtype key>]
-    # In this case numba is happy to return MaskedType(<bad dtype key>)
-    # because it relies on not finding overloaded operators for types to raise
-    # the exception, so we have to explicitly check for that case.
-    if isinstance(scalar_return_type, Poison):
-        raise TypeError(str(scalar_return_type))
-
-    # this is the signature for the final full kernel compilation
-    sig = construct_signature(frame, scalar_return_type, args)
-
-    # this row type is used within the kernel to pack up the column and
-    # mask data into the dict like data structure the user udf expects
-    np_field_types = np.dtype(list(supported_dtypes_from_frame(frame).items()))
-    row_type = get_frame_row_type(np_field_types)
-
-    f_ = cuda.jit(device=True)(func)
-    # Dict of 'local' variables into which `_kernel` is defined
-    local_exec_context = {}
-    global_exec_context = {
-        "f_": f_,
-        "cuda": cuda,
-        "Masked": Masked,
-        "mask_get": mask_get,
-        "pack_return": pack_return,
-        "row_type": row_type,
-    }
-    exec(
-        _define_function(frame, row_type, args),
-        global_exec_context,
-        local_exec_context,
-    )
-    # The python function definition representing the kernel
-    _kernel = local_exec_context["_kernel"]
-    kernel = cuda.jit(sig)(_kernel)
-    np_return_type = numpy_support.as_dtype(scalar_return_type)
-    precompiled[cache_key] = (kernel, np_return_type)
-
-    return kernel, np_return_type
diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py
new file mode 100644
index 00000000000..5cda9fb8218
--- /dev/null
+++ b/python/cudf/cudf/core/udf/row_function.py
@@ -0,0 +1,151 @@
+import math
+
+import numpy as np
+from numba import cuda
+from numba.np import numpy_support
+from numba.types import Record
+
+from cudf.core.udf.api import Masked, pack_return
+from cudf.core.udf.templates import (
+    masked_input_initializer_template,
+    row_initializer_template,
+    row_kernel_template,
+    unmasked_input_initializer_template,
+)
+from cudf.core.udf.typing import MaskedType
+from cudf.core.udf.utils import (
+    _all_dtypes_from_frame,
+    _construct_signature,
+    _get_kernel,
+    _get_udf_return_type,
+    _mask_get,
+    _supported_cols_from_frame,
+    _supported_dtypes_from_frame,
+)
+
+
+def _get_frame_row_type(dtype):
+    """
+    Get the numba `Record` type corresponding to a frame.
+    Models each column and its mask as a MaskedType and
+    models the row as a dictionary like data structure
+    containing these MaskedTypes.
+
+    Large parts of this function are copied with comments
+    from the Numba internals and slightly modified to
+    account for validity bools to be present in the final
+    struct.
+
+    See numba.np.numpy_support.from_struct_dtype for details.
+    """
+
+    # Create the numpy structured type corresponding to the numpy dtype.
+
+    fields = []
+    offset = 0
+
+    sizes = [val[0].itemsize for val in dtype.fields.values()]
+    for i, (name, info) in enumerate(dtype.fields.items()):
+        # *info* consists of the element dtype, its offset from the beginning
+        # of the record, and an optional "title" containing metadata.
+        # We ignore the offset in info because its value assumes no masking;
+        # instead, we compute the correct offset based on the masked type.
+        elemdtype = info[0]
+        title = info[2] if len(info) == 3 else None
+        ty = numpy_support.from_dtype(elemdtype)
+        infos = {
+            "type": MaskedType(ty),
+            "offset": offset,
+            "title": title,
+        }
+        fields.append((name, infos))
+
+        # increment offset by itemsize plus one byte for validity
+        offset += elemdtype.itemsize + 1
+
+        # Align the next member of the struct to be a multiple of the
+        # memory access size, per PTX ISA 7.4/5.4.5
+        if i < len(sizes) - 1:
+            next_itemsize = sizes[i + 1]
+            offset = int(math.ceil(offset / next_itemsize) * next_itemsize)
+
+    # Numba requires that structures are aligned for the CUDA target
+    _is_aligned_struct = True
+    return Record(fields, offset, _is_aligned_struct)
+
+
+def _row_kernel_string_from_template(frame, row_type, args):
+    """
+    Function to write numba kernels for `DataFrame.apply` as a string.
+    Workaround until numba supports functions that use `*args`
+
+    `DataFrame.apply` expects functions of a dict like row as well as
+    possibly one or more scalar arguments
+
+    def f(row, c, k):
+        return (row['x'] + c) / k
+
+    Both the number of input columns as well as their nullability and any
+    scalar arguments may vary, so the kernels vary significantly. See
+    templates.py for the full row kernel template and more details.
+    """
+    # Create argument list for kernel
+    frame = _supported_cols_from_frame(frame)
+
+    input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))])
+    input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))])
+    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
+
+    # Generate the initializers for each device function argument
+    initializers = []
+    row_initializers = []
+    for i, (colname, col) in enumerate(frame.items()):
+        idx = str(i)
+        template = (
+            masked_input_initializer_template
+            if col.mask is not None
+            else unmasked_input_initializer_template
+        )
+        initializers.append(template.format(idx=idx))
+        row_initializers.append(
+            row_initializer_template.format(idx=idx, name=colname)
+        )
+
+    return row_kernel_template.format(
+        input_columns=input_columns,
+        input_offsets=input_offsets,
+        extra_args=extra_args,
+        masked_input_initializers="\n".join(initializers),
+        row_initializers="\n".join(row_initializers),
+        numba_rectype=row_type,
+    )
+
+
+def _get_row_kernel(frame, func, args):
+    row_type = _get_frame_row_type(
+        np.dtype(list(_all_dtypes_from_frame(frame).items()))
+    )
+    scalar_return_type = _get_udf_return_type(row_type, func, args)
+
+    # this is the signature for the final full kernel compilation
+    sig = _construct_signature(frame, scalar_return_type, args)
+
+    # this row type is used within the kernel to pack up the column and
+    # mask data into the dict like data structure the user udf expects
+    np_field_types = np.dtype(
+        list(_supported_dtypes_from_frame(frame).items())
+    )
+    row_type = _get_frame_row_type(np_field_types)
+
+    # Dict of 'local' variables into which `_kernel` is defined
+    global_exec_context = {
+        "cuda": cuda,
+        "Masked": Masked,
+        "_mask_get": _mask_get,
+        "pack_return": pack_return,
+        "row_type": row_type,
+    }
+    kernel_string = _row_kernel_string_from_template(frame, row_type, args)
+    kernel = _get_kernel(kernel_string, global_exec_context, sig, func)
+
+    return kernel, scalar_return_type
diff --git a/python/cudf/cudf/core/udf/scalar_function.py b/python/cudf/cudf/core/udf/scalar_function.py
new file mode 100644
index 00000000000..7f3b461a1f0
--- /dev/null
+++ b/python/cudf/cudf/core/udf/scalar_function.py
@@ -0,0 +1,64 @@
+from numba import cuda
+from numba.np import numpy_support
+
+from cudf.core.udf.api import Masked, pack_return
+from cudf.core.udf.templates import (
+    masked_input_initializer_template,
+    scalar_kernel_template,
+    unmasked_input_initializer_template,
+)
+from cudf.core.udf.typing import MaskedType
+from cudf.core.udf.utils import (
+    _construct_signature,
+    _get_kernel,
+    _get_udf_return_type,
+    _mask_get,
+)
+
+
+def _scalar_kernel_string_from_template(sr, args):
+    """
+    Function to write numba kernels for `Series.apply` as a string.
+    Workaround until numba supports functions that use `*args`
+
+    `Series.apply` expects functions of a single variable and possibly
+    one or more constants, such as:
+
+    def f(x, c, k):
+        return (x + c) / k
+
+    where the `x` are meant to be the values of the series. Since there
+    can be only one column, the only thing that varies in the kinds of
+    kernels that we want is the number of extra_args. See templates.py
+    for the full kernel template.
+    """
+    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
+
+    masked_initializer = (
+        masked_input_initializer_template
+        if sr._column.mask
+        else unmasked_input_initializer_template
+    ).format(idx=0)
+
+    return scalar_kernel_template.format(
+        extra_args=extra_args, masked_initializer=masked_initializer
+    )
+
+
+def _get_scalar_kernel(sr, func, args):
+    sr_type = MaskedType(numpy_support.from_dtype(sr.dtype))
+    scalar_return_type = _get_udf_return_type(sr_type, func, args)
+
+    sig = _construct_signature(sr, scalar_return_type, args=args)
+    f_ = cuda.jit(device=True)(func)
+    global_exec_context = {
+        "f_": f_,
+        "cuda": cuda,
+        "Masked": Masked,
+        "_mask_get": _mask_get,
+        "pack_return": pack_return,
+    }
+    kernel_string = _scalar_kernel_string_from_template(sr, args=args)
+    kernel = _get_kernel(kernel_string, global_exec_context, sig, func)
+
+    return kernel, scalar_return_type
diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py
new file mode 100644
index 00000000000..8cb11133323
--- /dev/null
+++ b/python/cudf/cudf/core/udf/templates.py
@@ -0,0 +1,52 @@
+unmasked_input_initializer_template = """\
+        d_{idx} = input_col_{idx}
+        masked_{idx} = Masked(d_{idx}[i], True)
+"""
+
+masked_input_initializer_template = """\
+        d_{idx}, m_{idx} = input_col_{idx}
+        masked_{idx} = Masked(d_{idx}[i], _mask_get(m_{idx}, i + offset_{idx}))
+"""
+
+row_initializer_template = """\
+        row["{name}"] = masked_{idx}
+"""
+
+row_kernel_template = """\
+def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}):
+    i = cuda.grid(1)
+    ret_data_arr, ret_mask_arr = retval
+    if i < size:
+        # Create a structured array with the desired fields
+        rows = cuda.local.array(1, dtype=row_type)
+
+        # one element of that array
+        row = rows[0]
+
+{masked_input_initializers}
+{row_initializers}
+
+        # pass the assembled row into the udf
+        ret = f_(row, {extra_args})
+
+        # pack up the return values and set them
+        ret_masked = pack_return(ret)
+        ret_data_arr[i] = ret_masked.value
+        ret_mask_arr[i] = ret_masked.valid
+"""
+
+scalar_kernel_template = """
+def _kernel(retval, size, input_col_0, offset_0, {extra_args}):
+    i = cuda.grid(1)
+    ret_data_arr, ret_mask_arr = retval
+
+    if i < size:
+
+{masked_initializer}
+
+        ret = f_(masked_0, {extra_args})
+
+        ret_masked = pack_return(ret)
+        ret_data_arr[i] = ret_masked.value
+        ret_mask_arr[i] = ret_masked.valid
+"""
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
new file mode 100644
index 00000000000..a98ee40274e
--- /dev/null
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -0,0 +1,216 @@
+from typing import Callable
+
+import cachetools
+import numpy as np
+from numba import cuda, typeof
+from numba.core.errors import TypingError
+from numba.np import numpy_support
+from numba.types import Poison, Tuple, boolean, int64, void
+from nvtx import annotate
+
+from cudf.core.dtypes import CategoricalDtype
+from cudf.core.udf.typing import MaskedType
+from cudf.utils import cudautils
+from cudf.utils.dtypes import (
+    BOOL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+)
+
+JIT_SUPPORTED_TYPES = (
+    NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES
+)
+
+libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32"))
+MASK_BITSIZE = np.dtype("int32").itemsize * 8
+
+precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
+
+
+@annotate("NUMBA JIT", color="green", domain="cudf_python")
+def _get_udf_return_type(argty, func: Callable, args=()):
+    """
+    Get the return type of a masked UDF for a given set of argument dtypes. It
+    is assumed that the function consumes a dictionary whose keys are strings
+    and whose values are of MaskedType. Initially assume that the UDF may be
+    written to utilize any field in the row - including those containing an
+    unsupported dtype. If an unsupported dtype is actually used in the function
+    the compilation should fail at `compile_udf`. If compilation succeeds, one
+    can infer that the function does not use any of the columns of unsupported
+    dtype - meaning we can drop them going forward and the UDF will still end
+    up getting fed rows containing all the fields it actually needs to use to
+    compute the answer for that row.
+    """
+
+    # present a row containing all fields to the UDF and try and compile
+    compile_sig = (argty, *(typeof(arg) for arg in args))
+
+    # Get the return type. The PTX is also returned by compile_udf, but is not
+    # needed here.
+    ptx, output_type = cudautils.compile_udf(func, compile_sig)
+    if not isinstance(output_type, MaskedType):
+        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
+    else:
+        numba_output_type = output_type
+
+    result = (
+        numba_output_type
+        if not isinstance(numba_output_type, MaskedType)
+        else numba_output_type.value_type
+    )
+
+    # _get_udf_return_type will throw a TypingError if the user tries to use
+    # a field in the row containing an unsupported dtype, except in the
+    # edge case where all the function does is return that element:
+
+    # def f(row):
+    #    return row[<bad dtype key>]
+    # In this case numba is happy to return MaskedType(<bad dtype key>)
+    # because it relies on not finding overloaded operators for types to raise
+    # the exception, so we have to explicitly check for that case.
+    if isinstance(result, Poison):
+        raise TypingError(str(result))
+
+    return result
+
+
+def _is_jit_supported_type(dtype):
+    # category dtype isn't hashable
+    if isinstance(dtype, CategoricalDtype):
+        return False
+    return str(dtype) in JIT_SUPPORTED_TYPES
+
+
+def _all_dtypes_from_frame(frame):
+    return {
+        colname: col.dtype
+        if _is_jit_supported_type(col.dtype)
+        else np.dtype("O")
+        for colname, col in frame._data.items()
+    }
+
+
+def _supported_dtypes_from_frame(frame):
+    return {
+        colname: col.dtype
+        for colname, col in frame._data.items()
+        if _is_jit_supported_type(col.dtype)
+    }
+
+
+def _supported_cols_from_frame(frame):
+    return {
+        colname: col
+        for colname, col in frame._data.items()
+        if _is_jit_supported_type(col.dtype)
+    }
+
+
+def _masked_array_type_from_col(col):
+    """
+    Return a type representing a tuple of arrays,
+    the first element an array of the numba type
+    corresponding to `dtype`, and the second an
+    array of bools representing a mask.
+    """
+    nb_scalar_ty = numpy_support.from_dtype(col.dtype)
+    if col.mask is None:
+        return nb_scalar_ty[::1]
+    else:
+        return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1]))
+
+
+def _construct_signature(frame, return_type, args):
+    """
+    Build the signature of numba types that will be used to
+    actually JIT the kernel itself later, accounting for types
+    and offsets. Skips columns with unsupported dtypes.
+    """
+
+    # Tuple of arrays, first the output data array, then the mask
+    return_type = Tuple((return_type[::1], boolean[::1]))
+    offsets = []
+    sig = [return_type, int64]
+    for col in _supported_cols_from_frame(frame).values():
+        sig.append(_masked_array_type_from_col(col))
+        offsets.append(int64)
+
+    # return_type, size, data, masks, offsets, extra args
+    sig = void(*(sig + offsets + [typeof(arg) for arg in args]))
+
+    return sig
+
+
+@cuda.jit(device=True)
+def _mask_get(mask, pos):
+    """Return the validity of mask[pos] as a word."""
+    return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
+
+
+def _generate_cache_key(frame, func: Callable):
+    """Create a cache key that uniquely identifies a compilation.
+
+    A new compilation is needed any time any of the following things change:
+    - The UDF itself as defined in python by the user
+    - The types of the columns utilized by the UDF
+    - The existence of the input columns masks
+    """
+    return (
+        *cudautils.make_cache_key(
+            func, tuple(_all_dtypes_from_frame(frame).values())
+        ),
+        *(col.mask is None for col in frame._data.values()),
+        *frame._data.keys(),
+    )
+
+
+@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
+def _compile_or_get(frame, func, args, kernel_getter=None):
+    """
+    Return a compiled kernel in terms of MaskedTypes that launches a
+    kernel equivalent of `f` for the dtypes of `df`. The kernel uses
+    a thread for each row and calls `f` using that rows data / mask
+    to produce an output value and output validity for each row.
+
+    If the UDF has already been compiled for this requested dtypes,
+    a cached version will be returned instead of running compilation.
+
+    CUDA kernels are void and do not return values. Thus, we need to
+    preallocate a column of the correct dtype and pass it in as one of
+    the kernel arguments. This creates a chicken-and-egg problem where
+    we need the column type to compile the kernel, but normally we would
+    be getting that type FROM compiling the kernel (and letting numba
+    determine it as a return value). As a workaround, we compile the UDF
+    itself outside the final kernel to invoke a full typing pass, which
+    unfortunately is difficult to do without running full compilation.
+    we then obtain the return type from that separate compilation and
+    use it to allocate an output column of the right dtype.
+    """
+
+    # check to see if we already compiled this function
+    cache_key = _generate_cache_key(frame, func)
+    if precompiled.get(cache_key) is not None:
+        kernel, masked_or_scalar = precompiled[cache_key]
+        return kernel, masked_or_scalar
+
+    # precompile the user udf to get the right return type.
+    # could be a MaskedType or a scalar type.
+
+    kernel, scalar_return_type = kernel_getter(frame, func, args)
+
+    np_return_type = numpy_support.as_dtype(scalar_return_type)
+    precompiled[cache_key] = (kernel, np_return_type)
+
+    return kernel, np_return_type
+
+
+def _get_kernel(kernel_string, globals_, sig, func):
+    """template kernel compilation helper function"""
+    f_ = cuda.jit(device=True)(func)
+    globals_["f_"] = f_
+    exec(kernel_string, globals_)
+    _kernel = globals_["_kernel"]
+    kernel = cuda.jit(sig)(_kernel)
+
+    return kernel
diff --git a/python/cudf/cudf/errors.py b/python/cudf/cudf/errors.py
index 8a31afab9cf..5d6f52c0307 100644
--- a/python/cudf/cudf/errors.py
+++ b/python/cudf/cudf/errors.py
@@ -1,9 +1,5 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 
-class UnSupportedGPUError(Exception):
-    pass
-
-
-class UnSupportedCUDAError(Exception):
+class UnsupportedCUDAError(Exception):
     pass
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 01f1fdf9020..4694243ad18 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -59,17 +59,10 @@ def read_csv(
             "`read_csv` does not yet support reading multiple files"
         )
 
-    # Only need to pass byte_ranges to get_filepath_or_buffer
-    # if `use_python_file_object=False`
-    byte_ranges = None
-    if not use_python_file_object and byte_range:
-        byte_ranges = [byte_range]
-
     filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=compression,
         iotypes=(BytesIO, StringIO, NativeFile),
-        byte_ranges=byte_ranges,
         use_python_file_object=use_python_file_object,
         **kwargs,
     )
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index c1cce3f996f..a09fb1f8e12 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import datetime
 import warnings
@@ -291,7 +291,12 @@ def read_orc(
     **kwargs,
 ):
     """{docstring}"""
-
+    if decimal_cols_as_float is not None:
+        warnings.warn(
+            "`decimal_cols_as_float` is deprecated and will be removed in "
+            "the future",
+            FutureWarning,
+        )
     from cudf import DataFrame
 
     # Multiple sources are passed as a list. If a single source is passed,
@@ -395,7 +400,7 @@ def to_orc(
     df,
     fname,
     compression=None,
-    enable_statistics=True,
+    statistics="ROWGROUP",
     stripe_size_bytes=None,
     stripe_size_rows=None,
     row_index_stride=None,
@@ -431,7 +436,7 @@ def to_orc(
                 df,
                 file_obj,
                 compression,
-                enable_statistics,
+                statistics,
                 stripe_size_bytes,
                 stripe_size_rows,
                 row_index_stride,
@@ -441,7 +446,7 @@ def to_orc(
             df,
             path_or_buf,
             compression,
-            enable_statistics,
+            statistics,
             stripe_size_bytes,
             stripe_size_rows,
             row_index_stride,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 3e73e0c9e3d..a919b00692d 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,14 +1,11 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
-import io
-import json
 import warnings
 from collections import defaultdict
 from contextlib import ExitStack
 from typing import Dict, List, Tuple
 from uuid import uuid4
 
-import fsspec
 import numpy as np
 from pyarrow import dataset as ds, parquet as pq
 
@@ -310,103 +307,6 @@ def _process_dataset(
     )
 
 
-def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs):
-
-    # This utility is used to collect the footer metadata
-    # from a parquet file. This metadata is used to define
-    # the exact byte-ranges that will be needed to read the
-    # target column-chunks from the file.
-    #
-    # This utility is only used for remote storage.
-    #
-    # The calculated byte-range information is used within
-    # cudf.io.ioutils.get_filepath_or_buffer (which uses
-    # _fsspec_data_transfer to convert non-local fsspec file
-    # objects into local byte buffers).
-
-    if row_groups is None:
-        if columns is None:
-            return None, None, None  # No reason to construct this
-        row_groups = [None for path in file_list]
-
-    # Construct a list of required byte-ranges for every file
-    all_byte_ranges, all_footers, all_sizes = [], [], []
-    for path, rgs in zip(file_list, row_groups):
-
-        # Step 0 - Get size of file
-        if fs is None:
-            file_size = path.size
-        else:
-            file_size = fs.size(path)
-
-        # Step 1 - Get 32 KB from tail of file.
-        #
-        # This "sample size" can be tunable, but should
-        # always be >= 8 bytes (so we can read the footer size)
-        tail_size = min(kwargs.get("footer_sample_size", 32_000), file_size,)
-        if fs is None:
-            path.seek(file_size - tail_size)
-            footer_sample = path.read(tail_size)
-        else:
-            footer_sample = fs.tail(path, tail_size)
-
-        # Step 2 - Read the footer size and re-read a larger
-        #          tail if necessary
-        footer_size = int.from_bytes(footer_sample[-8:-4], "little")
-        if tail_size < (footer_size + 8):
-            if fs is None:
-                path.seek(file_size - (footer_size + 8))
-                footer_sample = path.read(footer_size + 8)
-            else:
-                footer_sample = fs.tail(path, footer_size + 8)
-
-        # Step 3 - Collect required byte ranges
-        byte_ranges = []
-        md = pq.ParquetFile(io.BytesIO(footer_sample)).metadata
-        column_set = None if columns is None else set(columns)
-        if column_set is not None:
-            schema = md.schema.to_arrow_schema()
-            has_pandas_metadata = (
-                schema.metadata is not None and b"pandas" in schema.metadata
-            )
-            if has_pandas_metadata:
-                md_index = [
-                    ind
-                    for ind in json.loads(
-                        schema.metadata[b"pandas"].decode("utf8")
-                    ).get("index_columns", [])
-                    # Ignore RangeIndex information
-                    if not isinstance(ind, dict)
-                ]
-                column_set |= set(md_index)
-        for r in range(md.num_row_groups):
-            # Skip this row-group if we are targetting
-            # specific row-groups
-            if rgs is None or r in rgs:
-                row_group = md.row_group(r)
-                for c in range(row_group.num_columns):
-                    column = row_group.column(c)
-                    name = column.path_in_schema
-                    # Skip this column if we are targetting a
-                    # specific columns
-                    split_name = name.split(".")[0]
-                    if (
-                        column_set is None
-                        or name in column_set
-                        or split_name in column_set
-                    ):
-                        file_offset0 = column.dictionary_page_offset
-                        if file_offset0 is None:
-                            file_offset0 = column.data_page_offset
-                        num_bytes = column.total_compressed_size
-                        byte_ranges.append((file_offset0, num_bytes))
-
-        all_byte_ranges.append(byte_ranges)
-        all_footers.append(footer_sample)
-        all_sizes.append(file_size)
-    return all_byte_ranges, all_footers, all_sizes
-
-
 @ioutils.doc_read_parquet()
 def read_parquet(
     filepath_or_buffer,
@@ -418,13 +318,24 @@ def read_parquet(
     num_rows=None,
     strings_to_categorical=False,
     use_pandas_metadata=True,
-    use_python_file_object=False,
+    use_python_file_object=True,
     categorical_partitions=True,
+    open_file_options=None,
     *args,
     **kwargs,
 ):
     """{docstring}"""
 
+    # Do not allow the user to set file-opening options
+    # when `use_python_file_object=False` is specified
+    if use_python_file_object is False:
+        if open_file_options:
+            raise ValueError(
+                "open_file_options is not currently supported when "
+                "use_python_file_object is set to False."
+            )
+        open_file_options = {}
+
     # Multiple sources are passed as a list. If a single source is passed,
     # wrap it in a list for unified processing downstream.
     if not is_list_like(filepath_or_buffer):
@@ -470,38 +381,18 @@ def read_parquet(
         raise ValueError("cudf cannot apply filters to open file objects.")
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
-    # Check if we should calculate the specific byte-ranges
-    # needed for each parquet file. We always do this when we
-    # have a file-system object to work with and it is not a
-    # local filesystem object. We can also do it without a
-    # file-system object for `AbstractBufferedFile` buffers
-    byte_ranges, footers, file_sizes = None, None, None
-    if not use_python_file_object:
-        need_byte_ranges = fs is not None and not ioutils._is_local_filesystem(
-            fs
-        )
-        if need_byte_ranges or (
-            filepath_or_buffer
-            and isinstance(
-                filepath_or_buffer[0], fsspec.spec.AbstractBufferedFile,
-            )
-        ):
-            byte_ranges, footers, file_sizes = _get_byte_ranges(
-                filepath_or_buffer, row_groups, columns, fs, **kwargs
-            )
-
     filepaths_or_buffers = []
+    if use_python_file_object:
+        open_file_options = _default_open_file_options(
+            open_file_options, columns, row_groups, fs=fs,
+        )
     for i, source in enumerate(filepath_or_buffer):
-
         tmp_source, compression = ioutils.get_filepath_or_buffer(
             path_or_data=source,
             compression=None,
             fs=fs,
-            byte_ranges=byte_ranges[i] if byte_ranges else None,
-            footer=footers[i] if footers else None,
-            file_size=file_sizes[i] if file_sizes else None,
-            add_par1_magic=True,
             use_python_file_object=use_python_file_object,
+            open_file_options=open_file_options,
             **kwargs,
         )
 
@@ -953,3 +844,41 @@ def __enter__(self):
 
     def __exit__(self, *args):
         self.close()
+
+
+def _default_open_file_options(
+    open_file_options, columns, row_groups, fs=None
+):
+    """
+    Set default fields in open_file_options.
+
+    Copies and updates `open_file_options` to
+    include column and row-group information
+    under the "precache_options" key. By default,
+    we set "method" to "parquet", but precaching
+    will be disabled if the user chooses `method=None`
+
+    Parameters
+    ----------
+    open_file_options : dict or None
+    columns : list
+    row_groups : list
+    fs : fsspec.AbstractFileSystem, Optional
+    """
+    if fs and ioutils._is_local_filesystem(fs):
+        # Quick return for local fs
+        return open_file_options or {}
+    # Assume remote storage if `fs` was not specified
+    open_file_options = (open_file_options or {}).copy()
+    precache_options = open_file_options.pop("precache_options", {}).copy()
+    if precache_options.get("method", "parquet") == "parquet":
+        precache_options.update(
+            {
+                "method": "parquet",
+                "engine": precache_options.get("engine", "pyarrow"),
+                "columns": columns,
+                "row_groups": row_groups,
+            }
+        )
+    open_file_options["precache_options"] = precache_options
+    return open_file_options
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index cc5aec36853..41dac26edf8 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -40,6 +40,17 @@
 ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES))
 
 
+def set_random_null_mask_inplace(series, null_probability=0.5, seed=None):
+    """Randomly nullify elements in series with the provided probability."""
+    probs = [null_probability, 1 - null_probability]
+    rng = np.random.default_rng(seed=seed)
+    mask = rng.choice([False, True], size=len(series), p=probs)
+    series[mask] = None
+
+
+# TODO: This function should be removed. Anywhere that it is being used should
+# instead be generating a random boolean array (bytemask) and use the public
+# APIs to set those elements to None.
 def random_bitmask(size):
     """
     Parameters
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 13be158ed78..e1c7b42c7a3 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -133,7 +133,25 @@ def _generate_column(column_params, num_rows):
         else:
             arrow_type = None
 
-        if not isinstance(arrow_type, pa.lib.Decimal128Type):
+        if isinstance(column_params.dtype, cudf.StructDtype):
+            vals = pa.StructArray.from_arrays(
+                column_params.generator,
+                names=column_params.dtype.fields.keys(),
+                mask=pa.array(
+                    np.random.choice(
+                        [True, False],
+                        size=num_rows,
+                        p=[
+                            column_params.null_frequency,
+                            1 - column_params.null_frequency,
+                        ],
+                    )
+                )
+                if column_params.null_frequency > 0.0
+                else None,
+            )
+            return vals
+        elif not isinstance(arrow_type, pa.lib.Decimal128Type):
             vals = pa.array(
                 column_params.generator,
                 size=column_params.cardinality,
@@ -352,6 +370,30 @@ def rand_dataframe(
                     dtype=dtype,
                 )
             )
+        elif dtype == "struct":
+            nesting_max_depth = meta["nesting_max_depth"]
+            max_types_at_each_level = meta["max_types_at_each_level"]
+            max_null_frequency = meta["max_null_frequency"]
+            nesting_depth = np.random.randint(1, nesting_max_depth)
+            structDtype = create_nested_struct_type(
+                max_types_at_each_level=max_types_at_each_level,
+                nesting_level=nesting_depth,
+            )
+
+            column_params.append(
+                ColumnParameters(
+                    cardinality=cardinality,
+                    null_frequency=null_frequency,
+                    generator=struct_generator(
+                        dtype=structDtype,
+                        cardinality=cardinality,
+                        size=rows,
+                        max_null_frequency=max_null_frequency,
+                    ),
+                    is_sorted=False,
+                    dtype=structDtype,
+                )
+            )
         elif dtype == "decimal64":
             max_precision = meta.get(
                 "max_precision", cudf.Decimal64Dtype.MAX_PRECISION
@@ -600,11 +642,15 @@ def decimal_generator(dtype, size):
     )
 
 
-def get_values_for_nested_data(dtype, lists_max_length):
+def get_values_for_nested_data(dtype, lists_max_length=None, size=None):
     """
     Returns list of values based on dtype.
     """
-    cardinality = np.random.randint(0, lists_max_length)
+    if size is None:
+        cardinality = np.random.randint(0, lists_max_length)
+    else:
+        cardinality = size
+
     dtype = cudf.dtype(dtype)
     if dtype.kind in ("i", "u"):
         values = int_generator(dtype=dtype, size=cardinality)()
@@ -628,12 +674,7 @@ def get_values_for_nested_data(dtype, lists_max_length):
     else:
         raise TypeError(f"Unsupported dtype: {dtype}")
 
-    # To ensure numpy arrays are not passed as input to
-    # list constructor, returning a python list object here.
-    if isinstance(values, np.ndarray):
-        return values.tolist()
-    else:
-        return values
+    return values
 
 
 def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
@@ -657,9 +698,40 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
         top_level_list = get_values_for_nested_data(
             dtype=dtype, lists_max_length=lists_max_length
         )
+        # To ensure numpy arrays are not passed as input to
+        # list constructor, returning a python list object here.
+        if isinstance(top_level_list, np.ndarray):
+            top_level_list = top_level_list.tolist()
+
     return top_level_list
 
 
+def make_array_for_struct(dtype, cardinality, size, max_null_frequency):
+    """
+    Helper to create a pa.array with `size` and `dtype`
+    for a `StructArray`.
+    """
+
+    null_frequency = np.random.uniform(low=0, high=max_null_frequency)
+    local_cardinality = max(np.random.randint(low=0, high=cardinality), 1)
+    data = get_values_for_nested_data(
+        dtype=dtype.type.to_pandas_dtype(), size=local_cardinality
+    )
+    vals = np.random.choice(data, size=size)
+
+    return pa.array(
+        vals,
+        mask=np.random.choice(
+            [True, False], size=size, p=[null_frequency, 1 - null_frequency],
+        )
+        if null_frequency > 0.0
+        else None,
+        size=size,
+        safe=False,
+        type=dtype.type,
+    )
+
+
 def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
     """
     Returns a list of nested lists with random nesting
@@ -680,6 +752,34 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
     return list_of_lists
 
 
+def get_nested_structs(dtype, cardinality, size, max_null_frequency):
+    """
+    Returns a list of arrays with random data
+    corresponding to the dtype provided.
+    ``dtype`` here should be a ``cudf.StructDtype``
+    """
+    list_of_arrays = []
+
+    for name, col_dtype in dtype.fields.items():
+        if isinstance(col_dtype, cudf.StructDtype):
+            result_arrays = get_nested_structs(
+                col_dtype, cardinality, size, max_null_frequency
+            )
+            result_arrays = pa.StructArray.from_arrays(
+                result_arrays, names=col_dtype.fields.keys()
+            )
+        else:
+            result_arrays = make_array_for_struct(
+                dtype=dtype._typ[name],
+                cardinality=cardinality,
+                size=size,
+                max_null_frequency=max_null_frequency,
+            )
+        list_of_arrays.append(result_arrays)
+
+    return list_of_arrays
+
+
 def list_generator(dtype, size, nesting_depth, lists_max_length):
     """
     Generator for list data
@@ -690,3 +790,29 @@ def list_generator(dtype, size, nesting_depth, lists_max_length):
         nesting_depth=nesting_depth,
         lists_max_length=lists_max_length,
     )
+
+
+def struct_generator(dtype, cardinality, size, max_null_frequency):
+    """
+    Generator for struct data
+    """
+    return lambda: get_nested_structs(
+        dtype=dtype,
+        cardinality=cardinality,
+        size=size,
+        max_null_frequency=max_null_frequency,
+    )
+
+
+def create_nested_struct_type(max_types_at_each_level, nesting_level):
+    dtypes_list = cudf.utils.dtypes.ALL_TYPES
+    picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level)
+    type_dict = {}
+    for name, type_ in enumerate(picked_types):
+        if type_ == "struct":
+            type_dict[str(name)] = create_nested_struct_type(
+                max_types_at_each_level, nesting_level - 1
+            )
+        else:
+            type_dict[str(name)] = cudf.dtype(type_)
+    return cudf.StructDtype(type_dict)
diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index 925c9ef720c..ff6e79e7804 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -24,14 +24,10 @@ def test_applymap_round(nelem, masked):
         boolmask = np.asarray(
             utils.expand_bits_to_bytes(bitmask), dtype=np.bool_
         )[:nelem]
-        data[~boolmask] = np.nan
+        data[~boolmask] = None
 
     sr = Series(data)
 
-    if masked:
-        # Mask the Series
-        sr = sr.set_mask(bitmask)
-
     # Call applymap
     out = sr.applymap(
         lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index e01b952be94..748cf958ac3 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -437,7 +437,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
     expect_mask = [True if x is not pd.NA else False for x in pd_data["a"]]
     got_mask = mask_to_bools(
         gd_data["a"]._column.base_mask, 0, len(gd_data)
-    ).to_array()
+    ).values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
@@ -475,7 +475,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
     expect_mask = [True if x is not pd.NA else False for x in pd_data]
     got_mask = mask_to_bools(
         gd_data._column.base_mask, 0, len(gd_data)
-    ).to_array()
+    ).values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d81451e9d54..d0a25fd3e8c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -968,7 +968,7 @@ def test_dataframe_dir_and_getattr():
         df.not_a_column
 
 
-def test_empty_dataframe_to_array():
+def test_empty_dataframe_to_cupy():
     df = cudf.DataFrame()
 
     # Check fully empty dataframe.
@@ -1023,7 +1023,7 @@ def test_dataframe_to_cupy_null_values():
     for k in "abcd":
         df[k] = data = np.random.random(nelem)
         bitmask = utils.random_bitmask(nelem)
-        df[k] = df[k].set_mask(bitmask)
+        df[k] = df[k]._column.set_mask(bitmask)
         boolmask = np.asarray(
             utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_
         )
@@ -1194,7 +1194,7 @@ def test_dataframe_hash_partition_masked_value(nrows):
     gdf["val"] = np.arange(nrows) + 100
     bitmask = utils.random_bitmask(nrows)
     bytemask = utils.expand_bits_to_bytes(bitmask)
-    gdf["val"] = gdf["val"].set_mask(bitmask)
+    gdf["val"] = gdf["val"]._column.set_mask(bitmask)
     parted = gdf.partition_by_hash(["key"], nparts=3)
     # Verify that the valid mask is correct
     for p in parted:
@@ -1215,7 +1215,7 @@ def test_dataframe_hash_partition_masked_keys(nrows):
     gdf["val"] = np.arange(nrows) + 100
     bitmask = utils.random_bitmask(nrows)
     bytemask = utils.expand_bits_to_bytes(bitmask)
-    gdf["key"] = gdf["key"].set_mask(bitmask)
+    gdf["key"] = gdf["key"]._column.set_mask(bitmask)
     parted = gdf.partition_by_hash(["key"], nparts=3, keep_index=False)
     # Verify that the valid mask is correct
     for p in parted:
@@ -1463,11 +1463,8 @@ def test_nonmatching_index_setitem(nrows):
     test_values = np.random.randint(2147483647, size=nrows)
     gdf["c"] = test_values
     assert len(test_values) == len(gdf["c"])
-    assert (
-        gdf["c"]
-        .to_pandas()
-        .equals(cudf.Series(test_values).set_index(gdf._index).to_pandas())
-    )
+    gdf_series = cudf.Series(test_values, index=gdf.index, name="c")
+    assert_eq(gdf["c"].to_pandas(), gdf_series.to_pandas())
 
 
 def test_from_pandas():
@@ -5473,12 +5470,17 @@ def test_memory_usage_list():
 
 @pytest.mark.parametrize("rows", [10, 100])
 def test_memory_usage_multi(rows):
-    deep = True
+    # We need to sample without replacement to guarantee that the size of the
+    # levels are always the same.
     df = pd.DataFrame(
         {
             "A": np.arange(rows, dtype="int32"),
-            "B": np.random.choice(np.arange(3, dtype="int64"), rows),
-            "C": np.random.choice(np.arange(3, dtype="float64"), rows),
+            "B": np.random.choice(
+                np.arange(rows, dtype="int64"), rows, replace=False
+            ),
+            "C": np.random.choice(
+                np.arange(rows, dtype="float64"), rows, replace=False
+            ),
         }
     ).set_index(["B", "C"])
     gdf = cudf.from_pandas(df)
@@ -5486,10 +5488,10 @@ def test_memory_usage_multi(rows):
     # of the underlying columns, levels, and codes
     expect = rows * 16  # Source Columns
     expect += rows * 16  # Codes
-    expect += 3 * 8  # Level 0
-    expect += 3 * 8  # Level 1
+    expect += rows * 8  # Level 0
+    expect += rows * 8  # Level 1
 
-    assert expect == gdf.index.memory_usage(deep=deep)
+    assert expect == gdf.index.memory_usage(deep=True)
 
 
 @pytest.mark.parametrize(
@@ -9078,6 +9080,7 @@ def test_dataframe_assign_cp_np_array():
 
     assert_eq(pdf, gdf)
 
+
 @pytest.mark.parametrize(
     "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}],
 )
@@ -9090,6 +9093,7 @@ def test_dataframe_nunique(data):
 
     assert_eq(expected, actual)
 
+
 @pytest.mark.parametrize(
     "data", [{ "key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}],
 )
@@ -9101,3 +9105,12 @@ def test_dataframe_nunique_index(data):
     expected = pdf.index.nunique()
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_rename_duplicate_column():
+    gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+    with pytest.raises(
+        ValueError, match="Duplicate column names are not allowed"
+    ):
+        gdf.rename(columns={"a": "b"}, inplace=True)
+
diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py
deleted file mode 100644
index 224db2b39d1..00000000000
--- a/python/cudf/cudf/tests/test_fill.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import pandas as pd
-import pytest
-
-import cudf
-from cudf.testing._utils import assert_eq
-
-
-@pytest.mark.parametrize(
-    "fill_value,data",
-    [
-        (7, [6, 3, 4]),
-        ("x", ["a", "b", "c", "d", "e", "f"]),
-        (7, [6, 3, 4, 2, 1, 7, 8, 5]),
-        (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]),
-        ("b", pd.Categorical(["a", "b", "c"])),
-        (None, [0.0, 1.0, 2.0, 3.0]),
-    ],
-)
-@pytest.mark.parametrize(
-    "begin,end",
-    [
-        (0, -1),
-        (0, 4),
-        (1, -1),
-        (1, 4),
-        (-2, 1),
-        (-2, -1),
-        (10, 12),
-        (8, 10),
-        (10, 8),
-        (-10, -8),
-        (-2, 6),
-    ],
-)
-@pytest.mark.parametrize("inplace", [True, False])
-def test_fill(data, fill_value, begin, end, inplace):
-    gs = cudf.Series(data)
-    ps = gs.to_pandas()
-
-    if inplace:
-        actual = gs
-        gs[begin:end] = fill_value
-    else:
-        # private impl doesn't take care of rounding or bounds check
-        if begin < 0:
-            begin += len(gs)
-
-        if end < 0:
-            end += len(gs)
-
-        begin = max(0, min(len(gs), begin))
-        end = max(0, min(len(gs), end))
-        actual = gs.fill(fill_value, begin, end, False)
-        assert actual is not gs
-
-    ps[begin:end] = fill_value
-
-    assert_eq(ps, actual)
-
-
-@pytest.mark.xfail(raises=ValueError)
-def test_fill_new_category():
-    gs = cudf.Series(pd.Categorical(["a", "b", "c"]))
-    gs[0:1] = "d"
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index c73e96de470..f5decd62ea9 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2362,6 +2362,28 @@ def test_groupby_get_group(pdf, group, name, obj):
     assert_groupby_results_equal(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "by",
+    [
+        "a",
+        ["a", "b"],
+        pd.Series([2, 1, 1, 2, 2]),
+        pd.Series(["b", "a", "a", "b", "b"]),
+    ],
+)
+@pytest.mark.parametrize("agg", ["sum", "mean", lambda df: df.mean()])
+def test_groupby_transform_aggregation(by, agg):
+    gdf = cudf.DataFrame(
+        {"a": [2, 2, 1, 2, 1], "b": [1, 1, 1, 2, 2], "c": [1, 2, 3, 4, 5]}
+    )
+    pdf = gdf.to_pandas()
+
+    expected = pdf.groupby(by).transform(agg)
+    actual = gdf.groupby(by).transform(agg)
+
+    assert_groupby_results_equal(expected, actual)
+
+
 def test_groupby_select_then_ffill():
     pdf = pd.DataFrame(
         {
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index e452dc5d7f7..102e5b57e8e 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -783,8 +783,8 @@ def test_dataframe_masked_slicing(nelem, slice_start, slice_end):
     gdf = cudf.DataFrame()
     gdf["a"] = list(range(nelem))
     gdf["b"] = list(range(nelem, 2 * nelem))
-    gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem))
-    gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem))
+    gdf["a"] = gdf["a"]._column.set_mask(utils.random_bitmask(nelem))
+    gdf["b"] = gdf["b"]._column.set_mask(utils.random_bitmask(nelem))
 
     def do_slice(x):
         return x[slice_start:slice_end]
diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py
deleted file mode 100644
index e5c6bacf1d1..00000000000
--- a/python/cudf/cudf/tests/test_label_encode.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
-
-import random
-from itertools import product
-
-import numpy as np
-import pytest
-
-import cudf
-from cudf import DataFrame, Series
-
-
-def _random_float(nelem, dtype):
-    return np.random.random(nelem).astype(dtype)
-
-
-def _random_int(nelem, dtype):
-    return np.random.randint(low=0, high=nelem, size=nelem, dtype=dtype)
-
-
-def _random(nelem, dtype):
-    dtype = cudf.dtype(dtype)
-    if dtype.kind in {"i", "u"}:
-        return _random_int(nelem, dtype)
-    elif dtype.kind == "f":
-        return _random_float(nelem, dtype)
-
-
-_param_sizes = [1, 7, 10, 100, 1000]
-_param_dtypes = [np.int32, np.float32]
-
-
-@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
-@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
-@pytest.mark.parametrize(
-    "nelem,dtype", list(product(_param_sizes, _param_dtypes))
-)
-def test_label_encode(nelem, dtype):
-    df = DataFrame()
-    np.random.seed(0)
-
-    # initialize data frame
-    df["cats"] = _random(nelem, dtype)
-    vals = df["cats"].unique()
-    lab = dict({vals[i]: i for i in range(len(vals))})
-
-    # label encode series
-    ncol = df["cats"].label_encoding(cats=vals)
-    arr = ncol.to_numpy()
-
-    # verify labels of new column
-    for i in range(arr.size):
-        np.testing.assert_equal(arr[i], lab.get(df.cats[i], None))
-
-    # label encode data frame
-    df2 = df.label_encoding(column="cats", prefix="cats", cats=vals)
-
-    assert df2.columns[0] == "cats"
-    assert df2.columns[1] == "cats_labels"
-
-
-@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
-@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
-def test_label_encode_drop_one():
-    random.seed(0)
-    np.random.seed(0)
-
-    df = DataFrame()
-
-    # initialize data frame
-    df["cats"] = np.random.randint(7, size=10, dtype=np.int32)
-    vals = df["cats"].unique()
-    # drop 1 randomly
-    vals = vals[vals.index != random.randrange(len(vals))].reset_index(
-        drop=True
-    )
-
-    lab = dict({vals[i]: i for i in range(len(vals))})
-
-    # label encode series
-    ncol = df["cats"].label_encoding(cats=vals, dtype="float32")
-    arr = ncol.to_numpy()
-
-    # verify labels of new column
-
-    for i in range(arr.size):
-        # assuming -1 is used for missing value
-        np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1))
-
-    # label encode data frame
-    df2 = df.label_encoding(
-        column="cats", prefix="cats", cats=vals, dtype="float32"
-    )
-
-    assert df2.columns[0] == "cats"
-    assert df2.columns[1] == "cats_labels"
-
-
-@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
-def test_label_encode_float_output():
-    random.seed(0)
-    np.random.seed(0)
-
-    df = DataFrame()
-
-    # initialize data frame
-    df["cats"] = arr = np.random.randint(7, size=10, dtype=np.int32)
-    cats = [1, 2, 3, 4]
-    encoder = {c: i for i, c in enumerate(cats)}
-    df2 = df.label_encoding(
-        column="cats",
-        prefix="cats",
-        cats=cats,
-        dtype=np.float32,
-        na_sentinel=np.nan,
-    )
-
-    got = df2["cats_labels"].to_numpy(na_value=np.nan)
-
-    handcoded = np.array([encoder.get(v, np.nan) for v in arr])
-    np.testing.assert_equal(got, handcoded)
-
-
-@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
-@pytest.mark.parametrize(
-    "ncats,cat_dtype", [(10, np.int8), (127, np.int8), (128, np.int16)]
-)
-def test_label_encode_dtype(ncats, cat_dtype):
-    s = Series([str(i % ncats) for i in range(ncats + 1)])
-    cats = s.unique().astype(s.dtype)
-    encoded_col = s.label_encoding(cats=cats)
-    np.testing.assert_equal(encoded_col.dtype, cat_dtype)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 44812f5aba4..8689f773a02 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -391,6 +391,64 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
+def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
+    reference_file = "TestOrcFile.demo-12-zlib.orc"
+    pdf_fname = datadir / reference_file
+    gdf_fname = tmpdir.join("gdf.orc")
+
+    try:
+        orcfile = pa.orc.ORCFile(pdf_fname)
+    except Exception as excpr:
+        if type(excpr).__name__ == "ArrowIOError":
+            pytest.skip(".orc file is not found")
+        else:
+            print(type(excpr).__name__)
+
+    expect = orcfile.read().to_pandas()
+    cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq)
+    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
+def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
+    reference_file = "TestOrcFile.test1.orc"
+    pdf_fname = datadir / reference_file
+    gdf_fname = tmpdir.join("chunked_gdf.orc")
+
+    try:
+        orcfile = pa.orc.ORCFile(pdf_fname)
+    except Exception as excpr:
+        if type(excpr).__name__ == "ArrowIOError":
+            pytest.skip(".orc file is not found")
+        else:
+            print(type(excpr).__name__)
+
+    columns = [
+        "boolean1",
+        "byte1",
+        "short1",
+        "int1",
+        "long1",
+        "float1",
+        "double1",
+    ]
+    pdf = orcfile.read(columns=columns).to_pandas()
+    gdf = cudf.from_pandas(pdf)
+    expect = pd.concat([pdf, pdf]).reset_index(drop=True)
+
+    writer = ORCWriter(gdf_fname, statistics=stats_freq)
+    writer.write_table(gdf)
+    writer.write_table(gdf)
+    writer.close()
+
+    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("compression", [None, "snappy"])
 @pytest.mark.parametrize(
     "reference_file, columns",
@@ -592,8 +650,9 @@ def normalized_equals(value1, value2):
     return value1 == value2
 
 
+@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
-def test_orc_write_statistics(tmpdir, datadir, nrows):
+def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Can't write random bool columns until issue #6763 is fixed
     if nrows == 6000000:
@@ -609,7 +668,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows):
     fname = tmpdir.join("gdf.orc")
 
     # Write said dataframe to ORC with cuDF
-    gdf.to_orc(fname.strpath)
+    gdf.to_orc(fname.strpath, statistics=stats_freq)
 
     # Read back written ORC's statistics
     orc_file = pa.orc.ORCFile(fname)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 519f24b7ca6..80ab0671a0d 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -28,7 +28,7 @@
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
-    random_bitmask,
+    set_random_null_mask_inplace,
 )
 
 
@@ -748,7 +748,10 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
     assert_eq(expect, got)
 
 
-def test_parquet_reader_use_python_file_object(parquet_path_or_buf):
+@pytest.mark.parametrize("use_python_file_object", [True, False])
+def test_parquet_reader_use_python_file_object(
+    parquet_path_or_buf, use_python_file_object
+):
     # Check that the non-default `use_python_file_object=True`
     # option works as expected
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
@@ -756,11 +759,15 @@ def test_parquet_reader_use_python_file_object(parquet_path_or_buf):
 
     # Pass open fsspec file
     with fs.open(paths[0], mode="rb") as fil:
-        got1 = cudf.read_parquet(fil, use_python_file_object=True)
+        got1 = cudf.read_parquet(
+            fil, use_python_file_object=use_python_file_object
+        )
     assert_eq(expect, got1)
 
     # Pass path only
-    got2 = cudf.read_parquet(paths[0], use_python_file_object=True)
+    got2 = cudf.read_parquet(
+        paths[0], use_python_file_object=use_python_file_object
+    )
     assert_eq(expect, got2)
 
 
@@ -2117,7 +2124,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
     gdf = cudf.from_pandas(pdf)
     if add_nulls:
         for col in gdf:
-            gdf[col] = gdf[col].set_mask(random_bitmask(len(gdf)))
+            set_random_null_mask_inplace(gdf[col])
     gdf.to_parquet(file_path, index=False)
 
     # Read back from pyarrow
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 82020f30f7c..ca02ee55df0 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -20,10 +20,8 @@
 @pytest.mark.parametrize("nrows", [0, 5, 10])
 def test_null_series(nrows, dtype):
     size = 5
-    mask = utils.random_bitmask(size)
-    data = cudf.Series(np.random.randint(1, 9, size))
-    column = data.set_mask(mask)
-    sr = cudf.Series(column).astype(dtype)
+    sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype)
+    sr[np.random.choice([False, True], size=size)] = None
     if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}:
         ps = pd.Series(
             sr._column.data_array_view.copy_to_host(),
@@ -62,10 +60,8 @@ def test_null_dataframe(ncols):
     size = 20
     gdf = cudf.DataFrame()
     for idx, dtype in enumerate(dtype_categories):
-        mask = utils.random_bitmask(size)
-        data = cudf.Series(np.random.randint(0, 128, size))
-        column = data.set_mask(mask)
-        sr = cudf.Series(column).astype(dtype)
+        sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype)
+        sr[np.random.choice([False, True], size=size)] = None
         gdf[dtype] = sr
     pdf = gdf.to_pandas()
     pd.options.display.max_columns = int(ncols)
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 5738e1f0d00..da1ffc1fc16 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -131,6 +131,9 @@ def pdf_ext(scope="module"):
     df["Integer"] = np.array([i for i in range(size)])
     df["List"] = [[i] for i in range(size)]
     df["Struct"] = [{"a": i} for i in range(size)]
+    df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[
+        :size
+    ]
     return df
 
 
@@ -225,9 +228,16 @@ def test_write_csv(s3_base, s3so, pdf, chunksize):
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
 @pytest.mark.parametrize("columns", [None, ["Float", "String"]])
-@pytest.mark.parametrize("use_python_file_object", [False, True])
+@pytest.mark.parametrize("precache", [None, "parquet"])
+@pytest.mark.parametrize("use_python_file_object", [True, False])
 def test_read_parquet(
-    s3_base, s3so, pdf, bytes_per_thread, columns, use_python_file_object
+    s3_base,
+    s3so,
+    pdf,
+    bytes_per_thread,
+    columns,
+    precache,
+    use_python_file_object,
 ):
     fname = "test_parquet_reader.parquet"
     bname = "parquet"
@@ -239,10 +249,15 @@ def test_read_parquet(
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got1 = cudf.read_parquet(
             "s3://{}/{}".format(bname, fname),
-            use_python_file_object=use_python_file_object,
+            open_file_options=(
+                {"precache_options": {"method": precache}}
+                if use_python_file_object
+                else None
+            ),
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
             columns=columns,
+            use_python_file_object=use_python_file_object,
         )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
@@ -256,25 +271,18 @@ def test_read_parquet(
         with fs.open("s3://{}/{}".format(bname, fname), mode="rb") as f:
             got2 = cudf.read_parquet(
                 f,
-                use_python_file_object=use_python_file_object,
                 bytes_per_thread=bytes_per_thread,
                 columns=columns,
+                use_python_file_object=use_python_file_object,
             )
     assert_eq(expect, got2)
 
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
 @pytest.mark.parametrize("columns", [None, ["List", "Struct"]])
-@pytest.mark.parametrize("use_python_file_object", [False, True])
 @pytest.mark.parametrize("index", [None, "Integer"])
 def test_read_parquet_ext(
-    s3_base,
-    s3so,
-    pdf_ext,
-    bytes_per_thread,
-    columns,
-    use_python_file_object,
-    index,
+    s3_base, s3so, pdf_ext, bytes_per_thread, columns, index,
 ):
     fname = "test_parquet_reader_ext.parquet"
     bname = "parquet"
@@ -290,7 +298,6 @@ def test_read_parquet_ext(
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got1 = cudf.read_parquet(
             "s3://{}/{}".format(bname, fname),
-            use_python_file_object=use_python_file_object,
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
             footer_sample_size=3200,
@@ -326,12 +333,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("python_file", [True, False])
-def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
+@pytest.mark.parametrize("precache", [None, "parquet"])
+def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     fname = "test_parquet_reader_filters.parquet"
     bname = "parquet"
     buffer = BytesIO()
-    pdf.to_parquet(path=buffer)
+    pdf_ext.to_parquet(path=buffer)
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
@@ -339,11 +346,11 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
             "s3://{}/{}".format(bname, fname),
             storage_options=s3so,
             filters=filters,
-            use_python_file_object=python_file,
+            open_file_options={"precache_options": {"method": precache}},
         )
 
     # All row-groups should be filtered out
-    assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
+    assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
 
 
 @pytest.mark.parametrize("partition_cols", [None, ["String"]])
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index ee891828b3c..358484d79b9 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -561,7 +561,9 @@ def test_series_value_counts(dropna, normalize):
     for size in [10 ** x for x in range(5)]:
         arr = np.random.randint(low=-1, high=10, size=size)
         mask = arr != -1
-        sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
+        sr = cudf.Series.from_masked_array(
+            arr, cudf.Series(mask)._column.as_mask()
+        )
         sr.name = "col"
 
         expect = (
@@ -1518,6 +1520,7 @@ def test_series_transpose(data):
     assert_eq(pd_property, cudf_property)
     assert_eq(cudf_transposed, csr)
 
+
 @pytest.mark.parametrize(
     "data", [1, 3, 5, 7, 7],
 )
@@ -1529,7 +1532,8 @@ def test_series_nunique(data):
     expected = pd_s.nunique()
 
     assert_eq(expected, actual)
-    
+
+
 @pytest.mark.parametrize(
     "data", [1, 3, 5, 7, 7],
 )
@@ -1540,4 +1544,49 @@ def test_series_nunique_index(data):
     actual = cd_s.index.nunique()
     expected = pd_s.index.nunique()
 
-    assert_eq(expected, actual)
\ No newline at end of file
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "fill_value,data",
+    [
+        (7, [6, 3, 4]),
+        ("x", ["a", "b", "c", "d", "e", "f"]),
+        (7, [6, 3, 4, 2, 1, 7, 8, 5]),
+        (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]),
+        ("b", pd.Categorical(["a", "b", "c"])),
+        (None, [0.0, 1.0, 2.0, 3.0]),
+    ],
+)
+@pytest.mark.parametrize(
+    "begin,end",
+    [
+        (0, -1),
+        (0, 4),
+        (1, -1),
+        (1, 4),
+        (-2, 1),
+        (-2, -1),
+        (10, 12),
+        (8, 10),
+        (10, 8),
+        (-10, -8),
+        (-2, 6),
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fill(data, fill_value, begin, end, inplace):
+    gs = cudf.Series(data)
+    ps = gs.to_pandas()
+
+    actual = gs
+    gs[begin:end] = fill_value
+    ps[begin:end] = fill_value
+
+    assert_eq(ps, actual)
+
+
+@pytest.mark.xfail(raises=ValueError)
+def test_fill_new_category():
+    gs = cudf.Series(pd.Categorical(["a", "b", "c"]))
+    gs[0:1] = "d"
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 142ca6c6831..cb3a369d067 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -32,7 +32,8 @@ def test_series_reductions(method, dtype, skipna):
     arr = arr.astype(dtype)
     if dtype in (np.float32, np.float64):
         arr[[2, 5, 14, 19, 50, 70]] = np.nan
-    sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
+    sr = cudf.Series(arr)
+    sr[~mask] = None
     psr = sr.to_pandas()
     psr[~mask] = np.nan
 
@@ -83,7 +84,8 @@ def test_series_unique():
     for size in [10 ** x for x in range(5)]:
         arr = np.random.randint(low=-1, high=10, size=size)
         mask = arr != -1
-        sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
+        sr = cudf.Series(arr)
+        sr[~mask] = None
         assert set(arr[mask]) == set(sr.unique().dropna().to_numpy())
         assert len(set(arr[mask])) == sr.nunique()
 
@@ -298,7 +300,8 @@ def test_series_median(dtype, num_na):
     mask = np.arange(100) >= num_na
 
     arr = arr.astype(dtype)
-    sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
+    sr = cudf.Series(arr)
+    sr[~mask] = None
     arr2 = arr[mask]
     ps = pd.Series(arr2, dtype=dtype)
 
diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index 935c3868a68..c5cd8f8b717 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -49,4 +49,4 @@ def generic_function(a, b):
 
     result = lhs_arr ** 3 + rhs_arr
 
-    np.testing.assert_almost_equal(result, out_col.to_array())
+    np.testing.assert_almost_equal(result, out_col.values_host)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index c9c2c440632..56090c8eacf 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -13,6 +13,7 @@
     comparison_ops,
     unary_ops,
 )
+from cudf.core.udf.utils import precompiled
 from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq
 
 
@@ -485,7 +486,7 @@ def outer(row):
         {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]}
     )
 
-    with pytest.raises(AttributeError):
+    with pytest.raises(ValueError):
         gdf.apply(outer, axis=1)
 
     pdf = gdf.to_pandas(nullable=True)
@@ -538,7 +539,7 @@ def func(row):
         return row["unsupported_col"]
 
     # check that we fail when an unsupported type is used within a function
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         data.apply(func, axis=1)
 
     # also check that a DF containing unsupported dtypes can still run a
@@ -595,6 +596,44 @@ def func(row, c, k):
     run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, cudf.NA, 3],
+        [0.5, 2.0, cudf.NA, cudf.NA, 5.0],
+        [True, False, cudf.NA],
+    ],
+)
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_mask_udf_scalar_args_binops_series(data, op):
+    data = cudf.Series(data)
+
+    def func(x, c):
+        return x + c
+
+    run_masked_udf_series(func, data, args=(1,), check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, cudf.NA, 3],
+        [0.5, 2.0, cudf.NA, cudf.NA, 5.0],
+        [True, False, cudf.NA],
+    ],
+)
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_masked_udf_scalar_args_binops_multiple_series(data, op):
+    data = cudf.Series(data)
+
+    def func(data, c, k):
+        x = op(data, c)
+        y = op(x, k)
+        return y
+
+    run_masked_udf_series(func, data, args=(1, 2), check_dtype=False)
+
+
 def test_masked_udf_caching():
     # Make sure similar functions that differ
     # by simple things like constants actually
@@ -612,3 +651,16 @@ def test_masked_udf_caching():
     expect = data ** 3
     got = data.applymap(lambda x: x ** 3)
     assert_eq(expect, got, check_dtype=False)
+
+    # make sure we get a hit when reapplying
+    def f(x):
+        return x + 1
+
+    precompiled.clear()
+    assert precompiled.currsize == 0
+    data.apply(f)
+
+    assert precompiled.currsize == 1
+    data.apply(f)
+
+    assert precompiled.currsize == 1
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index fa5cde76524..3cbbc1e1ce7 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -173,7 +173,9 @@ def run(self, df, **launch_params):
                 outputs[k], index=outdf.index, nan_as_null=False
             )
             if out_mask is not None:
-                outdf[k] = outdf[k].set_mask(out_mask.data_array_view)
+                outdf._data[k] = outdf[k]._column.set_mask(
+                    out_mask.data_array_view
+                )
 
         return outdf
 
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index dbdd68f2df8..bd3da4ea2ba 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -15,10 +15,10 @@ def validate_setup():
 
     import warnings
 
+    from cuda.cudart import cudaDeviceAttr, cudaError_t
+
     from rmm._cuda.gpu import (
         CUDARuntimeError,
-        cudaDeviceAttr,
-        cudaError,
         deviceGetName,
         driverGetVersion,
         getDeviceAttribute,
@@ -26,34 +26,26 @@ def validate_setup():
         runtimeGetVersion,
     )
 
-    def _try_get_old_or_new_symbols():
-        try:
-            # CUDA 10.2+ symbols
-            return [
-                cudaError.cudaErrorDeviceUninitialized,
-                cudaError.cudaErrorTimeout,
-            ]
-        except AttributeError:
-            # CUDA 10.1 symbols
-            return [cudaError.cudaErrorDeviceUninitilialized]
+    from cudf.errors import UnsupportedCUDAError
 
     notify_caller_errors = {
-        cudaError.cudaErrorInitializationError,
-        cudaError.cudaErrorInsufficientDriver,
-        cudaError.cudaErrorInvalidDeviceFunction,
-        cudaError.cudaErrorInvalidDevice,
-        cudaError.cudaErrorStartupFailure,
-        cudaError.cudaErrorInvalidKernelImage,
-        cudaError.cudaErrorAlreadyAcquired,
-        cudaError.cudaErrorOperatingSystem,
-        cudaError.cudaErrorNotPermitted,
-        cudaError.cudaErrorNotSupported,
-        cudaError.cudaErrorSystemNotReady,
-        cudaError.cudaErrorSystemDriverMismatch,
-        cudaError.cudaErrorCompatNotSupportedOnDevice,
-        *_try_get_old_or_new_symbols(),
-        cudaError.cudaErrorUnknown,
-        cudaError.cudaErrorApiFailureBase,
+        cudaError_t.cudaErrorInitializationError,
+        cudaError_t.cudaErrorInsufficientDriver,
+        cudaError_t.cudaErrorInvalidDeviceFunction,
+        cudaError_t.cudaErrorInvalidDevice,
+        cudaError_t.cudaErrorStartupFailure,
+        cudaError_t.cudaErrorInvalidKernelImage,
+        cudaError_t.cudaErrorAlreadyAcquired,
+        cudaError_t.cudaErrorOperatingSystem,
+        cudaError_t.cudaErrorNotPermitted,
+        cudaError_t.cudaErrorNotSupported,
+        cudaError_t.cudaErrorSystemNotReady,
+        cudaError_t.cudaErrorSystemDriverMismatch,
+        cudaError_t.cudaErrorCompatNotSupportedOnDevice,
+        cudaError_t.cudaErrorDeviceUninitialized,
+        cudaError_t.cudaErrorTimeout,
+        cudaError_t.cudaErrorUnknown,
+        cudaError_t.cudaErrorApiFailureBase,
     }
 
     try:
@@ -68,49 +60,42 @@ def _try_get_old_or_new_symbols():
         # Cupy throws RunTimeException to get GPU count,
         # hence obtaining GPU count by in-house cpp api above
 
-        # 75 - Indicates to get "cudaDevAttrComputeCapabilityMajor" attribute
-        # 0 - Get GPU 0
         major_version = getDeviceAttribute(
             cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, 0
         )
 
-        if major_version >= 6:
-            # You have a GPU with NVIDIA Pascal™ architecture or better
+        if major_version < 6:
+            # A GPU with NVIDIA Pascal™ architecture or newer is required.
+            # Reference: https://developer.nvidia.com/cuda-gpus
             # Hardware Generation	Compute Capability
+            #    Ampere	                8.x
             #    Turing	                7.5
-            #    Volta	                7.x
+            #    Volta	                7.0, 7.2
             #    Pascal	                6.x
-            #    Maxwell	              5.x
+            #    Maxwell                5.x
             #    Kepler	                3.x
             #    Fermi	                2.x
-            pass
-        else:
             device_name = deviceGetName(0)
             minor_version = getDeviceAttribute(
                 cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, 0
             )
             warnings.warn(
-                f"You will need a GPU with NVIDIA Pascal™ or "
-                f"newer architecture"
-                f"\nDetected GPU 0: {device_name} \n"
-                f"Detected Compute Capability: "
-                f"{major_version}.{minor_version}"
+                "A GPU with NVIDIA Pascal™ (Compute Capability 6.0) "
+                "or newer architecture is required.\n"
+                f"Detected GPU 0: {device_name}\n"
+                f"Detected Compute Capability: {major_version}.{minor_version}"
             )
 
         cuda_runtime_version = runtimeGetVersion()
 
-        if cuda_runtime_version >= 10000:
-            # CUDA Runtime Version Check: Runtime version is greater than 10000
-            pass
-        else:
-            from cudf.errors import UnSupportedCUDAError
-
-            minor_version = cuda_runtime_version % 100
-            major_version = (cuda_runtime_version - minor_version) // 1000
-            raise UnSupportedCUDAError(
-                f"Detected CUDA Runtime version is "
-                f"{major_version}.{str(minor_version)[0]}"
-                f"Please update your CUDA Runtime to 10.0 or above"
+        if cuda_runtime_version < 11000:
+            # Require CUDA Runtime version 11.0 or greater.
+            major_version = cuda_runtime_version // 1000
+            minor_version = (cuda_runtime_version % 1000) // 10
+            raise UnsupportedCUDAError(
+                "Detected CUDA Runtime version is "
+                f"{major_version}.{minor_version}. "
+                "Please update your CUDA Runtime to 11.0 or above."
             )
 
         cuda_driver_supported_rt_version = driverGetVersion()
@@ -126,15 +111,12 @@ def _try_get_old_or_new_symbols():
         # https://docs.nvidia.com/deploy/cuda-compatibility/index.html
 
         if cuda_driver_supported_rt_version == 0:
-            from cudf.errors import UnSupportedCUDAError
-
-            raise UnSupportedCUDAError(
-                "We couldn't detect the GPU driver "
-                "properly. Please follow the linux installation guide to "
-                "ensure your driver is properly installed "
-                ": https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"
+            raise UnsupportedCUDAError(
+                "We couldn't detect the GPU driver properly. Please follow "
+                "the installation guide to ensure your driver is properly "
+                "installed: "
+                "https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"
             )
-
         elif cuda_driver_supported_rt_version >= cuda_runtime_version:
             # CUDA Driver Version Check:
             # Driver Runtime version is >= Runtime version
@@ -149,17 +131,12 @@ def _try_get_old_or_new_symbols():
             # version 450.80.02 supports.
             pass
         else:
-            from cudf.errors import UnSupportedCUDAError
-
-            raise UnSupportedCUDAError(
-                f"Please update your NVIDIA GPU Driver to support CUDA "
-                f"Runtime.\n"
-                f"Detected CUDA Runtime version : {cuda_runtime_version}"
-                f"\n"
-                f"Latest version of CUDA supported by current "
+            raise UnsupportedCUDAError(
+                "Please update your NVIDIA GPU Driver to support CUDA "
+                "Runtime.\n"
+                f"Detected CUDA Runtime version : {cuda_runtime_version}\n"
+                "Latest version of CUDA supported by current "
                 f"NVIDIA GPU Driver : {cuda_driver_supported_rt_version}"
             )
-
     else:
-
         warnings.warn("No NVIDIA GPU detected")
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6f958860dad..8f8a40ae4ab 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -3,6 +3,7 @@
 import datetime
 import os
 import urllib
+import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
 
@@ -17,6 +18,13 @@
 
 from cudf.utils.docutils import docfmt_partial
 
+try:
+    import fsspec.parquet as fsspec_parquet
+
+except ImportError:
+    fsspec_parquet = None
+
+
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
   available sources
@@ -160,10 +168,17 @@
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
-use_python_file_object : boolean, default False
+use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger parquet files.
+    AbstractBufferedFile objects at IO time. Setting this argument to `False`
+    will require the entire file to be copied to host memory, and is highly
+    discouraged.
+open_file_options : dict, optional
+    Dictionary of key-value pairs to pass to the function used to open remote
+    files. By default, this will be `fsspec.parquet.open_parquet_file`. To
+    deactivate optimized precaching, set the "method" to `None` under the
+    "precache_options" key. Note that the `open_file_func` key can also be
+    used to specify a custom file-open function.
 
 Returns
 -------
@@ -1220,6 +1235,100 @@ def _get_filesystem_and_paths(path_or_data, **kwargs):
     return fs, return_paths
 
 
+def _set_context(obj, stack):
+    # Helper function to place open file on context stack
+    if stack is None:
+        return obj
+    return stack.enter_context(obj)
+
+
+def _open_remote_files(
+    paths,
+    fs,
+    context_stack=None,
+    open_file_func=None,
+    precache_options=None,
+    **kwargs,
+):
+    """Return a list of open file-like objects given
+    a list of remote file paths.
+
+    Parameters
+    ----------
+    paths : list(str)
+        List of file-path strings.
+    fs : fsspec.AbstractFileSystem
+        Fsspec file-system object.
+    context_stack : contextlib.ExitStack, Optional
+        Context manager to use for open files.
+    open_file_func : Callable, Optional
+        Call-back function to use for opening. If this argument
+        is specified, all other arguments will be ignored.
+    precache_options : dict, optional
+        Dictionary of key-word arguments to pass to use for
+        precaching. Unless the input contains ``{"method": None}``,
+        ``fsspec.parquet.open_parquet_file`` will be used for remote
+        storage.
+    **kwargs :
+        Key-word arguments to be passed to format-specific
+        open functions.
+    """
+
+    # Just use call-back function if one was specified
+    if open_file_func is not None:
+        return [
+            _set_context(open_file_func(path, **kwargs), context_stack)
+            for path in paths
+        ]
+
+    # Check if the "precache" option is supported.
+    # In the future, fsspec should do this check for us
+    precache_options = (precache_options or {}).copy()
+    precache = precache_options.pop("method", None)
+    if precache not in ("parquet", None):
+        raise ValueError(f"{precache} not a supported `precache` option.")
+
+    # Check that "parts" caching (used for all format-aware file handling)
+    # is supported by the installed fsspec/s3fs version
+    if precache == "parquet" and not fsspec_parquet:
+        warnings.warn(
+            f"This version of fsspec ({fsspec.__version__}) does "
+            f"not support parquet-optimized precaching. Please upgrade "
+            f"to the latest fsspec version for better performance."
+        )
+        precache = None
+
+    if precache == "parquet":
+        # Use fsspec.parquet module.
+        # TODO: Use `cat_ranges` to collect "known"
+        # parts for all files at once.
+        row_groups = precache_options.pop("row_groups", None) or (
+            [None] * len(paths)
+        )
+        return [
+            ArrowPythonFile(
+                _set_context(
+                    fsspec_parquet.open_parquet_file(
+                        path,
+                        fs=fs,
+                        row_groups=rgs,
+                        **precache_options,
+                        **kwargs,
+                    ),
+                    context_stack,
+                )
+            )
+            for path, rgs in zip(paths, row_groups)
+        ]
+
+    # Default open - Use pyarrow filesystem API
+    pa_fs = PyFileSystem(FSSpecHandler(fs))
+    return [
+        _set_context(pa_fs.open_input_file(fpath), context_stack)
+        for fpath in paths
+    ]
+
+
 def get_filepath_or_buffer(
     path_or_data,
     compression,
@@ -1228,6 +1337,7 @@ def get_filepath_or_buffer(
     iotypes=(BytesIO, NativeFile),
     byte_ranges=None,
     use_python_file_object=False,
+    open_file_options=None,
     **kwargs,
 ):
     """Return either a filepath string to data, or a memory buffer of data.
@@ -1249,6 +1359,9 @@ def get_filepath_or_buffer(
     use_python_file_object : boolean, default False
         If True, Arrow-backed PythonFile objects will be used in place
         of fsspec AbstractBufferedFile objects.
+    open_file_options : dict, optional
+        Optional dictionary of key-word arguments to pass to
+        `_open_remote_files` (used for remote storage only).
 
     Returns
     -------
@@ -1282,19 +1395,14 @@ def get_filepath_or_buffer(
 
         else:
             if use_python_file_object:
-                pa_fs = PyFileSystem(FSSpecHandler(fs))
-                path_or_data = [
-                    pa_fs.open_input_file(fpath) for fpath in paths
-                ]
+                path_or_data = _open_remote_files(
+                    paths, fs, **(open_file_options or {}),
+                )
             else:
                 path_or_data = [
                     BytesIO(
                         _fsspec_data_transfer(
-                            fpath,
-                            fs=fs,
-                            mode=mode,
-                            byte_ranges=byte_ranges,
-                            **kwargs,
+                            fpath, fs=fs, mode=mode, **kwargs,
                         )
                     )
                     for fpath in paths
@@ -1309,9 +1417,7 @@ def get_filepath_or_buffer(
             path_or_data = ArrowPythonFile(path_or_data)
         else:
             path_or_data = BytesIO(
-                _fsspec_data_transfer(
-                    path_or_data, mode=mode, byte_ranges=byte_ranges, **kwargs
-                )
+                _fsspec_data_transfer(path_or_data, mode=mode, **kwargs)
             )
 
     return path_or_data, compression
@@ -1545,10 +1651,7 @@ def _ensure_filesystem(passed_filesystem, path, **kwargs):
 def _fsspec_data_transfer(
     path_or_fob,
     fs=None,
-    byte_ranges=None,
-    footer=None,
     file_size=None,
-    add_par1_magic=None,
     bytes_per_thread=256_000_000,
     max_gap=64_000,
     mode="rb",
@@ -1568,48 +1671,22 @@ def _fsspec_data_transfer(
     file_size = file_size or fs.size(path_or_fob)
 
     # Check if a direct read makes the most sense
-    if not byte_ranges and bytes_per_thread >= file_size:
+    if bytes_per_thread >= file_size:
         if file_like:
             return path_or_fob.read()
         else:
-            return fs.open(path_or_fob, mode=mode, cache_type="none").read()
+            return fs.open(path_or_fob, mode=mode, cache_type="all").read()
 
     # Threaded read into "local" buffer
     buf = np.zeros(file_size, dtype="b")
-    if byte_ranges:
-
-        # Optimize/merge the ranges
-        byte_ranges = _merge_ranges(
-            byte_ranges, max_block=bytes_per_thread, max_gap=max_gap,
-        )
-
-        # Call multi-threaded data transfer of
-        # remote byte-ranges to local buffer
-        _read_byte_ranges(
-            path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
-        )
-
-        # Add Header & Footer bytes
-        if footer is not None:
-            footer_size = len(footer)
-            buf[-footer_size:] = np.frombuffer(
-                footer[-footer_size:], dtype="b"
-            )
 
-        # Add parquet magic bytes (optional)
-        if add_par1_magic:
-            buf[:4] = np.frombuffer(b"PAR1", dtype="b")
-            if footer is None:
-                buf[-4:] = np.frombuffer(b"PAR1", dtype="b")
-
-    else:
-        byte_ranges = [
-            (b, min(bytes_per_thread, file_size - b))
-            for b in range(0, file_size, bytes_per_thread)
-        ]
-        _read_byte_ranges(
-            path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
-        )
+    byte_ranges = [
+        (b, min(bytes_per_thread, file_size - b))
+        for b in range(0, file_size, bytes_per_thread)
+    ]
+    _read_byte_ranges(
+        path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
+    )
 
     return buf.tobytes()
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index d23094ef3f9..add4ecd8f01 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -242,7 +242,6 @@ def _fillna_natwise(col):
     return column.build_column(
         data=result.base_data,
         dtype=result.dtype,
-        mask=col.base_mask,
         size=result.size,
         offset=result.offset,
         children=result.base_children,
diff --git a/python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo b/python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo
deleted file mode 100644
index 624b60798ae..00000000000
Binary files a/python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo and /dev/null differ
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 24d072c544e..bff60e63fdb 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -14,7 +14,12 @@ from cudf_kafka._lib.kafka cimport kafka_consumer
 # To avoid including <python.h> in libcudf_kafka
 # we introduce this wrapper in Cython
 cdef map[string, string] oauth_callback_wrapper(void *ctx):
-    return (<object>(ctx))()
+    resp = (<object>(ctx))()
+    cdef map[string, string] c_resp
+    c_resp[str.encode("token")] = str.encode(resp["token"])
+    c_resp[str.encode("token_expiration_in_epoch")] \
+        = str(resp["token_expiration_in_epoch"]).encode()
+    return c_resp
 
 
 cdef class KafkaDatasource(Datasource):
diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt
index 6f1c09947d5..b4cd7a649ee 100644
--- a/python/custreamz/dev_requirements.txt
+++ b/python/custreamz/dev_requirements.txt
@@ -3,8 +3,8 @@
 flake8==3.8.3
 black==19.10b0
 isort==5.6.4
-dask>=2021.11.1,<=2021.11.2
-distributed>=2021.11.1,<=2021.11.2
+dask>=2021.11.1,<=2022.01.0
+distributed>=2021.11.1,<=2022.01.0
 streamz
 python-confluent-kafka
 pytest
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index a49d73493ec..ac5795fa2ec 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -20,7 +20,9 @@
 import cudf
 from cudf.core.column import as_column, build_categorical_column
 from cudf.io import write_to_dataset
+from cudf.io.parquet import _default_open_file_options
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
+from cudf.utils.ioutils import _is_local_filesystem, _open_remote_files
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -64,6 +66,7 @@ def _read_paths(
         partitions=None,
         partitioning=None,
         partition_keys=None,
+        open_file_options=None,
         **kwargs,
     ):
 
@@ -75,15 +78,15 @@ def _read_paths(
 
             # Non-local filesystem handling
             paths_or_fobs = paths
-            if not cudf.utils.ioutils._is_local_filesystem(fs):
-
-                # Convert paths to file objects for remote data
-                paths_or_fobs = [
-                    stack.enter_context(
-                        fs.open(path, mode="rb", cache_type="none")
-                    )
-                    for path in paths
-                ]
+            if not _is_local_filesystem(fs):
+                paths_or_fobs = _open_remote_files(
+                    paths_or_fobs,
+                    fs,
+                    context_stack=stack,
+                    **_default_open_file_options(
+                        open_file_options, columns, row_groups
+                    ),
+                )
 
             # Use cudf to read in data
             df = cudf.read_parquet(
@@ -150,6 +153,7 @@ def read_partition(
         partitions=(),
         partitioning=None,
         schema=None,
+        open_file_options=None,
         **kwargs,
     ):
 
@@ -168,7 +172,10 @@ def read_partition(
         if not isinstance(pieces, list):
             pieces = [pieces]
 
+        # Extract supported kwargs from `kwargs`
         strings_to_cats = kwargs.get("strings_to_categorical", False)
+        read_kwargs = kwargs.get("read", {})
+        read_kwargs.update(open_file_options or {})
 
         # Assume multi-piece read
         paths = []
@@ -192,7 +199,7 @@ def read_partition(
                         partitions=partitions,
                         partitioning=partitioning,
                         partition_keys=last_partition_keys,
-                        **kwargs.get("read", {}),
+                        **read_kwargs,
                     )
                 )
                 paths = rgs = []
@@ -215,13 +222,13 @@ def read_partition(
                 partitions=partitions,
                 partitioning=partitioning,
                 partition_keys=last_partition_keys,
-                **kwargs.get("read", {}),
+                **read_kwargs,
             )
         )
         df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0]
 
         # Re-set "object" dtypes align with pa schema
-        set_object_dtypes_from_pa_schema(df, kwargs.get("schema", None))
+        set_object_dtypes_from_pa_schema(df, schema)
 
         if index and (index[0] in df.columns):
             df = df.set_index(index[0])
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index ad53f5cfe0f..83ff1273b36 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -6,6 +6,7 @@
 from io import BytesIO
 
 import pandas as pd
+import pyarrow.fs as pa_fs
 import pytest
 
 import dask_cudf
@@ -115,7 +116,15 @@ def test_read_csv(s3_base, s3so):
         assert df.a.sum().compute() == 4
 
 
-def test_read_parquet(s3_base, s3so):
+@pytest.mark.parametrize(
+    "open_file_options",
+    [
+        {"precache_options": {"method": None}},
+        {"precache_options": {"method": "parquet"}},
+        {"open_file_func": None},
+    ],
+)
+def test_read_parquet(s3_base, s3so, open_file_options):
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]})
     buffer = BytesIO()
     pdf.to_parquet(path=buffer)
@@ -123,8 +132,15 @@ def test_read_parquet(s3_base, s3so):
     with s3_context(
         s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer}
     ):
+        if "open_file_func" in open_file_options:
+            fs = pa_fs.S3FileSystem(
+                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+            )
+            open_file_options["open_file_func"] = fs.open_input_file
         df = dask_cudf.read_parquet(
-            "s3://daskparquet/*.parq", storage_options=s3so
+            "s3://daskparquet/*.parq",
+            storage_options=s3so,
+            open_file_options=open_file_options,
         )
         assert df.a.sum().compute() == 10
         assert df.b.sum().compute() == 9
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 1521ce41806..c7342818610 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -41,7 +41,7 @@ def test_series(data):
     sr = Series(pdsr)
     dsr = dgd.from_cudf(sr, npartitions=5)
 
-    np.testing.assert_equal(np.array(pdsr), dsr.compute().to_array())
+    np.testing.assert_equal(np.array(pdsr), dsr.compute().values_host)
 
 
 @pytest.mark.parametrize("data", [data_dt_1()])
@@ -114,7 +114,7 @@ def test_categorical_basic(data):
     sr = Series(cat)
     dsr = dgd.from_cudf(sr, npartitions=2)
     result = dsr.compute()
-    np.testing.assert_array_equal(cat.codes, result.to_array())
+    np.testing.assert_array_equal(cat.codes, result.cat.codes.values_host)
 
     assert dsr.dtype.to_pandas() == pdsr.dtype
     # Test attributes
@@ -122,7 +122,9 @@ def test_categorical_basic(data):
 
     assert_eq(pdsr.cat.categories, dsr.cat.categories)
 
-    np.testing.assert_array_equal(pdsr.cat.codes.values, result.to_array())
+    np.testing.assert_array_equal(
+        pdsr.cat.codes.values, result.cat.codes.values_host
+    )
 
     string = str(result)
     expect_str = """
@@ -207,12 +209,12 @@ def test_categorical_compare_ordered(data):
     # Test equality
     out = dsr1 == dsr1
     assert out.dtype == np.bool_
-    assert np.all(out.compute().to_array())
+    assert np.all(out.compute().values_host)
     assert np.all(pdsr1 == pdsr1)
 
     # Test inequality
     out = dsr1 != dsr1
-    assert not np.any(out.compute().to_array())
+    assert not np.any(out.compute().values_host)
     assert not np.any(pdsr1 != pdsr1)
 
     assert dsr1.cat.ordered
@@ -220,10 +222,10 @@ def test_categorical_compare_ordered(data):
 
     # Test ordered operators
     np.testing.assert_array_equal(
-        pdsr1 < pdsr2, (dsr1 < dsr2).compute().to_array()
+        pdsr1 < pdsr2, (dsr1 < dsr2).compute().values_host
     )
     np.testing.assert_array_equal(
-        pdsr1 > pdsr2, (dsr1 > dsr2).compute().to_array()
+        pdsr1 > pdsr2, (dsr1 > dsr2).compute().values_host
     )
 
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index ace9701b677..67fed62c582 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -284,7 +284,7 @@ def test_assign():
     got = dgf.assign(z=newcol)
 
     dd.assert_eq(got.loc[:, ["x", "y"]], df)
-    np.testing.assert_array_equal(got["z"].compute().to_array(), pdcol)
+    np.testing.assert_array_equal(got["z"].compute().values_host, pdcol)
 
 
 @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"])
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index d8b0745be79..d5ba79d4987 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask>=2021.11.1
-distributed>=2021.11.1
+dask>=2021.11.1,<=2022.01.0
+distributed>=2021.11.1,<=2022.01.0
 fsspec>=0.6.0
 numba>=0.53.1
 numpy
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 425839772eb..39491a45e7e 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
 
 install_requires = [
     "cudf",
-    "dask>=2021.11.1",
-    "distributed>=2021.11.1",
+    "dask>=2021.11.1,<=2022.01.0",
+    "distributed>=2021.11.1,<=2022.01.0",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.4.0dev0",