diff --git a/build.sh b/build.sh
index 45074a6645f..c2eba134c35 100755
--- a/build.sh
+++ b/build.sh
@@ -230,6 +230,7 @@ if buildAll || hasArg libcudf; then
         fi
         echo "$MSG"
         python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
+        cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log
     fi
 
     if [[ ${INSTALL_TARGET} != "" ]]; then
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 13f7f0e6267..9fb86b0b3c5 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -14,7 +14,7 @@ LANG=C.UTF-8
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index f23296038f2..6f19f174da0 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -85,6 +85,7 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then
       gpuci_logger "Copying build metrics results"
       mkdir -p "$WORKSPACE/build-metrics"
       cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
+      cp "$LIBCUDF_BUILD_DIR/ninja.log" "$WORKSPACE/build-metrics/ninja.log"
   fi
 
   gpuci_logger "Build conda pkg for libcudf_kafka"
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index 8a2c9d9be7c..1699fc16a47 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -3,32 +3,11 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 set -e
 
-DEFAULT_CUDA_VER="11.5"
-DEFAULT_PYTHON_VER="3.8"
-
-#Always upload cudf Python package
+#Always upload cudf packages
 export UPLOAD_CUDF=1
-
-#Upload libcudf once per CUDA
-if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]]; then
-    export UPLOAD_LIBCUDF=1
-else
-    export UPLOAD_LIBCUDF=0
-fi
-
-# upload cudf_kafka for all versions of Python
-if [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
-    export UPLOAD_CUDF_KAFKA=1
-else
-    export UPLOAD_CUDF_KAFKA=0
-fi
-
-#We only want to upload libcudf_kafka once per python/CUDA combo
-if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
-    export UPLOAD_LIBCUDF_KAFKA=1
-else
-    export UPLOAD_LIBCUDF_KAFKA=0
-fi
+export UPLOAD_LIBCUDF=1
+export UPLOAD_CUDF_KAFKA=1
+export UPLOAD_LIBCUDF_KAFKA=1
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     #If project flash is not activate, always build both
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 1105b9c194d..5575b69c226 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -42,10 +42,13 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
 
+# cmake-format rapids-cmake definitions
+sed_runner 's/'"branch-.*\/cmake-format-rapids-cmake.json"'/'"branch-${NEXT_SHORT_TAG}\/cmake-format-rapids-cmake.json"'/g' ci/checks/style.sh
+
 # doxyfile update
 sed_runner 's/PROJECT_NUMBER         = .*/PROJECT_NUMBER         = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile
 
-# RTD update
+# sphinx docs update
 sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source/conf.py
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e4637408110..90e94ffcc7b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -425,13 +425,11 @@ add_library(
   src/strings/copying/concatenate.cu
   src/strings/copying/copying.cu
   src/strings/copying/shift.cu
+  src/strings/count_matches.cu
   src/strings/extract/extract.cu
   src/strings/extract/extract_all.cu
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
-  src/strings/findall.cu
-  src/strings/find.cu
-  src/strings/find_multiple.cu
   src/strings/padding.cu
   src/strings/json/json_path.cu
   src/strings/regex/regcomp.cpp
@@ -441,6 +439,10 @@ add_library(
   src/strings/replace/multi_re.cu
   src/strings/replace/replace.cu
   src/strings/replace/replace_re.cu
+  src/strings/search/findall.cu
+  src/strings/search/findall_record.cu
+  src/strings/search/find.cu
+  src/strings/search/find_multiple.cu
   src/strings/split/partition.cu
   src/strings/split/split.cu
   src/strings/split/split_record.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 57592de59af..13ef02efc99 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -123,7 +123,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
-ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)
+ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates.cpp)
 
 # ##################################################################################################
 # * join benchmark --------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index ca3a748ccad..5f23cbbafdd 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -32,8 +32,9 @@ inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>
 inline auto make_pool_instance()
 {
   static rmm::mr::cuda_memory_resource cuda_mr;
-  static rmm::mr::pool_memory_resource pool_mr{&cuda_mr};
-  return std::shared_ptr<rmm::mr::device_memory_resource>(&pool_mr);
+  static auto pool_mr =
+    std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>>(&cuda_mr);
+  return pool_mr;
 }
 }  // namespace
 
diff --git a/cpp/benchmarks/io/csv/csv_reader.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp
index 7cbdb8261b8..241ba4d5954 100644
--- a/cpp/benchmarks/io/csv/csv_reader.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader.cpp
@@ -43,9 +43,7 @@ void BM_csv_read_varying_input(benchmark::State& state)
 
   cuio_source_sink_pair source_sink(source_type);
   cudf_io::csv_writer_options options =
-    cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
-      .include_header(true)
-      .rows_per_chunk(1 << 14);  // TODO: remove once default is sensible
+    cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
   cudf_io::write_csv(options);
 
   cudf_io::csv_reader_options const read_options =
@@ -59,6 +57,7 @@ void BM_csv_read_varying_input(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void BM_csv_read_varying_options(benchmark::State& state)
@@ -79,23 +78,22 @@ void BM_csv_read_varying_options(benchmark::State& state)
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
 
-  std::vector<char> csv_data;
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
   cudf_io::csv_writer_options options =
-    cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
+    cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
       .include_header(true)
-      .line_terminator("\r\n")
-      .rows_per_chunk(1 << 14);  // TODO: remove once default is sensible
+      .line_terminator("\r\n");
   cudf_io::write_csv(options);
 
   cudf_io::csv_reader_options read_options =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_data.data(), csv_data.size()})
+    cudf_io::csv_reader_options::builder(source_sink.make_source_info())
       .use_cols_indexes(cols_to_read)
       .thousands('\'')
       .windowslinetermination(true)
       .comment('#')
       .prefix("BM_");
 
-  size_t const chunk_size             = csv_data.size() / num_chunks;
+  size_t const chunk_size             = source_sink.size() / num_chunks;
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto mem_stats_logger               = cudf::memory_stats_logger();
   for (auto _ : state) {
@@ -132,6 +130,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)       \
diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
index ad1fafb7f0d..413a269bcb2 100644
--- a/cpp/benchmarks/io/csv/csv_writer.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -46,14 +46,13 @@ void BM_csv_write_varying_inout(benchmark::State& state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
-      cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
-        .include_header(true)
-        .rows_per_chunk(1 << 14);  // TODO: remove once default is sensible
+      cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
     cudf_io::write_csv(options);
   }
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void BM_csv_write_varying_options(benchmark::State& state)
@@ -71,12 +70,12 @@ void BM_csv_write_varying_options(benchmark::State& state)
   auto const view = tbl->view();
 
   std::string const na_per(na_per_len, '#');
-  std::vector<char> csv_data;
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::csv_writer_options options =
-      cudf_io::csv_writer_options::builder(cudf_io::sink_info{&csv_data}, view)
+      cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
         .include_header(true)
         .na_rep(na_per)
         .rows_per_chunk(rows_per_chunk);
@@ -85,6 +84,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define CSV_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)       \
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index e035cc10dc1..3743be8bd5a 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -16,6 +16,7 @@
 
 #include <benchmarks/io/cuio_common.hpp>
 
+#include <fstream>
 #include <numeric>
 #include <string>
 
@@ -53,13 +54,25 @@ cudf_io::source_info cuio_source_sink_pair::make_source_info()
 cudf_io::sink_info cuio_source_sink_pair::make_sink_info()
 {
   switch (type) {
-    case io_type::VOID: return cudf_io::sink_info();
+    case io_type::VOID: return cudf_io::sink_info(&void_sink);
     case io_type::FILEPATH: return cudf_io::sink_info(file_name);
     case io_type::HOST_BUFFER: return cudf_io::sink_info(&buffer);
     default: CUDF_FAIL("invalid output type");
   }
 }
 
+size_t cuio_source_sink_pair::size()
+{
+  switch (type) {
+    case io_type::VOID: return void_sink.bytes_written();
+    case io_type::FILEPATH:
+      return static_cast<size_t>(
+        std::ifstream(file_name, std::ifstream::ate | std::ifstream::binary).tellg());
+    case io_type::HOST_BUFFER: return buffer.size();
+    default: CUDF_FAIL("invalid output type");
+  }
+}
+
 std::vector<cudf::type_id> dtypes_for_column_selection(std::vector<cudf::type_id> const& data_types,
                                                        column_selection col_sel)
 {
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 7107585dbcc..c74ee191d4e 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -39,6 +39,15 @@ std::string random_file_in_dir(std::string const& dir_path);
  * @brief Class to create a coupled `source_info` and `sink_info` of given type.
  */
 class cuio_source_sink_pair {
+  class bytes_written_only_sink : public cudf::io::data_sink {
+    size_t _bytes_written = 0;
+
+   public:
+    void host_write(void const* data, size_t size) override { _bytes_written += size; }
+    void flush() override {}
+    size_t bytes_written() override { return _bytes_written; }
+  };
+
  public:
   cuio_source_sink_pair(io_type type);
   ~cuio_source_sink_pair()
@@ -66,12 +75,15 @@ class cuio_source_sink_pair {
    */
   cudf::io::sink_info make_sink_info();
 
+  [[nodiscard]] size_t size();
+
  private:
   static temp_directory const tmpdir;
 
   io_type const type;
   std::vector<char> buffer;
   std::string const file_name;
+  bytes_written_only_sink void_sink;
 };
 
 /**
diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
index 56cca8d80a4..e15513275ee 100644
--- a/cpp/benchmarks/io/orc/orc_reader.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -66,13 +66,13 @@ void BM_orc_read_varying_input(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
-std::vector<std::string> get_col_names(std::vector<char> const& orc_data)
+std::vector<std::string> get_col_names(cudf_io::source_info const& source)
 {
   cudf_io::orc_reader_options const read_options =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()})
-      .num_rows(1);
+    cudf_io::orc_reader_options::builder(source).num_rows(1);
   return cudf_io::read_orc(read_options).metadata.column_names;
 }
 
@@ -88,25 +88,26 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const use_np_dtypes = (flags & 2) != 0;
   auto const ts_type       = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};
 
+  // skip_rows is not supported on nested types
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                                    int32_t(type_group_id::FLOATING_POINT),
                                                    int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
-                                                   int32_t(cudf::type_id::STRING),
-                                                   int32_t(cudf::type_id::LIST)}),
+                                                   int32_t(cudf::type_id::STRING)}),
                                 col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
 
-  std::vector<char> orc_data;
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
   cudf_io::orc_writer_options options =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{&orc_data}, view);
+    cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view);
   cudf_io::write_orc(options);
 
-  auto const cols_to_read = select_column_names(get_col_names(orc_data), col_sel);
+  auto const cols_to_read =
+    select_column_names(get_col_names(source_sink.make_source_info()), col_sel);
   cudf_io::orc_reader_options read_options =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{orc_data.data(), orc_data.size()})
+    cudf_io::orc_reader_options::builder(source_sink.make_source_info())
       .columns(cols_to_read)
       .use_index(use_index)
       .use_np_dtypes(use_np_dtypes)
@@ -148,6 +149,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define ORC_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                               \
@@ -179,11 +181,12 @@ BENCHMARK_REGISTER_F(OrcRead, column_selection)
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
 
+// Need an API to get the number of stripes to enable row_selection::STRIPES here
 BENCHMARK_DEFINE_F(OrcRead, row_selection)
 (::benchmark::State& state) { BM_orc_read_varying_options(state); }
 BENCHMARK_REGISTER_F(OrcRead, row_selection)
   ->ArgsProduct({{int32_t(column_selection::ALL)},
-                 {int32_t(row_selection::STRIPES), int32_t(row_selection::NROWS)},
+                 {int32_t(row_selection::NROWS)},
                  {1, 8},
                  {0b11},  // defaults
                  {int32_t(cudf::type_id::EMPTY)}})
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index 594feac41b1..50ae76e867c 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -62,6 +62,7 @@ void BM_orc_write_varying_inout(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void BM_orc_write_varying_options(benchmark::State& state)
@@ -98,6 +99,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define ORC_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                               \
diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
index b4f8dc8c450..09194931498 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -66,14 +66,13 @@ void BM_parq_read_varying_input(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
-std::vector<std::string> get_col_names(std::vector<char> const& parquet_data)
+std::vector<std::string> get_col_names(cudf::io::source_info const& source)
 {
   cudf_io::parquet_reader_options const read_options =
-    cudf_io::parquet_reader_options::builder(
-      cudf_io::source_info{parquet_data.data(), parquet_data.size()})
-      .num_rows(1);
+    cudf_io::parquet_reader_options::builder(source).num_rows(1);
   return cudf_io::read_parquet(read_options).metadata.column_names;
 }
 
@@ -89,26 +88,26 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const use_pandas_metadata = (flags & 2) != 0;
   auto const ts_type = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};
 
+  // No nested types here, because of https://github.com/rapidsai/cudf/issues/9970
   auto const data_types = dtypes_for_column_selection(
     get_type_or_group({static_cast<int32_t>(type_group_id::INTEGRAL),
                        static_cast<int32_t>(type_group_id::FLOATING_POINT),
                        static_cast<int32_t>(type_group_id::FIXED_POINT),
                        static_cast<int32_t>(type_group_id::TIMESTAMP),
-                       static_cast<int32_t>(cudf::type_id::STRING),
-                       static_cast<int32_t>(cudf::type_id::LIST)}),
+                       static_cast<int32_t>(cudf::type_id::STRING)}),
     col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
 
-  std::vector<char> parquet_data;
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
   cudf_io::parquet_writer_options options =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&parquet_data}, view);
+    cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view);
   cudf_io::write_parquet(options);
 
-  auto const cols_to_read = select_column_names(get_col_names(parquet_data), col_sel);
+  auto const cols_to_read =
+    select_column_names(get_col_names(source_sink.make_source_info()), col_sel);
   cudf_io::parquet_reader_options read_options =
-    cudf_io::parquet_reader_options::builder(
-      cudf_io::source_info{parquet_data.data(), parquet_data.size()})
+    cudf_io::parquet_reader_options::builder(source_sink.make_source_info())
       .columns(cols_to_read)
       .convert_strings_to_categories(str_to_categories)
       .use_pandas_metadata(use_pandas_metadata)
@@ -150,6 +149,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const data_processed = data_size * cols_to_read.size() / view.num_columns();
   state.SetBytesProcessed(data_processed * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define PARQ_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type)                              \
@@ -181,20 +181,18 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection)
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
 
-// Disabled until we add an API to read metadata from a parquet file and determine num row groups.
-// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
-/*
+// row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file and
+// determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
 BENCHMARK_DEFINE_F(ParquetRead, row_selection)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }
 BENCHMARK_REGISTER_F(ParquetRead, row_selection)
   ->ArgsProduct({{int32_t(column_selection::ALL)},
-                 {int32_t(row_selection::ROW_GROUPS), int32_t(row_selection::NROWS)},
+                 {int32_t(row_selection::NROWS)},
                  {1, 4},
                  {0b01},  // defaults
                  {int32_t(cudf::type_id::EMPTY)}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
-*/
 
 BENCHMARK_DEFINE_F(ParquetRead, misc_options)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index 937198d9048..8287c27f804 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -61,6 +61,7 @@ void BM_parq_write_varying_inout(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void BM_parq_write_varying_options(benchmark::State& state)
@@ -93,6 +94,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
 
   state.SetBytesProcessed(data_size * state.iterations());
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define PARQ_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type)                              \
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
index cc41c0237c2..98eaba213e5 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
@@ -25,6 +25,7 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/io/parquet.hpp>
@@ -48,15 +49,17 @@ void PQ_write(benchmark::State& state)
   cudf::table_view view = tbl->view();
 
   auto mem_stats_logger = cudf::memory_stats_logger();
+  cuio_source_sink_pair source_sink(io_type::VOID);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::parquet_writer_options opts =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info(), view);
+      cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view);
     cudf_io::write_parquet(opts);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 void PQ_write_chunked(benchmark::State& state)
@@ -71,10 +74,11 @@ void PQ_write_chunked(benchmark::State& state)
   }
 
   auto mem_stats_logger = cudf::memory_stats_logger();
+  cuio_source_sink_pair source_sink(io_type::VOID);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     cudf_io::chunked_parquet_writer_options opts =
-      cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info());
+      cudf_io::chunked_parquet_writer_options::builder(source_sink.make_sink_info());
     cudf_io::parquet_chunked_writer writer(opts);
     std::for_each(tables.begin(), tables.end(), [&writer](std::unique_ptr<cudf::table> const& tbl) {
       writer.write(*tbl);
@@ -84,6 +88,7 @@ void PQ_write_chunked(benchmark::State& state)
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
   state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage();
+  state.counters["encoded_file_size"] = source_sink.size();
 }
 
 #define PWBM_BENCHMARK_DEFINE(name, size, num_columns)                                    \
diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp
index 8039d7d065f..317db92ae8b 100644
--- a/cpp/benchmarks/stream_compaction/drop_duplicates.cpp
+++ b/cpp/benchmarks/stream_compaction/drop_duplicates.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,64 +15,102 @@
  */
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/stream_compaction.hpp>
+#include <cudf/detail/stream_compaction.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
+
+#include <fixture/rmm_pool_raii.hpp>
+
+#include <nvbench/nvbench.cuh>
 
 #include <memory>
 #include <random>
 
-class Compaction : public cudf::benchmark {
-};
+// necessary for custom enum types
+// see: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  // Enum type:
+  cudf::duplicate_keep_option,
+  // Callable to generate input strings:
+  [](cudf::duplicate_keep_option option) {
+    switch (option) {
+      case cudf::duplicate_keep_option::KEEP_FIRST: return "KEEP_FIRST";
+      case cudf::duplicate_keep_option::KEEP_LAST: return "KEEP_LAST";
+      case cudf::duplicate_keep_option::KEEP_NONE: return "KEEP_NONE";
+      default: return "ERROR";
+    }
+  },
+  // Callable to generate descriptions:
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
+
+template <typename Type, cudf::duplicate_keep_option Keep>
+void nvbench_drop_duplicates(nvbench::state& state,
+                             nvbench::type_list<Type, nvbench::enum_type<Keep>>)
+{
+  if constexpr (not std::is_same_v<Type, int32_t> and
+                Keep != cudf::duplicate_keep_option::KEEP_FIRST) {
+    state.skip("Skip unwanted benchmarks.");
+  }
+
+  cudf::rmm_pool_raii pool_raii;
+
+  auto const num_rows = state.get_int64("NumRows");
+
+  cudf::test::UniformRandomGenerator<long> rand_gen(0, 100);
+  auto elements = cudf::detail::make_counting_transform_iterator(
+    0, [&rand_gen](auto row) { return rand_gen.generate(); });
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return i % 100 == 0 ? false : true; });
+  cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + num_rows, valids);
+
+  auto input_column = cudf::column_view(values);
+  auto input_table  = cudf::table_view({input_column, input_column, input_column, input_column});
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::detail::drop_duplicates(
+      input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view);
+  });
+}
 
 template <typename Type>
-void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep)
+void nvbench_unordered_drop_duplicates(nvbench::state& state, nvbench::type_list<Type>)
 {
-  auto const n_rows = static_cast<cudf::size_type>(state.range(0));
+  cudf::rmm_pool_raii pool_raii;
+
+  auto const num_rows = state.get_int64("NumRows");
 
   cudf::test::UniformRandomGenerator<long> rand_gen(0, 100);
   auto elements = cudf::detail::make_counting_transform_iterator(
     0, [&rand_gen](auto row) { return rand_gen.generate(); });
   auto valids = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return i % 100 == 0 ? false : true; });
-  cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + n_rows, valids);
+  cudf::test::fixed_width_column_wrapper<Type, long> values(elements, elements + num_rows, valids);
 
   auto input_column = cudf::column_view(values);
   auto input_table  = cudf::table_view({input_column, input_column, input_column, input_column});
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::drop_duplicates(input_table, {0}, keep);
-  }
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    auto result = cudf::detail::unordered_drop_duplicates(
+      input_table, {0}, cudf::null_equality::EQUAL, stream_view);
+  });
 }
 
-#define concat(a, b, c) a##b##c
-#define get_keep(op)    cudf::duplicate_keep_option::KEEP_##op
-
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, keep)                     \
-  BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \
-  {                                                                \
-    BM_compaction<type>(state, get_keep(keep));                    \
-  }                                                                \
-  BENCHMARK_REGISTER_F(Compaction, name)                           \
-    ->UseManualTime()                                              \
-    ->Arg(10000)    /* 10k */                                      \
-    ->Arg(100000)   /* 100k */                                     \
-    ->Arg(1000000)  /* 1M */                                       \
-    ->Arg(10000000) /* 10M */
-
-#define COMPACTION_BENCHMARK_DEFINE(type, keep) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, keep), type, keep)
-
-COMPACTION_BENCHMARK_DEFINE(bool, NONE);
-COMPACTION_BENCHMARK_DEFINE(int8_t, NONE);
-COMPACTION_BENCHMARK_DEFINE(int32_t, NONE);
-COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST);
-COMPACTION_BENCHMARK_DEFINE(int32_t, LAST);
-using cudf::timestamp_ms;
-COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE);
-COMPACTION_BENCHMARK_DEFINE(float, NONE);
+using data_type   = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+using keep_option = nvbench::enum_type_list<cudf::duplicate_keep_option::KEEP_FIRST,
+                                            cudf::duplicate_keep_option::KEEP_LAST,
+                                            cudf::duplicate_keep_option::KEEP_NONE>;
+
+NVBENCH_BENCH_TYPES(nvbench_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option))
+  .set_name("drop_duplicates")
+  .set_type_axes_names({"Type", "KeepOption"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_unordered_drop_duplicates, NVBENCH_TYPE_AXES(data_type))
+  .set_name("unordered_drop_duplicates")
+  .set_type_axes_names({"Type"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 980b353406a..fbcfabb4532 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -46,7 +46,7 @@ static void BM_contains(benchmark::State& state, contains_type ct)
         cudf::strings::count_re(input, "\\d+");
         break;
       case contains_type::findall:  // returns occurrences of matches
-        cudf::strings::findall_re(input, "\\d+");
+        cudf::strings::findall(input, "\\d+");
         break;
     }
   }
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index c964c85156c..5a20f78b798 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
     cuco 0.0
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82
+    GIT_TAG 6ec8b6dcdeceea07ab4456d32461a05c18864411
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 5e465ed6991..eeebe38d873 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -347,7 +347,9 @@ implemented using asynchronous APIs on the default stream (e.g., stream 0).
 
 The recommended pattern for doing this is to make the definition of the external API invoke an
 internal API in the `detail` namespace. The internal `detail` API has the same parameters as the
-public API, plus a `rmm::cuda_stream_view` parameter at the end defaulted to
+public API, plus a `rmm::cuda_stream_view` parameter at the end with no default value. If the
+detail API also accepts a memory resource parameter, the stream parameter should be ideally placed
+just *before* the memory resource. The public API will call the detail API and provide
 `rmm::cuda_stream_default`. The implementation should be wholly contained in the `detail` API
 definition and use only asynchronous versions of CUDA APIs with the stream parameter.
 
@@ -362,14 +364,14 @@ void external_function(...);
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+void external_function(..., rmm::cuda_stream_view stream)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
-    // defaulted stream parameter
+    // Use the stream parameter in the detail implementation.
     void external_function(..., rmm::cuda_stream_view stream){
-        // implementation uses stream w/ async APIs
+        // Implementation uses the stream with async APIs.
         rmm::device_buffer buff(...,stream);
         CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
         kernel<<<..., stream>>>(...);
@@ -378,8 +380,8 @@ namespace detail{
 } // namespace detail
 
 void external_function(...){
-    CUDF_FUNC_RANGE(); // Auto generates NVTX range for lifetime of this function
-    detail::external_function(...);
+    CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
+    detail::external_function(..., rmm::cuda_stream_default);
 }
 ```
 
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 87823d71c6f..3d065556827 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,6 +75,18 @@ std::unique_ptr<table> drop_duplicates(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::unordered_drop_duplicates
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> unordered_drop_duplicates(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
  *
@@ -94,5 +106,24 @@ cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal    = null_equality::EQUAL,
                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy)
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling,
+                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @copydoc cudf::unordered_distinct_count(table_view const&, null_equality)
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+cudf::size_type unordered_distinct_count(table_view const& input,
+                                         null_equality nulls_equal    = null_equality::EQUAL,
+                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 6a85428d8f0..a7112ae415d 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -387,9 +387,9 @@ class fixed_point {
   /**
    * @brief operator + (for adding two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are added <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are added
+   * If `_scale`s are equal, `_value`s are added.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are added.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -402,9 +402,9 @@ class fixed_point {
   /**
    * @brief operator - (for subtracting two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are subtracted <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are subtracted
+   * If `_scale`s are equal, `_value`s are subtracted.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are subtracted.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -417,7 +417,7 @@ class fixed_point {
   /**
    * @brief operator * (for multiplying two `fixed_point` numbers)
    *
-   * `_scale`s are added and `_value`s are multiplied
+   * `_scale`s are added and `_value`s are multiplied.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -430,7 +430,7 @@ class fixed_point {
   /**
    * @brief operator / (for dividing two `fixed_point` numbers)
    *
-   * `_scale`s are subtracted and `_value`s are divided
+   * `_scale`s are subtracted and `_value`s are divided.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -443,9 +443,9 @@ class fixed_point {
   /**
    * @brief operator == (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -458,9 +458,9 @@ class fixed_point {
   /**
    * @brief operator != (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -473,9 +473,9 @@ class fixed_point {
   /**
    * @brief operator <= (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -488,9 +488,9 @@ class fixed_point {
   /**
    * @brief operator >= (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -503,9 +503,9 @@ class fixed_point {
   /**
    * @brief operator < (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -518,9 +518,9 @@ class fixed_point {
   /**
    * @brief operator > (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -534,7 +534,7 @@ class fixed_point {
    * @brief Method for creating a `fixed_point` number with a new `scale`
    *
    * The `fixed_point` number returned will have the same value, underlying representation and
-   * radix as `this`, the only thing changed is the scale
+   * radix as `this`, the only thing changed is the scale.
    *
    * @param scale The `scale` of the returned `fixed_point` number
    * @return `fixed_point` number with a new `scale`
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 108251dd646..c2187f056cf 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -72,7 +72,6 @@ class orc_reader_options {
 
   // Columns that should be read as Decimal128
   std::vector<std::string> _decimal128_columns;
-  bool _enable_decimal128 = true;
 
   friend orc_reader_options_builder;
 
@@ -152,11 +151,6 @@ class orc_reader_options {
    */
   std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
 
-  /**
-   * @brief Whether to use row index to speed-up reading.
-   */
-  bool is_enabled_decimal128() const { return _enable_decimal128; }
-
   // Setters
 
   /**
@@ -226,18 +220,13 @@ class orc_reader_options {
    *
    * @param val Vector of fully qualified column names.
    */
-  void set_decimal_cols_as_float(std::vector<std::string> val)
+  [[deprecated(
+    "Decimal to float conversion is deprecated and will be remove in future release")]] void
+  set_decimal_cols_as_float(std::vector<std::string> val)
   {
     _decimal_cols_as_float = std::move(val);
   }
 
-  /**
-   * @brief Enable/Disable the use of decimal128 type
-   *
-   * @param use Boolean value to enable/disable.
-   */
-  void enable_decimal128(bool use) { _enable_decimal128 = use; }
-
   /**
    * @brief Set columns that should be read as 128-bit Decimal
    *
@@ -357,7 +346,10 @@ class orc_reader_options_builder {
    * @param val Vector of column names.
    * @return this for chaining.
    */
-  orc_reader_options_builder& decimal_cols_as_float(std::vector<std::string> val)
+  [[deprecated(
+    "Decimal to float conversion is deprecated and will be remove in future "
+    "release")]] orc_reader_options_builder&
+  decimal_cols_as_float(std::vector<std::string> val)
   {
     options._decimal_cols_as_float = std::move(val);
     return *this;
@@ -375,17 +367,6 @@ class orc_reader_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Enable/Disable use of decimal128 type
-   *
-   * @param use Boolean value to enable/disable.
-   */
-  orc_reader_options_builder& decimal128(bool use)
-  {
-    options.enable_decimal128(use);
-    return *this;
-  }
-
   /**
    * @brief move orc_reader_options member once it's built.
    */
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index 7551511d281..94039d81f31 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -189,7 +189,7 @@ std::unique_ptr<table> drop_nans(
  * @note if @p input.num_rows() is zero, there is no error, and an empty table
  * is returned.
  *
- * @throws cudf::logic_error if The `input` size  and `boolean_mask` size mismatches.
+ * @throws cudf::logic_error if `input.num_rows() != boolean_mask.size()`.
  * @throws cudf::logic_error if `boolean_mask` is not `type_id::BOOL8` type.
  *
  * @param[in] input The input table_view to filter
@@ -214,7 +214,10 @@ enum class duplicate_keep_option {
 };
 
 /**
- * @brief Create a new table without duplicate rows
+ * @brief Create a new table without duplicate rows.
+ *
+ * The output table is sorted according to the lexicographic ordering of the data in the columns
+ * indexed by `keys`.
  *
  * Given an `input` table_view, each row is copied to output table if the corresponding
  * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep:
@@ -222,18 +225,18 @@ enum class duplicate_keep_option {
  * - KEEP_LAST: only the last of a sequence of duplicate rows is copied
  * - KEEP_NONE: no duplicate rows are copied
  *
- * @throws cudf::logic_error if The `input` row size mismatches with `keys`.
+ * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table.
  *
  * @param[in] input           input table_view to copy only unique rows
  * @param[in] keys            vector of indices representing key columns from `input`
- * @param[in] keep            keep first entry, last entry, or no entries if duplicates found
+ * @param[in] keep            keep first row, last row, or no rows of the found duplicates
  * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
  *                            equal if null_equality::UNEQUAL
  * @param[in] null_precedence flag to denote nulls should appear before or after non-null items
  * @param[in] mr              Device memory resource used to allocate the returned table's device
- * memory
+ *                            memory
  *
- * @return Table with unique rows as per specified `keep`.
+ * @return Table with sorted unique rows as specified by `keep`.
  */
 std::unique_ptr<table> drop_duplicates(
   table_view const& input,
@@ -244,37 +247,95 @@ std::unique_ptr<table> drop_duplicates(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Count the unique elements in the column_view
+ * @brief Create a new table without duplicate rows with hash-based algorithms.
+ *
+ * Given an `input` table_view, each row is copied to output table if the corresponding
+ * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which
+ * row is copied.
  *
- * Given an input column_view, number of unique elements in this column_view is returned
+ * The order of elements in the output table is not specified.
+ *
+ * @param[in] input           input table_view to copy only unique rows
+ * @param[in] keys            vector of indices representing key columns from `input`
+ * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
+ *                            equal if null_equality::UNEQUAL
+ * @param[in] mr              Device memory resource used to allocate the returned table's device
+ *                            memory
+ *
+ * @return Table with unique rows in an unspecified order.
+ */
+std::unique_ptr<table> unordered_drop_duplicates(
+  table_view const& input,
+  std::vector<size_type> const& keys,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Count the number of consecutive groups of equivalent elements in a column.
  *
  * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is  nan_policy::NAN_IS_NULL, both
  * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and
- * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique
- * count.
+ * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in count.
+ *
+ * `null`s are handled as equal.
  *
- * @param[in] input The column_view whose unique elements will be counted.
+ * @param[in] input The column_view whose number of distinct consecutive groups will be counted
  * @param[in] null_handling flag to include or ignore `null` while counting
- * @param[in] nan_handling flag to consider `NaN==null` or not.
+ * @param[in] nan_handling flag to consider `NaN==null` or not
  *
- * @return number of unique elements
+ * @return number of distinct consecutive groups in the column
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
                                nan_policy nan_handling);
 
 /**
- * @brief Count the unique rows in a table.
- *
+ * @brief Count the number of consecutive groups of equivalent elements in a table.
  *
- * @param[in] input Table whose unique rows will be counted.
- * @param[in] nulls_equal flag to denote if null elements should be considered equal
- * nulls are not equal if null_equality::UNEQUAL
+ * @param[in] input Table whose number of distinct consecutive groups will be counted
+ * @param[in] nulls_equal flag to denote if null elements should be considered equal.
+ *            nulls are not equal if null_equality::UNEQUAL.
  *
- * @return number of unique rows in the table
+ * @return number of distinct consecutive groups in the table
  */
 cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal = null_equality::EQUAL);
 
+/**
+ * @brief Count the unique elements in the column_view.
+ *
+ * If `nulls_equal == nulls_equal::UNEQUAL`, all `null`s are unique.
+ *
+ * Given an input column_view, number of unique elements in this column_view is returned.
+ *
+ * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is  nan_policy::NAN_IS_NULL, both
+ * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and
+ * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique
+ * count.
+ *
+ * `null`s are handled as equal.
+ *
+ * @param[in] input The column_view whose unique elements will be counted
+ * @param[in] null_handling flag to include or ignore `null` while counting
+ * @param[in] nan_handling flag to consider `NaN==null` or not
+ *
+ * @return number of unique elements
+ */
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling);
+
+/**
+ * @brief Count the unique rows in a table.
+ *
+ * @param[in] input Table whose unique rows will be counted
+ * @param[in] nulls_equal flag to denote if null elements should be considered equal.
+ *            nulls are not equal if null_equality::UNEQUAL.
+ *
+ * @return number of unique rows in the table
+ */
+cudf::size_type unordered_distinct_count(table_view const& input,
+                                         null_equality nulls_equal = null_equality::EQUAL);
+
 /** @} */
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 4cfd0b75cd4..0964e713592 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,30 +27,32 @@ namespace strings {
  */
 
 /**
- * @brief Returns a column with character position values where each
+ * @brief Returns a lists column with character position values where each
  * of the target strings are found in each string.
  *
- * The size of the output column is targets.size() * strings.size().
- * output[i] contains the position of target[i % targets.size()] in string[i/targets.size()]
+ * The size of the output column is `input.size()`.
+ * Each row of the output column is of size `targets.size()`.
+ *
+ * `output[i,j]` contains the position of `targets[j]` in `input[i]`
  *
  * @code{.pseudo}
  * Example:
- * s = ["abc","def"]
- * t = ["a","c","e"]
- * r = find_multiple(s,t)
- * r is now [ 0, 2,-1,   // for "abc": "a" at pos 0, "c" at pos 2, "e" not found
- *           -1,-1, 1 ]  // for "def": "a" and "b" not found, "e" at  pos 1
+ * s = ["abc", "def"]
+ * t = ["a", "c", "e"]
+ * r = find_multiple(s, t)
+ * r is now {[ 0, 2,-1],   // for "abc": "a" at pos 0, "c" at pos 2, "e" not found
+ *           [-1,-1, 1 ]}  // for "def": "a" and "b" not found, "e" at  pos 1
  * @endcode
  *
- * @throw cudf::logic_error targets is empty or contains nulls
+ * @throw cudf::logic_error if `targets` is empty or contains nulls
  *
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation.
  * @param targets Strings to search for in each string.
  * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New integer column with character position values.
+ * @return Lists column with character position values.
  */
 std::unique_ptr<column> find_multiple(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 6c3139747af..4207cddbafb 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,37 @@ namespace strings {
  * @param mr Device memory resource used to allocate the returned table's device memory.
  * @return New table of strings columns.
  */
-std::unique_ptr<table> findall_re(
+std::unique_ptr<table> findall(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a lists column of strings for each matching occurrence of the
+ * regex pattern within each string.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["bunny", "rabbit", "hare", "dog"]
+ * r = findall_record(s, "[ab]"")
+ * r is now a lists column like:
+ *  [ ["b"]
+ *    ["a","b","b"]
+ *    ["a"]
+ *    null ]
+ * @endcode
+ *
+ * A null output row results if the pattern is not found in the corresponding row
+ * input string.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation.
+ * @param pattern Regex pattern to match within each string.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New lists column of strings.
+ */
+std::unique_ptr<column> findall_record(
   strings_column_view const& strings,
   std::string const& pattern,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 43cc059eddd..9d75295cd39 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,6 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <cstdint>
-#include <cstring>
-
 namespace nvtext {
 
 /**
@@ -43,6 +40,8 @@ struct hashed_vocabulary {
   std::unique_ptr<cudf::column> table;             // uint64
   std::unique_ptr<cudf::column> bin_coefficients;  // uint64
   std::unique_ptr<cudf::column> bin_offsets;       // uint16
+  std::unique_ptr<cudf::column> cp_metadata;       // uint32
+  std::unique_ptr<cudf::column> aux_cp_table;      // uint64
 };
 
 /**
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index bac6697da82..33c369b254f 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 import argparse
 import os
@@ -34,49 +34,63 @@
 # build a map of the log entries
 entries = {}
 with open(log_file, "r") as log:
+    last = 0
+    files = {}
     for line in log:
         entry = line.split()
         if len(entry) > 4:
-            elapsed = int(entry[1]) - int(entry[0])
             obj_file = entry[3]
             file_size = (
                 os.path.getsize(os.path.join(log_path, obj_file))
                 if os.path.exists(obj_file)
                 else 0
             )
-            entries[entry[3]] = (elapsed, file_size)
+            start = int(entry[0])
+            end = int(entry[1])
+            # logic based on ninjatracing
+            if end < last:
+                files = {}
+            last = end
+            files.setdefault(entry[4], (entry[3], start, end, file_size))
 
-# check file could be loaded
+    # build entries from files dict
+    for entry in files.values():
+        entries[entry[0]] = (entry[1], entry[2], entry[3])
+
+# check file could be loaded and we have entries to report
 if len(entries) == 0:
     print("Could not parse", log_file)
     exit()
 
-# sort the keys by build time (descending order)
-keys = list(entries.keys())
-sl = sorted(keys, key=lambda k: entries[k][0], reverse=True)
+# sort the entries by build-time (descending order)
+sorted_list = sorted(
+    list(entries.keys()),
+    key=lambda k: entries[k][1] - entries[k][0],
+    reverse=True,
+)
 
-if output_fmt == "xml":
-    # output results in XML format
+# output results in XML format
+def output_xml(entries, sorted_list, args):
     root = ET.Element("testsuites")
     testsuite = ET.Element(
         "testsuite",
         attrib={
             "name": "build-time",
-            "tests": str(len(keys)),
+            "tests": str(len(sorted_list)),
             "failures": str(0),
             "errors": str(0),
         },
     )
     root.append(testsuite)
-    for key in sl:
-        entry = entries[key]
-        elapsed = float(entry[0]) / 1000
+    for name in sorted_list:
+        entry = entries[name]
+        build_time = float(entry[1] - entry[0]) / 1000
         item = ET.Element(
             "testcase",
             attrib={
                 "classname": "BuildTime",
-                "name": key,
-                "time": str(elapsed),
+                "name": name,
+                "time": str(build_time),
             },
         )
         testsuite.append(item)
@@ -85,62 +99,219 @@
     xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
     print(xmlstr)
 
-elif output_fmt == "html":
-    # output results in HTML format
-    print("<html><head><title>Sorted Ninja Build Times</title>")
-    # Note: Jenkins does not support style defined in the html
+
+# utility converts a millisecond value to a colum width in pixels
+def time_to_width(value, end):
+    # map a value from (0,end) to (0,1000)
+    r = (float(value) / float(end)) * 1000.0
+    return int(r)
+
+
+# assign each entry to a thread by analyzing the start/end times and
+# slotting them into thread buckets where they fit
+def assign_entries_to_threads(entries):
+    # first sort the entries' keys by end timestamp
+    sorted_keys = sorted(
+        list(entries.keys()), key=lambda k: entries[k][1], reverse=True
+    )
+
+    # build the chart data by assigning entries to threads
+    results = {}
+    threads = []
+    for name in sorted_keys:
+        entry = entries[name]
+
+        # assign this entry by finding the first available thread identified
+        # by the thread's current start time greater than the entry's end time
+        tid = -1
+        for t in range(len(threads)):
+            if threads[t] >= entry[1]:
+                threads[t] = entry[0]
+                tid = t
+                break
+
+        # if no current thread found, create a new one with this entry
+        if tid < 0:
+            threads.append(entry[0])
+            tid = len(threads) - 1
+
+        # add entry name to the array associated with this tid
+        if tid not in results.keys():
+            results[tid] = []
+        results[tid].append(name)
+
+    # first entry has the last end time
+    end_time = entries[sorted_keys[0]][1]
+
+    # return the threaded entries and the last end time
+    return (results, end_time)
+
+
+# output chart results in HTML format
+def output_html(entries, sorted_list, args):
+    print("<html><head><title>Build Metrics Report</title>")
+    # Note: Jenkins does not support javascript nor style defined in the html
     # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/
     print("</head><body>")
     if args.msg is not None:
         print("<p>", args.msg, "</p>")
-    print("<table>")
-    print(
-        "<tr><th>File</th>",
-        "<th>Compile time<br/>(ms)</th>",
-        "<th>Size<br/>(bytes)</th><tr>",
-        sep="",
-    )
-    summary = {"red": 0, "yellow": 0, "green": 0}
+
+    # map entries to threads
+    # the end_time is used to scale all the entries to a fixed output width
+    threads, end_time = assign_entries_to_threads(entries)
+
+    # color ranges for build times
+    summary = {"red": 0, "yellow": 0, "green": 0, "white": 0}
     red = "bgcolor='#FFBBD0'"
     yellow = "bgcolor='#FFFF80'"
     green = "bgcolor='#AAFFBD'"
-    for key in sl:
-        result = entries[key]
-        elapsed = result[0]
-        color = green
-        if elapsed > 300000:  # 5 minutes
-            color = red
-            summary["red"] += 1
-        elif elapsed > 120000:  # 2 minutes
-            color = yellow
-            summary["yellow"] += 1
-        else:
-            summary["green"] += 1
+    white = "bgcolor='#FFFFFF'"
+
+    # create the build-time chart
+    print("<table id='chart' width='1000px' bgcolor='#BBBBBB'>")
+    for tid in range(len(threads)):
+        names = threads[tid]
+        # sort the names for this thread by start time
+        names = sorted(names, key=lambda k: entries[k][0])
+
+        # use the last entry's end time as the total row size
+        # (this is an estimate and does not have to be exact)
+        last_entry = entries[names[len(names) - 1]]
+        last_time = time_to_width(last_entry[1], end_time)
         print(
-            "<tr ",
-            color,
-            "><td>",
-            key,
-            "</td><td align='right'>",
-            result[0],
-            "</td><td align='right'>",
-            result[1],
-            "</td></tr>",
+            "<tr><td><table width='",
+            last_time,
+            "px' border='0' cellspacing='1' cellpadding='0'><tr>",
             sep="",
         )
-    print("</table><br/><table border='2'>")
+
+        prev_end = 0  # used for spacing between entries
+
+        # write out each entry for this thread as a column for a single row
+        for name in names:
+            entry = entries[name]
+            start = entry[0]
+            end = entry[1]
+
+            # this handles minor gaps between end of the
+            # previous entry and the start of the next
+            if prev_end > 0 and start > prev_end:
+                size = time_to_width(start - prev_end, end_time)
+                print("<td width='", size, "px'></td>")
+            # adjust for the cellspacing
+            prev_end = end + int(end_time / 500)
+
+            # format the build-time
+            build_time = end - start
+            build_time_str = str(build_time) + " ms"
+            if build_time > 120000:  # 2 minutes
+                minutes = int(build_time / 60000)
+                seconds = int(((build_time / 60000) - minutes) * 60)
+                build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
+            elif build_time > 1000:
+                build_time_str = "{:.3f} s".format(build_time / 1000)
+
+            # assign color and accumulate legend values
+            color = white
+            if build_time > 300000:  # 5 minutes
+                color = red
+                summary["red"] += 1
+            elif build_time > 120000:  # 2 minutes
+                color = yellow
+                summary["yellow"] += 1
+            elif build_time > 1000:  # 1 second
+                color = green
+                summary["green"] += 1
+            else:
+                summary["white"] += 1
+
+            # compute the pixel width based on build-time
+            size = max(time_to_width(build_time, end_time), 2)
+            # output the column for this entry
+            print("<td height='20px' width='", size, "px' ", sep="", end="")
+            # title text is shown as hover-text by most browsers
+            print(color, "title='", end="")
+            print(name, "\n", build_time_str, "' ", sep="", end="")
+            # centers the name if it fits in the box
+            print("align='center' nowrap>", end="")
+            # use a slightly smaller, fixed-width font
+            print("<font size='-2' face='courier'>", end="")
+
+            # add the file-name if it fits, otherwise, truncate the name
+            file_name = os.path.basename(name)
+            if len(file_name) + 3 > size / 7:
+                abbr_size = int(size / 7) - 3
+                if abbr_size > 1:
+                    print(file_name[:abbr_size], "...", sep="", end="")
+            else:
+                print(file_name, end="")
+            # done with this entry
+            print("</font></td>")
+            # update the entry with just the computed output info
+            entries[name] = (build_time_str, color, entry[2])
+
+        # add a filler column at the end of each row
+        print("<td width='*'></td></tr></table></td></tr>")
+
+    # done with the chart
+    print("</table><br/>")
+
+    # output detail table in build-time descending order
+    print("<table id='detail' bgcolor='#EEEEEE'>")
+    print(
+        "<tr><th>File</th>",
+        "<th>Compile time</th>",
+        "<th>Size</th><tr>",
+        sep="",
+    )
+    for name in sorted_list:
+        entry = entries[name]
+        build_time_str = entry[0]
+        color = entry[1]
+        file_size = entry[2]
+
+        # format file size
+        file_size_str = ""
+        if file_size > 1000000:
+            file_size_str = "{:.3f} MB".format(file_size / 1000000)
+        elif file_size > 1000:
+            file_size_str = "{:.3f} KB".format(file_size / 1000)
+        elif file_size > 0:
+            file_size_str = str(file_size) + " bytes"
+
+        # output entry row
+        print("<tr ", color, "><td>", name, "</td>", sep="", end="")
+        print("<td align='right'>", build_time_str, "</td>", sep="", end="")
+        print("<td align='right'>", file_size_str, "</td></tr>", sep="")
+
+    print("</table><br/>")
+
     # include summary table with color legend
+    print("<table id='legend' border='2' bgcolor='#EEEEEE'>")
     print("<tr><td", red, ">time &gt; 5 minutes</td>")
     print("<td align='right'>", summary["red"], "</td></tr>")
     print("<tr><td", yellow, ">2 minutes &lt; time &lt; 5 minutes</td>")
     print("<td align='right'>", summary["yellow"], "</td></tr>")
-    print("<tr><td", green, ">time &lt; 2 minutes</td>")
+    print("<tr><td", green, ">1 second &lt; time &lt; 2 minutes</td>")
     print("<td align='right'>", summary["green"], "</td></tr>")
+    print("<tr><td", white, ">time &lt; 1 second</td>")
+    print("<td align='right'>", summary["white"], "</td></tr>")
     print("</table></body></html>")
 
-else:
-    # output results in CSV format
+
+# output results in CSV format
+def output_csv(entries, sorted_list, args):
     print("time,size,file")
-    for key in sl:
-        result = entries[key]
-        print(result[0], result[1], key, sep=",")
+    for name in sorted_list:
+        entry = entries[name]
+        build_time = entry[1] - entry[0]
+        file_size = entry[2]
+        print(build_time, file_size, name, sep=",")
+
+
+if output_fmt == "xml":
+    output_xml(entries, sorted_list, args)
+elif output_fmt == "html":
+    output_html(entries, sorted_list, args)
+else:
+    output_csv(entries, sorted_list, args)
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index e3d1ea88ece..96b7fd48dc9 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
@@ -57,26 +58,29 @@ std::unique_ptr<column> add_keys(
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys =
     cudf::detail::concatenate(std::vector<column_view>{old_keys, new_keys}, stream);
-  // sort and remove any duplicates from the combined keys
-  // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f]
-  auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}},
-                                                  std::vector<size_type>{0},  // only one key column
-                                                  duplicate_keep_option::KEEP_FIRST,
-                                                  null_equality::EQUAL,
-                                                  null_order::BEFORE,
-                                                  stream,
-                                                  mr)
-                      ->release();
-  std::unique_ptr<column> keys_column(std::move(table_keys.front()));
+
+  // Drop duplicates from the combined keys, then sort the result.
+  // sort(unordered_drop_duplicates([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f]
+  auto table_keys =
+    cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}},
+                                            std::vector<size_type>{0},  // only one key column
+                                            null_equality::EQUAL,
+                                            stream,
+                                            mr);
+  std::vector<order> column_order{order::ASCENDING};
+  std::vector<null_order> null_precedence{null_order::AFTER};  // should be no nulls here
+  auto sorted_keys =
+    cudf::detail::sort(table_keys->view(), column_order, null_precedence, stream, mr)->release();
+
+  std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));
   // create a map for the indices
   // lower_bound([a,b,c,d,e,f],[a,b,c,d,f]) = [0,1,2,3,5]
-  auto map_indices = cudf::detail::lower_bound(
-    table_view{{keys_column->view()}},
-    table_view{{old_keys}},
-    std::vector<order>{order::ASCENDING},
-    std::vector<null_order>{null_order::AFTER},  // should be no nulls here
-    stream,
-    mr);
+  auto map_indices = cudf::detail::lower_bound(table_view{{keys_column->view()}},
+                                               table_view{{old_keys}},
+                                               column_order,
+                                               null_precedence,
+                                               stream,
+                                               mr);
   // now create the indices column -- map old values to the new ones
   // gather([4,0,3,1,2,2,2,4,0],[0,1,2,3,5]) = [5,0,3,1,2,2,2,5,0]
   column_view indices_view(dictionary_column.indices().type(),
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index fd86d8ec7d4..301338fa1a8 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
@@ -216,15 +217,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // sort keys and remove duplicates;
   // this becomes the keys child for the output dictionary column
-  auto table_keys = cudf::detail::drop_duplicates(table_view{{all_keys->view()}},
-                                                  std::vector<size_type>{0},
-                                                  duplicate_keep_option::KEEP_FIRST,
-                                                  null_equality::EQUAL,
-                                                  null_order::BEFORE,
-                                                  stream,
-                                                  mr)
-                      ->release();
-  std::unique_ptr<column> keys_column(std::move(table_keys.front()));
+  auto table_keys = cudf::detail::unordered_drop_duplicates(
+    table_view{{all_keys->view()}}, std::vector<size_type>{0}, null_equality::EQUAL, stream, mr);
+  auto sorted_keys = cudf::detail::sort(table_keys->view(),
+                                        std::vector<order>{order::ASCENDING},
+                                        std::vector<null_order>{null_order::BEFORE},
+                                        stream,
+                                        mr)
+                       ->release();
+  std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));
 
   // next, concatenate the indices
   std::vector<column_view> indices_views(columns.size());
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 72f6e034479..c1fb1fa2180 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
+#include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/detail/encode.hpp>
@@ -120,16 +121,17 @@ std::unique_ptr<column> set_keys(
   auto keys = dictionary_column.keys();
   CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match");
 
-  // copy the keys -- use drop_duplicates to make sure they are sorted and unique
-  auto table_keys = cudf::detail::drop_duplicates(table_view{{new_keys}},
-                                                  std::vector<size_type>{0},
-                                                  duplicate_keep_option::KEEP_FIRST,
-                                                  null_equality::EQUAL,
-                                                  null_order::BEFORE,
-                                                  stream,
-                                                  mr)
-                      ->release();
-  std::unique_ptr<column> keys_column(std::move(table_keys.front()));
+  // copy the keys -- use unordered_drop_duplicates to make sure they are unique, then
+  // sort the results.
+  auto unique_keys = cudf::detail::unordered_drop_duplicates(
+    table_view{{new_keys}}, std::vector<size_type>{0}, null_equality::EQUAL, stream, mr);
+  auto sorted_keys = cudf::detail::sort(unique_keys->view(),
+                                        std::vector<order>{order::ASCENDING},
+                                        std::vector<null_order>{null_order::BEFORE},
+                                        stream,
+                                        mr)
+                       ->release();
+  std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));
 
   // compute the new nulls
   auto matches   = cudf::detail::contains(keys, keys_column->view(), stream, mr);
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index e8b4a8b1cbf..57bb222aaa0 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/strings/string_view.hpp>
@@ -37,7 +38,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <memory>
@@ -219,20 +219,18 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
 groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto grouped_keys = helper().sorted_keys(rmm::cuda_stream_default, mr);
+  auto const stream = rmm::cuda_stream_default;
+  auto grouped_keys = helper().sorted_keys(stream, mr);
 
-  auto const& group_offsets = helper().group_offsets(rmm::cuda_stream_default);
-  std::vector<size_type> group_offsets_vector(group_offsets.size());
-  thrust::copy(thrust::device_pointer_cast(group_offsets.begin()),
-               thrust::device_pointer_cast(group_offsets.end()),
-               group_offsets_vector.begin());
+  auto const& group_offsets       = helper().group_offsets(stream);
+  auto const group_offsets_vector = cudf::detail::make_std_vector_sync(group_offsets, stream);
 
-  if (values.num_columns()) {
+  if (not values.is_empty()) {
     auto grouped_values = cudf::detail::gather(values,
-                                               helper().key_sort_order(rmm::cuda_stream_default),
+                                               helper().key_sort_order(stream),
                                                cudf::out_of_bounds_policy::DONT_CHECK,
                                                cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                               rmm::cuda_stream_default,
+                                               stream,
                                                mr);
     return groupby::groups{
       std::move(grouped_keys), std::move(group_offsets_vector), std::move(grouped_values)};
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 817b9fd7b01..f133b79a27e 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -232,7 +232,6 @@ size_t gather_stream_info(const size_t stripe_index,
  */
 auto decimal_column_type(std::vector<std::string> const& float64_columns,
                          std::vector<std::string> const& decimal128_columns,
-                         bool is_decimal128_enabled,
                          cudf::io::orc::detail::aggregate_orc_metadata const& metadata,
                          int column_index)
 {
@@ -244,7 +243,7 @@ auto decimal_column_type(std::vector<std::string> const& float64_columns,
   };
 
   auto const user_selected_float64    = is_column_in(float64_columns);
-  auto const user_selected_decimal128 = is_decimal128_enabled and is_column_in(decimal128_columns);
+  auto const user_selected_decimal128 = is_column_in(decimal128_columns);
   CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128,
                "Both decimal128 and float64 types selected for column " + column_path);
 
@@ -255,9 +254,6 @@ auto decimal_column_type(std::vector<std::string> const& float64_columns,
                            .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
   if (precision <= cuda::std::numeric_limits<int32_t>::digits10) return type_id::DECIMAL32;
   if (precision <= cuda::std::numeric_limits<int64_t>::digits10) return type_id::DECIMAL64;
-  CUDF_EXPECTS(is_decimal128_enabled,
-               "Decimal precision too high for decimal64, use `decimal_cols_as_float` or enable "
-               "decimal128 use");
   return type_id::DECIMAL128;
 }
 
@@ -754,8 +750,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
     _metadata.get_schema(orc_col_id),
     _use_np_dtypes,
     _timestamp_type.id(),
-    decimal_column_type(
-      _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, orc_col_id));
+    decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, orc_col_id));
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
@@ -900,7 +895,6 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   // Control decimals conversion
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
   decimal128_columns     = options.get_decimal128_columns();
-  is_decimal128_enabled  = options.is_enabled_decimal128();
 }
 
 timezone_table reader::impl::compute_timezone_table(
@@ -964,8 +958,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         _metadata.get_col_type(col.id),
         _use_np_dtypes,
         _timestamp_type.id(),
-        decimal_column_type(
-          _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id));
+        decimal_column_type(_decimal_cols_as_float, decimal128_columns, _metadata, col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
       if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
           col_type == type_id::DECIMAL128) {
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index e8aa298012b..1e586bcde00 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -223,7 +223,6 @@ class reader::impl {
   bool _use_np_dtypes{true};
   std::vector<std::string> _decimal_cols_as_float;
   std::vector<std::string> decimal128_columns;
-  bool is_decimal128_enabled{true};
   data_type _timestamp_type{type_id::EMPTY};
   reader_column_meta _col_meta{};
 };
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index c2115c3caa4..21bfd8120f7 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -68,7 +68,7 @@ class make_pair_function {
   {
     // Compute the hash value of row `i`
     auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
-    return cuco::make_pair<hash_value_type, size_type>(std::move(row_hash_value), std::move(i));
+    return cuco::make_pair(row_hash_value, i);
   }
 
  private:
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index f38e653c4a6..e492968b8a6 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -45,7 +45,7 @@ struct make_pair_function_semi {
   {
     // The value is irrelevant since we only ever use the hash map to check for
     // membership of a particular row index.
-    return cuco::make_pair<hash_value_type, size_type>(i, 0);
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
   }
 };
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 8563a2a3bd3..39fe0b60c8c 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -49,7 +49,7 @@ struct make_pair_function {
   {
     // The value is irrelevant since we only ever use the hash map to check for
     // membership of a particular row index.
-    return cuco::make_pair<hash_value_type, size_type>(i, 0);
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
   }
 };
 
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 6f9149a47e2..234eaf51f96 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -93,7 +93,7 @@ struct reduce_dispatch_functor {
       case aggregation::NUNIQUE: {
         auto nunique_agg = dynamic_cast<nunique_aggregation const*>(agg.get());
         return make_fixed_width_scalar(
-          detail::distinct_count(
+          detail::unordered_distinct_count(
             col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream),
           stream,
           mr);
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 5c695f8a16f..2c7488084b5 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,18 @@
  * limitations under the License.
  */
 
+#include "stream_compaction_common.cuh"
+#include "stream_compaction_common.hpp"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -30,39 +34,19 @@
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
+#include <thrust/logical.h>
+
+#include <cmath>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace cudf {
 namespace detail {
-
-cudf::size_type distinct_count(table_view const& keys,
-                               null_equality nulls_equal,
-                               rmm::cuda_stream_view stream)
-{
-  // sort only indices
-  auto sorted_indices = sorted_order(keys,
-                                     std::vector<order>{},
-                                     std::vector<null_order>{},
-                                     stream,
-                                     rmm::mr::get_current_device_resource());
-
-  // count unique elements
-  auto sorted_row_index   = sorted_indices->view().data<cudf::size_type>();
-  auto device_input_table = cudf::table_device_view::create(keys, stream);
-
-  row_equality_comparator comp(
-    nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
-  return thrust::count_if(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<cudf::size_type>(0),
-    thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-    [sorted_row_index, comp] __device__(cudf::size_type i) {
-      return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1]));
-    });
-}
-
+namespace {
 /**
- * @brief Functor to check for `NAN` at an index in a `column_device_view`.
+ * @brief Functor to check for `NaN` at an index in a `column_device_view`.
  *
  * @tparam T The type of `column_device_view`
  */
@@ -76,97 +60,199 @@ struct check_for_nan {
   check_for_nan(cudf::column_device_view input) : _input{input} {}
 
   /**
-   * @brief Operator to be called to check for `NAN` at `index` in `_input`
+   * @brief Operator to be called to check for `NaN` at `index` in `_input`
    *
-   * @param[in] index The index at which the `NAN` needs to be checked in `input`
+   * @param[in] index The index at which the `NaN` needs to be checked in `input`
    *
-   * @returns bool true if value at `index` is `NAN` and not null, else false
+   * @returns bool true if value at `index` is `NaN` and not null, else false
    */
-  __device__ bool operator()(size_type index)
+  __device__ bool operator()(size_type index) const noexcept
   {
     return std::isnan(_input.data<T>()[index]) and _input.is_valid(index);
   }
 
- protected:
   cudf::column_device_view _input;
 };
 
 /**
  * @brief A structure to be used along with type_dispatcher to check if a
- * `column_view` has `NAN`.
+ * `column_view` has `NaN`.
  */
 struct has_nans {
   /**
-   * @brief Checks if `input` has `NAN`
+   * @brief Checks if `input` has `NaN`
    *
    * @note This will be applicable only for floating point type columns.
    *
-   * @param[in] input The `column_view` which will be checked for `NAN`
+   * @param[in] input The `column_view` which will be checked for `NaN`
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    *
-   * @returns bool true if `input` has `NAN` else false
+   * @returns bool true if `input` has `NaN` else false
    */
-  template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
   bool operator()(column_view const& input, rmm::cuda_stream_view stream)
   {
     auto input_device_view = cudf::column_device_view::create(input, stream);
     auto device_view       = *input_device_view;
-    auto count             = thrust::count_if(rmm::exec_policy(stream),
-                                  thrust::counting_iterator<cudf::size_type>(0),
-                                  thrust::counting_iterator<cudf::size_type>(input.size()),
-                                  check_for_nan<T>(device_view));
-    return count > 0;
+    return thrust::any_of(rmm::exec_policy(stream),
+                          thrust::counting_iterator<cudf::size_type>(0),
+                          thrust::counting_iterator<cudf::size_type>(input.size()),
+                          check_for_nan<T>(device_view));
   }
 
   /**
-   * @brief Checks if `input` has `NAN`
+   * @brief Checks if `input` has `NaN`
    *
    * @note This will be applicable only for non-floating point type columns. And
-   * non-floating point columns can never have `NAN`, so it will always return
+   * non-floating point columns can never have `NaN`, so it will always return
    * false
    *
-   * @param[in] input The `column_view` which will be checked for `NAN`
+   * @param[in] input The `column_view` which will be checked for `NaN`
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    *
-   * @returns bool Always false as non-floating point columns can't have `NAN`
+   * @returns bool Always false as non-floating point columns can't have `NaN`
    */
-  template <typename T, std::enable_if_t<not std::is_floating_point<T>::value>* = nullptr>
-  bool operator()(column_view const& input, rmm::cuda_stream_view stream)
+  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  bool operator()(column_view const&, rmm::cuda_stream_view)
+  {
+    return false;
+  }
+};
+
+/**
+ * @brief A functor to be used along with device type_dispatcher to check if
+ * the row `index` of `column_device_view` is `NaN`.
+ */
+struct check_nan {
+  // Check if it's `NaN` for floating point type columns
+  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+  __device__ inline bool operator()(column_device_view const& input, size_type index)
+  {
+    return std::isnan(input.data<T>()[index]);
+  }
+  // Non-floating point type columns can never have `NaN`, so it will always return false.
+  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  __device__ inline bool operator()(column_device_view const&, size_type)
   {
     return false;
   }
 };
+}  // namespace
+
+cudf::size_type distinct_count(table_view const& keys,
+                               null_equality nulls_equal,
+                               rmm::cuda_stream_view stream)
+{
+  auto table_ptr = cudf::table_device_view::create(keys, stream);
+  row_equality_comparator comp(
+    nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal);
+  return thrust::count_if(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
+    [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
+}
+
+cudf::size_type unordered_distinct_count(table_view const& keys,
+                                         null_equality nulls_equal,
+                                         rmm::cuda_stream_view stream)
+{
+  auto table_ptr      = cudf::table_device_view::create(keys, stream);
+  auto const num_rows = table_ptr->num_rows();
+  auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)};
+
+  hash_map_type key_map{compute_hash_table_size(num_rows),
+                        COMPACTION_EMPTY_KEY_SENTINEL,
+                        COMPACTION_EMPTY_VALUE_SENTINEL,
+                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+                        stream.value()};
+
+  compaction_hash hash_key{has_null, *table_ptr};
+  row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal);
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
+
+  // when nulls are equal, insert non-null rows only to improve efficiency
+  if (nulls_equal == null_equality::EQUAL and has_null) {
+    thrust::counting_iterator<size_type> stencil(0);
+    auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream);
+    row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
+    return key_map.get_size() + static_cast<std::size_t>((null_count > 0) ? 1 : 0);
+  }
+  // otherwise, insert all
+  key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
+  return key_map.get_size();
+}
 
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
                                nan_policy nan_handling,
                                rmm::cuda_stream_view stream)
 {
-  if (0 == input.size() || input.null_count() == input.size()) { return 0; }
-
-  cudf::size_type nrows = input.size();
-
-  bool has_nan = false;
-  // Check for Nans
-  // Checking for nulls in input and flag nan_handling, as the count will
-  // only get affected if these two conditions are true. NAN will only be
-  // be an extra if nan_handling was NAN_IS_NULL and input also had null, which
-  // will increase the count by 1.
-  if (input.has_nulls() and nan_handling == nan_policy::NAN_IS_NULL) {
-    has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream);
-  }
+  auto const num_rows = input.size();
 
-  auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
+  if (num_rows == 0 or num_rows == input.null_count()) { return 0; }
 
-  // if nan is considered null and there are already null values
-  if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count;
+  auto const count_nulls      = null_handling == null_policy::INCLUDE;
+  auto const nan_is_null      = nan_handling == nan_policy::NAN_IS_NULL;
+  auto const should_check_nan = cudf::is_floating_point(input.type());
+  auto input_device_view      = cudf::column_device_view::create(input, stream);
+  auto device_view            = *input_device_view;
+  auto input_table_view       = table_view{{input}};
+  auto table_ptr              = cudf::table_device_view::create(input_table_view, stream);
+  row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)},
+                               *table_ptr,
+                               *table_ptr,
+                               null_equality::EQUAL);
 
-  if (null_handling == null_policy::EXCLUDE and input.has_nulls())
-    return --count;
-  else
-    return count;
+  return thrust::count_if(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(num_rows),
+    [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) {
+      auto const is_null = device_view.is_null(i);
+      auto const is_nan  = nan_is_null and should_check_nan and
+                          cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i);
+      if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; }
+      if (i == 0) { return true; }
+      if (count_nulls and nan_is_null and (is_nan or is_null)) {
+        auto const prev_is_nan =
+          should_check_nan and
+          cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1);
+        return not(prev_is_nan or device_view.is_null(i - 1));
+      }
+      return not comp(i, i - 1);
+    });
 }
 
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling,
+                                         rmm::cuda_stream_view stream)
+{
+  if (0 == input.size() or input.null_count() == input.size()) { return 0; }
+
+  auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
+
+  // Check for nulls. If the null policy is EXCLUDE and null values were found,
+  // we decrement the count.
+  auto const has_null = input.has_nulls();
+  if (null_handling == null_policy::EXCLUDE and has_null) { --count; }
+
+  // Check for NaNs. There are two cases that can lead to decrementing the
+  // count. The first case is when the input has no nulls, but has NaN values
+  // handled as a null via NAN_IS_NULL and has a policy to EXCLUDE null values
+  // from the count. The second case is when the input has null values and NaN
+  // values handled as nulls via NAN_IS_NULL. Regardless of whether the null
+  // policy is set to EXCLUDE, we decrement the count to avoid double-counting
+  // null and NaN as distinct entities.
+  auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and
+                               cudf::type_dispatcher(input.type(), has_nans{}, input, stream);
+  if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; }
+  return count;
+}
 }  // namespace detail
 
 cudf::size_type distinct_count(column_view const& input,
@@ -183,4 +269,18 @@ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equa
   return detail::distinct_count(input, nulls_equal);
 }
 
+cudf::size_type unordered_distinct_count(column_view const& input,
+                                         null_policy null_handling,
+                                         nan_policy nan_handling)
+{
+  CUDF_FUNC_RANGE();
+  return detail::unordered_distinct_count(input, null_handling, nan_handling);
+}
+
+cudf::size_type unordered_distinct_count(table_view const& input, null_equality nulls_equal)
+{
+  CUDF_FUNC_RANGE();
+  return detail::unordered_distinct_count(input, nulls_equal);
+}
+
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu
index abc34663aee..2fd1f530b6d 100644
--- a/cpp/src/stream_compaction/drop_duplicates.cu
+++ b/cpp/src/stream_compaction/drop_duplicates.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,16 @@
  * limitations under the License.
  */
 
-#include <stream_compaction/drop_duplicates.cuh>
+#include "drop_duplicates.cuh"
+#include "stream_compaction_common.cuh"
+#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
@@ -37,6 +40,7 @@
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -85,12 +89,12 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
 
   auto comp = row_equality_comparator(
     nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
-  auto result_end = unique_copy(sorted_indices->view().begin<cudf::size_type>(),
-                                sorted_indices->view().end<cudf::size_type>(),
-                                unique_indices.begin<cudf::size_type>(),
-                                comp,
-                                keep,
-                                stream);
+  auto result_end = cudf::detail::unique_copy(sorted_indices->view().begin<cudf::size_type>(),
+                                              sorted_indices->view().end<cudf::size_type>(),
+                                              unique_indices.begin<cudf::size_type>(),
+                                              comp,
+                                              keep,
+                                              stream);
 
   return cudf::detail::slice(column_view(unique_indices),
                              0,
@@ -106,7 +110,7 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) {
+  if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
     return empty_like(input);
   }
 
@@ -130,6 +134,62 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                         mr);
 }
 
+std::unique_ptr<table> unordered_drop_duplicates(table_view const& input,
+                                                 std::vector<size_type> const& keys,
+                                                 null_equality nulls_equal,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
+    return empty_like(input);
+  }
+
+  auto keys_view = input.select(keys);
+  auto table_ptr = cudf::table_device_view::create(keys_view, stream);
+  auto has_null  = nullate::DYNAMIC{cudf::has_nulls(keys_view)};
+  auto const num_rows{table_ptr->num_rows()};
+
+  hash_map_type key_map{compute_hash_table_size(num_rows),
+                        COMPACTION_EMPTY_KEY_SENTINEL,
+                        COMPACTION_EMPTY_VALUE_SENTINEL,
+                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+                        stream.value()};
+
+  compaction_hash hash_key{has_null, *table_ptr};
+  row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal);
+
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
+  // insert unique indices into the map.
+  key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
+
+  auto counting_iter = thrust::make_counting_iterator<size_type>(0);
+  rmm::device_uvector<bool> index_exists_in_map(num_rows, stream, mr);
+  // enumerate all indices to check if they are present in the map.
+  key_map.contains(counting_iter, counting_iter + num_rows, index_exists_in_map.begin(), hash_key);
+
+  auto const output_size{key_map.get_size()};
+
+  // write unique indices to a numeric column
+  auto unique_indices = cudf::make_numeric_column(
+    data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);
+  auto mutable_view = mutable_column_device_view::create(*unique_indices, stream);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  counting_iter,
+                  counting_iter + num_rows,
+                  index_exists_in_map.begin(),
+                  mutable_view->begin<size_type>(),
+                  thrust::identity<bool>{});
+
+  // run gather operation to establish new order
+  return detail::gather(input,
+                        unique_indices->view(),
+                        out_of_bounds_policy::DONT_CHECK,
+                        detail::negative_index_policy::NOT_ALLOWED,
+                        stream,
+                        mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<table> drop_duplicates(table_view const& input,
@@ -144,4 +204,13 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
     input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<table> unordered_drop_duplicates(table_view const& input,
+                                                 std::vector<size_type> const& keys,
+                                                 null_equality nulls_equal,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::unordered_drop_duplicates(input, keys, nulls_equal, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
new file mode 100644
index 00000000000..8ba9223a1bc
--- /dev/null
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "stream_compaction_common.hpp"
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Device callable to hash a given row.
+ */
+template <typename Nullate>
+class compaction_hash {
+ public:
+  compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {}
+
+  __device__ inline auto operator()(size_type i) const noexcept
+  {
+    auto hash = _hash(i);
+    return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash;
+  }
+
+ private:
+  row_hash _hash;
+};
+
+/**
+￼ * @brief Device functor to determine if a row is valid.
+￼ */
+class row_validity {
+ public:
+  row_validity(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {}
+
+  __device__ inline bool operator()(const size_type& i) const noexcept
+  {
+    return cudf::bit_is_set(_row_bitmask, i);
+  }
+
+ private:
+  bitmask_type const* _row_bitmask;
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
new file mode 100644
index 00000000000..1d743eccdbe
--- /dev/null
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/table/row_operators.cuh>
+#include <cudf/table/table_device_view.cuh>
+
+#include <hash/hash_allocator.cuh>
+#include <hash/helper_functions.cuh>
+
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <limits>
+
+namespace cudf {
+namespace detail {
+
+constexpr auto COMPACTION_EMPTY_KEY_SENTINEL   = std::numeric_limits<size_type>::max();
+constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits<size_type>::min();
+
+using hash_type = cuco::detail::MurmurHash3_32<size_type>;
+
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+
+using hash_map_type =
+  cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
+
+using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/strings/count_matches.cuh b/cpp/src/strings/count_matches.cu
similarity index 87%
rename from cpp/src/strings/count_matches.cuh
rename to cpp/src/strings/count_matches.cu
index c14142f4779..d0a6825666b 100644
--- a/cpp/src/strings/count_matches.cuh
+++ b/cpp/src/strings/count_matches.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,13 @@
  * limitations under the License.
  */
 
-#pragma once
-
+#include <strings/count_matches.hpp>
 #include <strings/regex/regex.cuh>
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/string_view.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/transform.h>
@@ -32,6 +29,7 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+namespace {
 /**
  * @brief Functor counts the total matches to the given regex in each string.
  */
@@ -50,12 +48,13 @@ struct count_matches_fn {
     int32_t end   = d_str.length();
     while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
       ++count;
-      begin = end;
+      begin = end + (begin == end);
       end   = d_str.length();
     }
     return count;
   }
 };
+}  // namespace
 
 /**
  * @brief Returns a column of regex match counts for each string in the given column.
@@ -67,11 +66,10 @@ struct count_matches_fn {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-std::unique_ptr<column> count_matches(
-  column_device_view const& d_strings,
-  reprog_device const& d_prog,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> count_matches(column_device_view const& d_strings,
+                                      reprog_device const& d_prog,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   // Create output column
   auto counts = make_numeric_column(
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
new file mode 100644
index 00000000000..1339f2b1ebd
--- /dev/null
+++ b/cpp/src/strings/count_matches.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+
+class column_device_view;
+
+namespace strings {
+namespace detail {
+
+class reprog_device;
+
+/**
+ * @brief Returns a column of regex match counts for each string in the given column.
+ *
+ * A null entry will result in a zero count for that output row.
+ *
+ * @param d_strings Device view of the input strings column.
+ * @param d_prog Regex instance to evaluate on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<column> count_matches(
+  column_device_view const& d_strings,
+  reprog_device const& d_prog,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 584741298c2..c4749eae003 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <strings/count_matches.cuh>
+#include <strings/count_matches.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 7be88d01387..244cec1d780 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -830,6 +830,7 @@ class regex_compiler {
     m_prog.set_start_inst(andstack[andstack.size() - 1].id_first);
     m_prog.optimize1();
     m_prog.optimize2();
+    m_prog.check_for_errors();
     m_prog.set_groups_count(cursubid);
   }
 };
@@ -926,6 +927,68 @@ void reprog::optimize2()
   _startinst_ids.push_back(-1);  // terminator mark
 }
 
+/**
+ * @brief Check a specific instruction for errors.
+ *
+ * Currently this is checking for an infinite-loop condition as documented in this issue:
+ * https://github.com/rapidsai/cudf/issues/10006
+ *
+ * Example instructions list created from pattern `(A?)+`
+ * ```
+ *   0:    CHAR c='A', next=2
+ *   1:      OR right=0, left=2, next=2
+ *   2:    RBRA id=1, next=4
+ *   3:    LBRA id=1, next=1
+ *   4:      OR right=3, left=5, next=5
+ *   5:     END
+ * ```
+ *
+ * Following the example above, the instruction at `id==1` (OR)
+ * is being checked. If the instruction path returns to `id==1`
+ * without including the `0==CHAR` or `5==END` as in this example,
+ * then this would cause the runtime to go into an infinite-loop.
+ *
+ * It appears this example pattern is not valid. But Python interprets
+ * its behavior similarly to pattern `(A*)`. Handling this in the same
+ * way does not look feasible with the current implementation.
+ *
+ * @throw cudf::logic_error if instruction logic error is found
+ *
+ * @param id Instruction to check if repeated.
+ * @param next_id Next instruction to process.
+ */
+void reprog::check_for_errors(int32_t id, int32_t next_id)
+{
+  auto inst = inst_at(next_id);
+  while (inst.type == LBRA || inst.type == RBRA) {
+    next_id = inst.u2.next_id;
+    inst    = inst_at(next_id);
+  }
+  if (inst.type == OR) {
+    CUDF_EXPECTS(next_id != id, "Unsupported regex pattern");
+    check_for_errors(id, inst.u2.left_id);
+    check_for_errors(id, inst.u1.right_id);
+  }
+}
+
+/**
+ * @brief Check regex instruction set for any errors.
+ *
+ * Currently, this checks for OR instructions that eventually point back to themselves with only
+ * intervening capture group instructions between causing an infinite-loop during runtime
+ * evaluation.
+ */
+void reprog::check_for_errors()
+{
+  for (auto id = 0; id < insts_count(); ++id) {
+    auto const inst = inst_at(id);
+    if (inst.type == OR) {
+      check_for_errors(id, inst.u2.left_id);
+      check_for_errors(id, inst.u1.right_id);
+    }
+  }
+}
+
 #ifndef NDEBUG
 void reprog::print(regex_flags const flags)
 {
@@ -933,83 +996,81 @@ void reprog::print(regex_flags const flags)
   printf("Instructions:\n");
   for (std::size_t i = 0; i < _insts.size(); i++) {
     const reinst& inst = _insts[i];
-    printf("%zu :", i);
+    printf("%3zu: ", i);
     switch (inst.type) {
-      default: printf("Unknown instruction: %d, nextid= %d", inst.type, inst.u2.next_id); break;
+      default: printf("Unknown instruction: %d, next=%d", inst.type, inst.u2.next_id); break;
       case CHAR:
-        if (inst.u1.c <= 32 || inst.u1.c >= 127)
-          printf(
-            "CHAR, c = '0x%02x', nextid= %d", static_cast<unsigned>(inst.u1.c), inst.u2.next_id);
-        else
-          printf("CHAR, c = '%c', nextid= %d", inst.u1.c, inst.u2.next_id);
+        if (inst.u1.c <= 32 || inst.u1.c >= 127) {
+          printf("   CHAR c='0x%02x', next=%d", static_cast<unsigned>(inst.u1.c), inst.u2.next_id);
+        } else {
+          printf("   CHAR c='%c', next=%d", inst.u1.c, inst.u2.next_id);
+        }
         break;
-      case RBRA: printf("RBRA, subid= %d, nextid= %d", inst.u1.subid, inst.u2.next_id); break;
-      case LBRA: printf("LBRA, subid= %d, nextid= %d", inst.u1.subid, inst.u2.next_id); break;
+      case RBRA: printf("   RBRA id=%d, next=%d", inst.u1.subid, inst.u2.next_id); break;
+      case LBRA: printf("   LBRA id=%d, next=%d", inst.u1.subid, inst.u2.next_id); break;
       case OR:
-        printf("OR, rightid=%d, leftid=%d, nextid=%d",
-               inst.u1.right_id,
-               inst.u2.left_id,
-               inst.u2.next_id);
+        printf(
+          "     OR right=%d, left=%d, next=%d", inst.u1.right_id, inst.u2.left_id, inst.u2.next_id);
         break;
-      case STAR: printf("STAR, nextid= %d", inst.u2.next_id); break;
-      case PLUS: printf("PLUS, nextid= %d", inst.u2.next_id); break;
-      case QUEST: printf("QUEST, nextid= %d", inst.u2.next_id); break;
-      case ANY: printf("ANY, nextid= %d", inst.u2.next_id); break;
-      case ANYNL: printf("ANYNL, nextid= %d", inst.u2.next_id); break;
-      case NOP: printf("NOP, nextid= %d", inst.u2.next_id); break;
+      case STAR: printf("   STAR next=%d", inst.u2.next_id); break;
+      case PLUS: printf("   PLUS next=%d", inst.u2.next_id); break;
+      case QUEST: printf("  QUEST next=%d", inst.u2.next_id); break;
+      case ANY: printf("    ANY next=%d", inst.u2.next_id); break;
+      case ANYNL: printf("  ANYNL next=%d", inst.u2.next_id); break;
+      case NOP: printf("    NOP next=%d", inst.u2.next_id); break;
       case BOL: {
-        printf("BOL, c = ");
+        printf("    BOL c=");
         if (inst.u1.c == '\n') {
           printf("'\\n'");
         } else {
           printf("'%c'", inst.u1.c);
         }
-        printf(", nextid= %d", inst.u2.next_id);
+        printf(", next=%d", inst.u2.next_id);
         break;
       }
       case EOL: {
-        printf("EOL, c = ");
+        printf("    EOL c=");
         if (inst.u1.c == '\n') {
           printf("'\\n'");
         } else {
           printf("'%c'", inst.u1.c);
         }
-        printf(", nextid= %d", inst.u2.next_id);
+        printf(", next=%d", inst.u2.next_id);
         break;
       }
-      case CCLASS: printf("CCLASS, cls_id=%d , nextid= %d", inst.u1.cls_id, inst.u2.next_id); break;
-      case NCCLASS:
-        printf("NCCLASS, cls_id=%d , nextid= %d", inst.u1.cls_id, inst.u2.next_id);
-        break;
-      case BOW: printf("BOW, nextid= %d", inst.u2.next_id); break;
-      case NBOW: printf("NBOW, nextid= %d", inst.u2.next_id); break;
-      case END: printf("END"); break;
+      case CCLASS: printf(" CCLASS cls=%d , next=%d", inst.u1.cls_id, inst.u2.next_id); break;
+      case NCCLASS: printf("NCCLASS cls=%d, next=%d", inst.u1.cls_id, inst.u2.next_id); break;
+      case BOW: printf("    BOW next=%d", inst.u2.next_id); break;
+      case NBOW: printf("   NBOW next=%d", inst.u2.next_id); break;
+      case END: printf("    END"); break;
     }
     printf("\n");
   }
 
   printf("startinst_id=%d\n", _startinst_id);
   if (_startinst_ids.size() > 0) {
-    printf("startinst_ids:");
-    for (size_t i = 0; i < _startinst_ids.size(); i++)
+    printf("startinst_ids: [");
+    for (size_t i = 0; i < _startinst_ids.size(); i++) {
       printf(" %d", _startinst_ids[i]);
-    printf("\n");
+    }
+    printf("]\n");
   }
 
   int count = static_cast<int>(_classes.size());
   printf("\nClasses %d\n", count);
   for (int i = 0; i < count; i++) {
     const reclass& cls = _classes[i];
-    int len            = static_cast<int>(cls.literals.size());
+    auto const size    = static_cast<int>(cls.literals.size());
     printf("%2d: ", i);
-    for (int j = 0; j < len; j += 2) {
+    for (int j = 0; j < size; j += 2) {
       char32_t c1 = cls.literals[j];
       char32_t c2 = cls.literals[j + 1];
-      if (c1 <= 32 || c1 >= 127 || c2 <= 32 || c2 >= 127)
+      if (c1 <= 32 || c1 >= 127 || c2 <= 32 || c2 >= 127) {
         printf("0x%02x-0x%02x", static_cast<unsigned>(c1), static_cast<unsigned>(c2));
-      else
+      } else {
         printf("%c-%c", static_cast<char>(c1), static_cast<char>(c2));
-      if ((j + 2) < len) printf(", ");
+      }
+      if ((j + 2) < size) { printf(", "); }
     }
     printf("\n");
     if (cls.builtins) {
@@ -1024,7 +1085,7 @@ void reprog::print(regex_flags const flags)
     }
     printf("\n");
   }
-  if (_num_capturing_groups) printf("Number of capturing groups: %d\n", _num_capturing_groups);
+  if (_num_capturing_groups) { printf("Number of capturing groups: %d\n", _num_capturing_groups); }
 }
 #endif
 
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 3131767de59..18735d0f980 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,14 +116,19 @@ class reprog {
 
   void optimize1();
   void optimize2();
+  void check_for_errors();
+#ifndef NDEBUG
   void print(regex_flags const flags);
+#endif
 
  private:
   std::vector<reinst> _insts;
   std::vector<reclass> _classes;
   int32_t _startinst_id;
   std::vector<int32_t> _startinst_ids;  // short-cut to speed-up ORs
-  int32_t _num_capturing_groups;
+  int32_t _num_capturing_groups{};
+
+  void check_for_errors(int32_t id, int32_t next_id);
 };
 
 }  // namespace detail
diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/search/find.cu
similarity index 100%
rename from cpp/src/strings/find.cu
rename to cpp/src/strings/search/find.cu
diff --git a/cpp/src/strings/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
similarity index 67%
rename from cpp/src/strings/find_multiple.cu
rename to cpp/src/strings/search/find_multiple.cu
index 72e7081cb7a..5756c239f1c 100644
--- a/cpp/src/strings/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sequence.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -31,37 +33,32 @@ namespace cudf {
 namespace strings {
 namespace detail {
 std::unique_ptr<column> find_multiple(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::INT32);
-  auto targets_count = targets.size();
+  auto const strings_count = input.size();
+  auto const targets_count = targets.size();
   CUDF_EXPECTS(targets_count > 0, "Must include at least one search target");
   CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
   auto targets_column = column_device_view::create(targets.parent(), stream);
   auto d_targets      = *targets_column;
 
+  auto const total_count = strings_count * targets_count;
+
   // create output column
-  auto total_count  = strings_count * targets_count;
-  auto results      = make_numeric_column(data_type{type_id::INT32},
-                                     total_count,
-                                     rmm::device_buffer{0, stream, mr},
-                                     0,
-                                     stream,
-                                     mr);  // no nulls
-  auto results_view = results->mutable_view();
-  auto d_results    = results_view.data<int32_t>();
+  auto results = make_numeric_column(
+    data_type{type_id::INT32}, total_count, rmm::device_buffer{0, stream, mr}, 0, stream, mr);
+
   // fill output column with position values
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(total_count),
-                    d_results,
+                    results->mutable_view().begin<int32_t>(),
                     [d_strings, d_targets, targets_count] __device__(size_type idx) {
                       size_type str_idx = idx / targets_count;
                       if (d_strings.is_null(str_idx)) return -1;
@@ -70,18 +67,30 @@ std::unique_ptr<column> find_multiple(
                       return d_str.find(d_tgt);
                     });
   results->set_null_count(0);
-  return results;
+
+  auto offsets = cudf::detail::sequence(strings_count + 1,
+                                        numeric_scalar<offset_type>(0),
+                                        numeric_scalar<offset_type>(targets_count),
+                                        stream,
+                                        mr);
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(results),
+                           0,
+                           rmm::device_buffer{0, stream, mr},
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> find_multiple(strings_column_view const& strings,
+std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find_multiple(strings, targets, rmm::cuda_stream_default, mr);
+  return detail::find_multiple(input, targets, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/search/findall.cu
similarity index 95%
rename from cpp/src/strings/findall.cu
rename to cpp/src/strings/search/findall.cu
index c82ab4f81c3..8fb754848d4 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -109,11 +109,11 @@ struct findall_count_fn : public findall_fn<stack_size> {
 }  // namespace
 
 //
-std::unique_ptr<table> findall_re(
+std::unique_ptr<table> findall(
   strings_column_view const& strings,
   std::string const& pattern,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto const strings_count = strings.size();
   auto const d_strings     = column_device_view::create(strings.parent(), stream);
@@ -205,12 +205,12 @@ std::unique_ptr<table> findall_re(
 
 // external API
 
-std::unique_ptr<table> findall_re(strings_column_view const& strings,
-                                  std::string const& pattern,
-                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> findall(strings_column_view const& strings,
+                               std::string const& pattern,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::findall_re(strings, pattern, mr);
+  return detail::findall(strings, pattern, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu
new file mode 100644
index 00000000000..9ffdb33f5f2
--- /dev/null
+++ b/cpp/src/strings/search/findall_record.cu
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <strings/count_matches.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/findall.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+using string_index_pair = thrust::pair<const char*, size_type>;
+
+namespace {
+
+/**
+ * @brief This functor handles extracting matched strings by applying the compiled regex pattern
+ * and creating string_index_pairs for all the substrings.
+ */
+template <int stack_size>
+struct findall_fn {
+  column_device_view const d_strings;
+  reprog_device prog;
+  offset_type const* d_offsets;
+  string_index_pair* d_indices;
+
+  __device__ void operator()(size_type const idx)
+  {
+    if (d_strings.is_null(idx)) { return; }
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    auto d_output        = d_indices + d_offsets[idx];
+    size_type output_idx = 0;
+
+    int32_t begin = 0;
+    int32_t end   = d_str.length();
+    while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
+      auto const spos = d_str.byte_offset(begin);  // convert
+      auto const epos = d_str.byte_offset(end);    // to bytes
+
+      d_output[output_idx++] = string_index_pair{d_str.data() + spos, (epos - spos)};
+
+      begin = end + (begin == end);
+      end   = d_str.length();
+    }
+  }
+};
+
+}  // namespace
+
+//
+std::unique_ptr<column> findall_record(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const strings_count = strings.size();
+  auto const d_strings     = column_device_view::create(strings.parent(), stream);
+
+  // compile regex into device object
+  auto const d_prog =
+    reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+
+  // Create lists offsets column
+  auto offsets   = count_matches(*d_strings, *d_prog, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<offset_type>();
+
+  // Compute null output rows
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    d_offsets,
+    d_offsets + strings_count,
+    [] __device__(auto const v) { return v > 0; },
+    stream,
+    mr);
+
+  auto const valid_count = strings_count - null_count;
+  // Return an empty lists column if there are no valid rows
+  if (valid_count == 0) {
+    return make_lists_column(0,
+                             make_empty_column(type_to_id<offset_type>()),
+                             make_empty_column(type_id::STRING),
+                             0,
+                             rmm::device_buffer{},
+                             stream,
+                             mr);
+  }
+
+  // Convert counts into offsets
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+  // Create indices vector with the total number of groups that will be extracted
+  auto total_matches = cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
+
+  rmm::device_uvector<string_index_pair> indices(total_matches, stream);
+  auto d_indices = indices.data();
+  auto begin     = thrust::make_counting_iterator<size_type>(0);
+
+  // Build the string indices
+  auto const regex_insts = d_prog->insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    findall_fn<RX_STACK_SMALL> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    findall_fn<RX_STACK_MEDIUM> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    findall_fn<RX_STACK_LARGE> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else {
+    findall_fn<RX_STACK_ANY> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  }
+
+  // Build the child strings column from the resulting indices
+  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
+
+  // Build the lists column from the offsets and the strings
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           null_count,
+                           std::move(null_mask),
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> findall_record(strings_column_view const& strings,
+                                       std::string const& pattern,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::findall_record(strings, pattern, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index c6dd11c1d82..62fd98d2027 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <text/subword/detail/data_normalizer.hpp>
+#include <text/subword/detail/tokenizer_utils.cuh>
 #include <text/utilities/tokenize_ops.cuh>
 
 #include <nvtext/normalize.hpp>
@@ -199,12 +200,14 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create the normalizer and call it
-  data_normalizer normalizer(stream, do_lower_case);
-  auto result = [&strings, &normalizer, stream] {
-    auto const offsets   = strings.offsets();
-    auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
-    auto const offset    = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
-    auto const d_chars   = strings.chars().data<char>() + offset;
+  auto result = [&] {
+    auto const cp_metadata = get_codepoint_metadata(stream);
+    auto const aux_table   = get_aux_codepoint_data(stream);
+    auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
+    auto const offsets     = strings.offsets();
+    auto const d_offsets   = offsets.data<uint32_t>() + strings.offset();
+    auto const offset      = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
+    auto const d_chars     = strings.chars().data<char>() + offset;
     return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
   }();
 
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index f3b642132e3..5af87f4de0e 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -261,17 +261,17 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
 
 }  // namespace
 
-data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case)
-  : do_lower_case(do_lower_case)
+data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
+                                 aux_codepoint_data_type const* aux_table,
+                                 bool do_lower_case)
+  : d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case}
 {
-  d_cp_metadata = detail::get_codepoint_metadata(stream);
-  d_aux_table   = detail::get_aux_codepoint_data(stream);
 }
 
 uvector_pair data_normalizer::normalize(char const* d_strings,
                                         uint32_t const* d_offsets,
                                         uint32_t num_strings,
-                                        rmm::cuda_stream_view stream)
+                                        rmm::cuda_stream_view stream) const
 {
   if (num_strings == 0)
     return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp
index 1a9eb5ba997..927de5a74f9 100644
--- a/cpp/src/text/subword/detail/data_normalizer.hpp
+++ b/cpp/src/text/subword/detail/data_normalizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,14 +48,17 @@ namespace detail {
 class data_normalizer {
  public:
   /**
-   * @brief Transfer to the GPU the metadata needed to normalize characters.
+   * @brief Create instance of the normalizer.
    *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param cp_metadata The code point metadata table to use for normalization.
+   * @param aux_table The auxiliary code point table.
    * @param do_lower_case If true, the normalizer will convert uppercase characters in the
    *        input stream to lower case and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
    */
-  data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true);
+  data_normalizer(codepoint_metadata_type const* cp_metadata,
+                  aux_codepoint_data_type const* aux_table,
+                  bool do_lower_case = true);
 
   /**
    * @brief Normalize a vector of strings.
@@ -84,7 +87,7 @@ class data_normalizer {
   uvector_pair normalize(char const* d_strings,
                          uint32_t const* d_offsets,
                          uint32_t num_strings,
-                         rmm::cuda_stream_view stream);
+                         rmm::cuda_stream_view stream) const;
 
  private:
   bool const do_lower_case;
diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh
index dcd241fc045..5e8de1ba244 100644
--- a/cpp/src/text/subword/detail/tokenizer_utils.cuh
+++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <text/subword/detail/cp_data.h>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <cstdint>
 
@@ -57,22 +58,16 @@ struct update_strings_lengths_fn {
 /**
  * @brief Retrieve the code point metadata table.
  *
- * This is a singleton instance that copies a large table of integers into
- * device memory on the very first call.
- *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream);
+rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream);
 
 /**
- * @brief Retrieve the aux code point metadata table.
- *
- * This is a singleton instance that copies a large table of integers into
- * device memory on the very first call.
+ * @brief Retrieve the auxiliary code point metadata table.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream);
+rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
index 0259e8ce4f4..b5ad9724d72 100644
--- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
+++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,6 @@ class wordpiece_tokenizer {
    * @param do_lower_case If true, the tokenizer will convert uppercase characters in the
    *        input stream to lowercase and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param max_word_length The length of the longest word that will be tokenized. Words
    *        longer than this will simply be replaced by the unknown token
    *        specified in the `vocab_file`.
@@ -72,7 +71,6 @@ class wordpiece_tokenizer {
                       uint32_t stride,
                       bool do_truncate,
                       bool do_lower_case,
-                      rmm::cuda_stream_view stream,
                       uint32_t max_word_length = 200);
 
   /**
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 75c79381032..7cfdb4dea96 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <text/subword/detail/codepoint_metadata.ah>
-#include <text/subword/detail/data_normalizer.hpp>
 #include <text/subword/detail/tokenizer_utils.cuh>
 
 #include <nvtext/detail/load_hash_file.hpp>
@@ -37,87 +36,32 @@
 namespace nvtext {
 namespace detail {
 
-namespace {
-struct get_codepoint_metadata_init {
-  rmm::cuda_stream_view stream;
-
-  rmm::device_uvector<codepoint_metadata_type>* operator()() const
-  {
-    auto table_vector =
-      new rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
-    auto table = table_vector->data();
-    thrust::fill(rmm::exec_policy(stream),
-                 table + cp_section1_end,
-                 table + codepoint_metadata_size,
-                 codepoint_metadata_default_value);
-    CUDA_TRY(cudaMemcpyAsync(table,
-                             codepoint_metadata,
-                             cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + cp_section2_begin,
-      cp_metadata_917505_917999,
-      (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    return table_vector;
-  };
-};
-
-struct get_aux_codepoint_data_init {
-  rmm::cuda_stream_view stream;
-
-  rmm::device_uvector<aux_codepoint_data_type>* operator()() const
-  {
-    auto table_vector =
-      new rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
-    auto table = table_vector->data();
-    thrust::fill(rmm::exec_policy(stream),
-                 table + aux_section1_end,
-                 table + aux_codepoint_data_size,
-                 aux_codepoint_default_value);
-    CUDA_TRY(cudaMemcpyAsync(table,
-                             aux_codepoint_data,
-                             aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section2_begin,
-      aux_cp_data_44032_55203,
-      (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section3_begin,
-      aux_cp_data_70475_71099,
-      (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    CUDA_TRY(cudaMemcpyAsync(
-      table + aux_section4_begin,
-      aux_cp_data_119134_119232,
-      (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
-      cudaMemcpyHostToDevice,
-      stream.value()));
-    return table_vector;
-  }
-};
-}  // namespace
-
 /**
  * @brief Retrieve the code point metadata table.
  *
  * Build the code point metadata table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
+rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<
-    rmm::device_uvector<codepoint_metadata_type>>
-    g_codepoint_metadata;
-
-  return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data();
+  auto table_vector = rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
+  auto table        = table_vector.data();
+  thrust::fill(rmm::exec_policy(stream),
+               table + cp_section1_end,
+               table + codepoint_metadata_size,
+               codepoint_metadata_default_value);
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           codepoint_metadata,
+                           cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + cp_section2_begin,
+    cp_metadata_917505_917999,
+    (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  return table_vector;
 }
 
 /**
@@ -126,13 +70,38 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre
  * Build the aux code point data table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
+rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<
-    rmm::device_uvector<aux_codepoint_data_type>>
-    g_aux_codepoint_data;
-
-  return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data();
+  auto table_vector = rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
+  auto table        = table_vector.data();
+  thrust::fill(rmm::exec_policy(stream),
+               table + aux_section1_end,
+               table + aux_codepoint_data_size,
+               aux_codepoint_default_value);
+  CUDA_TRY(cudaMemcpyAsync(table,
+                           aux_codepoint_data,
+                           aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section2_begin,
+    aux_cp_data_44032_55203,
+    (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section3_begin,
+    aux_cp_data_70475_71099,
+    (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(
+    table + aux_section4_begin,
+    aux_cp_data_119134_119232,
+    (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
+    cudaMemcpyHostToDevice,
+    stream.value()));
+  return table_vector;
 }
 
 namespace {
@@ -293,10 +262,15 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
                            cudaMemcpyHostToDevice,
                            stream.value()));
 
-  // this just initializes some constant tables into device memory
-  // to help speed up the runtime
-  detail::get_codepoint_metadata(stream);
-  detail::get_aux_codepoint_data(stream);
+  auto cp_metadata            = detail::get_codepoint_metadata(stream);
+  auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
+  result.cp_metadata          = std::make_unique<cudf::column>(
+    cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release());
+
+  auto aux_cp_table            = detail::get_aux_codepoint_data(stream);
+  auto const aux_cp_table_size = static_cast<cudf::size_type>(aux_cp_table.size());
+  result.aux_cp_table          = std::make_unique<cudf::column>(
+    cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release());
 
   return std::make_unique<hashed_vocabulary>(std::move(result));
 }
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 193cd80d9a6..1ac7dd0d8a1 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -153,7 +153,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
-    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case, stream);
+    vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
   // Run tokenizer
   auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
   // assign output components
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index 00798e7e4e2..afd82f0bb5d 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -394,10 +394,11 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
                                          uint32_t stride,
                                          bool do_truncate,
                                          bool do_lower_case,
-                                         rmm::cuda_stream_view stream,
                                          uint32_t max_word_length)
   : vocab_table(vocab_table),
-    normalizer(stream, do_lower_case),
+    normalizer(vocab_table.cp_metadata->view().data<codepoint_metadata_type>(),
+               vocab_table.aux_cp_table->view().data<aux_codepoint_data_type>(),
+               do_lower_case),
     max_sequence_length{max_sequence_length},
     stride(stride),
     do_truncate(do_truncate),
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index dadeaf7d1e0..405c83ab872 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,10 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <memory>
 #include <numeric>
+#include <utility>
+#include <vector>
 
 namespace cudf {
 namespace detail {
@@ -38,29 +41,23 @@ namespace detail {
 std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
   table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
-  std::vector<size_type> drop_keys(input_table.num_columns());
+  auto const num_cols = input_table.num_columns();
+
+  std::vector<size_type> drop_keys(num_cols);
   std::iota(drop_keys.begin(), drop_keys.end(), 0);
 
-  // side effects of this function we are now dependent on:
-  // - resulting column elements are sorted ascending
-  // - nulls are sorted to the beginning
-  auto keys_table = cudf::detail::drop_duplicates(input_table,
-                                                  drop_keys,
-                                                  duplicate_keep_option::KEEP_FIRST,
-                                                  null_equality::EQUAL,
-                                                  null_order::AFTER,
-                                                  stream,
-                                                  mr);
+  auto unique_keys = cudf::detail::unordered_drop_duplicates(
+    input_table, drop_keys, null_equality::EQUAL, stream, mr);
+
+  std::vector<order> column_order(num_cols, order::ASCENDING);
+  std::vector<null_order> null_precedence(num_cols, null_order::AFTER);
+  auto sorted_unique_keys =
+    cudf::detail::sort(unique_keys->view(), column_order, null_precedence, stream, mr);
 
-  auto indices_column =
-    cudf::detail::lower_bound(keys_table->view(),
-                              input_table,
-                              std::vector<order>(input_table.num_columns(), order::ASCENDING),
-                              std::vector<null_order>(input_table.num_columns(), null_order::AFTER),
-                              stream,
-                              mr);
+  auto indices_column = cudf::detail::lower_bound(
+    sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr);
 
-  return std::make_pair(std::move(keys_table), std::move(indices_column));
+  return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column));
 }
 
 }  // namespace detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e23403e68e4..6b5670630ec 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -292,6 +292,7 @@ ConfigureTest(
 ConfigureTest(
   STREAM_COMPACTION_TEST
   stream_compaction/apply_boolean_mask_tests.cpp
+  stream_compaction/distinct_count_tests.cpp
   stream_compaction/drop_nulls_tests.cpp
   stream_compaction/drop_nans_tests.cpp
   stream_compaction/drop_duplicates_tests.cpp
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
new file mode 100644
index 00000000000..78b52db5255
--- /dev/null
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <algorithm>
+#include <cmath>
+
+using cudf::nan_policy;
+using cudf::null_equality;
+using cudf::null_policy;
+
+constexpr int32_t XXX{70};  // Mark for null elements
+constexpr int32_t YYY{3};   // Mark for null elements
+
+template <typename T>
+struct DistinctCountCommon : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes);
+
+TYPED_TEST(DistinctCountCommon, NoNull)
+{
+  using T = TypeParam;
+
+  auto const input = cudf::test::make_type_param_vector<T>(
+    {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
+
+  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end());
+
+  // explicit instantiation to one particular type (`double`) to reduce build time
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<double>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  // explicit instantiation to one particular type (`double`) to reduce build time
+  std::vector<double> input_data(input.begin(), input.end());
+  auto const new_end      = std::unique(input_data.begin(), input_data.end());
+  auto const gold_ordered = std::distance(input_data.begin(), new_end);
+  EXPECT_EQ(gold_ordered,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TYPED_TEST(DistinctCountCommon, TableNoNull)
+{
+  using T = TypeParam;
+
+  auto const input1 = cudf::test::make_type_param_vector<T>(
+    {1, 3, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
+  auto const input2 = cudf::test::make_type_param_vector<T>(
+    {3, 3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1});
+
+  std::vector<std::pair<T, T>> pair_input;
+  std::transform(
+    input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) {
+      return std::make_pair(a, b);
+    });
+
+  cudf::test::fixed_width_column_wrapper<T> input_col1(input1.begin(), input1.end());
+  cudf::test::fixed_width_column_wrapper<T> input_col2(input2.begin(), input2.end());
+  cudf::table_view input_table({input_col1, input_col2});
+
+  auto const expected = static_cast<cudf::size_type>(
+    std::set<std::pair<T, T>>(pair_input.begin(), pair_input.end()).size());
+  EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL));
+
+  auto const new_end      = std::unique(pair_input.begin(), pair_input.end());
+  auto const gold_ordered = std::distance(pair_input.begin(), new_end);
+  EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL));
+}
+
+struct DistinctCount : public cudf::test::BaseFixture {
+};
+
+TEST_F(DistinctCount, WithNull)
+{
+  using T = int32_t;
+
+  std::vector<T> input               = {1,   3,  3,  XXX, 31, 1, 8,  2, 0, XXX, XXX,
+                          XXX, 10, 40, 31,  42, 0, 42, 8, 5, XXX};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
+                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end(), valid.begin());
+
+  // explicit instantiation to one particular type (`double`) to reduce build time
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<double>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  auto const new_end      = std::unique(input.begin(), input.end());
+  auto const gold_ordered = std::distance(input.begin(), new_end) - 3;
+  EXPECT_EQ(gold_ordered,
+            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, IgnoringNull)
+{
+  using T = int32_t;
+
+  std::vector<T> input               = {1,   YYY, YYY, XXX, 31, 1, 8,  2, 0, XXX, 1,
+                          XXX, 10,  40,  31,  42, 0, 42, 8, 5, XXX};
+  std::vector<cudf::size_type> valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
+                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end(), valid.begin());
+
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<T>(input.begin(), input.end()).size());
+  // Removing 2 from expected to remove count for `XXX` and `YYY`
+  EXPECT_EQ(
+    expected - 2,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
+
+  auto const new_end = std::unique(input.begin(), input.end());
+  // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows
+  auto const gold_ordered = std::distance(input.begin(), new_end) - 1;
+  EXPECT_EQ(gold_ordered,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, WithNansAndNull)
+{
+  using T = float;
+
+  std::vector<T> input               = {1,   3,  NAN, XXX, 31,  1, 8,   2, 0, XXX, 1,
+                          XXX, 10, 40,  31,  NAN, 0, NAN, 8, 5, XXX};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
+                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<T>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected + 1,  // +1 since `NAN` is not in std::set
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  auto const new_end      = std::unique(input.begin(), input.end());
+  auto const gold_ordered = std::distance(input.begin(), new_end);
+  EXPECT_EQ(gold_ordered,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  input     = {NAN, NAN, XXX};
+  valid     = {1, 1, 0};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_nan = 2;
+  EXPECT_EQ(
+    expected_all_nan,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+  EXPECT_EQ(expected_all_nan,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, WithNansOnly)
+{
+  using T = float;
+
+  std::vector<T> input               = {1, 3, NAN, 70, 31};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 1, 1};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected = 5;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+
+  input     = {NAN, NAN, NAN};
+  valid     = {1, 1, 1};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_nan = 1;
+  EXPECT_EQ(
+    expected_all_nan,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+  EXPECT_EQ(expected_all_nan,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, NansAsNullWithNoNull)
+{
+  using T = float;
+
+  std::vector<T> input               = {1, 3, NAN, 70, 31};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 1, 1};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected = 5;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+
+  input     = {NAN, NAN, NAN};
+  valid     = {1, 1, 1};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_nan = 1;
+  EXPECT_EQ(
+    expected_all_nan,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected_all_nan,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+}
+
+TEST_F(DistinctCount, NansAsNullWithNull)
+{
+  using T = float;
+
+  std::vector<T> input               = {1, 3, NAN, XXX, 31};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected = 4;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+
+  input     = {NAN, NAN, XXX};
+  valid     = {1, 1, 0};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_null = 1;
+  EXPECT_EQ(
+    expected_all_null,
+    cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected_all_null,
+            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+}
+
+TEST_F(DistinctCount, NansAsNullWithIgnoreNull)
+{
+  using T = float;
+
+  std::vector<T> input               = {1, 3, NAN, XXX, 31};
+  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1};
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected = 3;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+
+  input     = {NAN, NAN, NAN};
+  valid     = {1, 1, 1};
+  input_col = cudf::test::fixed_width_column_wrapper<T>{input.begin(), input.end(), valid.begin()};
+
+  constexpr auto expected_all_nan = 0;
+  EXPECT_EQ(
+    expected_all_nan,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected_all_nan,
+            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+}
+
+TEST_F(DistinctCount, EmptyColumn)
+{
+  using T = float;
+
+  cudf::test::fixed_width_column_wrapper<T> input_col{};
+
+  constexpr auto expected = 0;
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+  EXPECT_EQ(expected,
+            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+}
+
+TEST_F(DistinctCount, StringColumnWithNull)
+{
+  cudf::test::strings_column_wrapper input_col{
+    {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"},
+    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}};
+
+  cudf::size_type const expected =
+    (std::vector<std::string>{"", "this", "is", "This", "a", "column", "of", "strings"}).size();
+  EXPECT_EQ(
+    expected,
+    cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
+}
+
+TEST_F(DistinctCount, TableWithNull)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
+                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+                                                       {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
+  cudf::table_view input{{col1, col2}};
+
+  EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL));
+}
+
+TEST_F(DistinctCount, EmptyColumnedTable)
+{
+  std::vector<cudf::column_view> cols{};
+
+  cudf::table_view input(cols);
+
+  EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL));
+}
+
+TEST_F(DistinctCount, TableMixedTypes)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
+                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<double> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+                                                      {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<uint32_t> col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+                                                        {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::table_view input{{col1, col2, col3}};
+
+  EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL));
+}
+
+TEST_F(DistinctCount, TableWithStringColumnWithNull)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0},
+                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::test::strings_column_wrapper col2{
+    {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""},
+    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+
+  cudf::table_view input{{col1, col2}};
+  EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL));
+  EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL));
+}
diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
index 916d2a33b97..d49b8208094 100644
--- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,254 +14,98 @@
  * limitations under the License.
  */
 
-#include <algorithm>
-#include <cmath>
-#include <ctgmath>
 #include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <algorithm>
+#include <cmath>
+
 using cudf::nan_policy;
 using cudf::null_equality;
 using cudf::null_policy;
-template <typename T>
-struct DistinctCountCommon : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes);
-
-TYPED_TEST(DistinctCountCommon, NoNull)
-{
-  using T = TypeParam;
-
-  auto const input = cudf::test::make_type_param_vector<T>(
-    {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
-
-  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end());
-
-  cudf::size_type expected = std::set<double>(input.begin(), input.end()).size();
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TYPED_TEST(DistinctCountCommon, TableNoNull)
-{
-  using T = TypeParam;
-
-  auto const input1 = cudf::test::make_type_param_vector<T>(
-    {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
-  auto const input2 = cudf::test::make_type_param_vector<T>(
-    {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1});
 
-  std::vector<std::pair<T, T>> pair_input;
-  std::transform(
-    input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) {
-      return std::make_pair(a, b);
-    });
-
-  cudf::test::fixed_width_column_wrapper<T> input_col1(input1.begin(), input1.end());
-  cudf::test::fixed_width_column_wrapper<T> input_col2(input2.begin(), input2.end());
-
-  std::vector<cudf::column_view> cols{input_col1, input_col2};
-  cudf::table_view input_table(cols);
-
-  cudf::size_type expected = std::set<std::pair<T, T>>(pair_input.begin(), pair_input.end()).size();
-  EXPECT_EQ(expected, cudf::distinct_count(input_table, null_equality::EQUAL));
-}
-
-struct DistinctCount : public cudf::test::BaseFixture {
+struct DropDuplicatesCommon : public cudf::test::BaseFixture {
 };
 
-TEST_F(DistinctCount, WithNull)
-{
-  using T = int32_t;
-
-  // Considering 70 as null
-  std::vector<T> input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
-                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end(), valid.begin());
-
-  cudf::size_type expected = std::set<double>(input.begin(), input.end()).size();
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TEST_F(DistinctCount, IgnoringNull)
-{
-  using T = int32_t;
-
-  // Considering 70 and 3 as null
-  std::vector<T> input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70};
-  std::vector<cudf::size_type> valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
-                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end(), valid.begin());
-
-  cudf::size_type expected = std::set<T>(input.begin(), input.end()).size();
-  // Removing 2 from expected to remove count for 70 and 3
-  EXPECT_EQ(expected - 2,
-            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TEST_F(DistinctCount, WithNansAndNull)
+TEST_F(DropDuplicatesCommon, StringKeyColumn)
 {
-  using T = float;
-
-  std::vector<T> input               = {1,  3,  NAN, 70, 31,  1, 8,   2, 0, 70, 1,
-                          70, 10, 40,  31, NAN, 0, NAN, 8, 5, 70};
-  std::vector<cudf::size_type> valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
-                                        0, 1, 1, 1, 1, 1, 1, 1, 1, 0};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
-
-  cudf::size_type expected = std::set<T>(input.begin(), input.end()).size();
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TEST_F(DistinctCount, WithNansOnly)
-{
-  using T = float;
-
-  std::vector<T> input               = {1, 3, NAN, 70, 31};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 1, 1};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
-
-  cudf::size_type expected = 5;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID));
-}
-
-TEST_F(DistinctCount, NansAsNullWithNoNull)
-{
-  using T = float;
-
-  std::vector<T> input               = {1, 3, NAN, 70, 31};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 1, 1};
-
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
-
-  cudf::size_type expected = 5;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
-}
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}};
+  cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"},
+                                             {1, 1, 1, 1, 0, 1, 1}};
+  cudf::table_view input{{col, key_col}};
+  std::vector<cudf::size_type> keys{1};
 
-TEST_F(DistinctCount, NansAsNullWithNull)
-{
-  using T = float;
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}};
+  cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"},
+                                                      {0, 1, 1, 1, 1}};
+  cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}};
 
-  std::vector<T> input               = {1, 3, NAN, 70, 31};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1};
+  auto got_sort = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view());
 
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+  auto got_unordered = unordered_drop_duplicates(input, keys);
+  auto key_view      = got_unordered->select(keys.begin(), keys.end());
+  auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view);
 
-  cudf::size_type expected = 4;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view());
 }
 
-TEST_F(DistinctCount, NansAsNullWithIgnoreNull)
+TEST_F(DropDuplicatesCommon, EmptyInputTable)
 {
-  using T = float;
-
-  std::vector<T> input               = {1, 3, NAN, 70, 31};
-  std::vector<cudf::size_type> valid = {1, 1, 1, 0, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> col(std::initializer_list<int32_t>{});
+  cudf::table_view input{{col}};
+  std::vector<cudf::size_type> keys{1, 2};
 
-  cudf::test::fixed_width_column_wrapper<T> input_col{input.begin(), input.end(), valid.begin()};
+  auto got =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
 
-  cudf::size_type expected = 3;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
+  auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view());
 }
 
-TEST_F(DistinctCount, EmptyColumn)
+TEST_F(DropDuplicatesCommon, NoColumnInputTable)
 {
-  using T = float;
-
-  cudf::test::fixed_width_column_wrapper<T> input_col{};
-
-  cudf::size_type expected = 0;
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
-}
+  cudf::table_view input{std::vector<cudf::column_view>()};
+  std::vector<cudf::size_type> keys{1, 2};
 
-TEST_F(DistinctCount, StringColumnWithNull)
-{
-  cudf::test::strings_column_wrapper input_col{
-    {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"},
-    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}};
-
-  cudf::size_type expected =
-    (std::vector<std::string>{"", "this", "is", "This", "a", "column", "of", "strings"}).size();
-  EXPECT_EQ(expected,
-            cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID));
-}
+  auto got =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
 
-TEST_F(DistinctCount, TableWithNull)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                       {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
-  cudf::table_view input{{col1, col2}};
-
-  EXPECT_EQ(8, cudf::distinct_count(input, null_equality::EQUAL));
-  EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
+  auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view());
 }
 
-TEST_F(DistinctCount, EmptyColumnedTable)
+TEST_F(DropDuplicatesCommon, EmptyKeys)
 {
-  std::vector<cudf::column_view> cols{};
-
-  cudf::table_view input(cols);
-
-  EXPECT_EQ(0, cudf::distinct_count(input, null_equality::EQUAL));
-  EXPECT_EQ(0, cudf::distinct_count(input, null_equality::UNEQUAL));
-  EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::EQUAL));
-  EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::UNEQUAL));
-}
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> empty_col{};
+  cudf::table_view input{{col}};
+  std::vector<cudf::size_type> keys{};
 
-TEST_F(DistinctCount, TableMixedTypes)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<double> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                      {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<uint32_t> col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                        {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::table_view input{{col1, col2, col3}};
-
-  EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL));
-  EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
-}
+  auto got =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view());
 
-TEST_F(DistinctCount, TableWithStringColumnWithNull)
-{
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::test::strings_column_wrapper col2{
-    {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""},
-    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-
-  cudf::table_view input{{col1, col2}};
-  EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL));
-  EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
+  auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view());
 }
 
-struct DropDuplicate : public cudf::test::BaseFixture {
+struct DropDuplicates : public cudf::test::BaseFixture {
 };
 
-TEST_F(DropDuplicate, NonNullTable)
+TEST_F(DropDuplicates, NonNullTable)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 5}};
   cudf::test::fixed_width_column_wrapper<float> col2{{4, 5, 3, 4, 9, 4}};
@@ -271,7 +115,7 @@ TEST_F(DropDuplicate, NonNullTable)
   cudf::table_view input{{col1, col2, col1_key, col2_key}};
   std::vector<cudf::size_type> keys{2, 3};
 
-  // Keep first of duplicate
+  // Keep the first duplicate row
   // The expected table would be sorted in ascending order with respect to keys
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_first{{5, 5, 5, 3, 8}};
   cudf::test::fixed_width_column_wrapper<float> exp_col2_first{{4, 4, 4, 3, 9}};
@@ -284,7 +128,7 @@ TEST_F(DropDuplicate, NonNullTable)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view());
 
-  // keep last of duplicate
+  // Keep the last duplicate row
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_last{{5, 5, 4, 3, 8}};
   cudf::test::fixed_width_column_wrapper<float> exp_col2_last{{4, 4, 5, 3, 9}};
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_last{{9, 19, 20, 20, 21}};
@@ -296,7 +140,7 @@ TEST_F(DropDuplicate, NonNullTable)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view());
 
-  // Keep unique
+  // Keep no duplicate rows
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_unique{{5, 5, 3, 8}};
   cudf::test::fixed_width_column_wrapper<float> exp_col2_unique{{4, 4, 3, 9}};
   cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key_unique{{9, 19, 20, 21}};
@@ -309,88 +153,145 @@ TEST_F(DropDuplicate, NonNullTable)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view());
 }
 
-TEST_F(DropDuplicate, WithNull)
+TEST_F(DropDuplicates, KeepFirstWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
+                                                      {1, 1, 0, 0, 1, 1, 1}};
   cudf::table_view input{{col, key}};
   std::vector<cudf::size_type> keys{1};
 
-  // Keep first of duplicate
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}};
-  cudf::table_view expected_first{{exp_col_first, exp_key_col_first}};
-  auto got_first =
+  // nulls are equal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_equal{{3, 5, 5, 8}, {1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_equal{{20, 19, 20, 21},
+                                                                          {0, 1, 1, 1}};
+  cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}};
+  auto got_first_equal =
     drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view());
 
-  // Keep last of duplicate
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}};
-  cudf::table_view expected_last{{exp_col_last, exp_key_col_last}};
-  auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
+  // nulls are unequal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_unequal{{3, 2, 5, 5, 8},
+                                                                        {1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_unequal{{20, 20, 19, 20, 21},
+                                                                            {0, 0, 1, 1, 1}};
+  cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}};
+  auto got_first_unequal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view());
-
-  // Keep unique of duplicate
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique{{5, 8}, {1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique{{20, 21}, {1, 1}};
-  cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}};
-  auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view());
 }
 
-TEST_F(DropDuplicate, StringKeyColumn)
+TEST_F(DropDuplicates, KeepLastWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
-  cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"},
-                                             {1, 1, 1, 0, 1, 1}};
-  cudf::table_view input{{col, key_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
+                                                      {1, 1, 0, 0, 1, 1, 1}};
+  cudf::table_view input{{col, key}};
   std::vector<cudf::size_type> keys{1};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last{{5, 3, 4, 1, 8}, {1, 1, 0, 1, 1}};
-  cudf::test::strings_column_wrapper exp_key_col_last{{"new", "all", "new", "strings", "the"},
-                                                      {0, 1, 1, 1, 1}};
-  cudf::table_view expected_last{{exp_col_last, exp_key_col_last}};
 
-  auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
-
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view());
+  // nulls are equal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_equal{{2, 1, 4, 8}, {1, 1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_equal{{20, 19, 20, 21},
+                                                                         {0, 1, 1, 1}};
+  cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}};
+  auto got_last_equal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view());
+
+  // nulls are unequal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_unequal{{3, 2, 1, 4, 8},
+                                                                       {1, 1, 1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_unequal{{20, 20, 19, 20, 21},
+                                                                           {0, 0, 1, 1, 1}};
+  cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}};
+  auto got_last_unequal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view());
 }
 
-TEST_F(DropDuplicate, EmptyInputTable)
+TEST_F(DropDuplicates, KeepNoneWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col(std::initializer_list<int32_t>{});
-  cudf::table_view input{{col}};
-  std::vector<cudf::size_type> keys{1, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
+                                                      {1, 1, 0, 0, 1, 1, 1}};
+  cudf::table_view input{{col, key}};
+  std::vector<cudf::size_type> keys{1};
 
-  auto got =
-    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  // nulls are equal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_equal{{8}, {1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_equal{{21}, {1}};
+  cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}};
+  auto got_unique_equal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view());
+
+  // nulls are unequal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_unequal{{3, 2, 8}, {1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_unequal{{20, 20, 21},
+                                                                             {0, 0, 1}};
+  cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}};
+  auto got_unique_unequal =
+    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view());
 }
 
-TEST_F(DropDuplicate, NoColumnInputTable)
+struct UnorderedDropDuplicates : public cudf::test::BaseFixture {
+};
+
+TEST_F(UnorderedDropDuplicates, NonNullTable)
 {
-  cudf::table_view input{std::vector<cudf::column_view>()};
-  std::vector<cudf::size_type> keys{1, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{6, 6, 3, 5, 8, 5}};
+  cudf::test::fixed_width_column_wrapper<float> col2{{6, 6, 3, 4, 9, 4}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1_key{{20, 20, 20, 19, 21, 9}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2_key{{19, 19, 20, 20, 9, 21}};
 
-  auto got =
-    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  cudf::table_view input{{col1, col2, col1_key, col2_key}};
+  std::vector<cudf::size_type> keys{2, 3};
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view());
+  // The expected table would be sorted in ascending order with respect to keys
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col1{{5, 5, 6, 3, 8}};
+  cudf::test::fixed_width_column_wrapper<float> exp_col2{{4, 4, 6, 3, 9}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col1_key{{9, 19, 20, 20, 21}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col2_key{{21, 20, 19, 20, 9}};
+  cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}};
+
+  auto result        = unordered_drop_duplicates(input, keys);
+  auto key_view      = result->select(keys.begin(), keys.end());
+  auto sorted_result = cudf::sort_by_key(result->view(), key_view);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view());
 }
 
-TEST_F(DropDuplicate, EmptyKeys)
+TEST_F(UnorderedDropDuplicates, WithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> empty_col{};
-  cudf::table_view input{{col}};
-  std::vector<cudf::size_type> keys{};
-
-  auto got =
-    drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}};
+  cudf::table_view input{{col, key}};
+  std::vector<cudf::size_type> keys{1};
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view());
+  // nulls are equal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}};
+  cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}};
+  auto res_equal    = unordered_drop_duplicates(input, keys, null_equality::EQUAL);
+  auto equal_keys   = res_equal->select(keys.begin(), keys.end());
+  auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view());
+
+  // nulls are unequal
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_unequal_key_col{{20, 19, 20, 20, 21},
+                                                                      {0, 1, 0, 1, 1}};
+  cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}};
+  auto res_unequal    = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL);
+  auto sorted_unequal = cudf::sort(res_unequal->view());
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view());
 }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 48c4aac9e8a..12a00aa35ab 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -274,6 +274,15 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
 }
 
+TEST_F(StringsContainsTests, Errors)
+{
+  cudf::test::strings_column_wrapper input({"3", "33"});
+  auto strings_view = cudf::strings_column_view(input);
+
+  EXPECT_THROW(cudf::strings::contains_re(strings_view, "(3?)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::contains_re(strings_view, "3?+"), cudf::logic_error);
+}
+
 TEST_F(StringsContainsTests, CountTest)
 {
   std::vector<const char*> h_strings{
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index a4cb27b7a9f..7b9f639f965 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,13 +41,16 @@ TEST_F(StringsFindMultipleTest, FindMultiple)
   cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
   auto targets_view = cudf::strings_column_view(targets);
 
-  auto results                = cudf::strings::find_multiple(strings_view, targets_view);
-  cudf::size_type total_count = static_cast<cudf::size_type>(h_strings.size() * h_targets.size());
-  EXPECT_EQ(total_count, results->size());
+  auto results = cudf::strings::find_multiple(strings_view, targets_view);
+
+  using LCW = cudf::test::lists_column_wrapper<int32_t>;
+  LCW expected({LCW{1, -1, -1, -1, 4, -1, -1},
+                LCW{4, -1, 2, -1, -1, -1, 2},
+                LCW{-1, -1, -1, -1, -1, -1, -1},
+                LCW{-1, 2, 1, -1, -1, -1, -1},
+                LCW{-1, -1, 1, 8, -1, -1, 1},
+                LCW{-1, -1, -1, -1, -1, -1, -1}});
 
-  cudf::test::fixed_width_column_wrapper<int32_t> expected(
-    {1,  -1, -1, -1, 4,  -1, -1, 4,  -1, 2, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1,
-     -1, 2,  1,  -1, -1, -1, -1, -1, -1, 1, 8,  -1, -1, 1, -1, -1, -1, -1, -1, -1, -1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index d7bf162d36f..4b1305a870a 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ TEST_F(StringsFindallTests, FindallTest)
                                        nullptr};
 
   std::string pattern = "(\\w+)";
-  auto results        = cudf::strings::findall_re(strings_view, pattern);
+  auto results        = cudf::strings::findall(strings_view, pattern);
   EXPECT_TRUE(results->num_columns() == 2);
 
   cudf::test::strings_column_wrapper expected1(
@@ -75,6 +75,28 @@ TEST_F(StringsFindallTests, FindallTest)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
+TEST_F(StringsFindallTests, FindallRecord)
+{
+  cudf::test::strings_column_wrapper input(
+    {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"},
+    {1, 1, 1, 1, 1, 0, 1, 1});
+
+  auto results = cudf::strings::findall_record(cudf::strings_column_view(input), "(\\d+)-(\\w+)");
+
+  bool valids[] = {1, 1, 1, 0, 0, 0, 0, 1};
+  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"3-A"},
+                LCW{"4-May", "5-Day", "6-Hay"},
+                LCW{"12-Dec", "2021-Jan"},
+                LCW{},
+                LCW{},
+                LCW{},
+                LCW{},
+                LCW{"25-9000"}},
+               valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsFindallTests, MediumRegex)
 {
   // This results in 15 regex instructions and falls in the 'medium' range.
@@ -87,7 +109,7 @@ TEST_F(StringsFindallTests, MediumRegex)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::findall_re(strings_view, medium_regex);
+  auto results      = cudf::strings::findall(strings_view, medium_regex);
   EXPECT_TRUE(results->num_columns() == 2);
 
   std::vector<const char*> h_expected1{"first words 1234", nullptr};
@@ -115,9 +137,11 @@ TEST_F(StringsFindallTests, LargeRegex)
   std::vector<const char*> h_strings{
     "hello @abc @def world The quick brown @fox jumps over the lazy @dog hello "
     "http://www.world.com I'm here @home zzzz",
-    "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
+    "12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012"
+    "34"
     "5678901234567890",
-    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnop"
+    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmn"
+    "op"
     "qrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
@@ -125,7 +149,7 @@ TEST_F(StringsFindallTests, LargeRegex)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::findall_re(strings_view, large_regex);
+  auto results      = cudf::strings::findall(strings_view, large_regex);
   EXPECT_TRUE(results->num_columns() == 1);
 
   std::vector<const char*> h_expected{large_regex.c_str(), nullptr, nullptr};
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 2a4dd5ff9c8..d705504cc0c 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -22,7 +22,6 @@ Properties
    :toctree: api/
 
    Index.empty
-   Index.gpu_values
    Index.has_duplicates
    Index.is_monotonic
    Index.is_monotonic_increasing
@@ -93,9 +92,9 @@ Conversion
    :toctree: api/
 
    Index.astype
-   Index.to_array
    Index.to_arrow
    Index.to_list
+   Index.to_numpy
    Index.to_series
    Index.to_frame
    Index.to_pandas
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 891bb3a1e61..cf5dd4a2a1d 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -408,13 +408,13 @@ Serialization / IO / conversion
 .. autosummary::
    :toctree: api/
 
-   Series.to_array
    Series.to_arrow
+   Series.to_cupy
    Series.to_dlpack
    Series.to_frame
-   Series.to_gpu_array
    Series.to_hdf
    Series.to_json
+   Series.to_numpy
    Series.to_pandas
    Series.to_string
    Series.from_arrow
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index a7e959a05a7..0034584a6f7 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -4550,7 +4550,7 @@
     }
    ],
    "source": [
-    "df['a'].to_array()"
+    "df['a'].to_numpy()"
    ]
   },
   {
@@ -4571,7 +4571,7 @@
     }
    ],
    "source": [
-    "ddf['a'].compute().to_array()"
+    "ddf['a'].compute().to_numpy()"
    ]
   },
   {
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 61981b34615..cb3234bf706 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -45,7 +45,6 @@ public final class ColumnVector extends ColumnView {
     NativeDepsLoader.loadNativeDeps();
   }
 
-  private final OffHeapState offHeap;
   private Optional<Long> nullCount = Optional.empty();
   private int refCount;
 
@@ -56,14 +55,23 @@ public final class ColumnVector extends ColumnView {
    *                      owned by this instance.
    */
   public ColumnVector(long nativePointer) {
-    super(getColumnViewFromColumn(nativePointer));
+    super(new OffHeapState(nativePointer));
     assert nativePointer != 0;
-    offHeap = new OffHeapState(nativePointer);
     MemoryCleaner.register(this, offHeap);
     this.refCount = 0;
     incRefCountInternal(true);
   }
 
+  private static OffHeapState makeOffHeap(DType type, long rows, Optional<Long> nullCount,
+      DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
+      DeviceMemoryBuffer offsetBuffer) {
+    long viewHandle = initViewHandle(
+        type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        dataBuffer, validityBuffer, offsetBuffer, null);
+    return new OffHeapState(type, (int) rows, dataBuffer, validityBuffer,
+        offsetBuffer, null, viewHandle);
+  }
+
   /**
    * Create a new column vector based off of data already on the device.
    * @param type the type of the vector
@@ -81,24 +89,29 @@ public ColumnVector(long nativePointer) {
   public ColumnVector(DType type, long rows, Optional<Long> nullCount,
       DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
       DeviceMemoryBuffer offsetBuffer) {
-    super(ColumnVector.initViewHandle(
-        type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
-        dataBuffer, validityBuffer, offsetBuffer, null));
+    super(makeOffHeap(type, rows, nullCount, dataBuffer, validityBuffer, offsetBuffer));
     assert !type.equals(DType.LIST) : "This constructor should not be used for list type";
     if (!type.equals(DType.STRING)) {
       assert offsetBuffer == null : "offsets are only supported for STRING";
     }
     assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
         || !nullCount.isPresent();
-    offHeap = new OffHeapState(type, (int) rows, dataBuffer, validityBuffer,
-        offsetBuffer, null, viewHandle);
     MemoryCleaner.register(this, offHeap);
     this.nullCount = nullCount;
-
     this.refCount = 0;
     incRefCountInternal(true);
   }
 
+  private static OffHeapState makeOffHeap(DType type, long rows, Optional<Long> nullCount,
+      DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
+      DeviceMemoryBuffer offsetBuffer, List<DeviceMemoryBuffer> toClose, long[] childHandles) {
+    long viewHandle = initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        dataBuffer, validityBuffer,
+        offsetBuffer, childHandles);
+    return new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, offsetBuffer,
+        toClose, viewHandle);
+  }
+
   /**
    * Create a new column vector based off of data already on the device with child columns.
    * @param type the type of the vector, typically a nested type
@@ -118,16 +131,12 @@ public ColumnVector(DType type, long rows, Optional<Long> nullCount,
   public ColumnVector(DType type, long rows, Optional<Long> nullCount,
                       DeviceMemoryBuffer dataBuffer, DeviceMemoryBuffer validityBuffer,
                       DeviceMemoryBuffer offsetBuffer, List<DeviceMemoryBuffer> toClose, long[] childHandles) {
-    super(initViewHandle(type, (int)rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
-        dataBuffer, validityBuffer,
-        offsetBuffer, childHandles));
+    super(makeOffHeap(type, rows, nullCount, dataBuffer, validityBuffer, offsetBuffer, toClose, childHandles));
     if (!type.equals(DType.STRING) && !type.equals(DType.LIST)) {
       assert offsetBuffer == null : "offsets are only supported for STRING, LISTS";
     }
     assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
         || !nullCount.isPresent();
-    offHeap = new OffHeapState(type, (int) rows, dataBuffer, validityBuffer, offsetBuffer,
-            toClose, viewHandle);
     MemoryCleaner.register(this, offHeap);
 
     this.refCount = 0;
@@ -143,8 +152,7 @@ public ColumnVector(DType type, long rows, Optional<Long> nullCount,
    * @param contiguousBuffer the buffer that this is based off of.
    */
   private ColumnVector(long viewAddress, DeviceMemoryBuffer contiguousBuffer) {
-    super(viewAddress);
-    offHeap = new OffHeapState(viewAddress, contiguousBuffer);
+    super(new OffHeapState(viewAddress, contiguousBuffer));
     MemoryCleaner.register(this, offHeap);
     // TODO we may want to ask for the null count anyways...
     this.nullCount = Optional.empty();
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index a2e080e02f6..8155fe79080 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
   protected final DType type;
   protected final long rows;
   protected final long nullCount;
+  protected final ColumnVector.OffHeapState offHeap;
 
   /**
    * Constructs a Column View given a native view address
@@ -50,6 +51,22 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
     this.type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
     this.rows = ColumnView.getNativeRowCount(viewHandle);
     this.nullCount = ColumnView.getNativeNullCount(viewHandle);
+    this.offHeap = null;
+  }
+
+
+  /**
+   * Intended to be called from ColumnVector when it is being constructed. Because state creates a
+   * cudf::column_view instance and will close it in all cases, we don't want to have to double
+   * close it.
+   * @param state the state this view is based off of.
+   */
+  protected ColumnView(ColumnVector.OffHeapState state) {
+    offHeap = state;
+    viewHandle = state.getViewHandle();
+    type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
+    rows = ColumnView.getNativeRowCount(viewHandle);
+    nullCount = ColumnView.getNativeNullCount(viewHandle);
   }
 
   /**
@@ -265,7 +282,10 @@ public long getDeviceMemorySize() {
 
   @Override
   public void close() {
-    ColumnView.deleteColumnView(viewHandle);
+    // close the view handle so long as offHeap is not going to do it for us.
+    if (offHeap == null) {
+      ColumnView.deleteColumnView(viewHandle);
+    }
     viewHandle = 0;
   }
 
@@ -2331,13 +2351,27 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    * Null string entries return corresponding null output columns.
    * @param delimiter UTF-8 encoded string identifying the split points in each string.
    *                  An empty string indicates split on whitespace.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(Scalar delimiter) {
+  public final Table stringSplit(Scalar delimiter, int maxSplit) {
     assert type.equals(DType.STRING) : "column type must be a String";
     assert delimiter != null : "delimiter may not be null";
     assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle()));
+    return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+  }
+  
+  /**
+   * Returns a list of columns by splitting each string using the specified delimiter.
+   * The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted.
+   * Null string entries return corresponding null output columns.
+   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   *                  An empty string indicates split on whitespace.
+   * @return New table of strings columns.
+   */
+  public final Table stringSplit(Scalar delimiter) {
+    return stringSplit(delimiter, -1);
   }
 
   /**
@@ -2349,7 +2383,7 @@ public final Table stringSplit(Scalar delimiter) {
    */
   public final Table stringSplit() {
     try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplit(emptyString);
+      return stringSplit(emptyString, -1);
     }
   }
 
@@ -2362,7 +2396,7 @@ public final ColumnVector stringSplitRecord() {
 
   /**
    * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
-   * @param maxSplit the maximum number of records to split, or -1 for all of them.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    */
   public final ColumnVector stringSplitRecord(int maxSplit) {
     try (Scalar emptyString = Scalar.fromString("")) {
@@ -2384,7 +2418,7 @@ public final ColumnVector stringSplitRecord(Scalar delimiter) {
    * string using the specified delimiter.
    * @param delimiter UTF-8 encoded string identifying the split points in each string.
    *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of records to split, or -1 for all of them.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @return New table of strings columns.
    */
   public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) {
@@ -3234,7 +3268,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) {
    * The index is set to null if one of the following is true: 
    * 1. The search key row is null.
    * 2. The list row is null.
-   * @param key ColumnView of search keys.
+   * @param keys ColumnView of search keys.
    * @param findOption Whether to find the first index of the key, or the last.
    * @return The resultant column of int32 indices
    */
@@ -3270,6 +3304,17 @@ public final Scalar getScalarElement(int index) {
     return new Scalar(getType(), getElement(getNativeView(), index));
   }
 
+  /**
+   * Get the number of bytes needed to allocate a validity buffer for the given number of rows.
+   * According to cudf::bitmask_allocation_size_bytes, the padding boundary for null mask is 64 bytes.
+   */
+  static long getValidityBufferSize(int numRows) {
+    // number of bytes required = Math.ceil(number of bits / 8)
+    long actualBytes = ((long) numRows + 7) >> 3;
+    // padding to the multiplies of the padding boundary(64 bytes)
+    return ((actualBytes + 63) >> 6) << 6;
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
   /////////////////////////////////////////////////////////////////////////////
@@ -3490,8 +3535,9 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
    * delimiter.
    * @param columnView native handle of the cudf::column_view being operated on.
    * @param delimiter  UTF-8 encoded string identifying the split points in each string.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    */
-  private static native long[] stringSplit(long columnView, long delimiter);
+  private static native long[] stringSplit(long columnView, long delimiter, int maxSplit);
 
   private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit);
 
@@ -3686,7 +3732,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    * Native method to find the first (or last) index of each search key in the specified column,
    * in each row of a list column.
    * @param nativeView the column view handle of the list
-   * @param scalarColumnHandle handle to the search key column
+   * @param keyColumnHandle handle to the search key column
    * @param isFindFirst Whether to find the first index of the key, or the last.
    * @return column handle of the resultant column of int32 indices
    */
@@ -3866,11 +3912,6 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi
   private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, 
                                                              long boolColumnViewHandle) throws CudfException;
 
-  /**
-   * Get the number of bytes needed to allocate a validity buffer for the given number of rows.
-   */
-  static native long getNativeValidPointerSize(int size);
-
   ////////
   // Native cudf::column_view life cycle and metadata access methods. Life cycle methods
   // should typically only be called from the OffHeap inner class.
@@ -3960,7 +4001,7 @@ static ColumnVector createColumnVector(DType type, int rows, HostMemoryBuffer da
       DeviceMemoryBuffer mainValidDevBuff = null;
       DeviceMemoryBuffer mainOffsetsDevBuff = null;
       if (mainColValid != null) {
-        long validLen = getNativeValidPointerSize(mainColRows);
+        long validLen = getValidityBufferSize(mainColRows);
         mainValidDevBuff = DeviceMemoryBuffer.allocate(validLen);
         mainValidDevBuff.copyFromHostBuffer(mainColValid, 0, validLen);
       }
@@ -4069,7 +4110,7 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows
         data.copyFromHostBuffer(dataBuffer, 0, dataLen);
       }
       if (validityBuffer != null) {
-        long validLen = getNativeValidPointerSize((int)rows);
+        long validLen = getValidityBufferSize((int)rows);
         valid = DeviceMemoryBuffer.allocate(validLen);
         valid.copyFromHostBuffer(validityBuffer, 0, validLen);
       }
diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index 742501be375..2e5b0202dc5 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -307,7 +307,7 @@ public static DType fromJavaBigDecimal(BigDecimal dec) {
       return new DType(DTypeEnum.DECIMAL128, -dec.scale());
     }
     throw new IllegalArgumentException("Precision " + dec.precision() +
-        " exceeds max precision cuDF can support " + DECIMAL64_MAX_PRECISION);
+        " exceeds max precision cuDF can support " + DECIMAL128_MAX_PRECISION);
   }
 
   /**
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 0fe7d7a5df8..3abc6db385d 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -199,7 +199,7 @@ public ColumnVector copyToDevice() {
         }
         HostMemoryBuffer hvalid = this.offHeap.valid;
         if (hvalid != null) {
-          long validLen = ColumnView.getNativeValidPointerSize((int) rows);
+          long validLen = ColumnView.getValidityBufferSize((int) rows);
           valid = DeviceMemoryBuffer.allocate(validLen);
           valid.copyFromHostBuffer(hvalid, 0, validLen);
         }
@@ -858,7 +858,7 @@ public static HostColumnVector timestampNanoSecondsFromBoxedLongs(Long... values
    * Build
    */
 
-  public static final class ColumnBuilder implements  AutoCloseable {
+  public static final class ColumnBuilder implements AutoCloseable {
 
     private DType type;
     private HostMemoryBuffer data;
@@ -869,28 +869,78 @@ public static final class ColumnBuilder implements  AutoCloseable {
     private boolean nullable;
     private long rows;
     private long estimatedRows;
+    private long rowCapacity = 0L;
+    private long validCapacity = 0L;
     private boolean built = false;
     private List<ColumnBuilder> childBuilders = new ArrayList<>();
+    private Runnable nullHandler;
 
-    private int currentIndex = 0;
-    private int currentByteIndex = 0;
-
+    // The value of currentIndex can't exceed Int32.Max. Storing currentIndex as a long is to
+    // adapt HostMemoryBuffer.setXXX, which requires a long offset.
+    private long currentIndex = 0;
+    // Only for Strings: pointer of the byte (data) buffer
+    private int currentStringByteIndex = 0;
+    // Use bit shift instead of multiply to transform row offset to byte offset
+    private int bitShiftBySize = 0;
+    private static final int bitShiftByOffset = (int)(Math.log(OFFSET_SIZE) / Math.log(2));
 
     public ColumnBuilder(HostColumnVector.DataType type, long estimatedRows) {
       this.type = type.getType();
       this.nullable = type.isNullable();
       this.rows = 0;
-      this.estimatedRows = estimatedRows;
+      this.estimatedRows = Math.max(estimatedRows, 1L);
+      this.bitShiftBySize = (int)(Math.log(this.type.getSizeInBytes()) / Math.log(2));
+
+      // initialize the null handler according to the data type
+      this.setupNullHandler();
+
       for (int i = 0; i < type.getNumChildren(); i++) {
         childBuilders.add(new ColumnBuilder(type.getChild(i), estimatedRows));
       }
     }
 
+    private void setupNullHandler() {
+      if (this.type == DType.LIST) {
+        this.nullHandler = () -> {
+          this.growListBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          offsets.setInt(currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex());
+        };
+      } else if (this.type == DType.STRING) {
+        this.nullHandler = () -> {
+          this.growStringBuffersAndRows(0);
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          offsets.setInt(currentIndex << bitShiftByOffset, currentStringByteIndex);
+        };
+      } else if (this.type == DType.STRUCT) {
+        this.nullHandler = () -> {
+          this.growStructBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+          for (ColumnBuilder childBuilder : childBuilders) {
+            childBuilder.appendNull();
+          }
+        };
+      } else {
+        this.nullHandler = () -> {
+          this.growFixedWidthBuffersAndRows();
+          this.growValidBuffer();
+          setNullAt(currentIndex++);
+        };
+      }
+    }
+
     public HostColumnVector build() {
       List<HostColumnVectorCore> hostColumnVectorCoreList = new ArrayList<>();
       for (ColumnBuilder childBuilder : childBuilders) {
         hostColumnVectorCoreList.add(childBuilder.buildNestedInternal());
       }
+      // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily.
+      if (valid != null) {
+        growValidBuffer();
+      }
       HostColumnVector hostColumnVector = new HostColumnVector(type, rows, Optional.of(nullCount), data, valid, offsets,
           hostColumnVectorCoreList);
       built = true;
@@ -902,6 +952,10 @@ private HostColumnVectorCore buildNestedInternal() {
       for (ColumnBuilder childBuilder : childBuilders) {
         hostColumnVectorCoreList.add(childBuilder.buildNestedInternal());
       }
+      // Aligns the valid buffer size with other buffers in terms of row size, because it grows lazily.
+      if (valid != null) {
+        growValidBuffer();
+      }
       return new HostColumnVectorCore(type, rows, Optional.of(nullCount), data, valid, offsets, hostColumnVectorCoreList);
     }
 
@@ -929,71 +983,113 @@ public ColumnBuilder appendStructValues(StructData... inputList) {
     }
 
     /**
-     * A method that is responsible for growing the buffers as needed
-     * and incrementing the row counts when we append values or nulls.
-     * @param hasNull indicates whether the validity buffer needs to be considered, as the
-     *                nullcount may not have been fully calculated yet
-     * @param length used for strings
+     * Grows valid buffer lazily. The valid buffer won't be materialized until the first null
+     * value appended. This method reuses the rowCapacity to track the sizes of column.
+     * Therefore, please call specific growBuffer method to update rowCapacity before calling
+     * this method.
+     */
+    private void growValidBuffer() {
+      if (valid == null) {
+        long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity);
+        valid = HostMemoryBuffer.allocate(maskBytes);
+        valid.setMemory(0, valid.length, (byte) 0xFF);
+        validCapacity = rowCapacity;
+        return;
+      }
+      if (validCapacity < rowCapacity) {
+        long maskBytes = ColumnView.getValidityBufferSize((int) rowCapacity);
+        HostMemoryBuffer newValid = HostMemoryBuffer.allocate(maskBytes);
+        newValid.setMemory(0, newValid.length, (byte) 0xFF);
+        valid = copyBuffer(newValid, valid);
+        validCapacity = rowCapacity;
+      }
+    }
+
+    /**
+     * A method automatically grows data buffer for fixed-width columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
      */
-    private void growBuffersAndRows(boolean hasNull, int length) {
+    private void growFixedWidthBuffersAndRows() {
       assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
       rows++;
-      long targetDataSize = 0;
 
-      if (!type.isNestedType()) {
-        if (type.equals(DType.STRING)) {
-          targetDataSize = data == null ? length : currentByteIndex + length;
-        } else {
-          targetDataSize = data == null ? estimatedRows * type.getSizeInBytes() : rows * type.getSizeInBytes();
-        }
+      if (data == null) {
+        data = HostMemoryBuffer.allocate(estimatedRows << bitShiftBySize);
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1);
+        data = copyBuffer(HostMemoryBuffer.allocate(newCap << bitShiftBySize), data);
+        rowCapacity = newCap;
       }
+    }
 
-      if (targetDataSize > 0) {
-        if (data == null) {
-          data = HostMemoryBuffer.allocate(targetDataSize);
-        } else {
-          long maxLen;
-          if (type.equals(DType.STRING)) {
-            maxLen = Integer.MAX_VALUE;
-          } else {
-            maxLen = Integer.MAX_VALUE * (long) type.getSizeInBytes();
-          }
-          long oldLen = data.getLength();
-          long newDataLen = Math.max(1, oldLen);
-          while (targetDataSize > newDataLen) {
-            newDataLen = newDataLen * 2;
-          }
-          if (newDataLen != oldLen) {
-            newDataLen = Math.min(newDataLen, maxLen);
-            if (newDataLen < targetDataSize) {
-              throw new IllegalStateException("A data buffer for strings is not supported over 2GB in size");
-            }
-            HostMemoryBuffer newData = HostMemoryBuffer.allocate(newDataLen);
-            data = copyBuffer(newData, data);
-          }
-        }
+    /**
+     * A method automatically grows offsets buffer for list columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
+     */
+    private void growListBuffersAndRows() {
+      assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (offsets == null) {
+        offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset);
+        offsets.setInt(0, 0);
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2);
+        offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets);
+        rowCapacity = newCap;
       }
-      if (type.equals(DType.LIST) || type.equals(DType.STRING)) {
-        if (offsets == null) {
-          offsets = HostMemoryBuffer.allocate((estimatedRows + 1) * OFFSET_SIZE);
-          offsets.setInt(0, 0);
-        } else if ((rows +1) * OFFSET_SIZE > offsets.length) {
-          long newOffsetLen = offsets.length * 2;
-          HostMemoryBuffer newOffsets = HostMemoryBuffer.allocate(newOffsetLen);
-          offsets = copyBuffer(newOffsets, offsets);
-        }
+    }
+
+    /**
+     * A method automatically grows offsets and data buffer for string columns as needed along with
+     * incrementing the row counts. Please call this method before appending any value or null.
+     *
+     * @param stringLength number of bytes required by the next row
+     */
+    private void growStringBuffersAndRows(int stringLength) {
+      assert rows + 2 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (offsets == null) {
+        // Initialize data buffer with at least 1 byte in case the first appended value is null.
+        data = HostMemoryBuffer.allocate(Math.max(1, stringLength));
+        offsets = HostMemoryBuffer.allocate((estimatedRows + 1) << bitShiftByOffset);
+        offsets.setInt(0, 0);
+        rowCapacity = estimatedRows;
+        return;
       }
-      if (hasNull || nullCount > 0) {
-        if (valid == null) {
-          long targetValidSize = ColumnView.getNativeValidPointerSize((int)estimatedRows);
-          valid = HostMemoryBuffer.allocate(targetValidSize);
-          valid.setMemory(0, targetValidSize, (byte) 0xFF);
-        } else if (valid.length < ColumnView.getNativeValidPointerSize((int)rows)) {
-          long newValidLen = valid.length * 2;
-          HostMemoryBuffer newValid = HostMemoryBuffer.allocate(newValidLen);
-          newValid.setMemory(0, newValidLen, (byte) 0xFF);
-          valid = copyBuffer(newValid, valid);
-        }
+
+      if (rows > rowCapacity) {
+        long newCap = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 2);
+        offsets = copyBuffer(HostMemoryBuffer.allocate((newCap + 1) << bitShiftByOffset), offsets);
+        rowCapacity = newCap;
+      }
+
+      long currentLength = currentStringByteIndex + stringLength;
+      if (currentLength > data.length) {
+        long requiredLength = data.length;
+        do {
+          requiredLength = requiredLength * 2;
+        } while (currentLength > requiredLength);
+        data = copyBuffer(HostMemoryBuffer.allocate(requiredLength), data);
+      }
+    }
+
+    /**
+     * For struct columns, we only need to update rows and rowCapacity (for the growth of
+     * valid buffer), because struct columns hold no buffer itself.
+     * Please call this method before appending any value or null.
+     */
+    private void growStructBuffersAndRows() {
+      assert rows + 1 <= Integer.MAX_VALUE : "Row count cannot go over Integer.MAX_VALUE";
+      rows++;
+
+      if (rowCapacity == 0) {
+        rowCapacity = estimatedRows;
+      } else if (rows > rowCapacity) {
+        rowCapacity = Math.min(rowCapacity * 2, Integer.MAX_VALUE - 1);
       }
     }
 
@@ -1015,29 +1111,13 @@ private HostMemoryBuffer copyBuffer(HostMemoryBuffer targetBuffer, HostMemoryBuf
      * Method that sets the null bit in the validity vector
      * @param index the row index at which the null is marked
      */
-    private void setNullAt(int index) {
+    private void setNullAt(long index) {
       assert index < rows : "Index for null value should fit the column with " + rows + " rows";
       nullCount += BitVectorHelper.setNullAt(valid, index);
     }
 
     public final ColumnBuilder appendNull() {
-      growBuffersAndRows(true, 0);
-      setNullAt(currentIndex);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
-      if (type.hasOffsets()) {
-        if (type.equals(DType.LIST)) {
-          offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex());
-        } else {
-          // It is a String
-          offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex);
-        }
-      } else if (type.equals(DType.STRUCT)) {
-        // structs propagate nulls to children and even further down if needed
-        for (ColumnBuilder childBuilder : childBuilders) {
-          childBuilder.appendNull();
-        }
-      }
+      nullHandler.run();
       return this;
     }
 
@@ -1081,7 +1161,7 @@ public ColumnBuilder endStruct() {
       assert type.equals(DType.STRUCT) : "This only works for structs";
       assert allChildrenHaveSameIndex() : "Appending structs data appears to be off " +
           childBuilders + " should all have the same currentIndex " + type;
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growStructBuffersAndRows();
       currentIndex++;
       return this;
     }
@@ -1095,9 +1175,8 @@ assert allChildrenHaveSameIndex() : "Appending structs data appears to be off "
      */
     public ColumnBuilder endList() {
       assert type.equals(DType.LIST);
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
-      currentIndex++;
-      offsets.setInt(currentIndex * OFFSET_SIZE, childBuilders.get(0).getCurrentIndex());
+      growListBuffersAndRows();
+      offsets.setInt(++currentIndex << bitShiftByOffset, childBuilders.get(0).getCurrentIndex());
       return this;
     }
 
@@ -1155,80 +1234,67 @@ public void incrCurrentIndex() {
     }
 
     public int getCurrentIndex() {
-      return currentIndex;
+      return (int) currentIndex;
     }
 
+    @Deprecated
     public int getCurrentByteIndex() {
-      return currentByteIndex;
+      return currentStringByteIndex;
     }
 
     public final ColumnBuilder append(byte value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByByte();
       assert currentIndex < rows;
-      data.setByte(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setByte(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(short value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByShort();
       assert currentIndex < rows;
-      data.setShort(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setShort(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(int value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByInt();
       assert currentIndex < rows;
-      data.setInt(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setInt(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(long value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.isBackedByLong();
       assert currentIndex < rows;
-      data.setLong(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setLong(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(float value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.FLOAT32);
       assert currentIndex < rows;
-      data.setFloat(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setFloat(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(double value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.FLOAT64);
       assert currentIndex < rows;
-      data.setDouble(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setDouble(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
     public final ColumnBuilder append(boolean value) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert type.equals(DType.BOOL8);
       assert currentIndex < rows;
-      data.setBoolean(currentIndex * type.getSizeInBytes(), value);
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
+      data.setBoolean(currentIndex++ << bitShiftBySize, value);
       return this;
     }
 
@@ -1237,22 +1303,19 @@ public ColumnBuilder append(BigDecimal value) {
     }
 
     public ColumnBuilder append(BigInteger unscaledVal) {
-      growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
+      growFixedWidthBuffersAndRows();
       assert currentIndex < rows;
       if (type.typeId == DType.DTypeEnum.DECIMAL32) {
-        data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact());
+        data.setInt(currentIndex++ << bitShiftBySize, unscaledVal.intValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
-        data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact());
+        data.setLong(currentIndex++ << bitShiftBySize, unscaledVal.longValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
-        assert currentIndex < rows;
         byte[] unscaledValueBytes = unscaledVal.toByteArray();
         byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes);
-        data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length);
-      }  else {
+        data.setBytes(currentIndex++ << bitShiftBySize, result, 0, result.length);
+      } else {
         throw new IllegalStateException(type + " is not a supported decimal type.");
       }
-      currentIndex++;
-      currentByteIndex += type.getSizeInBytes();
       return this;
     }
 
@@ -1271,14 +1334,13 @@ public ColumnBuilder appendUTF8String(byte[] value, int srcOffset, int length) {
       assert length >= 0;
       assert value.length + srcOffset <= length;
       assert type.equals(DType.STRING) : " type " + type + " is not String";
-      currentIndex++;
-      growBuffersAndRows(false, length);
-      assert currentIndex < rows + 1;
+      growStringBuffersAndRows(length);
+      assert currentIndex < rows;
       if (length > 0) {
-        data.setBytes(currentByteIndex, value, srcOffset, length);
+        data.setBytes(currentStringByteIndex, value, srcOffset, length);
       }
-      currentByteIndex += length;
-      offsets.setInt(currentIndex * OFFSET_SIZE, currentByteIndex);
+      currentStringByteIndex += length;
+      offsets.setInt(++currentIndex << bitShiftByOffset, currentStringByteIndex);
       return this;
     }
 
@@ -1822,7 +1884,7 @@ public final Builder append(HostColumnVector columnVector) {
     }
 
     private void allocateBitmaskAndSetDefaultValues() {
-      long bitmaskSize = ColumnView.getNativeValidPointerSize((int) rows);
+      long bitmaskSize = ColumnView.getValidityBufferSize((int) rows);
       valid = HostMemoryBuffer.allocate(bitmaskSize);
       valid.setMemory(0, bitmaskSize, (byte) 0xFF);
     }
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index 317ef152492..a45716a89b3 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -395,6 +395,9 @@ template <typename T> class native_jpointerArray {
 
   T **data() { return reinterpret_cast<T **>(wrapped.data()); }
 
+  T *const *begin() const { return data(); }
+  T *const *end() const { return data() + size(); }
+
   const jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   jlongArray get_jArray() { return wrapped.get_jArray(); }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 38c6bb3740e..63247eb0066 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -561,17 +561,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong column_view,
-                                                                        jlong delimiter) {
+                                                                        jlong delimiter_ptr,
+                                                                        jint max_split) {
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0);
+  JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
-    cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
+    cudf::strings_column_view const scv{*reinterpret_cast<cudf::column_view *>(column_view)};
+    auto delimiter = reinterpret_cast<cudf::string_scalar *>(delimiter_ptr);
 
-    std::unique_ptr<cudf::table> table_result = cudf::strings::split(scv, *ss_scalar);
-    return cudf::jni::convert_table_for_return(env, table_result);
+    return cudf::jni::convert_table_for_return(env,
+                                               cudf::strings::split(scv, *delimiter, max_split));
   }
   CATCH_STD(env, 0);
 }
@@ -1409,13 +1409,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *en
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
-    cudf::strings_column_view strings_column(*column_view);
+    cudf::strings_column_view const strings_column{
+        *reinterpret_cast<cudf::column_view *>(j_view_handle)};
     cudf::jni::native_jstring pattern(env, patternObj);
 
-    std::unique_ptr<cudf::table> table_result =
-        cudf::strings::extract(strings_column, pattern.get());
-    return cudf::jni::convert_table_for_return(env, table_result);
+    return cudf::jni::convert_table_for_return(
+        env, cudf::strings::extract(strings_column, pattern.get()));
   }
   CATCH_STD(env, 0);
 }
@@ -1791,16 +1790,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidPointerSize(JNIEnv *env,
-                                                                                 jobject j_object,
-                                                                                 jint size) {
-  try {
-    cudf::jni::auto_set_device(env);
-    return static_cast<jlong>(cudf::bitmask_allocation_size_bytes(size));
-  }
-  CATCH_STD(env, 0);
-}
-
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass,
                                                                            jlong handle) {
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 10f295e27bf..aeac1856db0 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -599,37 +599,27 @@ class native_arrow_ipc_reader_handle final {
   void close() { source->Close(); }
 };
 
-/**
- * Take a table returned by some operation and turn it into an array of column* so we can track them
- * ourselves in java instead of having their life tied to the table.
- * @param table_result the table to convert for return
- * @param extra_columns columns not in the table that will be added to the result at the end.
- */
-static jlongArray
-convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
-                         std::vector<std::unique_ptr<cudf::column>> &extra_columns) {
+jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
+                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
   int table_cols = ret.size();
   int num_columns = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-  for (int i = 0; i < table_cols; i++) {
-    outcol_handles[i] = release_as_jlong(ret[i]);
-  }
-  for (size_t i = 0; i < extra_columns.size(); i++) {
-    outcol_handles[i + table_cols] = release_as_jlong(extra_columns[i]);
-  }
+  std::transform(ret.begin(), ret.end(), outcol_handles.begin(),
+                 [](auto &col) { return release_as_jlong(col); });
+  std::transform(extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols,
+                 [](auto &col) { return release_as_jlong(col); });
   return outcol_handles.get_jArray();
 }
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result) {
-  std::vector<std::unique_ptr<cudf::column>> extra;
-  return convert_table_for_return(env, table_result, extra);
+jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
+                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
+  return convert_table_for_return(env, std::move(table_result), std::move(extra_columns));
 }
 
 jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &first_table,
                                     std::unique_ptr<cudf::table> &second_table) {
-  std::vector<std::unique_ptr<cudf::column>> second_tmp = second_table->release();
-  return convert_table_for_return(env, first_table, second_tmp);
+  return convert_table_for_return(env, first_table, second_table->release());
 }
 
 // Convert the JNI boolean array of key column sort order to a vector of cudf::order
@@ -1068,6 +1058,7 @@ cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
 } // namespace jni
 } // namespace cudf
 
+using cudf::jni::convert_table_for_return;
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
@@ -1223,9 +1214,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jcla
     std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
     auto sorted_col = cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order);
 
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(j_input_table);
-    std::unique_ptr<cudf::table> result = cudf::gather(*input_table, sorted_col->view());
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_input_table);
+    return convert_table_for_return(env, cudf::gather(*input_table, sorted_col->view()));
   }
   CATCH_STD(env, NULL);
 }
@@ -1267,8 +1257,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
         n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
     std::vector<cudf::table_view> tables = n_table_handles.get_dereferenced();
 
-    std::unique_ptr<cudf::table> result = cudf::merge(tables, indexes, order, null_order);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::merge(tables, indexes, order, null_order));
   }
   CATCH_STD(env, NULL);
 }
@@ -1344,8 +1333,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
                                             .comment(comment)
                                             .build();
 
-    cudf::io::table_with_metadata result = cudf::io::read_csv(opts);
-    return cudf::jni::convert_table_for_return(env, result.tbl);
+    return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
@@ -1425,7 +1413,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
 
     // there is no need to re-order columns when inferring schema
     if (result.metadata.column_names.empty() || n_col_names.size() <= 0) {
-      return cudf::jni::convert_table_for_return(env, result.tbl);
+      return convert_table_for_return(env, result.tbl);
     } else {
       // json reader will not return the correct column order,
       // so we need to re-order the column of table according to table meta.
@@ -1453,11 +1441,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
 
       if (!match) {
         // can't find some input column names in table meta, return what json reader reads.
-        return cudf::jni::convert_table_for_return(env, result.tbl);
+        return convert_table_for_return(env, result.tbl);
       } else {
         auto tbv = result.tbl->view().select(std::move(indices));
         auto table = std::make_unique<cudf::table>(tbv);
-        return cudf::jni::convert_table_for_return(env, table);
+        return convert_table_for_return(env, table);
       }
     }
   }
@@ -1501,8 +1489,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env,
             .convert_strings_to_categories(false)
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
             .build();
-    cudf::io::table_with_metadata result = cudf::io::read_parquet(opts);
-    return cudf::jni::convert_table_for_return(env, result.tbl);
+    return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
@@ -1672,8 +1659,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
             .decimal128_columns(n_dec128_col_names.as_cpp_vector())
             .build();
-    cudf::io::table_with_metadata result = cudf::io::read_orc(opts);
-    return cudf::jni::convert_table_for_return(env, result.tbl);
+    return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
@@ -1956,8 +1942,7 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arr
 
   try {
     cudf::jni::auto_set_device(env);
-    std::unique_ptr<cudf::table> result = cudf::from_arrow(*(handle->get()));
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::from_arrow(*(handle->get())));
   }
   CATCH_STD(env, 0)
 }
@@ -2142,7 +2127,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoin(
                              static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
                                                                       cudf::null_equality::UNEQUAL);
 
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, NULL);
 }
@@ -2171,7 +2156,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoin(
                              static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
                                                                       cudf::null_equality::UNEQUAL);
 
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, NULL);
 }
@@ -2706,12 +2691,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jc
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_left_table = reinterpret_cast<cudf::table_view *>(left_table);
-    cudf::table_view *n_right_table = reinterpret_cast<cudf::table_view *>(right_table);
-
-    std::unique_ptr<cudf::table> result = cudf::cross_join(*n_left_table, *n_right_table);
-
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const left = reinterpret_cast<cudf::table_view const *>(left_table);
+    auto const right = reinterpret_cast<cudf::table_view const *>(right_table);
+    return convert_table_for_return(env, cudf::cross_join(*left, *right));
   }
   CATCH_STD(env, NULL);
 }
@@ -2734,18 +2716,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jpointerArray<cudf::table_view> tables(env, table_handles);
-
-    int num_tables = tables.size();
-    // There are some issues with table_view and std::vector. We cannot give the
-    // vector a size or it will not compile.
-    std::vector<cudf::table_view> to_concat;
-    to_concat.reserve(num_tables);
-    for (int i = 0; i < num_tables; i++) {
-      JNI_NULL_CHECK(env, tables[i], "input table included a null", NULL);
-      to_concat.push_back(*tables[i]);
-    }
-    std::unique_ptr<cudf::table> table_result = cudf::concatenate(to_concat);
-    return cudf::jni::convert_table_for_return(env, table_result);
+    std::vector<cudf::table_view> const to_concat = tables.get_dereferenced();
+    return convert_table_for_return(env, cudf::concatenate(to_concat));
   }
   CATCH_STD(env, NULL);
 }
@@ -2763,20 +2735,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    cudf::column_view *n_part_column = reinterpret_cast<cudf::column_view *>(partition_column);
-    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
+    auto const n_part_column = reinterpret_cast<cudf::column_view const *>(partition_column);
 
-    auto result = cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
+    auto [partitioned_table, partition_offsets] =
+        cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
 
-    for (size_t i = 0; i < result.second.size() - 1; i++) {
-      // for what ever reason partition returns the length of the result at then
-      // end and hash partition/round robin do not, so skip the last entry for
-      // consistency
-      n_output_offsets[i] = result.second[i];
-    }
+    // for what ever reason partition returns the length of the result at then
+    // end and hash partition/round robin do not, so skip the last entry for
+    // consistency
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+    std::copy(partition_offsets.begin(), partition_offsets.end() - 1, n_output_offsets.begin());
 
-    return cudf::jni::convert_table_for_return(env, result.first);
+    return convert_table_for_return(env, partitioned_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -2792,26 +2763,21 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::hash_id hash_func = static_cast<cudf::hash_id>(hash_function);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto const hash_func = static_cast<cudf::hash_id>(hash_function);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
-    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
-
     JNI_ARG_CHECK(env, n_columns_to_hash.size() > 0, "columns_to_hash is zero", NULL);
 
-    std::vector<cudf::size_type> columns_to_hash_vec(n_columns_to_hash.size());
-    for (int i = 0; i < n_columns_to_hash.size(); i++) {
-      columns_to_hash_vec[i] = n_columns_to_hash[i];
-    }
+    std::vector<cudf::size_type> columns_to_hash_vec(n_columns_to_hash.begin(),
+                                                     n_columns_to_hash.end());
 
-    std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> result =
+    auto [partitioned_table, partition_offsets] =
         cudf::hash_partition(*n_input_table, columns_to_hash_vec, number_of_partitions, hash_func);
 
-    for (size_t i = 0; i < result.second.size(); i++) {
-      n_output_offsets[i] = result.second[i];
-    }
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+    std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
 
-    return cudf::jni::convert_table_for_return(env, result.first);
+    return convert_table_for_return(env, partitioned_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -2827,15 +2793,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
   try {
     cudf::jni::auto_set_device(env);
     auto n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
 
-    auto result = cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
+    auto [partitioned_table, partition_offsets] =
+        cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
 
-    for (size_t i = 0; i < result.second.size(); i++) {
-      n_output_offsets[i] = result.second[i];
-    }
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+    std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
 
-    return cudf::jni::convert_table_for_return(env, result.first);
+    return convert_table_for_return(env, partitioned_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -2905,7 +2870,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
         result_columns.push_back(std::move(result.second[agg_result_index].results[col_agg_index]));
       }
     }
-    return cudf::jni::convert_table_for_return(env, result.first, result_columns);
+    return convert_table_for_return(env, result.first, std::move(result_columns));
   }
   CATCH_STD(env, NULL);
 }
@@ -2975,7 +2940,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
         result_columns.push_back(std::move(result.second[agg_result_index].results[col_agg_index]));
       }
     }
-    return cudf::jni::convert_table_for_return(env, result.first, result_columns);
+    return convert_table_for_return(env, result.first, std::move(result_columns));
   }
   CATCH_STD(env, NULL);
 }
@@ -3020,10 +2985,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
     std::vector<cudf::replace_policy> policies = n_is_preceding.transform_if_else(
         cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING);
 
-    std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> result =
-        grouper.replace_nulls(n_replace_table, policies);
-
-    return cudf::jni::convert_table_for_return(env, result.first, result.second);
+    auto [keys, results] = grouper.replace_nulls(n_replace_table, policies);
+    return convert_table_for_return(env, keys, results);
   }
   CATCH_STD(env, NULL);
 }
@@ -3034,10 +2997,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas
   JNI_NULL_CHECK(env, mask_jcol, "mask column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::column_view *mask = reinterpret_cast<cudf::column_view *>(mask_jcol);
-    std::unique_ptr<cudf::table> result = cudf::apply_boolean_mask(*input, *mask);
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const mask = reinterpret_cast<cudf::column_view const *>(mask_jcol);
+    return convert_table_for_return(env, cudf::apply_boolean_mask(*input, *mask));
   }
   CATCH_STD(env, 0);
 }
@@ -3063,7 +3025,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
         nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
         nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER,
         rmm::mr::get_current_device_resource());
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
@@ -3074,12 +3036,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
-    cudf::column_view *map = reinterpret_cast<cudf::column_view *>(j_map);
+    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
+    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
     auto bounds_policy =
         check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
-    std::unique_ptr<cudf::table> result = cudf::gather(*input, *map, bounds_policy);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::gather(*input, *map, bounds_policy));
   }
   CATCH_STD(env, 0);
 }
@@ -3090,7 +3051,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols =
         cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table);
     int num_columns = cols.size();
@@ -3114,8 +3075,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env,
     auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
     auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
     auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
-    auto result = cudf::scatter(*input, *map, *target, check_bounds);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::scatter(*input, *map, *target, check_bounds));
   }
   CATCH_STD(env, 0);
 }
@@ -3131,13 +3091,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en
     cudf::jni::auto_set_device(env);
     auto const scalars_array = cudf::jni::native_jpointerArray<cudf::scalar>(env, j_input);
     std::vector<std::reference_wrapper<cudf::scalar const>> input;
-    for (int i = 0; i < scalars_array.size(); ++i) {
-      input.emplace_back(*scalars_array[i]);
-    }
+    std::transform(scalars_array.begin(), scalars_array.end(), std::back_inserter(input),
+                   [](auto &scalar) { return std::ref(*scalar); });
     auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
     auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
-    auto result = cudf::scatter(input, *map, *target, check_bounds);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::scatter(input, *map, *target, check_bounds));
   }
   CATCH_STD(env, 0);
 }
@@ -3148,7 +3106,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
     std::vector<std::unique_ptr<cudf::column>> cols = cudf::jni::convert_to_rows(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
@@ -3166,8 +3124,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
-    cudf::lists_column_view list_input(*input);
+    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view *>(input_column)};
     cudf::jni::native_jintArray n_types(env, types);
     cudf::jni::native_jintArray n_scale(env, scale);
     if (n_types.size() != n_scale.size()) {
@@ -3179,7 +3136,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidth
                    [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
     std::unique_ptr<cudf::table> result =
         cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
@@ -3193,8 +3150,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
-    cudf::lists_column_view list_input(*input);
+    cudf::lists_column_view const list_input{*reinterpret_cast<cudf::column_view *>(input_column)};
     cudf::jni::native_jintArray n_types(env, types);
     cudf::jni::native_jintArray n_scale(env, scale);
     if (n_types.size() != n_scale.size()) {
@@ -3205,7 +3161,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     std::transform(n_types.begin(), n_types.end(), n_scale.begin(), std::back_inserter(types_vec),
                    [](jint type, jint scale) { return cudf::jni::make_data_type(type, scale); });
     std::unique_ptr<cudf::table> result = cudf::jni::convert_from_rows(list_input, types_vec);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
@@ -3216,9 +3172,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(input_jtable);
-    std::unique_ptr<cudf::table> result = cudf::repeat(*input, count);
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    return convert_table_for_return(env, cudf::repeat(*input, count));
   }
   CATCH_STD(env, 0);
 }
@@ -3231,10 +3186,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv
   JNI_NULL_CHECK(env, count_jcol, "count column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::column_view *count = reinterpret_cast<cudf::column_view *>(count_jcol);
-    std::unique_ptr<cudf::table> result = cudf::repeat(*input, *count, check_count);
-    return cudf::jni::convert_table_for_return(env, result);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const count = reinterpret_cast<cudf::column_view const *>(count_jcol);
+    return convert_table_for_return(env, cudf::repeat(*input, *count, check_count));
   }
   CATCH_STD(env, 0);
 }
@@ -3351,7 +3305,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     }
 
     auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
-    return cudf::jni::convert_table_for_return(env, result_table);
+    return convert_table_for_return(env, result_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -3444,7 +3398,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
     }
 
     auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
-    return cudf::jni::convert_table_for_return(env, result_table);
+    return convert_table_for_return(env, result_table);
   }
   CATCH_STD(env, NULL);
 }
@@ -3455,10 +3409,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv *env, jcla
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
-    std::unique_ptr<cudf::table> exploded = cudf::explode(*input_table, col_index);
-    return cudf::jni::convert_table_for_return(env, exploded);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const col_index = static_cast<cudf::size_type>(column_index);
+    return convert_table_for_return(env, cudf::explode(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
@@ -3469,10 +3422,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
-    std::unique_ptr<cudf::table> exploded = cudf::explode_position(*input_table, col_index);
-    return cudf::jni::convert_table_for_return(env, exploded);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const col_index = static_cast<cudf::size_type>(column_index);
+    return convert_table_for_return(env, cudf::explode_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
@@ -3483,10 +3435,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env,
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
-    std::unique_ptr<cudf::table> exploded = cudf::explode_outer(*input_table, col_index);
-    return cudf::jni::convert_table_for_return(env, exploded);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const col_index = static_cast<cudf::size_type>(column_index);
+    return convert_table_for_return(env, cudf::explode_outer(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
@@ -3497,10 +3448,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIE
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(input_jtable);
-    cudf::size_type col_index = static_cast<cudf::size_type>(column_index);
-    std::unique_ptr<cudf::table> exploded = cudf::explode_outer_position(*input_table, col_index);
-    return cudf::jni::convert_table_for_return(env, exploded);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const col_index = static_cast<cudf::size_type>(column_index);
+    return convert_table_for_return(env, cudf::explode_outer_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
@@ -3509,8 +3459,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv *env, jclas
   JNI_NULL_CHECK(env, j_table, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto t = reinterpret_cast<cudf::table_view *>(j_table);
-    return release_as_jlong(cudf::row_bit_count(*t));
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_table);
+    return release_as_jlong(cudf::row_bit_count(*input_table));
   }
   CATCH_STD(env, 0);
 }
@@ -3528,7 +3478,7 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jintArray n_key_indices(env, jkey_indices);
-    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(jinput_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const *>(jinput_table);
 
     // Prepares arguments for the groupby:
     //   (keys, null_handling, keys_are_sorted, column_order, null_precedence)
@@ -3622,11 +3572,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclas
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
+    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
     auto sample_with_replacement =
         replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
-    std::unique_ptr<cudf::table> result = cudf::sample(*input, n, sample_with_replacement, seed);
-    return cudf::jni::convert_table_for_return(env, result);
+    return convert_table_for_return(env, cudf::sample(*input, n, sample_with_replacement, seed));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index fbcca0c82ee..12fd45b831a 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -23,7 +23,28 @@
 namespace cudf {
 namespace jni {
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result);
+/**
+ * @brief Detach all columns from the specified table, and pointers to them as an array.
+ *
+ * This function takes a table (presumably returned by some operation), and turns it into an
+ * array of column* (as jlongs).
+ * The lifetime of the columns is decoupled from that of the table, and is managed by the caller.
+ *
+ * @param env The JNI environment
+ * @param table_result the table to convert for return
+ * @param extra_columns columns not in the table that will be appended to the result.
+ */
+jlongArray
+convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
+                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
+
+/**
+ * @copydoc convert_table_for_return(JNIEnv*, std::unique_ptr<cudf::table>&,
+ *                                   std::vector<std::unique_ptr<cudf::column>>&&)
+ */
+jlongArray
+convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
+                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
 
 //
 // ContiguousTable APIs
diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
index a26dbec4907..7b476c31b95 100644
--- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -39,21 +40,34 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertFalse(byteColumnVector.hasNulls());
-      assertEquals(byteColumnVector.getByte(0), 2);
-      assertEquals(byteColumnVector.getByte(1), 3);
-      assertEquals(byteColumnVector.getByte(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getByte(0), 2);
+      assertEquals(cv.getByte(1), 3);
+      assertEquals(cv.getByte(2), 5);
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector v = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) {
-      assertFalse(v.hasNulls());
-      assertEquals(0xff, Byte.toUnsignedInt(v.getByte(0)), 0xff);
-      assertEquals(128, Byte.toUnsignedInt(v.getByte(1)), 128);
-      assertEquals(5, Byte.toUnsignedInt(v.getByte(2)), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(0xff, Byte.toUnsignedInt(cv.getByte(0)), 0xff);
+      assertEquals(128, Byte.toUnsignedInt(cv.getByte(1)), 128);
+      assertEquals(5, Byte.toUnsignedInt(cv.getByte(2)), 5);
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromUnsignedBytes(new byte[]{(byte)0xff, (byte)128, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(false,
+        new byte[]{(byte)0xff, (byte)128, 5})) {
+      verify.accept(bcv);
     }
   }
 
@@ -70,47 +84,73 @@ public void testAppendRepeatingValues() {
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertThrows(AssertionError.class, () -> byteColumnVector.getByte(3));
-      assertFalse(byteColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getByte(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
-      assertFalse(byteColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> byteColumnVector.getByte(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getByte(-1));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBytes(new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBytes(true, new byte[]{2, 3, 5})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedBytes(
-        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
-      assertTrue(byteColumnVector.hasNulls());
-      assertEquals(2, byteColumnVector.getNullCount());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertTrue(cv.hasNulls());
+      assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
-        assertFalse(byteColumnVector.isNull(i));
+        assertFalse(cv.isNull(i));
       }
-      assertTrue(byteColumnVector.isNull(6));
-      assertTrue(byteColumnVector.isNull(7));
+      assertTrue(cv.isNull(6));
+      assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBoxedBytes(
+        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(true,
+        new Byte[]{2, 3, 4, 5, 6, 7, null, null})) {
+      verify.accept(bcv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector byteColumnVector = HostColumnVector.fromBoxedUnsignedBytes(
-        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
-      assertTrue(byteColumnVector.hasNulls());
-      assertEquals(2, byteColumnVector.getNullCount());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertTrue(cv.hasNulls());
+      assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
-        assertFalse(byteColumnVector.isNull(i));
+        assertFalse(cv.isNull(i));
       }
-      assertEquals(128, Byte.toUnsignedInt(byteColumnVector.getByte(4)));
-      assertEquals(254, Byte.toUnsignedInt(byteColumnVector.getByte(5)));
-      assertTrue(byteColumnVector.isNull(6));
-      assertTrue(byteColumnVector.isNull(7));
+      assertEquals(128, Byte.toUnsignedInt(cv.getByte(4)));
+      assertEquals(254, Byte.toUnsignedInt(cv.getByte(5)));
+      assertTrue(cv.isNull(6));
+      assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector bcv = HostColumnVector.fromBoxedUnsignedBytes(
+        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
+      verify.accept(bcv);
+    }
+    try (HostColumnVector bcv = ColumnBuilderHelper.fromBoxedBytes(false,
+        new Byte[]{2, 3, 4, 5, (byte)128, (byte)254, null, null})) {
+      verify.accept(bcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java
new file mode 100644
index 00000000000..263244b2413
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/ColumnBuilderHelper.java
@@ -0,0 +1,158 @@
+/*
+ *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Objects;
+import java.util.function.Consumer;
+
+/**
+ * ColumnBuilderHelper helps to test ColumnBuilder with existed ColumnVector tests.
+ */
+public class ColumnBuilderHelper {
+
+  public static HostColumnVector build(
+      HostColumnVector.DataType type,
+      int rows,
+      Consumer<HostColumnVector.ColumnBuilder> init) {
+    try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) {
+      init.accept(b);
+      return b.build();
+    }
+  }
+
+  public static ColumnVector buildOnDevice(
+      HostColumnVector.DataType type,
+      int rows,
+      Consumer<HostColumnVector.ColumnBuilder> init) {
+    try (HostColumnVector.ColumnBuilder b = new HostColumnVector.ColumnBuilder(type, rows)) {
+      init.accept(b);
+      return b.buildAndPutOnDevice();
+    }
+  }
+
+  public static HostColumnVector fromBoxedBytes(boolean signed, Byte... values) {
+    DType dt = signed ? DType.INT8 : DType.UINT8;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Byte v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedDoubles(Double... values) {
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, DType.FLOAT64),
+        values.length,
+        (b) -> {
+          for (Double v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedInts(boolean signed, Integer... values) {
+    DType dt = signed ? DType.INT32 : DType.UINT32;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Integer v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBoxedLongs(boolean signed, Long... values) {
+    DType dt = signed ? DType.INT64 : DType.UINT64;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, dt),
+        values.length,
+        (b) -> {
+          for (Long v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromBytes(boolean signed, byte... values) {
+    DType dt = signed ? DType.INT8 : DType.UINT8;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (byte v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromDecimals(BigDecimal... values) {
+    // Simply copy from HostColumnVector.fromDecimals
+    BigDecimal maxDec = Arrays.stream(values).filter(Objects::nonNull)
+        .max(Comparator.comparingInt(BigDecimal::precision))
+        .orElse(BigDecimal.ZERO);
+    int maxScale = Arrays.stream(values).filter(Objects::nonNull)
+        .map(decimal -> decimal.scale())
+        .max(Comparator.naturalOrder())
+        .orElse(0);
+    maxDec = maxDec.setScale(maxScale, RoundingMode.UNNECESSARY);
+
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(true, DType.fromJavaBigDecimal(maxDec)),
+        values.length,
+        (b) -> {
+          for (BigDecimal v : values)
+            if (v == null) b.appendNull();
+            else b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromDoubles(double... values) {
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, DType.FLOAT64),
+        values.length,
+        (b) -> {
+          for (double v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromInts(boolean signed, int... values) {
+    DType dt = signed ? DType.INT32 : DType.UINT32;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (int v : values) b.append(v);
+        });
+  }
+
+  public static HostColumnVector fromLongs(boolean signed, long... values) {
+    DType dt = signed ? DType.INT64 : DType.UINT64;
+    return ColumnBuilderHelper.build(
+        new HostColumnVector.BasicType(false, dt),
+        values.length,
+        (b) -> {
+          for (long v : values) b.append(v);
+        });
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 8d4bbff1542..8f39c3c51ce 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3559,7 +3559,6 @@ void testCastDecimal64ToString() {
     for (int scale : new int[]{-5, -2, -1, 0, 1, 2, 5}) {
       for (int i = 0; i < strDecimalValues.length; i++) {
         strDecimalValues[i] = dumpDecimal(unScaledValues[i], scale);
-        System.out.println(strDecimalValues[i]);
       }
 
       testCastFixedWidthToStringsAndBack(DType.create(DType.DTypeEnum.DECIMAL64, scale),
@@ -4703,13 +4702,21 @@ void testStringSplitRecord() {
 
   @Test
   void testStringSplit() {
-    try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", null, "", "ARé some", "test strings");
-         Table expected = new Table.TestBuilder().column("Héllo", "thésé", null, "", "ARé", "test")
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here");
+         Table expectedSplitOnce = new Table.TestBuilder()
+         .column("Héllo", "thésé", null, "", "ARé", "test")
+         .column("there all", null, null, null, "some things", "strings here")
+         .build();
+         Table expectedSplitAll = new Table.TestBuilder()
+         .column("Héllo", "thésé", null, "", "ARé", "test")
          .column("there", null, null, null, "some", "strings")
+         .column("all", null, null, null, "things", "here")
          .build();
          Scalar pattern = Scalar.fromString(" ");
-         Table result = v.stringSplit(pattern)) {
-      assertTablesAreEqual(expected, result);
+         Table resultSplitOnce = v.stringSplit(pattern, 1);
+         Table resultSplitAll = v.stringSplit(pattern)) {
+          assertTablesAreEqual(expectedSplitOnce, resultSplitOnce);
+          assertTablesAreEqual(expectedSplitAll, resultSplitAll);      
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
index c2772520f57..994066c5df0 100644
--- a/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/DecimalColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,10 +22,12 @@
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.util.Arrays;
 import java.util.Objects;
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.*;
 
@@ -33,9 +35,11 @@ public class DecimalColumnVectorTest extends CudfTestBase {
   private static final Random rdSeed = new Random(1234);
   private static final int dec32Scale = 4;
   private static final int dec64Scale = 10;
+  private static final int dec128Scale = 30;
 
   private static final BigDecimal[] decimal32Zoo = new BigDecimal[20];
   private static final BigDecimal[] decimal64Zoo = new BigDecimal[20];
+  private static final BigDecimal[] decimal128Zoo = new BigDecimal[20];
   private static final int[] unscaledDec32Zoo = new int[decimal32Zoo.length];
   private static final long[] unscaledDec64Zoo = new long[decimal64Zoo.length];
 
@@ -45,6 +49,9 @@ public class DecimalColumnVectorTest extends CudfTestBase {
   private final BigDecimal[] boundaryDecimal64 = new BigDecimal[]{
       new BigDecimal("999999999999999999"), new BigDecimal("-999999999999999999")};
 
+  private final BigDecimal[] boundaryDecimal128 = new BigDecimal[]{
+      new BigDecimal("99999999999999999999999999999999999999"), new BigDecimal("-99999999999999999999999999999999999999")};
+
   private final BigDecimal[] overflowDecimal32 = new BigDecimal[]{
       BigDecimal.valueOf(Integer.MAX_VALUE), BigDecimal.valueOf(Integer.MIN_VALUE)};
 
@@ -72,6 +79,12 @@ public static void setup() {
       } else {
         decimal64Zoo[i] = null;
       }
+      if (rdSeed.nextBoolean()) {
+        BigInteger unscaledVal = BigInteger.valueOf(rdSeed.nextLong()).multiply(BigInteger.valueOf(rdSeed.nextLong()));
+        decimal128Zoo[i] = new BigDecimal(unscaledVal, dec128Scale);
+      } else {
+        decimal128Zoo[i] = null;
+      }
     }
   }
 
@@ -190,27 +203,44 @@ public void testDecimalGeneral() {
 
   @Test
   public void testDecimalFromDecimals() {
-    DecimalColumnVectorTest.testDecimalImpl(false, dec32Scale, decimal32Zoo);
-    DecimalColumnVectorTest.testDecimalImpl(true, dec64Scale, decimal64Zoo);
-    DecimalColumnVectorTest.testDecimalImpl(false, 0, boundaryDecimal32);
-    DecimalColumnVectorTest.testDecimalImpl(true, 0, boundaryDecimal64);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, dec32Scale, decimal32Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, dec64Scale, decimal64Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, dec128Scale, decimal128Zoo);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL32, 0, boundaryDecimal32);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL64, 0, boundaryDecimal64);
+    DecimalColumnVectorTest.testDecimalImpl(DType.DTypeEnum.DECIMAL128, 0, boundaryDecimal128);
   }
 
-  private static void testDecimalImpl(boolean isInt64, int scale, BigDecimal[] decimalZoo) {
-    try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) {
-      try (HostColumnVector hcv = cv.copyToHost()) {
-        assertEquals(-scale, hcv.getType().getScale());
-        assertEquals(isInt64, hcv.getType().typeId == DType.DTypeEnum.DECIMAL64);
-        assertEquals(decimalZoo.length, hcv.rows);
-        for (int i = 0; i < decimalZoo.length; i++) {
-          assertEquals(decimalZoo[i] == null, hcv.isNull(i));
-          if (decimalZoo[i] != null) {
-            assertEquals(decimalZoo[i].floatValue(), hcv.getBigDecimal(i).floatValue());
-            long backValue = isInt64 ? hcv.getLong(i) : hcv.getInt(i);
-            assertEquals(decimalZoo[i].setScale(scale, RoundingMode.UNNECESSARY), BigDecimal.valueOf(backValue, scale));
+  private static void testDecimalImpl(DType.DTypeEnum decimalType, int scale, BigDecimal[] decimalZoo) {
+    Consumer<HostColumnVector> assertions = (hcv) -> {
+      assertEquals(-scale, hcv.getType().getScale());
+      assertEquals(hcv.getType().typeId, decimalType);
+      assertEquals(decimalZoo.length, hcv.rows);
+      for (int i = 0; i < decimalZoo.length; i++) {
+        assertEquals(decimalZoo[i] == null, hcv.isNull(i));
+        if (decimalZoo[i] != null) {
+          BigDecimal actual;
+          switch (decimalType) {
+          case DECIMAL32:
+            actual = BigDecimal.valueOf(hcv.getInt(i), scale);
+            break;
+          case DECIMAL64:
+            actual = BigDecimal.valueOf(hcv.getLong(i), scale);
+            break;
+          default:
+            actual = hcv.getBigDecimal(i);
           }
+          assertEquals(decimalZoo[i].subtract(actual).longValueExact(), 0L);
         }
       }
+    };
+    try (ColumnVector cv = ColumnVector.fromDecimals(decimalZoo)) {
+      try (HostColumnVector hcv = cv.copyToHost()) {
+        assertions.accept(hcv);
+      }
+    }
+    try (HostColumnVector hcv = ColumnBuilderHelper.fromDecimals(decimalZoo)) {
+      assertions.accept(hcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
index d82565e1d2d..fa34429685e 100644
--- a/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/DoubleColumnVectorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -40,34 +41,51 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertFalse(doubleColumnVector.hasNulls());
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(0), 2.1, 0.01);
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(1), 3.02, 0.01);
-      assertEqualsWithinPercentage(doubleColumnVector.getDouble(2), 5.003, 0.001);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEqualsWithinPercentage(cv.getDouble(0), 2.1, 0.01);
+      assertEqualsWithinPercentage(cv.getDouble(1), 3.02, 0.01);
+      assertEqualsWithinPercentage(cv.getDouble(2), 5.003, 0.001);
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(3));
-      assertFalse(doubleColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getDouble(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector doubleColumnVector = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
-      assertFalse(doubleColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> doubleColumnVector.getDouble(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getDouble(-1));
+    };
+    try (HostColumnVector dcv = HostColumnVector.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromDoubles(2.1, 3.02, 5.003)) {
+      verify.accept(dcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv =
-             HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -75,6 +93,14 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector dcv =
+             HostColumnVector.fromBoxedDoubles(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+      verify.accept(dcv);
+    }
+    try (HostColumnVector dcv = ColumnBuilderHelper.fromBoxedDoubles(
+        2.0, 3.0, 4.0, 5.0, 6.0, 7.0, null, null)) {
+      verify.accept(dcv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
index 2fb8164534b..7d6311fb24c 100644
--- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -34,47 +35,75 @@ public void testCreateColumnVectorBuilder() {
     try (ColumnVector intColumnVector = ColumnVector.build(DType.INT32, 3, (b) -> b.append(1))) {
       assertFalse(intColumnVector.hasNulls());
     }
+    try (ColumnVector intColumnVector = ColumnBuilderHelper.buildOnDevice(
+        new HostColumnVector.BasicType(true, DType.INT32), 3, (b) -> b.append(1))) {
+      assertFalse(intColumnVector.hasNulls());
+    }
   }
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertFalse(intColumnVector.hasNulls());
-      assertEquals(intColumnVector.getInt(0), 2);
-      assertEquals(intColumnVector.getInt(1), 3);
-      assertEquals(intColumnVector.getInt(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getInt(0), 2);
+      assertEquals(cv.getInt(1), 3);
+      assertEquals(cv.getInt(2), 5);
+    };
+    try (HostColumnVector cv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector v = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) {
-      assertFalse(v.hasNulls());
-      assertEquals(0xfedcba98L, Integer.toUnsignedLong(v.getInt(0)));
-      assertEquals(0x80000000L, Integer.toUnsignedLong(v.getInt(1)));
-      assertEquals(5, Integer.toUnsignedLong(v.getInt(2)));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(0xfedcba98L, Integer.toUnsignedLong(cv.getInt(0)));
+      assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(1)));
+      assertEquals(5, Integer.toUnsignedLong(cv.getInt(2)));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromUnsignedInts(0xfedcba98, 0x80000000, 5)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromInts(false, 0xfedcba98, 0x80000000, 5)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertThrows(AssertionError.class, () -> intColumnVector.getInt(3));
-      assertFalse(intColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getInt(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(icv);
+    }
+    try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(icv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector intColumnVector = HostColumnVector.fromInts(2, 3, 5)) {
-      assertFalse(intColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> intColumnVector.getInt(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getInt(-1));
+    };
+    try (HostColumnVector icv = HostColumnVector.fromInts(2, 3, 5)) {
+      verify.accept(icv);
+    }
+    try (HostColumnVector icv = ColumnBuilderHelper.fromInts(true, 2, 3, 5)) {
+      verify.accept(icv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -82,13 +111,18 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromBoxedInts(2, 3, 4, 5, 6, 7, null, null)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(true, 2, 3, 4, 5, 6, 7, null, null)) {
+      verify.accept(cv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts(
-            2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -98,6 +132,14 @@ public void testAddingUnsignedNullValues() {
       assertEquals(0x80000000L, Integer.toUnsignedLong(cv.getInt(5)));
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedInts(
+            2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+      verify.accept(cv);
+    }
+    try (HostColumnVector cv = ColumnBuilderHelper.fromBoxedInts(false,
+        2, 3, 4, 5, 0xfedbca98, 0x80000000, null, null)) {
+      verify.accept(cv);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
index 43c2b5a99c2..193992f5304 100644
--- a/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/LongColumnVectorTest.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 import org.junit.jupiter.api.Test;
 
 import java.util.Random;
+import java.util.function.Consumer;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -38,46 +39,71 @@ public void testCreateColumnVectorBuilder() {
 
   @Test
   public void testArrayAllocation() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
-      assertEquals(longColumnVector.getLong(0), 2);
-      assertEquals(longColumnVector.getLong(1), 3);
-      assertEquals(longColumnVector.getLong(2), 5);
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertEquals(cv.getLong(0), 2);
+      assertEquals(cv.getLong(1), 3);
+      assertEquals(cv.getLong(2), 5);
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true,2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testUnsignedArrayAllocation() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromUnsignedLongs(
-        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
       assertEquals(Long.toUnsignedString(0xfedcba9876543210L),
-          Long.toUnsignedString(longColumnVector.getLong(0)));
+          Long.toUnsignedString(cv.getLong(0)));
       assertEquals(Long.toUnsignedString(0x8000000000000000L),
-          Long.toUnsignedString(longColumnVector.getLong(1)));
-      assertEquals(5L, longColumnVector.getLong(2));
+          Long.toUnsignedString(cv.getLong(1)));
+      assertEquals(5L, cv.getLong(2));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromUnsignedLongs(
+        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(false,
+        0xfedcba9876543210L, 0x8000000000000000L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testUpperIndexOutOfBoundsException() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertThrows(AssertionError.class, () -> longColumnVector.getLong(3));
-      assertFalse(longColumnVector.hasNulls());
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertThrows(AssertionError.class, () -> cv.getLong(3));
+      assertFalse(cv.hasNulls());
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testLowerIndexOutOfBoundsException() {
-    try (HostColumnVector longColumnVector = HostColumnVector.fromLongs(2L, 3L, 5L)) {
-      assertFalse(longColumnVector.hasNulls());
-      assertThrows(AssertionError.class, () -> longColumnVector.getLong(-1));
+    Consumer<HostColumnVector> verify = (cv) -> {
+      assertFalse(cv.hasNulls());
+      assertThrows(AssertionError.class, () -> cv.getLong(-1));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromLongs(2L, 3L, 5L)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromLongs(true, 2L, 3L, 5L)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testAddingNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -85,13 +111,19 @@ public void testAddingNullValues() {
       }
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromBoxedLongs(2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(true,
+        2L, 3L, 4L, 5L, 6L, 7L, null, null)) {
+      verify.accept(lcv);
     }
   }
 
   @Test
   public void testAddingUnsignedNullValues() {
-    try (HostColumnVector cv = HostColumnVector.fromBoxedUnsignedLongs(
-        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+    Consumer<HostColumnVector> verify = (cv) -> {
       assertTrue(cv.hasNulls());
       assertEquals(2, cv.getNullCount());
       for (int i = 0; i < 6; i++) {
@@ -103,6 +135,14 @@ public void testAddingUnsignedNullValues() {
           Long.toUnsignedString(cv.getLong(5)));
       assertTrue(cv.isNull(6));
       assertTrue(cv.isNull(7));
+    };
+    try (HostColumnVector lcv = HostColumnVector.fromBoxedUnsignedLongs(
+        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+      verify.accept(lcv);
+    }
+    try (HostColumnVector lcv = ColumnBuilderHelper.fromBoxedLongs(false,
+        2L, 3L, 4L, 5L, 0xfedcba9876543210L, 0x8000000000000000L, null, null)) {
+      verify.accept(lcv);
     }
   }
 
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index e5a8bb926c1..0c2f971a26c 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -37,7 +37,6 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
         void set_decimal_cols_as_float(vector[string] val) except+
-        void enable_decimal128(bool val) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -59,7 +58,6 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& decimal_cols_as_float(
             vector[string] val
         ) except+
-        orc_reader_options_builder& decimal128(bool val) except+
 
         orc_reader_options build() except+
 
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
index 5b81d369ef5..897b61f8001 100644
--- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -33,11 +33,13 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \
         column_view boolean_mask
     ) except +
 
-    cdef unique_ptr[table] drop_duplicates(table_view source_table,
-                                           vector[size_type] keys,
-                                           duplicate_keep_option keep,
-                                           null_equality nulls_equal) except +
-
-    cdef size_type distinct_count(column_view source_table,
-                                  null_policy null_handling,
-                                  nan_policy nan_handling) except +
+    cdef unique_ptr[table] drop_duplicates(
+        table_view source_table,
+        vector[size_type] keys,
+        duplicate_keep_option keep,
+        null_equality nulls_equal) except +
+
+    cdef size_type unordered_distinct_count(
+        column_view source_table,
+        null_policy null_handling,
+        nan_policy nan_handling) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/findall.pxd b/python/cudf/cudf/_lib/cpp/strings/findall.pxd
index 189d0770b81..5533467d72a 100644
--- a/python/cudf/cudf/_lib/cpp/strings/findall.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/findall.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -10,6 +10,10 @@ from cudf._lib.cpp.table.table cimport table
 
 cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
 
-    cdef unique_ptr[table] findall_re(
-        column_view source_strings,
-        string pattern) except +
+    cdef unique_ptr[table] findall(
+        const column_view& source_strings,
+        const string& pattern) except +
+
+    cdef unique_ptr[column] findall_record(
+        const column_view& source_strings,
+        const string& pattern) except +
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 4330c565982..c4f885382f3 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import pandas as pd
 
@@ -11,10 +11,10 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.stream_compaction cimport (
     apply_boolean_mask as cpp_apply_boolean_mask,
-    distinct_count as cpp_distinct_count,
     drop_duplicates as cpp_drop_duplicates,
     drop_nulls as cpp_drop_nulls,
     duplicate_keep_option,
+    unordered_distinct_count as cpp_unordered_distinct_count,
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -190,7 +190,7 @@ def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
 
     cdef column_view source_column_view = source_column.view()
     with nogil:
-        count = cpp_distinct_count(
+        count = cpp_unordered_distinct_count(
             source_column_view,
             cpp_null_handling,
             cpp_nan_handling
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 80af18e7c71..b17988018a6 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -8,7 +8,10 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.findall cimport findall_re as cpp_findall_re
+from cudf._lib.cpp.strings.findall cimport (
+    findall as cpp_findall,
+    findall_record as cpp_findall_record,
+)
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
@@ -25,7 +28,7 @@ def findall(Column source_strings, pattern):
     cdef string pattern_string = <string>str(pattern).encode()
 
     with nogil:
-        c_result = move(cpp_findall_re(
+        c_result = move(cpp_findall(
             source_view,
             pattern_string
         ))
@@ -34,3 +37,22 @@ def findall(Column source_strings, pattern):
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
+
+
+def findall_record(Column source_strings, pattern):
+    """
+    Returns data with all non-overlapping matches of `pattern`
+    in each string of `source_strings` as a lists column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    cdef string pattern_string = <string>str(pattern).encode()
+
+    with nogil:
+        c_result = move(cpp_findall_record(
+            source_view,
+            pattern_string
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index b1335c7c076..6569184e90b 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -569,17 +569,6 @@ def to_dlpack(self):
 
         return cudf.io.dlpack.to_dlpack(self)
 
-    @property
-    def gpu_values(self):
-        """
-        View the data as a numba device array object
-        """
-        warnings.warn(
-            "The gpu_values property is deprecated and will be removed.",
-            FutureWarning,
-        )
-        return self._values.data_array_view
-
     def append(self, other):
         """
         Append a collection of Index options together.
@@ -1254,10 +1243,6 @@ def astype(self, dtype, copy=False):
             self.copy(deep=copy)._values.astype(dtype), name=self.name
         )
 
-    # TODO: This method is deprecated and can be removed.
-    def to_array(self, fillna=None):
-        return self._values.to_array(fillna=fillna)
-
     def to_series(self, index=None, name=None):
         """
         Create a Series with both index and values equal to the index keys.
@@ -1536,14 +1521,6 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None):
                 "`allow_fill` and `fill_value` are unsupported."
             )
 
-        indices = cudf.core.column.as_column(indices)
-        if is_bool_dtype(indices):
-            warnings.warn(
-                "Calling take with a boolean array is deprecated and will be "
-                "removed in the future.",
-                FutureWarning,
-            )
-            return self._apply_boolean_mask(indices)
         return self._gather(indices)
 
     def _apply_boolean_mask(self, boolean_mask):
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index de06e62cbb1..24f9dc83ca9 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -46,6 +46,9 @@
     )
 
 
+_DEFAULT_CATEGORICAL_VALUE = -1
+
+
 class CategoricalAccessor(ColumnMethods):
     """
     Accessor object for categorical properties of the Series values.
@@ -946,7 +949,11 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series:
             col = self
 
         signed_dtype = min_signed_type(len(col.categories))
-        codes = col.codes.astype(signed_dtype).fillna(-1).to_array()
+        codes = (
+            col.codes.astype(signed_dtype)
+            .fillna(_DEFAULT_CATEGORICAL_VALUE)
+            .values_host
+        )
         if is_interval_dtype(col.categories.dtype):
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
@@ -1015,13 +1022,10 @@ def _encode(self, value) -> ScalarLike:
         return self.categories.find_first_value(value)
 
     def _decode(self, value: int) -> ScalarLike:
-        if value == self._default_na_value():
+        if value == _DEFAULT_CATEGORICAL_VALUE:
             return None
         return self.categories.element_indexing(value)
 
-    def _default_na_value(self) -> ScalarLike:
-        return -1
-
     def find_and_replace(
         self,
         to_replace: ColumnLike,
@@ -1178,7 +1182,7 @@ def fillna(
             fill_is_scalar = np.isscalar(fill_value)
 
             if fill_is_scalar:
-                if fill_value == self._default_na_value():
+                if fill_value == _DEFAULT_CATEGORICAL_VALUE:
                     fill_value = self.codes.dtype.type(fill_value)
                 else:
                     try:
@@ -1578,7 +1582,7 @@ def _create_empty_categorical_column(
         categories=column.as_column(dtype.categories),
         codes=column.as_column(
             cudf.utils.utils.scalar_broadcast_to(
-                categorical_column._default_na_value(),
+                _DEFAULT_CATEGORICAL_VALUE,
                 categorical_column.size,
                 categorical_column.codes.dtype,
             )
@@ -1601,7 +1605,7 @@ def pandas_categorical_as_column(
     codes = categorical.codes if codes is None else codes
     codes = column.as_column(codes)
 
-    valid_codes = codes != codes.dtype.type(-1)
+    valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE)
 
     mask = None
     if not valid_codes.all():
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7999fa9039b..82641d83b07 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -314,51 +314,6 @@ def memory_usage(self) -> int:
             n += bitmask_allocation_size_bytes(self.size)
         return n
 
-    def _default_na_value(self) -> Any:
-        raise NotImplementedError()
-
-    # TODO: This method is deprecated and can be removed when the associated
-    # Frame methods are removed.
-    def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
-        """Get a dense numba device array for the data.
-
-        Parameters
-        ----------
-        fillna : scalar, 'pandas', or None
-            See *fillna* in ``.to_array``.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-        """
-        if fillna:
-            return self.fillna(self._default_na_value()).data_array_view
-        else:
-            return self.dropna(drop_nan=False).data_array_view
-
-    # TODO: This method is deprecated and can be removed when the associated
-    # Frame methods are removed.
-    def to_array(self, fillna=None) -> np.ndarray:
-        """Get a dense numpy array for the data.
-
-        Parameters
-        ----------
-        fillna : scalar, 'pandas', or None
-            Defaults to None, which will skip null values.
-            If it equals "pandas", null values are filled with NaNs.
-            Non integral dtype is promoted to np.float64.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-        """
-
-        return self.to_gpu_array(fillna=fillna).copy_to_host()
-
     def _fill(
         self,
         fill_value: ScalarLike,
@@ -1031,7 +986,7 @@ def __array__(self, dtype=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
             "allowed. To explicitly construct a host array, consider using "
-            ".to_array()"
+            ".to_numpy()"
         )
 
     @property
@@ -1316,6 +1271,12 @@ def column_empty(
             column_empty(row_count, field_dtype)
             for field_dtype in dtype.fields.values()
         )
+    elif is_list_dtype(dtype):
+        data = None
+        children = (
+            full(row_count + 1, 0, dtype="int32"),
+            column_empty(row_count, dtype=dtype.element_type),
+        )
     elif is_categorical_dtype(dtype):
         data = None
         children = (
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b763790986a..c72fb66addc 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -199,7 +199,7 @@ def to_pandas(
 
         # Pandas supports only `datetime64[ns]`, hence the cast.
         return pd.Series(
-            self.astype("datetime64[ns]").to_array("NAT"),
+            self.astype("datetime64[ns]").fillna("NaT").values_host,
             copy=False,
             index=index,
         )
@@ -346,10 +346,6 @@ def as_string_column(
                 column.column_empty(0, dtype="object", masked=False),
             )
 
-    def _default_na_value(self) -> DatetimeLikeScalar:
-        """Returns the default NA value for this column"""
-        return np.datetime64("nat", self.time_unit)
-
     def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
         return pd.Timestamp(
             self.as_numerical.mean(skipna=skipna, dtype=dtype),
@@ -488,15 +484,6 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
             return False
 
 
-def binop_offset(lhs, rhs, op):
-    if rhs._is_no_op:
-        return lhs
-    else:
-        rhs = rhs._generate_column(len(lhs), op)
-        out = libcudf.datetime.add_months(lhs, rhs)
-        return out
-
-
 def infer_format(element: str, **kwargs) -> str:
     """
     Infers datetime format from a string, also takes cares for `ms` and `ns`
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a7481ce62a3..9b54c4d9acd 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -355,20 +355,6 @@ def _process_for_reduction(
             skipna=skipna, min_count=min_count
         )
 
-    def _default_na_value(self) -> ScalarLike:
-        """Returns the default NA value for this column"""
-        dkind = self.dtype.kind
-        if dkind == "f":
-            return self.dtype.type(np.nan)
-        elif dkind == "i":
-            return np.iinfo(self.dtype).min
-        elif dkind == "u":
-            return np.iinfo(self.dtype).max
-        elif dkind == "b":
-            return self.dtype.type(False)
-        else:
-            raise TypeError(f"numeric column of {self.dtype} has no NaN value")
-
     def find_and_replace(
         self,
         to_replace: ColumnLike,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 9b44b4e6831..6467fd39ddd 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5218,26 +5218,6 @@ def values(self) -> cupy.ndarray:
         """
         raise TypeError("String Arrays is not yet implemented in cudf")
 
-    # TODO: This method is deprecated and should be removed when the associated
-    # Frame methods are removed.
-    def to_array(self, fillna: bool = None) -> np.ndarray:
-        """Get a dense numpy array for the data.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-
-        Raises
-        ------
-        ``NotImplementedError`` if there are nulls
-        """
-        if fillna is not None:
-            warnings.warn("fillna parameter not supported for string arrays")
-
-        return self.to_arrow().to_pandas().values
-
     def to_pandas(
         self, index: pd.Index = None, nullable: bool = False, **kwargs
     ) -> "pd.Series":
@@ -5402,9 +5382,6 @@ def normalize_binop_value(self, other) -> "column.ColumnBase":
         else:
             raise TypeError(f"cannot broadcast {type(other)}")
 
-    def _default_na_value(self) -> ScalarLike:
-        return None
-
     def binary_operator(
         self, op: builtins.str, rhs, reflect: bool = False
     ) -> "column.ColumnBase":
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 4b7a3bcc197..6c8c904e13c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -12,13 +12,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import (
-    BinaryOperand,
-    DatetimeLikeScalar,
-    Dtype,
-    DtypeObj,
-    ScalarLike,
-)
+from cudf._typing import BinaryOperand, DatetimeLikeScalar, Dtype, DtypeObj
 from cudf.api.types import is_scalar
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, column, string
@@ -123,7 +117,8 @@ def to_pandas(
 
         # Pandas supports only `timedelta64[ns]`, hence the cast.
         pd_series = pd.Series(
-            self.astype("timedelta64[ns]").to_array("NAT"), copy=False
+            self.astype("timedelta64[ns]").fillna("NaT").values_host,
+            copy=False,
         )
 
         if index is not None:
@@ -304,10 +299,6 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn":
             ),
         )
 
-    def _default_na_value(self) -> ScalarLike:
-        """Returns the default NA value for this column"""
-        return np.timedelta64("nat", self.time_unit)
-
     @property
     def time_unit(self) -> str:
         return self._time_unit
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index c2ea9d756f7..67976ac27d4 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -523,14 +523,19 @@ def rename_column(x):
                 raise IndexError(
                     f"Too many levels: Index has only 1 level, not {level+1}"
                 )
+
             if isinstance(mapper, Mapping):
-                new_names = (
+                new_col_names = [
                     mapper.get(col_name, col_name) for col_name in self.keys()
-                )
+                ]
             else:
-                new_names = (mapper(col_name) for col_name in self.keys())
+                new_col_names = [mapper(col_name) for col_name in self.keys()]
+
+            if len(new_col_names) != len(set(new_col_names)):
+                raise ValueError("Duplicate column names are not allowed")
+
             ca = ColumnAccessor(
-                dict(zip(new_names, self.values())),
+                dict(zip(new_col_names, self.values())),
                 level_names=self.level_names,
                 multiindex=self.multiindex,
             )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e973468a322..0035a979760 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -17,7 +17,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba import cuda
 from nvtx import annotate
 from pandas._config import get_option
 from pandas.core.dtypes.common import is_float, is_integer
@@ -64,6 +63,7 @@
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
+from cudf.core.udf.row_function import _get_row_kernel
 from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -269,7 +269,9 @@ def _getitem_tuple_arg(self, arg):
             else:
                 row_selection = as_column(arg[0])
                 if is_bool_dtype(row_selection.dtype):
-                    df.index = self._frame.index.take(row_selection)
+                    df.index = self._frame.index._apply_boolean_mask(
+                        row_selection
+                    )
                 else:
                     df.index = as_index(row_selection)
         # Step 4: Downcast
@@ -3096,124 +3098,6 @@ def add_suffix(self, suffix):
         ]
         return out
 
-    def as_gpu_matrix(self, columns=None, order="F"):
-        warnings.warn(
-            "The as_gpu_matrix method will be removed in a future cuDF "
-            "release. Consider using `to_cupy` instead.",
-            FutureWarning,
-        )
-        if columns is None:
-            columns = self._data.names
-
-        cols = [self._data[k] for k in columns]
-        ncol = len(cols)
-        nrow = len(self)
-        if ncol < 1:
-            # This is the case for empty dataframe - construct empty cupy array
-            matrix = cupy.empty(
-                shape=(0, 0), dtype=cudf.dtype("float64"), order=order
-            )
-            return cuda.as_cuda_array(matrix)
-
-        if any(
-            (is_categorical_dtype(c) or np.issubdtype(c, cudf.dtype("object")))
-            for c in cols
-        ):
-            raise TypeError("non-numeric data not yet supported")
-
-        dtype = find_common_type([col.dtype for col in cols])
-        for k, c in self._data.items():
-            if c.has_nulls():
-                raise ValueError(
-                    f"column '{k}' has null values. "
-                    f"hint: use .fillna() to replace null values"
-                )
-        cupy_dtype = dtype
-        if np.issubdtype(cupy_dtype, np.datetime64):
-            cupy_dtype = cudf.dtype("int64")
-
-        if order not in ("F", "C"):
-            raise ValueError(
-                "order parameter should be 'C' for row major or 'F' for"
-                "column major GPU matrix"
-            )
-
-        matrix = cupy.empty(shape=(nrow, ncol), dtype=cupy_dtype, order=order)
-        for colidx, inpcol in enumerate(cols):
-            dense = inpcol.astype(cupy_dtype)
-            matrix[:, colidx] = cupy.asarray(dense)
-        return cuda.as_cuda_array(matrix).view(dtype)
-
-    def as_matrix(self, columns=None):
-        warnings.warn(
-            "The as_matrix method will be removed in a future cuDF "
-            "release. Consider using `to_numpy` instead.",
-            FutureWarning,
-        )
-        return self.as_gpu_matrix(columns=columns).copy_to_host()
-
-    def label_encoding(
-        self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1
-    ):
-        """Encode labels in a column with label encoding.
-
-        Parameters
-        ----------
-        column : str
-            the source column with binary encoding for the data.
-        prefix : str
-            the new column name prefix.
-        cats : sequence of ints
-            the sequence of categories as integers.
-        prefix_sep : str
-            the separator between the prefix and the category.
-        dtype :
-            the dtype for the outputs; see Series.label_encoding
-        na_sentinel : number
-            Value to indicate missing category.
-
-        Returns
-        -------
-        A new DataFrame with a new column appended for the coded values.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 10, 20]})
-        >>> df
-           a   b
-        0  1  10
-        1  2  10
-        2  3  20
-        >>> df.label_encoding(column="b", prefix="b_col", cats=[10, 20])
-           a   b  b_col_labels
-        0  1  10             0
-        1  2  10             0
-        2  3  20             1
-        """
-
-        warnings.warn(
-            "DataFrame.label_encoding is deprecated and will be removed in "
-            "the future. Consider using cuML's LabelEncoder instead.",
-            FutureWarning,
-        )
-
-        return self._label_encoding(
-            column, prefix, cats, prefix_sep, dtype, na_sentinel
-        )
-
-    def _label_encoding(
-        self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1
-    ):
-        # Private implementation of deprecated public label_encoding method
-        newname = prefix_sep.join([prefix, "labels"])
-        newcol = self[column]._label_encoding(
-            cats=cats, dtype=dtype, na_sentinel=na_sentinel
-        )
-        outdf = self.copy()
-        outdf.insert(len(outdf._data), newname, newcol)
-        return outdf
-
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -4001,10 +3885,8 @@ def apply(
             raise ValueError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
             raise ValueError("The `result_type` kwarg is not yet supported.")
-        if kwargs:
-            raise ValueError("UDFs using **kwargs are not yet supported.")
 
-        return self._apply(func, *args)
+        return self._apply(func, _get_row_kernel, *args, **kwargs)
 
     @applyutils.doc_apply()
     def apply_rows(
@@ -5611,7 +5493,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                     )
                     .fillna(np.nan)
                 )
-        arr = cupy.asarray(prepared.as_gpu_matrix())
+        arr = prepared.to_cupy()
 
         if skipna is not False and method in _cupy_nan_methods_map:
             method = _cupy_nan_methods_map[method]
@@ -6220,6 +6102,37 @@ def __dataframe__(
             self, nan_as_null=nan_as_null, allow_copy=allow_copy
         )
 
+    def nunique(self, axis=0, dropna=True):
+        """
+        Count number of distinct elements in specified axis.
+        Return Series with number of distinct elements. Can ignore NaN values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
+            column-wise.
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
+        >>> df.nunique()
+        A    3
+        B    2
+        dtype: int64
+        """
+        if axis != 0:
+            raise NotImplementedError("axis parameter is not supported yet.")
+
+        return cudf.Series(super().nunique(method="sort", dropna=dropna))
+
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 69dc5389e7a..7eabc39aa4b 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import builtins
 import copy
 import pickle
 import warnings
@@ -45,7 +46,6 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import Merge, MergeSemi
-from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
@@ -1367,39 +1367,6 @@ def _quantiles(
         result._copy_type_metadata(self)
         return result
 
-    @annotate("APPLY", color="purple", domain="cudf_python")
-    def _apply(self, func, *args):
-        """
-        Apply `func` across the rows of the frame.
-        """
-        kernel, retty = compile_or_get(self, func, args)
-
-        # Mask and data column preallocated
-        ans_col = cupy.empty(len(self), dtype=retty)
-        ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
-        launch_args = [(ans_col, ans_mask), len(self)]
-        offsets = []
-
-        # if compile_or_get succeeds, it is safe to create a kernel that only
-        # consumes the columns that are of supported dtype
-        for col in supported_cols_from_frame(self).values():
-            data = col.data
-            mask = col.mask
-            if mask is None:
-                launch_args.append(data)
-            else:
-                launch_args.append((data, mask))
-            offsets.append(col.offset)
-        launch_args += offsets
-        launch_args += list(args)
-        kernel.forall(len(self))(*launch_args)
-
-        col = as_column(ans_col)
-        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self._index)
-
-        return result
-
     def rank(
         self,
         axis=0,
@@ -6016,12 +5983,12 @@ def eq(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.eq(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.eq(right, fill_value=7)
-        a     b      c      d
+              a     b      c      d
         0  True  True   True  False
         1  True  True  False  False
         2  True  True  False  False
@@ -6091,12 +6058,12 @@ def ne(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.ne(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.ne(right, fill_value=7)
-        a      b      c     d
+               a      b      c     d
         0  False  False  False  True
         1  False  False   True  True
         2  False  False   True  True
@@ -6166,12 +6133,12 @@ def lt(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.lt(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.lt(right, fill_value=7)
-        a      b      c     d
+               a      b      c     d
         0  False  False  False  True
         1  False  False  False  True
         2  False  False  False  True
@@ -6241,12 +6208,12 @@ def le(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.le(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.le(right, fill_value=7)
-        a     b      c     d
+              a     b      c     d
         0  True  True   True  True
         1  True  True  False  True
         2  True  True  False  True
@@ -6316,12 +6283,12 @@ def gt(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.gt(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.gt(right, fill_value=7)
-        a      b      c      d
+               a      b      c      d
         0  False  False  False  False
         1  False  False   True  False
         2  False  False   True  False
@@ -6391,12 +6358,12 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.ge(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.ge(right, fill_value=7)
-        a     b     c      d
+              a     b     c      d
         0  True  True  True  False
         1  True  True  True  False
         2  True  True  True  False
@@ -6436,6 +6403,28 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="ge", fill_value=fill_value, can_reindex=True
         )
 
+    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+        """
+        Returns a per column mapping with counts of unique values for
+        each column.
+
+        Parameters
+        ----------
+        method : builtins.str, default "sort"
+            Method used by cpp_distinct_count
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        dict
+            Name and unique value counts of each column in frame.
+        """
+        return {
+            name: col.distinct_count(method=method, dropna=dropna)
+            for name, col in self._data.items()
+        }
+
 
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any]
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 91c7a740699..fc59d15e264 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -682,6 +682,7 @@ def _intersection(self, other, sort=False):
         return new_index
 
     def _gather(self, gather_map, nullify=False, check_bounds=True):
+        gather_map = cudf.core.column.as_column(gather_map)
         return Int64Index._from_columns(
             [self._values.take(gather_map, nullify, check_bounds)], [self.name]
         )
@@ -771,23 +772,6 @@ def __init__(self, data, **kwargs):
         name = kwargs.get("name")
         super().__init__({name: data})
 
-    @classmethod
-    def deserialize(cls, header, frames):
-        if "index_column" in header:
-            warnings.warn(
-                "Index objects serialized in cudf version "
-                "21.10 or older will no longer be deserializable "
-                "after version 21.12. Please load and resave any "
-                "pickles before upgrading to version 22.02.",
-                FutureWarning,
-            )
-            header["columns"] = [header.pop("index_column")]
-            header["column_names"] = pickle.dumps(
-                [pickle.loads(header["name"])]
-            )
-
-        return super().deserialize(header, frames)
-
     def _binaryop(
         self,
         other: T,
@@ -2508,7 +2492,7 @@ def to_pandas(self):
 
     def __repr__(self):
         return (
-            f"{self.__class__.__name__}({self._values.to_array()},"
+            f"{self.__class__.__name__}({self._values.values_host},"
             f" dtype='object'"
             + (
                 f", name={pd.io.formats.printing.default_pprint(self.name)}"
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e9f2de1cb1c..8ecab2c7c65 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -24,11 +24,12 @@
     is_integer_dtype,
     is_list_like,
 )
-from cudf.core.column import arange
+from cudf.core.column import arange, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex, _index_from_columns
 from cudf.core.multiindex import MultiIndex
+from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
 from cudf.utils.utils import cached_property
 
 doc_reset_index_template = """
@@ -756,6 +757,51 @@ def add_suffix(self, suffix):
                 Use `Series.add_suffix` or `DataFrame.add_suffix`"
         )
 
+    @annotate("APPLY", color="purple", domain="cudf_python")
+    def _apply(self, func, kernel_getter, *args, **kwargs):
+        """Apply `func` across the rows of the frame."""
+        if kwargs:
+            raise ValueError("UDFs using **kwargs are not yet supported.")
+
+        try:
+            kernel, retty = _compile_or_get(
+                self, func, args, kernel_getter=kernel_getter
+            )
+        except Exception as e:
+            raise ValueError(
+                "user defined function compilation failed."
+            ) from e
+
+        # Mask and data column preallocated
+        ans_col = cp.empty(len(self), dtype=retty)
+        ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
+        launch_args = [(ans_col, ans_mask), len(self)]
+        offsets = []
+
+        # if _compile_or_get succeeds, it is safe to create a kernel that only
+        # consumes the columns that are of supported dtype
+        for col in _supported_cols_from_frame(self).values():
+            data = col.data
+            mask = col.mask
+            if mask is None:
+                launch_args.append(data)
+            else:
+                launch_args.append((data, mask))
+            offsets.append(col.offset)
+        launch_args += offsets
+        launch_args += list(args)
+
+        try:
+            kernel.forall(len(self))(*launch_args)
+        except Exception as e:
+            raise RuntimeError("UDF kernel execution failed.") from e
+
+        col = as_column(ans_col)
+        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
+        result = cudf.Series._from_data({None: col}, self._index)
+
+        return result
+
     def sort_values(
         self,
         by,
@@ -1298,9 +1344,7 @@ def dropna(
         0  Alfred  Batmobile 1940-04-25
         """
         if axis == 0:
-            result = self._drop_na_rows(
-                how=how, subset=subset, thresh=thresh, drop_nan=True
-            )
+            result = self._drop_na_rows(how=how, subset=subset, thresh=thresh)
         else:
             result = self._drop_na_columns(
                 how=how, subset=subset, thresh=thresh
@@ -1308,9 +1352,7 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    def _drop_na_rows(
-        self, how="any", subset=None, thresh=None, drop_nan=False
-    ):
+    def _drop_na_rows(self, how="any", subset=None, thresh=None):
         """
         Drop null rows from `self`.
 
@@ -1321,7 +1363,7 @@ def _drop_na_rows(
             *all* null values.
         subset : list, optional
             List of columns to consider when dropping rows.
-        thresh: int, optional
+        thresh : int, optional
             If specified, then drops every row containing
             less than `thresh` non-null values.
         """
@@ -1341,17 +1383,16 @@ def _drop_na_rows(
         if len(subset) == 0:
             return self.copy(deep=True)
 
-        if drop_nan:
-            data_columns = [
-                col.nans_to_nulls()
-                if isinstance(col, cudf.core.column.NumericalColumn)
-                else col
-                for col in self._columns
-            ]
+        data_columns = [
+            col.nans_to_nulls()
+            if isinstance(col, cudf.core.column.NumericalColumn)
+            else col
+            for col in self._columns
+        ]
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                list(self._index._data.columns) + data_columns,
+                [*self._index._data.columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -1418,18 +1459,9 @@ def take(self, indices, axis=0):
         0  1.0  a
         2  3.0  c
         """
-        axis = self._get_axis_from_axis_arg(axis)
-        if axis != 0:
+        if self._get_axis_from_axis_arg(axis) != 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
-        indices = cudf.core.column.as_column(indices)
-        if is_bool_dtype(indices):
-            warnings.warn(
-                "Calling take with a boolean array is deprecated and will be "
-                "removed in the future.",
-                FutureWarning,
-            )
-            return self._apply_boolean_mask(indices)
         return self._gather(indices)
 
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index fa84889adea..adce3c24a83 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -859,28 +859,8 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        if "names" in header:
-            warnings.warn(
-                "MultiIndex objects serialized in cudf version "
-                "21.10 or older will no longer be deserializable "
-                "after version 21.12. Please load and resave any "
-                "pickles before upgrading to version 22.02.",
-                FutureWarning,
-            )
-            header["column_names"] = header["names"]
-        column_names = pickle.loads(header["column_names"])
-        if "source_data" in header:
-            warnings.warn(
-                "MultiIndex objects serialized in cudf version "
-                "21.08 or older will no longer be deserializable "
-                "after version 21.10. Please load and resave any "
-                "pickles before upgrading to version 21.12.",
-                FutureWarning,
-            )
-            df = cudf.DataFrame.deserialize(header["source_data"], frames)
-            return cls.from_frame(df)._set_names(column_names)
-
         # Spoof the column names to construct the frame, then set manually.
+        column_names = pickle.loads(header["column_names"])
         header["column_names"] = pickle.dumps(range(0, len(column_names)))
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 0371c40274f..12a2538b776 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -14,7 +14,6 @@
 import cupy
 import numpy as np
 import pandas as pd
-from numba import cuda
 from pandas._config import get_option
 
 import cudf
@@ -67,6 +66,7 @@
     doc_reset_index_template,
 )
 from cudf.core.single_column_frame import SingleColumnFrame
+from cudf.core.udf.scalar_function import _get_scalar_kernel
 from cudf.utils import cudautils, docutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -559,19 +559,6 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        if "column" in header:
-            warnings.warn(
-                "Series objects serialized in cudf version "
-                "21.10 or older will no longer be deserializable "
-                "after version 21.12. Please load and resave any "
-                "pickles before upgrading to version 22.02.",
-                FutureWarning,
-            )
-            header["columns"] = [header.pop("column")]
-            header["column_names"] = pickle.dumps(
-                [pickle.loads(header["name"])]
-            )
-
         index_nframes = header["index_frame_count"]
         obj = super().deserialize(
             header, frames[header["index_frame_count"] :]
@@ -965,15 +952,6 @@ def to_frame(self, name=None):
 
         return cudf.DataFrame({col: self._column}, index=self.index)
 
-    def set_mask(self, mask, null_count=None):
-        warnings.warn(
-            "Series.set_mask is deprecated and will be removed in the future.",
-            FutureWarning,
-        )
-        return self._from_data(
-            {self.name: self._column.set_mask(mask)}, self._index
-        )
-
     def memory_usage(self, index=True, deep=False):
         """
         Return the memory usage of the Series.
@@ -1623,25 +1601,6 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    def fill(self, fill_value, begin=0, end=-1, inplace=False):
-        warnings.warn(
-            "The fill method will be removed in a future cuDF release.",
-            FutureWarning,
-        )
-        fill_values = [fill_value]
-        col_and_fill = zip(self._columns, fill_values)
-
-        if not inplace:
-            data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill)
-            return self.__class__._from_data(
-                zip(self._column_names, data_columns), self._index
-            )
-
-        for (c, v) in col_and_fill:
-            c.fill(v, begin, end, inplace=True)
-
-        return self
-
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1665,15 +1624,6 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
-    # TODO: When this method is removed we can also remove ColumnBase.to_array.
-    def to_array(self, fillna=None):
-        warnings.warn(
-            "The to_array method will be removed in a future cuDF "
-            "release. Consider using `to_numpy` instead.",
-            FutureWarning,
-        )
-        return self._column.to_array(fillna=fillna)
-
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1782,27 +1732,6 @@ def nullmask(self):
         """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
-    def as_mask(self):
-        """Convert booleans to bitmask
-
-        Returns
-        -------
-        device array
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([True, False, True])
-        >>> s.as_mask()
-        <cudf.core.buffer.Buffer object at 0x...>
-        """
-        if not is_bool_dtype(self.dtype):
-            raise TypeError(
-                f"Series must of boolean dtype, found: {self.dtype}"
-            )
-
-        return self._column.as_mask()
-
     def astype(self, dtype, copy=False, errors="raise"):
         """
         Cast the Series to the given dtype
@@ -2243,76 +2172,6 @@ def update(self, other):
 
         self.mask(mask, other, inplace=True)
 
-    def reverse(self):
-        warnings.warn(
-            "Series.reverse is deprecated and will be removed in the future.",
-            FutureWarning,
-        )
-        rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32)
-        return self._from_data(
-            {self.name: self._column[rinds]}, self.index._values[rinds]
-        )
-
-    def label_encoding(self, cats, dtype=None, na_sentinel=-1):
-        """Perform label encoding.
-
-        Parameters
-        ----------
-        values : sequence of input values
-        dtype : numpy.dtype; optional
-            Specifies the output dtype.  If `None` is given, the
-            smallest possible integer dtype (starting with np.int8)
-            is used.
-        na_sentinel : number, default -1
-            Value to indicate missing category.
-
-        Returns
-        -------
-        A sequence of encoded labels with value between 0 and n-1 classes(cats)
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([1, 2, 3, 4, 10])
-        >>> s.label_encoding([2, 3])
-        0   -1
-        1    0
-        2    1
-        3   -1
-        4   -1
-        dtype: int8
-
-        `na_sentinel` parameter can be used to
-        control the value when there is no encoding.
-
-        >>> s.label_encoding([2, 3], na_sentinel=10)
-        0    10
-        1     0
-        2     1
-        3    10
-        4    10
-        dtype: int8
-
-        When none of `cats` values exist in s, entire
-        Series will be `na_sentinel`.
-
-        >>> s.label_encoding(['a', 'b', 'c'])
-        0   -1
-        1   -1
-        2   -1
-        3   -1
-        4   -1
-        dtype: int8
-        """
-
-        warnings.warn(
-            "Series.label_encoding is deprecated and will be removed in the "
-            "future. Consider using cuML's LabelEncoder instead.",
-            FutureWarning,
-        )
-
-        return self._label_encoding(cats, dtype, na_sentinel)
-
     def _label_encoding(self, cats, dtype=None, na_sentinel=-1):
         # Private implementation of deprecated public label_encoding method
         def _return_sentinel_series():
@@ -2374,7 +2233,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
             by numba based on the function logic and argument types.
             See examples for details.
         args : tuple
-            Not supported
+            Positional arguments passed to func after the series value.
         **kwargs
             Not supported
 
@@ -2440,20 +2299,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         2     4.5
         dtype: float64
         """
-        if args or kwargs:
-            raise ValueError(
-                "UDFs using *args or **kwargs are not yet supported."
-            )
-
-        # these functions are generally written as functions of scalar
-        # values rather than rows. Rather than writing an entirely separate
-        # numba kernel that is not built around a row object, its simpler
-        # to just turn this into the equivalent single column dataframe case
-        name = self.name or "__temp_srname"
-        df = cudf.DataFrame({name: self})
-        f_ = cuda.jit(device=True)(func)
-
-        return df.apply(lambda row: f_(row[name]))
+        if convert_dtype is not True:
+            raise ValueError("Series.apply only supports convert_dtype=True")
+        return self._apply(func, _get_scalar_kernel, *args, **kwargs)
 
     def applymap(self, udf, out_dtype=None):
         """Apply an elementwise function to transform the values in the Column.
@@ -2908,7 +2756,7 @@ def nunique(self, method="sort", dropna=True):
             raise NotImplementedError(msg)
         if self.null_count == len(self):
             return 0
-        return self._column.distinct_count(method, dropna)
+        return super().nunique(method, dropna)
 
     def value_counts(
         self,
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 7793a2fdf29..ef479f19363 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-import warnings
+import builtins
 from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union
 
 import cupy
@@ -143,16 +143,6 @@ def tolist(self):  # noqa: D102
 
     to_list = tolist
 
-    # TODO: When this method is removed we can also remove
-    # ColumnBase.to_gpu_array.
-    def to_gpu_array(self, fillna=None):  # noqa: D102
-        warnings.warn(
-            "The to_gpu_array method will be removed in a future cuDF "
-            "release. Consider using `to_cupy` instead.",
-            FutureWarning,
-        )
-        return self._column.to_gpu_array(fillna=fillna)
-
     @classmethod
     def from_arrow(cls, array):
         """Create from PyArrow Array/ChunkedArray.
@@ -336,3 +326,21 @@ def _make_operands_for_binop(
                 return NotImplemented
 
         return {result_name: (self._column, other, reflect, fill_value)}
+
+    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+        """
+        Return count of unique values for the column.
+
+        Parameters
+        ----------
+        method : builtins.str, default "sort"
+            Method used by cpp_distinct_count
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        int
+            Number of unique values in the column.
+        """
+        return self._column.distinct_count(method=method, dropna=dropna)
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
deleted file mode 100644
index 2464906be04..00000000000
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ /dev/null
@@ -1,388 +0,0 @@
-import math
-from typing import Callable
-
-import cachetools
-import numpy as np
-from numba import cuda, typeof
-from numba.np import numpy_support
-from numba.types import Poison, Record, Tuple, boolean, int64, void
-from nvtx import annotate
-
-from cudf.core.dtypes import CategoricalDtype
-from cudf.core.udf.api import Masked, pack_return
-from cudf.core.udf.typing import MaskedType
-from cudf.utils import cudautils
-from cudf.utils.dtypes import (
-    BOOL_TYPES,
-    DATETIME_TYPES,
-    NUMERIC_TYPES,
-    TIMEDELTA_TYPES,
-)
-
-libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32"))
-MASK_BITSIZE = np.dtype("int32").itemsize * 8
-precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
-
-JIT_SUPPORTED_TYPES = (
-    NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES
-)
-
-
-def _is_jit_supported_type(dtype):
-    # category dtype isn't hashable
-    if isinstance(dtype, CategoricalDtype):
-        return False
-    return str(dtype) in JIT_SUPPORTED_TYPES
-
-
-def all_dtypes_from_frame(frame):
-    return {
-        colname: col.dtype
-        if _is_jit_supported_type(col.dtype)
-        else np.dtype("O")
-        for colname, col in frame._data.items()
-    }
-
-
-def supported_dtypes_from_frame(frame):
-    return {
-        colname: col.dtype
-        for colname, col in frame._data.items()
-        if _is_jit_supported_type(col.dtype)
-    }
-
-
-def supported_cols_from_frame(frame):
-    return {
-        colname: col
-        for colname, col in frame._data.items()
-        if _is_jit_supported_type(col.dtype)
-    }
-
-
-def generate_cache_key(frame, func: Callable):
-    """Create a cache key that uniquely identifies a compilation.
-
-    A new compilation is needed any time any of the following things change:
-    - The UDF itself as defined in python by the user
-    - The types of the columns utilized by the UDF
-    - The existence of the input columns masks
-    """
-    return (
-        *cudautils.make_cache_key(func, all_dtypes_from_frame(frame).values()),
-        *(col.mask is None for col in frame._data.values()),
-        *frame._data.keys(),
-    )
-
-
-def get_frame_row_type(dtype):
-    """
-    Get the numba `Record` type corresponding to a frame.
-    Models each column and its mask as a MaskedType and
-    models the row as a dictionary like data structure
-    containing these MaskedTypes.
-
-    Large parts of this function are copied with comments
-    from the Numba internals and slightly modified to
-    account for validity bools to be present in the final
-    struct.
-    """
-
-    # Create the numpy structured type corresponding to the numpy dtype.
-
-    fields = []
-    offset = 0
-
-    sizes = [val[0].itemsize for val in dtype.fields.values()]
-    for i, (name, info) in enumerate(dtype.fields.items()):
-        # *info* consists of the element dtype, its offset from the beginning
-        # of the record, and an optional "title" containing metadata.
-        # We ignore the offset in info because its value assumes no masking;
-        # instead, we compute the correct offset based on the masked type.
-        elemdtype = info[0]
-        title = info[2] if len(info) == 3 else None
-        ty = numpy_support.from_dtype(elemdtype)
-        infos = {
-            "type": MaskedType(ty),
-            "offset": offset,
-            "title": title,
-        }
-        fields.append((name, infos))
-
-        # increment offset by itemsize plus one byte for validity
-        offset += elemdtype.itemsize + 1
-
-        # Align the next member of the struct to be a multiple of the
-        # memory access size, per PTX ISA 7.4/5.4.5
-        if i < len(sizes) - 1:
-            next_itemsize = sizes[i + 1]
-            offset = int(math.ceil(offset / next_itemsize) * next_itemsize)
-
-    # Numba requires that structures are aligned for the CUDA target
-    _is_aligned_struct = True
-    return Record(fields, offset, _is_aligned_struct)
-
-
-@annotate("NUMBA JIT", color="green", domain="cudf_python")
-def get_udf_return_type(frame, func: Callable, args=()):
-
-    """
-    Get the return type of a masked UDF for a given set of argument dtypes. It
-    is assumed that the function consumes a dictionary whose keys are strings
-    and whose values are of MaskedType. Initially assume that the UDF may be
-    written to utilize any field in the row - including those containing an
-    unsupported dtype. If an unsupported dtype is actually used in the function
-    the compilation should fail at `compile_udf`. If compilation succeeds, one
-    can infer that the function does not use any of the columns of unsupported
-    dtype - meaning we can drop them going forward and the UDF will still end
-    up getting fed rows containing all the fields it actually needs to use to
-    compute the answer for that row.
-    """
-
-    # present a row containing all fields to the UDF and try and compile
-    row_type = get_frame_row_type(
-        np.dtype(list(all_dtypes_from_frame(frame).items()))
-    )
-    compile_sig = (row_type, *(typeof(arg) for arg in args))
-
-    # Get the return type. The PTX is also returned by compile_udf, but is not
-    # needed here.
-    ptx, output_type = cudautils.compile_udf(func, compile_sig)
-    if not isinstance(output_type, MaskedType):
-        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
-    else:
-        numba_output_type = output_type
-
-    return (
-        numba_output_type
-        if not isinstance(numba_output_type, MaskedType)
-        else numba_output_type.value_type
-    )
-
-
-def masked_array_type_from_col(col):
-    """
-    Return a type representing a tuple of arrays,
-    the first element an array of the numba type
-    corresponding to `dtype`, and the second an
-    array of bools representing a mask.
-    """
-    nb_scalar_ty = numpy_support.from_dtype(col.dtype)
-    if col.mask is None:
-        return nb_scalar_ty[::1]
-    else:
-        return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1]))
-
-
-def construct_signature(frame, return_type, args):
-    """
-    Build the signature of numba types that will be used to
-    actually JIT the kernel itself later, accounting for types
-    and offsets. Skips columns with unsupported dtypes.
-    """
-
-    # Tuple of arrays, first the output data array, then the mask
-    return_type = Tuple((return_type[::1], boolean[::1]))
-    offsets = []
-    sig = [return_type, int64]
-    for col in supported_cols_from_frame(frame).values():
-        sig.append(masked_array_type_from_col(col))
-        offsets.append(int64)
-
-    # return_type, size, data, masks, offsets, extra args
-    sig = void(*(sig + offsets + [typeof(arg) for arg in args]))
-
-    return sig
-
-
-@cuda.jit(device=True)
-def mask_get(mask, pos):
-    return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
-
-
-kernel_template = """\
-def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}):
-    i = cuda.grid(1)
-    ret_data_arr, ret_mask_arr = retval
-    if i < size:
-        # Create a structured array with the desired fields
-        rows = cuda.local.array(1, dtype=row_type)
-
-        # one element of that array
-        row = rows[0]
-
-{masked_input_initializers}
-{row_initializers}
-
-        # pass the assembled row into the udf
-        ret = f_(row, {extra_args})
-
-        # pack up the return values and set them
-        ret_masked = pack_return(ret)
-        ret_data_arr[i] = ret_masked.value
-        ret_mask_arr[i] = ret_masked.valid
-"""
-
-unmasked_input_initializer_template = """\
-        d_{idx} = input_col_{idx}
-        masked_{idx} = Masked(d_{idx}[i], True)
-"""
-
-masked_input_initializer_template = """\
-        d_{idx}, m_{idx} = input_col_{idx}
-        masked_{idx} = Masked(d_{idx}[i], mask_get(m_{idx}, i + offset_{idx}))
-"""
-
-row_initializer_template = """\
-        row["{name}"] = masked_{idx}
-"""
-
-
-def _define_function(frame, row_type, args):
-    """
-    The kernel we want to JIT compile looks something like the following,
-    which is an example for two columns that both have nulls present
-
-    def _kernel(retval, input_col_0, input_col_1, offset_0, offset_1, size):
-        i = cuda.grid(1)
-        ret_data_arr, ret_mask_arr = retval
-        if i < size:
-            rows = cuda.local.array(1, dtype=row_type)
-            row = rows[0]
-
-            d_0, m_0 = input_col_0
-            masked_0 = Masked(d_0[i], mask_get(m_0, i + offset_0))
-            d_1, m_1 = input_col_1
-            masked_1 = Masked(d_1[i], mask_get(m_1, i + offset_1))
-
-            row["a"] = masked_0
-            row["b"] = masked_1
-
-            ret = f_(row)
-
-            ret_masked = pack_return(ret)
-            ret_data_arr[i] = ret_masked.value
-            ret_mask_arr[i] = ret_masked.valid
-
-    However we do not always have two columns and columns do not always have
-    an associated mask. Ideally, we would just write one kernel and make use
-    of `*args` - and then one function would work for any number of columns,
-    currently numba does not support `*args` and treats functions it JITs as
-    if `*args` is a singular argument. Thus we are forced to write the right
-    functions dynamically at runtime and define them using `exec`.
-    """
-    # Create argument list for kernel
-    frame = supported_cols_from_frame(frame)
-
-    input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))])
-    input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))])
-    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
-
-    # Generate the initializers for each device function argument
-    initializers = []
-    row_initializers = []
-    for i, (colname, col) in enumerate(frame.items()):
-        idx = str(i)
-        if col.mask is not None:
-            template = masked_input_initializer_template
-        else:
-            template = unmasked_input_initializer_template
-
-        initializer = template.format(idx=idx)
-
-        initializers.append(initializer)
-
-        row_initializer = row_initializer_template.format(
-            idx=idx, name=colname
-        )
-        row_initializers.append(row_initializer)
-
-    # Incorporate all of the above into the kernel code template
-    d = {
-        "input_columns": input_columns,
-        "input_offsets": input_offsets,
-        "extra_args": extra_args,
-        "masked_input_initializers": "\n".join(initializers),
-        "row_initializers": "\n".join(row_initializers),
-        "numba_rectype": row_type,  # from global
-    }
-
-    return kernel_template.format(**d)
-
-
-@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
-def compile_or_get(frame, func, args):
-    """
-    Return a compiled kernel in terms of MaskedTypes that launches a
-    kernel equivalent of `f` for the dtypes of `df`. The kernel uses
-    a thread for each row and calls `f` using that rows data / mask
-    to produce an output value and output validity for each row.
-
-    If the UDF has already been compiled for this requested dtypes,
-    a cached version will be returned instead of running compilation.
-
-    CUDA kernels are void and do not return values. Thus, we need to
-    preallocate a column of the correct dtype and pass it in as one of
-    the kernel arguments. This creates a chicken-and-egg problem where
-    we need the column type to compile the kernel, but normally we would
-    be getting that type FROM compiling the kernel (and letting numba
-    determine it as a return value). As a workaround, we compile the UDF
-    itself outside the final kernel to invoke a full typing pass, which
-    unfortunately is difficult to do without running full compilation.
-    we then obtain the return type from that separate compilation and
-    use it to allocate an output column of the right dtype.
-    """
-
-    # check to see if we already compiled this function
-    cache_key = generate_cache_key(frame, func)
-    if precompiled.get(cache_key) is not None:
-        kernel, masked_or_scalar = precompiled[cache_key]
-        return kernel, masked_or_scalar
-
-    # precompile the user udf to get the right return type.
-    # could be a MaskedType or a scalar type.
-    scalar_return_type = get_udf_return_type(frame, func, args)
-
-    # get_udf_return_type will throw a TypingError if the user tries to use
-    # a field in the row containing an unsupported dtype, except in the
-    # edge case where all the function does is return that element:
-
-    # def f(row):
-    #    return row[<bad dtype key>]
-    # In this case numba is happy to return MaskedType(<bad dtype key>)
-    # because it relies on not finding overloaded operators for types to raise
-    # the exception, so we have to explicitly check for that case.
-    if isinstance(scalar_return_type, Poison):
-        raise TypeError(str(scalar_return_type))
-
-    # this is the signature for the final full kernel compilation
-    sig = construct_signature(frame, scalar_return_type, args)
-
-    # this row type is used within the kernel to pack up the column and
-    # mask data into the dict like data structure the user udf expects
-    np_field_types = np.dtype(list(supported_dtypes_from_frame(frame).items()))
-    row_type = get_frame_row_type(np_field_types)
-
-    f_ = cuda.jit(device=True)(func)
-    # Dict of 'local' variables into which `_kernel` is defined
-    local_exec_context = {}
-    global_exec_context = {
-        "f_": f_,
-        "cuda": cuda,
-        "Masked": Masked,
-        "mask_get": mask_get,
-        "pack_return": pack_return,
-        "row_type": row_type,
-    }
-    exec(
-        _define_function(frame, row_type, args),
-        global_exec_context,
-        local_exec_context,
-    )
-    # The python function definition representing the kernel
-    _kernel = local_exec_context["_kernel"]
-    kernel = cuda.jit(sig)(_kernel)
-    np_return_type = numpy_support.as_dtype(scalar_return_type)
-    precompiled[cache_key] = (kernel, np_return_type)
-
-    return kernel, np_return_type
diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py
new file mode 100644
index 00000000000..5cda9fb8218
--- /dev/null
+++ b/python/cudf/cudf/core/udf/row_function.py
@@ -0,0 +1,151 @@
+import math
+
+import numpy as np
+from numba import cuda
+from numba.np import numpy_support
+from numba.types import Record
+
+from cudf.core.udf.api import Masked, pack_return
+from cudf.core.udf.templates import (
+    masked_input_initializer_template,
+    row_initializer_template,
+    row_kernel_template,
+    unmasked_input_initializer_template,
+)
+from cudf.core.udf.typing import MaskedType
+from cudf.core.udf.utils import (
+    _all_dtypes_from_frame,
+    _construct_signature,
+    _get_kernel,
+    _get_udf_return_type,
+    _mask_get,
+    _supported_cols_from_frame,
+    _supported_dtypes_from_frame,
+)
+
+
+def _get_frame_row_type(dtype):
+    """
+    Get the numba `Record` type corresponding to a frame.
+    Models each column and its mask as a MaskedType and
+    models the row as a dictionary like data structure
+    containing these MaskedTypes.
+
+    Large parts of this function are copied with comments
+    from the Numba internals and slightly modified to
+    account for validity bools to be present in the final
+    struct.
+
+    See numba.np.numpy_support.from_struct_dtype for details.
+    """
+
+    # Create the numpy structured type corresponding to the numpy dtype.
+
+    fields = []
+    offset = 0
+
+    sizes = [val[0].itemsize for val in dtype.fields.values()]
+    for i, (name, info) in enumerate(dtype.fields.items()):
+        # *info* consists of the element dtype, its offset from the beginning
+        # of the record, and an optional "title" containing metadata.
+        # We ignore the offset in info because its value assumes no masking;
+        # instead, we compute the correct offset based on the masked type.
+        elemdtype = info[0]
+        title = info[2] if len(info) == 3 else None
+        ty = numpy_support.from_dtype(elemdtype)
+        infos = {
+            "type": MaskedType(ty),
+            "offset": offset,
+            "title": title,
+        }
+        fields.append((name, infos))
+
+        # increment offset by itemsize plus one byte for validity
+        offset += elemdtype.itemsize + 1
+
+        # Align the next member of the struct to be a multiple of the
+        # memory access size, per PTX ISA 7.4/5.4.5
+        if i < len(sizes) - 1:
+            next_itemsize = sizes[i + 1]
+            offset = int(math.ceil(offset / next_itemsize) * next_itemsize)
+
+    # Numba requires that structures are aligned for the CUDA target
+    _is_aligned_struct = True
+    return Record(fields, offset, _is_aligned_struct)
+
+
+def _row_kernel_string_from_template(frame, row_type, args):
+    """
+    Function to write numba kernels for `DataFrame.apply` as a string.
+    Workaround until numba supports functions that use `*args`
+
+    `DataFrame.apply` expects functions of a dict like row as well as
+    possibly one or more scalar arguments
+
+    def f(row, c, k):
+        return (row['x'] + c) / k
+
+    Both the number of input columns as well as their nullability and any
+    scalar arguments may vary, so the kernels vary significantly. See
+    templates.py for the full row kernel template and more details.
+    """
+    # Create argument list for kernel
+    frame = _supported_cols_from_frame(frame)
+
+    input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))])
+    input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))])
+    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
+
+    # Generate the initializers for each device function argument
+    initializers = []
+    row_initializers = []
+    for i, (colname, col) in enumerate(frame.items()):
+        idx = str(i)
+        template = (
+            masked_input_initializer_template
+            if col.mask is not None
+            else unmasked_input_initializer_template
+        )
+        initializers.append(template.format(idx=idx))
+        row_initializers.append(
+            row_initializer_template.format(idx=idx, name=colname)
+        )
+
+    return row_kernel_template.format(
+        input_columns=input_columns,
+        input_offsets=input_offsets,
+        extra_args=extra_args,
+        masked_input_initializers="\n".join(initializers),
+        row_initializers="\n".join(row_initializers),
+        numba_rectype=row_type,
+    )
+
+
+def _get_row_kernel(frame, func, args):
+    row_type = _get_frame_row_type(
+        np.dtype(list(_all_dtypes_from_frame(frame).items()))
+    )
+    scalar_return_type = _get_udf_return_type(row_type, func, args)
+
+    # this is the signature for the final full kernel compilation
+    sig = _construct_signature(frame, scalar_return_type, args)
+
+    # this row type is used within the kernel to pack up the column and
+    # mask data into the dict like data structure the user udf expects
+    np_field_types = np.dtype(
+        list(_supported_dtypes_from_frame(frame).items())
+    )
+    row_type = _get_frame_row_type(np_field_types)
+
+    # Dict of 'local' variables into which `_kernel` is defined
+    global_exec_context = {
+        "cuda": cuda,
+        "Masked": Masked,
+        "_mask_get": _mask_get,
+        "pack_return": pack_return,
+        "row_type": row_type,
+    }
+    kernel_string = _row_kernel_string_from_template(frame, row_type, args)
+    kernel = _get_kernel(kernel_string, global_exec_context, sig, func)
+
+    return kernel, scalar_return_type
diff --git a/python/cudf/cudf/core/udf/scalar_function.py b/python/cudf/cudf/core/udf/scalar_function.py
new file mode 100644
index 00000000000..7f3b461a1f0
--- /dev/null
+++ b/python/cudf/cudf/core/udf/scalar_function.py
@@ -0,0 +1,64 @@
+from numba import cuda
+from numba.np import numpy_support
+
+from cudf.core.udf.api import Masked, pack_return
+from cudf.core.udf.templates import (
+    masked_input_initializer_template,
+    scalar_kernel_template,
+    unmasked_input_initializer_template,
+)
+from cudf.core.udf.typing import MaskedType
+from cudf.core.udf.utils import (
+    _construct_signature,
+    _get_kernel,
+    _get_udf_return_type,
+    _mask_get,
+)
+
+
+def _scalar_kernel_string_from_template(sr, args):
+    """
+    Function to write numba kernels for `Series.apply` as a string.
+    Workaround until numba supports functions that use `*args`
+
+    `Series.apply` expects functions of a single variable and possibly
+    one or more constants, such as:
+
+    def f(x, c, k):
+        return (x + c) / k
+
+    where the `x` are meant to be the values of the series. Since there
+    can be only one column, the only thing that varies in the kinds of
+    kernels that we want is the number of extra_args. See templates.py
+    for the full kernel template.
+    """
+    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
+
+    masked_initializer = (
+        masked_input_initializer_template
+        if sr._column.mask
+        else unmasked_input_initializer_template
+    ).format(idx=0)
+
+    return scalar_kernel_template.format(
+        extra_args=extra_args, masked_initializer=masked_initializer
+    )
+
+
+def _get_scalar_kernel(sr, func, args):
+    sr_type = MaskedType(numpy_support.from_dtype(sr.dtype))
+    scalar_return_type = _get_udf_return_type(sr_type, func, args)
+
+    sig = _construct_signature(sr, scalar_return_type, args=args)
+    f_ = cuda.jit(device=True)(func)
+    global_exec_context = {
+        "f_": f_,
+        "cuda": cuda,
+        "Masked": Masked,
+        "_mask_get": _mask_get,
+        "pack_return": pack_return,
+    }
+    kernel_string = _scalar_kernel_string_from_template(sr, args=args)
+    kernel = _get_kernel(kernel_string, global_exec_context, sig, func)
+
+    return kernel, scalar_return_type
diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py
new file mode 100644
index 00000000000..8cb11133323
--- /dev/null
+++ b/python/cudf/cudf/core/udf/templates.py
@@ -0,0 +1,52 @@
+unmasked_input_initializer_template = """\
+        d_{idx} = input_col_{idx}
+        masked_{idx} = Masked(d_{idx}[i], True)
+"""
+
+masked_input_initializer_template = """\
+        d_{idx}, m_{idx} = input_col_{idx}
+        masked_{idx} = Masked(d_{idx}[i], _mask_get(m_{idx}, i + offset_{idx}))
+"""
+
+row_initializer_template = """\
+        row["{name}"] = masked_{idx}
+"""
+
+row_kernel_template = """\
+def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}):
+    i = cuda.grid(1)
+    ret_data_arr, ret_mask_arr = retval
+    if i < size:
+        # Create a structured array with the desired fields
+        rows = cuda.local.array(1, dtype=row_type)
+
+        # one element of that array
+        row = rows[0]
+
+{masked_input_initializers}
+{row_initializers}
+
+        # pass the assembled row into the udf
+        ret = f_(row, {extra_args})
+
+        # pack up the return values and set them
+        ret_masked = pack_return(ret)
+        ret_data_arr[i] = ret_masked.value
+        ret_mask_arr[i] = ret_masked.valid
+"""
+
+scalar_kernel_template = """
+def _kernel(retval, size, input_col_0, offset_0, {extra_args}):
+    i = cuda.grid(1)
+    ret_data_arr, ret_mask_arr = retval
+
+    if i < size:
+
+{masked_initializer}
+
+        ret = f_(masked_0, {extra_args})
+
+        ret_masked = pack_return(ret)
+        ret_data_arr[i] = ret_masked.value
+        ret_mask_arr[i] = ret_masked.valid
+"""
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
new file mode 100644
index 00000000000..a98ee40274e
--- /dev/null
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -0,0 +1,216 @@
+from typing import Callable
+
+import cachetools
+import numpy as np
+from numba import cuda, typeof
+from numba.core.errors import TypingError
+from numba.np import numpy_support
+from numba.types import Poison, Tuple, boolean, int64, void
+from nvtx import annotate
+
+from cudf.core.dtypes import CategoricalDtype
+from cudf.core.udf.typing import MaskedType
+from cudf.utils import cudautils
+from cudf.utils.dtypes import (
+    BOOL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+)
+
+JIT_SUPPORTED_TYPES = (
+    NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES
+)
+
+libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32"))
+MASK_BITSIZE = np.dtype("int32").itemsize * 8
+
+precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
+
+
+@annotate("NUMBA JIT", color="green", domain="cudf_python")
+def _get_udf_return_type(argty, func: Callable, args=()):
+    """
+    Get the return type of a masked UDF for a given set of argument dtypes. It
+    is assumed that the function consumes a dictionary whose keys are strings
+    and whose values are of MaskedType. Initially assume that the UDF may be
+    written to utilize any field in the row - including those containing an
+    unsupported dtype. If an unsupported dtype is actually used in the function
+    the compilation should fail at `compile_udf`. If compilation succeeds, one
+    can infer that the function does not use any of the columns of unsupported
+    dtype - meaning we can drop them going forward and the UDF will still end
+    up getting fed rows containing all the fields it actually needs to use to
+    compute the answer for that row.
+    """
+
+    # present a row containing all fields to the UDF and try and compile
+    compile_sig = (argty, *(typeof(arg) for arg in args))
+
+    # Get the return type. The PTX is also returned by compile_udf, but is not
+    # needed here.
+    ptx, output_type = cudautils.compile_udf(func, compile_sig)
+    if not isinstance(output_type, MaskedType):
+        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
+    else:
+        numba_output_type = output_type
+
+    result = (
+        numba_output_type
+        if not isinstance(numba_output_type, MaskedType)
+        else numba_output_type.value_type
+    )
+
+    # _get_udf_return_type will throw a TypingError if the user tries to use
+    # a field in the row containing an unsupported dtype, except in the
+    # edge case where all the function does is return that element:
+
+    # def f(row):
+    #    return row[<bad dtype key>]
+    # In this case numba is happy to return MaskedType(<bad dtype key>)
+    # because it relies on not finding overloaded operators for types to raise
+    # the exception, so we have to explicitly check for that case.
+    if isinstance(result, Poison):
+        raise TypingError(str(result))
+
+    return result
+
+
+def _is_jit_supported_type(dtype):
+    # category dtype isn't hashable
+    if isinstance(dtype, CategoricalDtype):
+        return False
+    return str(dtype) in JIT_SUPPORTED_TYPES
+
+
+def _all_dtypes_from_frame(frame):
+    return {
+        colname: col.dtype
+        if _is_jit_supported_type(col.dtype)
+        else np.dtype("O")
+        for colname, col in frame._data.items()
+    }
+
+
+def _supported_dtypes_from_frame(frame):
+    return {
+        colname: col.dtype
+        for colname, col in frame._data.items()
+        if _is_jit_supported_type(col.dtype)
+    }
+
+
+def _supported_cols_from_frame(frame):
+    return {
+        colname: col
+        for colname, col in frame._data.items()
+        if _is_jit_supported_type(col.dtype)
+    }
+
+
+def _masked_array_type_from_col(col):
+    """
+    Return a type representing a tuple of arrays,
+    the first element an array of the numba type
+    corresponding to `dtype`, and the second an
+    array of bools representing a mask.
+    """
+    nb_scalar_ty = numpy_support.from_dtype(col.dtype)
+    if col.mask is None:
+        return nb_scalar_ty[::1]
+    else:
+        return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1]))
+
+
+def _construct_signature(frame, return_type, args):
+    """
+    Build the signature of numba types that will be used to
+    actually JIT the kernel itself later, accounting for types
+    and offsets. Skips columns with unsupported dtypes.
+    """
+
+    # Tuple of arrays, first the output data array, then the mask
+    return_type = Tuple((return_type[::1], boolean[::1]))
+    offsets = []
+    sig = [return_type, int64]
+    for col in _supported_cols_from_frame(frame).values():
+        sig.append(_masked_array_type_from_col(col))
+        offsets.append(int64)
+
+    # return_type, size, data, masks, offsets, extra args
+    sig = void(*(sig + offsets + [typeof(arg) for arg in args]))
+
+    return sig
+
+
+@cuda.jit(device=True)
+def _mask_get(mask, pos):
+    """Return the validity of mask[pos] as a word."""
+    return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
+
+
+def _generate_cache_key(frame, func: Callable):
+    """Create a cache key that uniquely identifies a compilation.
+
+    A new compilation is needed any time any of the following things change:
+    - The UDF itself as defined in python by the user
+    - The types of the columns utilized by the UDF
+    - The existence of the input columns masks
+    """
+    return (
+        *cudautils.make_cache_key(
+            func, tuple(_all_dtypes_from_frame(frame).values())
+        ),
+        *(col.mask is None for col in frame._data.values()),
+        *frame._data.keys(),
+    )
+
+
+@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
+def _compile_or_get(frame, func, args, kernel_getter=None):
+    """
+    Return a compiled kernel in terms of MaskedTypes that launches a
+    kernel equivalent of `f` for the dtypes of `df`. The kernel uses
+    a thread for each row and calls `f` using that rows data / mask
+    to produce an output value and output validity for each row.
+
+    If the UDF has already been compiled for this requested dtypes,
+    a cached version will be returned instead of running compilation.
+
+    CUDA kernels are void and do not return values. Thus, we need to
+    preallocate a column of the correct dtype and pass it in as one of
+    the kernel arguments. This creates a chicken-and-egg problem where
+    we need the column type to compile the kernel, but normally we would
+    be getting that type FROM compiling the kernel (and letting numba
+    determine it as a return value). As a workaround, we compile the UDF
+    itself outside the final kernel to invoke a full typing pass, which
+    unfortunately is difficult to do without running full compilation.
+    we then obtain the return type from that separate compilation and
+    use it to allocate an output column of the right dtype.
+    """
+
+    # check to see if we already compiled this function
+    cache_key = _generate_cache_key(frame, func)
+    if precompiled.get(cache_key) is not None:
+        kernel, masked_or_scalar = precompiled[cache_key]
+        return kernel, masked_or_scalar
+
+    # precompile the user udf to get the right return type.
+    # could be a MaskedType or a scalar type.
+
+    kernel, scalar_return_type = kernel_getter(frame, func, args)
+
+    np_return_type = numpy_support.as_dtype(scalar_return_type)
+    precompiled[cache_key] = (kernel, np_return_type)
+
+    return kernel, np_return_type
+
+
+def _get_kernel(kernel_string, globals_, sig, func):
+    """template kernel compilation helper function"""
+    f_ = cuda.jit(device=True)(func)
+    globals_["f_"] = f_
+    exec(kernel_string, globals_)
+    _kernel = globals_["_kernel"]
+    kernel = cuda.jit(sig)(_kernel)
+
+    return kernel
diff --git a/python/cudf/cudf/errors.py b/python/cudf/cudf/errors.py
index 8a31afab9cf..5d6f52c0307 100644
--- a/python/cudf/cudf/errors.py
+++ b/python/cudf/cudf/errors.py
@@ -1,9 +1,5 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 
-class UnSupportedGPUError(Exception):
-    pass
-
-
-class UnSupportedCUDAError(Exception):
+class UnsupportedCUDAError(Exception):
     pass
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 5c35d004ac0..a09fb1f8e12 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -291,7 +291,12 @@ def read_orc(
     **kwargs,
 ):
     """{docstring}"""
-
+    if decimal_cols_as_float is not None:
+        warnings.warn(
+            "`decimal_cols_as_float` is deprecated and will be removed in "
+            "the future",
+            FutureWarning,
+        )
     from cudf import DataFrame
 
     # Multiple sources are passed as a list. If a single source is passed,
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index cc5aec36853..41dac26edf8 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -40,6 +40,17 @@
 ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES))
 
 
+def set_random_null_mask_inplace(series, null_probability=0.5, seed=None):
+    """Randomly nullify elements in series with the provided probability."""
+    probs = [null_probability, 1 - null_probability]
+    rng = np.random.default_rng(seed=seed)
+    mask = rng.choice([False, True], size=len(series), p=probs)
+    series[mask] = None
+
+
+# TODO: This function should be removed. Anywhere that it is being used should
+# instead be generating a random boolean array (bytemask) and use the public
+# APIs to set those elements to None.
 def random_bitmask(size):
     """
     Parameters
diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index 925c9ef720c..ff6e79e7804 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -24,14 +24,10 @@ def test_applymap_round(nelem, masked):
         boolmask = np.asarray(
             utils.expand_bits_to_bytes(bitmask), dtype=np.bool_
         )[:nelem]
-        data[~boolmask] = np.nan
+        data[~boolmask] = None
 
     sr = Series(data)
 
-    if masked:
-        # Mask the Series
-        sr = sr.set_mask(bitmask)
-
     # Call applymap
     out = sr.applymap(
         lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index e01b952be94..748cf958ac3 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -437,7 +437,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
     expect_mask = [True if x is not pd.NA else False for x in pd_data["a"]]
     got_mask = mask_to_bools(
         gd_data["a"]._column.base_mask, 0, len(gd_data)
-    ).to_array()
+    ).values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
@@ -475,7 +475,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
     expect_mask = [True if x is not pd.NA else False for x in pd_data]
     got_mask = mask_to_bools(
         gd_data._column.base_mask, 0, len(gd_data)
-    ).to_array()
+    ).values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 194d838a04e..a3b7bd2373f 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -968,7 +968,7 @@ def test_dataframe_dir_and_getattr():
         df.not_a_column
 
 
-def test_empty_dataframe_to_array():
+def test_empty_dataframe_to_cupy():
     df = cudf.DataFrame()
 
     # Check fully empty dataframe.
@@ -1023,7 +1023,7 @@ def test_dataframe_to_cupy_null_values():
     for k in "abcd":
         df[k] = data = np.random.random(nelem)
         bitmask = utils.random_bitmask(nelem)
-        df[k] = df[k].set_mask(bitmask)
+        df[k] = df[k]._column.set_mask(bitmask)
         boolmask = np.asarray(
             utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_
         )
@@ -1194,7 +1194,7 @@ def test_dataframe_hash_partition_masked_value(nrows):
     gdf["val"] = np.arange(nrows) + 100
     bitmask = utils.random_bitmask(nrows)
     bytemask = utils.expand_bits_to_bytes(bitmask)
-    gdf["val"] = gdf["val"].set_mask(bitmask)
+    gdf["val"] = gdf["val"]._column.set_mask(bitmask)
     parted = gdf.partition_by_hash(["key"], nparts=3)
     # Verify that the valid mask is correct
     for p in parted:
@@ -1215,7 +1215,7 @@ def test_dataframe_hash_partition_masked_keys(nrows):
     gdf["val"] = np.arange(nrows) + 100
     bitmask = utils.random_bitmask(nrows)
     bytemask = utils.expand_bits_to_bytes(bitmask)
-    gdf["key"] = gdf["key"].set_mask(bitmask)
+    gdf["key"] = gdf["key"]._column.set_mask(bitmask)
     parted = gdf.partition_by_hash(["key"], nparts=3, keep_index=False)
     # Verify that the valid mask is correct
     for p in parted:
@@ -9150,3 +9150,37 @@ def test_dataframe_assign_cp_np_array():
     gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray
 
     assert_eq(pdf, gdf)
+
+
+@pytest.mark.parametrize(
+    "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}],
+)
+def test_dataframe_nunique(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.nunique()
+    expected = pdf.nunique()
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}],
+)
+def test_dataframe_nunique_index(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.index.nunique()
+    expected = pdf.index.nunique()
+
+    assert_eq(expected, actual)
+
+
+def test_dataframe_rename_duplicate_column():
+    gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+    with pytest.raises(
+        ValueError, match="Duplicate column names are not allowed"
+    ):
+        gdf.rename(columns={"a": "b"}, inplace=True)
diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py
deleted file mode 100644
index 224db2b39d1..00000000000
--- a/python/cudf/cudf/tests/test_fill.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import pandas as pd
-import pytest
-
-import cudf
-from cudf.testing._utils import assert_eq
-
-
-@pytest.mark.parametrize(
-    "fill_value,data",
-    [
-        (7, [6, 3, 4]),
-        ("x", ["a", "b", "c", "d", "e", "f"]),
-        (7, [6, 3, 4, 2, 1, 7, 8, 5]),
-        (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]),
-        ("b", pd.Categorical(["a", "b", "c"])),
-        (None, [0.0, 1.0, 2.0, 3.0]),
-    ],
-)
-@pytest.mark.parametrize(
-    "begin,end",
-    [
-        (0, -1),
-        (0, 4),
-        (1, -1),
-        (1, 4),
-        (-2, 1),
-        (-2, -1),
-        (10, 12),
-        (8, 10),
-        (10, 8),
-        (-10, -8),
-        (-2, 6),
-    ],
-)
-@pytest.mark.parametrize("inplace", [True, False])
-def test_fill(data, fill_value, begin, end, inplace):
-    gs = cudf.Series(data)
-    ps = gs.to_pandas()
-
-    if inplace:
-        actual = gs
-        gs[begin:end] = fill_value
-    else:
-        # private impl doesn't take care of rounding or bounds check
-        if begin < 0:
-            begin += len(gs)
-
-        if end < 0:
-            end += len(gs)
-
-        begin = max(0, min(len(gs), begin))
-        end = max(0, min(len(gs), end))
-        actual = gs.fill(fill_value, begin, end, False)
-        assert actual is not gs
-
-    ps[begin:end] = fill_value
-
-    assert_eq(ps, actual)
-
-
-@pytest.mark.xfail(raises=ValueError)
-def test_fill_new_category():
-    gs = cudf.Series(pd.Categorical(["a", "b", "c"]))
-    gs[0:1] = "d"
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index e452dc5d7f7..102e5b57e8e 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -783,8 +783,8 @@ def test_dataframe_masked_slicing(nelem, slice_start, slice_end):
     gdf = cudf.DataFrame()
     gdf["a"] = list(range(nelem))
     gdf["b"] = list(range(nelem, 2 * nelem))
-    gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem))
-    gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem))
+    gdf["a"] = gdf["a"]._column.set_mask(utils.random_bitmask(nelem))
+    gdf["b"] = gdf["b"]._column.set_mask(utils.random_bitmask(nelem))
 
     def do_slice(x):
         return x[slice_start:slice_end]
diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py
deleted file mode 100644
index e5c6bacf1d1..00000000000
--- a/python/cudf/cudf/tests/test_label_encode.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
-
-import random
-from itertools import product
-
-import numpy as np
-import pytest
-
-import cudf
-from cudf import DataFrame, Series
-
-
-def _random_float(nelem, dtype):
-    return np.random.random(nelem).astype(dtype)
-
-
-def _random_int(nelem, dtype):
-    return np.random.randint(low=0, high=nelem, size=nelem, dtype=dtype)
-
-
-def _random(nelem, dtype):
-    dtype = cudf.dtype(dtype)
-    if dtype.kind in {"i", "u"}:
-        return _random_int(nelem, dtype)
-    elif dtype.kind == "f":
-        return _random_float(nelem, dtype)
-
-
-_param_sizes = [1, 7, 10, 100, 1000]
-_param_dtypes = [np.int32, np.float32]
-
-
-@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
-@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
-@pytest.mark.parametrize(
-    "nelem,dtype", list(product(_param_sizes, _param_dtypes))
-)
-def test_label_encode(nelem, dtype):
-    df = DataFrame()
-    np.random.seed(0)
-
-    # initialize data frame
-    df["cats"] = _random(nelem, dtype)
-    vals = df["cats"].unique()
-    lab = dict({vals[i]: i for i in range(len(vals))})
-
-    # label encode series
-    ncol = df["cats"].label_encoding(cats=vals)
-    arr = ncol.to_numpy()
-
-    # verify labels of new column
-    for i in range(arr.size):
-        np.testing.assert_equal(arr[i], lab.get(df.cats[i], None))
-
-    # label encode data frame
-    df2 = df.label_encoding(column="cats", prefix="cats", cats=vals)
-
-    assert df2.columns[0] == "cats"
-    assert df2.columns[1] == "cats_labels"
-
-
-@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
-@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
-def test_label_encode_drop_one():
-    random.seed(0)
-    np.random.seed(0)
-
-    df = DataFrame()
-
-    # initialize data frame
-    df["cats"] = np.random.randint(7, size=10, dtype=np.int32)
-    vals = df["cats"].unique()
-    # drop 1 randomly
-    vals = vals[vals.index != random.randrange(len(vals))].reset_index(
-        drop=True
-    )
-
-    lab = dict({vals[i]: i for i in range(len(vals))})
-
-    # label encode series
-    ncol = df["cats"].label_encoding(cats=vals, dtype="float32")
-    arr = ncol.to_numpy()
-
-    # verify labels of new column
-
-    for i in range(arr.size):
-        # assuming -1 is used for missing value
-        np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1))
-
-    # label encode data frame
-    df2 = df.label_encoding(
-        column="cats", prefix="cats", cats=vals, dtype="float32"
-    )
-
-    assert df2.columns[0] == "cats"
-    assert df2.columns[1] == "cats_labels"
-
-
-@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
-def test_label_encode_float_output():
-    random.seed(0)
-    np.random.seed(0)
-
-    df = DataFrame()
-
-    # initialize data frame
-    df["cats"] = arr = np.random.randint(7, size=10, dtype=np.int32)
-    cats = [1, 2, 3, 4]
-    encoder = {c: i for i, c in enumerate(cats)}
-    df2 = df.label_encoding(
-        column="cats",
-        prefix="cats",
-        cats=cats,
-        dtype=np.float32,
-        na_sentinel=np.nan,
-    )
-
-    got = df2["cats_labels"].to_numpy(na_value=np.nan)
-
-    handcoded = np.array([encoder.get(v, np.nan) for v in arr])
-    np.testing.assert_equal(got, handcoded)
-
-
-@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
-@pytest.mark.parametrize(
-    "ncats,cat_dtype", [(10, np.int8), (127, np.int8), (128, np.int16)]
-)
-def test_label_encode_dtype(ncats, cat_dtype):
-    s = Series([str(i % ncats) for i in range(ncats + 1)])
-    cats = s.unique().astype(s.dtype)
-    encoded_col = s.label_encoding(cats=cats)
-    np.testing.assert_equal(encoded_col.dtype, cat_dtype)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 44749103b54..fc9ad9711d1 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import functools
 import operator
 
@@ -586,3 +587,18 @@ def test_listcol_setitem_error_cases(data, item, error):
     sr = cudf.Series(data)
     with pytest.raises(BaseException, match=error):
         sr[1] = item
+
+
+def test_listcol_setitem_retain_dtype():
+    df = cudf.DataFrame(
+        {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]}
+    )
+    df1 = df.head(0)
+    # Performing a setitem on `b` triggers a `column.column_empty_like` call
+    # which tries to create an empty ListColumn.
+    df1["b"] = df1["c"]
+    # Performing a copy to trigger a copy dtype which is obtained by accessing
+    # `ListColumn.children` that would have been corrupted in previous call
+    # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/
+    df2 = df1.copy()
+    assert df2["a"].dtype == df["a"].dtype
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 21556aad1eb..80ab0671a0d 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -28,7 +28,7 @@
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
-    random_bitmask,
+    set_random_null_mask_inplace,
 )
 
 
@@ -2124,7 +2124,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
     gdf = cudf.from_pandas(pdf)
     if add_nulls:
         for col in gdf:
-            gdf[col] = gdf[col].set_mask(random_bitmask(len(gdf)))
+            set_random_null_mask_inplace(gdf[col])
     gdf.to_parquet(file_path, index=False)
 
     # Read back from pyarrow
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 82020f30f7c..ca02ee55df0 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -20,10 +20,8 @@
 @pytest.mark.parametrize("nrows", [0, 5, 10])
 def test_null_series(nrows, dtype):
     size = 5
-    mask = utils.random_bitmask(size)
-    data = cudf.Series(np.random.randint(1, 9, size))
-    column = data.set_mask(mask)
-    sr = cudf.Series(column).astype(dtype)
+    sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype)
+    sr[np.random.choice([False, True], size=size)] = None
     if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}:
         ps = pd.Series(
             sr._column.data_array_view.copy_to_host(),
@@ -62,10 +60,8 @@ def test_null_dataframe(ncols):
     size = 20
     gdf = cudf.DataFrame()
     for idx, dtype in enumerate(dtype_categories):
-        mask = utils.random_bitmask(size)
-        data = cudf.Series(np.random.randint(0, 128, size))
-        column = data.set_mask(mask)
-        sr = cudf.Series(column).astype(dtype)
+        sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype)
+        sr[np.random.choice([False, True], size=size)] = None
         gdf[dtype] = sr
     pdf = gdf.to_pandas()
     pd.options.display.max_columns = int(ncols)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index ffdd53c58ac..358484d79b9 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -561,7 +561,9 @@ def test_series_value_counts(dropna, normalize):
     for size in [10 ** x for x in range(5)]:
         arr = np.random.randint(low=-1, high=10, size=size)
         mask = arr != -1
-        sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
+        sr = cudf.Series.from_masked_array(
+            arr, cudf.Series(mask)._column.as_mask()
+        )
         sr.name = "col"
 
         expect = (
@@ -1517,3 +1519,74 @@ def test_series_transpose(data):
     assert_eq(pd_transposed, cudf_transposed)
     assert_eq(pd_property, cudf_property)
     assert_eq(cudf_transposed, csr)
+
+
+@pytest.mark.parametrize(
+    "data", [1, 3, 5, 7, 7],
+)
+def test_series_nunique(data):
+    cd_s = cudf.Series(data)
+    pd_s = cd_s.to_pandas()
+
+    actual = cd_s.nunique()
+    expected = pd_s.nunique()
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data", [1, 3, 5, 7, 7],
+)
+def test_series_nunique_index(data):
+    cd_s = cudf.Series(data)
+    pd_s = cd_s.to_pandas()
+
+    actual = cd_s.index.nunique()
+    expected = pd_s.index.nunique()
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "fill_value,data",
+    [
+        (7, [6, 3, 4]),
+        ("x", ["a", "b", "c", "d", "e", "f"]),
+        (7, [6, 3, 4, 2, 1, 7, 8, 5]),
+        (0.8, [0.6, 0.3, 0.4, 0.2, 0.1, 0.7, 0.8, 0.5]),
+        ("b", pd.Categorical(["a", "b", "c"])),
+        (None, [0.0, 1.0, 2.0, 3.0]),
+    ],
+)
+@pytest.mark.parametrize(
+    "begin,end",
+    [
+        (0, -1),
+        (0, 4),
+        (1, -1),
+        (1, 4),
+        (-2, 1),
+        (-2, -1),
+        (10, 12),
+        (8, 10),
+        (10, 8),
+        (-10, -8),
+        (-2, 6),
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fill(data, fill_value, begin, end, inplace):
+    gs = cudf.Series(data)
+    ps = gs.to_pandas()
+
+    actual = gs
+    gs[begin:end] = fill_value
+    ps[begin:end] = fill_value
+
+    assert_eq(ps, actual)
+
+
+@pytest.mark.xfail(raises=ValueError)
+def test_fill_new_category():
+    gs = cudf.Series(pd.Categorical(["a", "b", "c"]))
+    gs[0:1] = "d"
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 142ca6c6831..cb3a369d067 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -32,7 +32,8 @@ def test_series_reductions(method, dtype, skipna):
     arr = arr.astype(dtype)
     if dtype in (np.float32, np.float64):
         arr[[2, 5, 14, 19, 50, 70]] = np.nan
-    sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
+    sr = cudf.Series(arr)
+    sr[~mask] = None
     psr = sr.to_pandas()
     psr[~mask] = np.nan
 
@@ -83,7 +84,8 @@ def test_series_unique():
     for size in [10 ** x for x in range(5)]:
         arr = np.random.randint(low=-1, high=10, size=size)
         mask = arr != -1
-        sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
+        sr = cudf.Series(arr)
+        sr[~mask] = None
         assert set(arr[mask]) == set(sr.unique().dropna().to_numpy())
         assert len(set(arr[mask])) == sr.nunique()
 
@@ -298,7 +300,8 @@ def test_series_median(dtype, num_na):
     mask = np.arange(100) >= num_na
 
     arr = arr.astype(dtype)
-    sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
+    sr = cudf.Series(arr)
+    sr[~mask] = None
     arr2 = arr[mask]
     ps = pd.Series(arr2, dtype=dtype)
 
diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index 935c3868a68..c5cd8f8b717 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -49,4 +49,4 @@ def generic_function(a, b):
 
     result = lhs_arr ** 3 + rhs_arr
 
-    np.testing.assert_almost_equal(result, out_col.to_array())
+    np.testing.assert_almost_equal(result, out_col.values_host)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index c9c2c440632..56090c8eacf 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -13,6 +13,7 @@
     comparison_ops,
     unary_ops,
 )
+from cudf.core.udf.utils import precompiled
 from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq
 
 
@@ -485,7 +486,7 @@ def outer(row):
         {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]}
     )
 
-    with pytest.raises(AttributeError):
+    with pytest.raises(ValueError):
         gdf.apply(outer, axis=1)
 
     pdf = gdf.to_pandas(nullable=True)
@@ -538,7 +539,7 @@ def func(row):
         return row["unsupported_col"]
 
     # check that we fail when an unsupported type is used within a function
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         data.apply(func, axis=1)
 
     # also check that a DF containing unsupported dtypes can still run a
@@ -595,6 +596,44 @@ def func(row, c, k):
     run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, cudf.NA, 3],
+        [0.5, 2.0, cudf.NA, cudf.NA, 5.0],
+        [True, False, cudf.NA],
+    ],
+)
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_mask_udf_scalar_args_binops_series(data, op):
+    data = cudf.Series(data)
+
+    def func(x, c):
+        return x + c
+
+    run_masked_udf_series(func, data, args=(1,), check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, cudf.NA, 3],
+        [0.5, 2.0, cudf.NA, cudf.NA, 5.0],
+        [True, False, cudf.NA],
+    ],
+)
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_masked_udf_scalar_args_binops_multiple_series(data, op):
+    data = cudf.Series(data)
+
+    def func(data, c, k):
+        x = op(data, c)
+        y = op(x, k)
+        return y
+
+    run_masked_udf_series(func, data, args=(1, 2), check_dtype=False)
+
+
 def test_masked_udf_caching():
     # Make sure similar functions that differ
     # by simple things like constants actually
@@ -612,3 +651,16 @@ def test_masked_udf_caching():
     expect = data ** 3
     got = data.applymap(lambda x: x ** 3)
     assert_eq(expect, got, check_dtype=False)
+
+    # make sure we get a hit when reapplying
+    def f(x):
+        return x + 1
+
+    precompiled.clear()
+    assert precompiled.currsize == 0
+    data.apply(f)
+
+    assert precompiled.currsize == 1
+    data.apply(f)
+
+    assert precompiled.currsize == 1
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index fa5cde76524..3cbbc1e1ce7 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -173,7 +173,9 @@ def run(self, df, **launch_params):
                 outputs[k], index=outdf.index, nan_as_null=False
             )
             if out_mask is not None:
-                outdf[k] = outdf[k].set_mask(out_mask.data_array_view)
+                outdf._data[k] = outdf[k]._column.set_mask(
+                    out_mask.data_array_view
+                )
 
         return outdf
 
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index 8947760e052..bd3da4ea2ba 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -26,16 +26,7 @@ def validate_setup():
         runtimeGetVersion,
     )
 
-    def _try_get_old_or_new_symbols():
-        try:
-            # CUDA 10.2+ symbols
-            return [
-                cudaError_t.cudaErrorDeviceUninitialized,
-                cudaError_t.cudaErrorTimeout,
-            ]
-        except AttributeError:
-            # CUDA 10.1 symbols
-            return [cudaError_t.cudaErrorDeviceUninitilialized]
+    from cudf.errors import UnsupportedCUDAError
 
     notify_caller_errors = {
         cudaError_t.cudaErrorInitializationError,
@@ -51,7 +42,8 @@ def _try_get_old_or_new_symbols():
         cudaError_t.cudaErrorSystemNotReady,
         cudaError_t.cudaErrorSystemDriverMismatch,
         cudaError_t.cudaErrorCompatNotSupportedOnDevice,
-        *_try_get_old_or_new_symbols(),
+        cudaError_t.cudaErrorDeviceUninitialized,
+        cudaError_t.cudaErrorTimeout,
         cudaError_t.cudaErrorUnknown,
         cudaError_t.cudaErrorApiFailureBase,
     }
@@ -72,43 +64,38 @@ def _try_get_old_or_new_symbols():
             cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, 0
         )
 
-        if major_version >= 6:
-            # You have a GPU with NVIDIA Pascal™ architecture or better
+        if major_version < 6:
+            # A GPU with NVIDIA Pascal™ architecture or newer is required.
+            # Reference: https://developer.nvidia.com/cuda-gpus
             # Hardware Generation	Compute Capability
+            #    Ampere	                8.x
             #    Turing	                7.5
-            #    Volta	                7.x
+            #    Volta	                7.0, 7.2
             #    Pascal	                6.x
-            #    Maxwell	              5.x
+            #    Maxwell                5.x
             #    Kepler	                3.x
             #    Fermi	                2.x
-            pass
-        else:
             device_name = deviceGetName(0)
             minor_version = getDeviceAttribute(
                 cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, 0
             )
             warnings.warn(
-                f"You will need a GPU with NVIDIA Pascal™ or "
-                f"newer architecture"
-                f"\nDetected GPU 0: {device_name} \n"
-                f"Detected Compute Capability: "
-                f"{major_version}.{minor_version}"
+                "A GPU with NVIDIA Pascal™ (Compute Capability 6.0) "
+                "or newer architecture is required.\n"
+                f"Detected GPU 0: {device_name}\n"
+                f"Detected Compute Capability: {major_version}.{minor_version}"
             )
 
         cuda_runtime_version = runtimeGetVersion()
 
-        if cuda_runtime_version >= 10000:
-            # CUDA Runtime Version Check: Runtime version is greater than 10000
-            pass
-        else:
-            from cudf.errors import UnSupportedCUDAError
-
-            minor_version = cuda_runtime_version % 100
-            major_version = (cuda_runtime_version - minor_version) // 1000
-            raise UnSupportedCUDAError(
-                f"Detected CUDA Runtime version is "
-                f"{major_version}.{str(minor_version)[0]}"
-                f"Please update your CUDA Runtime to 10.0 or above"
+        if cuda_runtime_version < 11000:
+            # Require CUDA Runtime version 11.0 or greater.
+            major_version = cuda_runtime_version // 1000
+            minor_version = (cuda_runtime_version % 1000) // 10
+            raise UnsupportedCUDAError(
+                "Detected CUDA Runtime version is "
+                f"{major_version}.{minor_version}. "
+                "Please update your CUDA Runtime to 11.0 or above."
             )
 
         cuda_driver_supported_rt_version = driverGetVersion()
@@ -124,15 +111,12 @@ def _try_get_old_or_new_symbols():
         # https://docs.nvidia.com/deploy/cuda-compatibility/index.html
 
         if cuda_driver_supported_rt_version == 0:
-            from cudf.errors import UnSupportedCUDAError
-
-            raise UnSupportedCUDAError(
-                "We couldn't detect the GPU driver "
-                "properly. Please follow the linux installation guide to "
-                "ensure your driver is properly installed "
-                ": https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"
+            raise UnsupportedCUDAError(
+                "We couldn't detect the GPU driver properly. Please follow "
+                "the installation guide to ensure your driver is properly "
+                "installed: "
+                "https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"
             )
-
         elif cuda_driver_supported_rt_version >= cuda_runtime_version:
             # CUDA Driver Version Check:
             # Driver Runtime version is >= Runtime version
@@ -147,17 +131,12 @@ def _try_get_old_or_new_symbols():
             # version 450.80.02 supports.
             pass
         else:
-            from cudf.errors import UnSupportedCUDAError
-
-            raise UnSupportedCUDAError(
-                f"Please update your NVIDIA GPU Driver to support CUDA "
-                f"Runtime.\n"
-                f"Detected CUDA Runtime version : {cuda_runtime_version}"
-                f"\n"
-                f"Latest version of CUDA supported by current "
+            raise UnsupportedCUDAError(
+                "Please update your NVIDIA GPU Driver to support CUDA "
+                "Runtime.\n"
+                f"Detected CUDA Runtime version : {cuda_runtime_version}\n"
+                "Latest version of CUDA supported by current "
                 f"NVIDIA GPU Driver : {cuda_driver_supported_rt_version}"
             )
-
     else:
-
         warnings.warn("No NVIDIA GPU detected")
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index d23094ef3f9..add4ecd8f01 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -242,7 +242,6 @@ def _fillna_natwise(col):
     return column.build_column(
         data=result.base_data,
         dtype=result.dtype,
-        mask=col.base_mask,
         size=result.size,
         offset=result.offset,
         children=result.base_children,
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 149d98ebfb9..1bc270a5b9f 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import math
 from operator import getitem
 from typing import Set
@@ -42,19 +43,11 @@ def __init__(self, *args, **kwargs):
     def __getitem__(self, key):
         if isinstance(key, list):
             g = CudfDataFrameGroupBy(
-                self.obj,
-                by=self.index,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
+                self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna,
             )
         else:
             g = CudfSeriesGroupBy(
-                self.obj,
-                by=self.index,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
+                self.obj, by=self.by, slice=key, sort=self.sort, **self.dropna,
             )
 
         g._meta = g._meta[key]
@@ -63,8 +56,8 @@ def __getitem__(self, key):
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
-            {c: "mean" for c in self.obj.columns if c not in self.index},
+            self.by,
+            {c: "mean" for c in self.obj.columns if c not in self.by},
             split_every=split_every,
             split_out=split_out,
             dropna=self.dropna,
@@ -76,8 +69,8 @@ def mean(self, split_every=None, split_out=1):
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
-            {c: "collect" for c in self.obj.columns if c not in self.index},
+            self.by,
+            {c: "collect" for c in self.obj.columns if c not in self.by},
             split_every=split_every,
             split_out=split_out,
             dropna=self.dropna,
@@ -94,10 +87,10 @@ def aggregate(self, arg, split_every=None, split_out=1):
         if (
             isinstance(self.obj, DaskDataFrame)
             and (
-                isinstance(self.index, str)
+                isinstance(self.by, str)
                 or (
-                    isinstance(self.index, list)
-                    and all(isinstance(x, str) for x in self.index)
+                    isinstance(self.by, list)
+                    and all(isinstance(x, str) for x in self.by)
                 )
             )
             and _is_supported(arg, SUPPORTED_AGGS)
@@ -133,7 +126,7 @@ def __init__(self, *args, **kwargs):
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
+            self.by,
             {self._slice: "mean"},
             split_every=split_every,
             split_out=split_out,
@@ -146,7 +139,7 @@ def mean(self, split_every=None, split_out=1):
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
+            self.by,
             {self._slice: "std"},
             split_every=split_every,
             split_out=split_out,
@@ -159,7 +152,7 @@ def std(self, split_every=None, split_out=1):
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
+            self.by,
             {self._slice: "var"},
             split_every=split_every,
             split_out=split_out,
@@ -172,7 +165,7 @@ def var(self, split_every=None, split_out=1):
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
-            self.index,
+            self.by,
             {self._slice: "collect"},
             split_every=split_every,
             split_out=split_out,
@@ -192,12 +185,12 @@ def aggregate(self, arg, split_every=None, split_out=1):
 
         if (
             isinstance(self.obj, DaskDataFrame)
-            and isinstance(self.index, (str, list))
+            and isinstance(self.by, (str, list))
             and _is_supported(arg, SUPPORTED_AGGS)
         ):
             return groupby_agg(
                 self.obj,
-                self.index,
+                self.by,
                 arg,
                 split_every=split_every,
                 split_out=split_out,
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 1521ce41806..c7342818610 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -41,7 +41,7 @@ def test_series(data):
     sr = Series(pdsr)
     dsr = dgd.from_cudf(sr, npartitions=5)
 
-    np.testing.assert_equal(np.array(pdsr), dsr.compute().to_array())
+    np.testing.assert_equal(np.array(pdsr), dsr.compute().values_host)
 
 
 @pytest.mark.parametrize("data", [data_dt_1()])
@@ -114,7 +114,7 @@ def test_categorical_basic(data):
     sr = Series(cat)
     dsr = dgd.from_cudf(sr, npartitions=2)
     result = dsr.compute()
-    np.testing.assert_array_equal(cat.codes, result.to_array())
+    np.testing.assert_array_equal(cat.codes, result.cat.codes.values_host)
 
     assert dsr.dtype.to_pandas() == pdsr.dtype
     # Test attributes
@@ -122,7 +122,9 @@ def test_categorical_basic(data):
 
     assert_eq(pdsr.cat.categories, dsr.cat.categories)
 
-    np.testing.assert_array_equal(pdsr.cat.codes.values, result.to_array())
+    np.testing.assert_array_equal(
+        pdsr.cat.codes.values, result.cat.codes.values_host
+    )
 
     string = str(result)
     expect_str = """
@@ -207,12 +209,12 @@ def test_categorical_compare_ordered(data):
     # Test equality
     out = dsr1 == dsr1
     assert out.dtype == np.bool_
-    assert np.all(out.compute().to_array())
+    assert np.all(out.compute().values_host)
     assert np.all(pdsr1 == pdsr1)
 
     # Test inequality
     out = dsr1 != dsr1
-    assert not np.any(out.compute().to_array())
+    assert not np.any(out.compute().values_host)
     assert not np.any(pdsr1 != pdsr1)
 
     assert dsr1.cat.ordered
@@ -220,10 +222,10 @@ def test_categorical_compare_ordered(data):
 
     # Test ordered operators
     np.testing.assert_array_equal(
-        pdsr1 < pdsr2, (dsr1 < dsr2).compute().to_array()
+        pdsr1 < pdsr2, (dsr1 < dsr2).compute().values_host
     )
     np.testing.assert_array_equal(
-        pdsr1 > pdsr2, (dsr1 > dsr2).compute().to_array()
+        pdsr1 > pdsr2, (dsr1 > dsr2).compute().values_host
     )
 
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index ace9701b677..67fed62c582 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -284,7 +284,7 @@ def test_assign():
     got = dgf.assign(z=newcol)
 
     dd.assert_eq(got.loc[:, ["x", "y"]], df)
-    np.testing.assert_array_equal(got["z"].compute().to_array(), pdcol)
+    np.testing.assert_array_equal(got["z"].compute().values_host, pdcol)
 
 
 @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"])