diff --git a/build.sh b/build.sh
index 765a1b5325f..5fb957c80a6 100755
--- a/build.sh
+++ b/build.sh
@@ -168,6 +168,10 @@ if hasArg clean; then
         rmdir ${bd} || true
     fi
     done
+
+    # Cleaning up python artifacts
+    find ${REPODIR}/python/ | grep -E "(__pycache__|\.pyc|\.pyo|\.so$)"  | xargs rm -rf
+
 fi
 
 
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index a79ffa0fc47..aa57d02032b 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -239,9 +239,10 @@ fi
 # TEST - Run py.test, notebooks
 ################################################################################
 
-cd "$WORKSPACE/python/cudf"
+cd "$WORKSPACE/python/cudf/cudf"
+# It is essential to cd into $WORKSPACE/python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
 gpuci_logger "Python py.test for cuDF"
-py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests
 
 cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"
diff --git a/codecov.yml b/codecov.yml
index c0a3a2fba2b..4eaa5b066c4 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -2,4 +2,7 @@
 coverage:
   status:
     project: off
-    patch: off
+    patch: on
+
+github_checks:
+    annotations: true
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 0145e2e4d01..24432272693 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -22,6 +22,9 @@ build:
     - CC
     - CXX
     - CUDAHOSTCXX
+  # libcudf's run_exports pinning is looser than we would like
+  ignore_run_exports:
+    - libcudf
 
 requirements:
   build:
@@ -44,6 +47,7 @@ requirements:
     - numba >=0.54
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda
+    - libcudf {{ version }}
     - fastavro >=0.22.0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec>=0.6.0
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 4e20c979f6c..ebfc649c0d2 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -137,6 +137,7 @@ test:
     - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
     - test -f $PREFIX/include/cudf/io/orc.hpp
     - test -f $PREFIX/include/cudf/io/parquet.hpp
+    - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
     - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
     - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
     - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
@@ -203,6 +204,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/json.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/utf8.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp
diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index 1eb5b13ddc3..d5864a7d68c 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -20,6 +20,9 @@ build:
     - PARALLEL_LEVEL
     - VERSION_SUFFIX
     - PROJECT_FLASH
+  # libcudf's run_exports pinning is looser than we would like
+  ignore_run_exports:
+    - libcudf
 
 requirements:
   build:
@@ -27,6 +30,8 @@ requirements:
   host:
     - libcudf {{version}}
     - librdkafka >=1.7.0,<1.8.0a0
+  run:
+    - libcudf {{version}}
 
 test:
   commands:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f4fef2a1af0..f5d1dc51217 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -310,6 +310,7 @@ add_library(
   src/io/parquet/writer_impl.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
+  src/io/text/byte_range_info.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
@@ -361,6 +362,7 @@ add_library(
   src/quantiles/quantiles.cu
   src/reductions/all.cu
   src/reductions/any.cu
+  src/reductions/collect_ops.cu
   src/reductions/max.cu
   src/reductions/mean.cu
   src/reductions/min.cu
@@ -372,6 +374,13 @@ add_library(
   src/reductions/scan/scan.cpp
   src/reductions/scan/scan_exclusive.cu
   src/reductions/scan/scan_inclusive.cu
+  src/reductions/segmented_all.cu
+  src/reductions/segmented_any.cu
+  src/reductions/segmented_max.cu
+  src/reductions/segmented_min.cu
+  src/reductions/segmented_product.cu
+  src/reductions/segmented_reductions.cpp
+  src/reductions/segmented_sum.cu
   src/reductions/std.cu
   src/reductions/sum.cu
   src/reductions/sum_of_squares.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 11eef015364..67c5ba0b229 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 find_package(Threads REQUIRED)
 
-add_library(cudf_datagen STATIC common/generate_input.cpp)
+add_library(cudf_datagen STATIC common/generate_input.cpp common/generate_nullmask.cu)
 target_compile_features(cudf_datagen PUBLIC cxx_std_17 cuda_std_17)
 
 target_compile_options(
@@ -24,7 +24,7 @@ target_compile_options(
 
 target_link_libraries(
   cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
-                      benchmark::benchmark nvbench::nvbench Threads::Threads cudf
+                      benchmark::benchmark nvbench::nvbench Threads::Threads cudf cudftestutil
 )
 
 target_include_directories(
@@ -175,9 +175,10 @@ ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
 ConfigureBench(
-  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/reduce.cpp
-  reduction/scan.cpp reduction/minmax.cpp
+  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/minmax.cpp
+  reduction/reduce.cpp reduction/scan.cpp
 )
+ConfigureNVBench(REDUCTION_NVBENCH reduction/segment_reduce.cu)
 
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
@@ -276,7 +277,7 @@ ConfigureBench(
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH string/json.cpp)
+ConfigureBench(JSON_BENCH string/json.cu)
 
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index c17c288a6d3..de0429f74ad 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,26 +14,18 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <benchmark/benchmark.h>
-#include <fixture/benchmark_fixture.hpp>
-#include <fixture/templated_benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
 #include <list>
-#include <numeric>
-#include <random>
+#include <memory>
 #include <vector>
 
 enum class TreeType {
@@ -41,45 +33,23 @@ enum class TreeType {
                    // child column reference
 };
 
+template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 class AST : public cudf::benchmark {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 static void BM_ast_transform(benchmark::State& state)
 {
-  const cudf::size_type table_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type tree_levels = (cudf::size_type)state.range(1);
+  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
 
   // Create table data
-  auto n_cols          = reuse_columns ? 1 : tree_levels + 1;
-  auto column_wrappers = std::vector<cudf::test::fixed_width_column_wrapper<key_type>>(n_cols);
-  auto columns         = std::vector<cudf::column_view>(n_cols);
-
-  auto data_iterator = thrust::make_counting_iterator(0);
-
-  if constexpr (Nullable) {
-    auto validities = std::vector<bool>(table_size);
-    std::random_device rd;
-    std::mt19937 gen(rd());
-
-    std::generate(
-      validities.begin(), validities.end(), [&]() { return gen() > (0.5 * gen.max()); });
-    std::generate_n(column_wrappers.begin(), n_cols, [=]() {
-      return cudf::test::fixed_width_column_wrapper<key_type>(
-        data_iterator, data_iterator + table_size, validities.begin());
-    });
-  } else {
-    std::generate_n(column_wrappers.begin(), n_cols, [=]() {
-      return cudf::test::fixed_width_column_wrapper<key_type>(data_iterator,
-                                                              data_iterator + table_size);
-    });
-  }
-  std::transform(
-    column_wrappers.begin(), column_wrappers.end(), columns.begin(), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  cudf::table_view table{columns};
+  auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
+  auto const source_table =
+    create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols),
+                          row_count{table_size},
+                          Nullable ? 0.5 : -1.0);
+  auto table = source_table->view();
 
   // Create column references
   auto column_refs = std::vector<cudf::ast::column_reference>();
@@ -138,10 +108,15 @@ static void CustomRanges(benchmark::internal::Benchmark* b)
   }
 }
 
-#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable)   \
-  TEMPLATED_BENCHMARK_F(AST, BM_ast_transform, key_type, tree_type, reuse_columns, nullable) \
-    ->Apply(CustomRanges)                                                                    \
-    ->Unit(benchmark::kMillisecond)                                                          \
+#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
+  (::benchmark::State & st)                                                                \
+  {                                                                                        \
+    BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                    \
+  }                                                                                        \
+  BENCHMARK_REGISTER_F(AST, name)                                                          \
+    ->Apply(CustomRanges)                                                                  \
+    ->Unit(benchmark::kMillisecond)                                                        \
     ->UseManualTime();
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index 314d657679b..e5bde94f1f9 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,23 +14,15 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/binaryop.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <benchmark/benchmark.h>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <numeric>
 #include <vector>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
@@ -47,40 +39,29 @@ class BINARYOP : public cudf::benchmark {
 template <typename key_type, TreeType tree_type, bool reuse_columns>
 static void BM_binaryop_transform(benchmark::State& state)
 {
-  const cudf::size_type table_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type tree_levels = (cudf::size_type)state.range(1);
+  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
 
   // Create table data
-  auto n_cols          = reuse_columns ? 1 : tree_levels + 1;
-  auto column_wrappers = std::vector<cudf::test::fixed_width_column_wrapper<key_type>>();
-  auto columns         = std::vector<cudf::column_view>(n_cols);
-
-  auto data_iterator = thrust::make_counting_iterator(0);
-  std::generate_n(std::back_inserter(column_wrappers), n_cols, [=]() {
-    return cudf::test::fixed_width_column_wrapper<key_type>(data_iterator,
-                                                            data_iterator + table_size);
-  });
-  std::transform(
-    column_wrappers.begin(), column_wrappers.end(), columns.begin(), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  cudf::table_view table{columns};
+  auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
+  auto const source_table = create_sequence_table(
+    cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
+  cudf::table_view table{*source_table};
 
   // Execute benchmark
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     // Execute tree that chains additions like (((a + b) + c) + d)
-    auto const op         = cudf::binary_operator::ADD;
-    auto result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
+    auto const op               = cudf::binary_operator::ADD;
+    auto const result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
     if (reuse_columns) {
-      auto result = cudf::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
+      auto result = cudf::binary_operation(table.column(0), table.column(0), op, result_data_type);
       for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-        result = cudf::binary_operation(result->view(), columns.at(0), op, result_data_type);
+        result = cudf::binary_operation(result->view(), table.column(0), op, result_data_type);
       }
     } else {
-      auto result = cudf::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
-      std::for_each(std::next(columns.cbegin(), 2), columns.cend(), [&](auto const& col) {
+      auto result = cudf::binary_operation(table.column(0), table.column(1), op, result_data_type);
+      std::for_each(std::next(table.begin(), 2), table.end(), [&](auto const& col) {
         result = cudf::binary_operation(result->view(), col, op, result_data_type);
       });
     }
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index f8226c7387a..4c3bf360256 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -14,30 +14,26 @@
  * limitations under the License.
  */
 
-#include <fixture/benchmark_fixture.hpp>
-#include <fixture/templated_benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/binaryop.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 class COMPILED_BINARYOP : public cudf::benchmark {
 };
 
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
 void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+
+  auto const source_table = create_random_table(
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{column_size});
 
-  auto data_it = thrust::make_counting_iterator(0);
-  cudf::test::fixed_width_column_wrapper<TypeLhs> input1(data_it, data_it + column_size);
-  cudf::test::fixed_width_column_wrapper<TypeRhs> input2(data_it, data_it + column_size);
+  auto lhs = cudf::column_view(source_table->get_column(0));
+  auto rhs = cudf::column_view(source_table->get_column(1));
 
-  auto lhs          = cudf::column_view(input1);
-  auto rhs          = cudf::column_view(input2);
   auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
 
   // Call once for hot cache.
diff --git a/cpp/benchmarks/common/generate_input.cpp b/cpp/benchmarks/common/generate_input.cpp
index 68eabd3f1cc..6330beda54c 100644
--- a/cpp/benchmarks/common/generate_input.cpp
+++ b/cpp/benchmarks/common/generate_input.cpp
@@ -19,6 +19,8 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -120,7 +122,7 @@ struct random_value_fn;
  * @brief Creates an random timestamp/duration value
  */
 template <typename T>
-struct random_value_fn<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
+struct random_value_fn<T, std::enable_if_t<cudf::is_chrono<T>()>> {
   std::function<int64_t(std::mt19937&)> seconds_gen;
   std::function<int64_t(std::mt19937&)> nanoseconds_gen;
 
@@ -162,7 +164,7 @@ struct random_value_fn<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
  * @brief Creates an random fixed_point value. Not implemented yet.
  */
 template <typename T>
-struct random_value_fn<T, typename std::enable_if_t<cudf::is_fixed_point<T>()>> {
+struct random_value_fn<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
   using rep = typename T::rep;
   rep const lower_bound;
   rep const upper_bound;
@@ -192,9 +194,7 @@ struct random_value_fn<T, typename std::enable_if_t<cudf::is_fixed_point<T>()>>
  * @brief Creates an random numeric value with the given distribution.
  */
 template <typename T>
-struct random_value_fn<
-  T,
-  typename std::enable_if_t<!std::is_same_v<T, bool> && cudf::is_numeric<T>()>> {
+struct random_value_fn<T, std::enable_if_t<!std::is_same_v<T, bool> && cudf::is_numeric<T>()>> {
   T const lower_bound;
   T const upper_bound;
   distribution_fn<T> dist;
@@ -217,7 +217,7 @@ struct random_value_fn<
  * @brief Creates an boolean value with given probability of returning `true`.
  */
 template <typename T>
-struct random_value_fn<T, typename std::enable_if_t<std::is_same_v<T, bool>>> {
+struct random_value_fn<T, std::enable_if_t<std::is_same_v<T, bool>>> {
   std::bernoulli_distribution b_dist;
 
   random_value_fn(distribution_params<bool> const& desc) : b_dist{desc.probability_true} {}
@@ -258,7 +258,7 @@ struct stored_as {
 
 // Use `int8_t` for bools because that's how they're stored in columns
 template <typename T>
-struct stored_as<T, typename std::enable_if_t<std::is_same_v<T, bool>>> {
+struct stored_as<T, std::enable_if_t<std::is_same_v<T, bool>>> {
   using type = int8_t;
 };
 
@@ -571,11 +571,11 @@ columns_vector create_random_columns(data_profile const& profile,
 }
 
 /**
- * @brief Repeats the input data types in round-robin order to fill a vector of @ref num_cols
+ * @brief Repeats the input data types cyclically order to fill a vector of @ref num_cols
  * elements.
  */
-std::vector<cudf::type_id> repeat_dtypes(std::vector<cudf::type_id> const& dtype_ids,
-                                         cudf::size_type num_cols)
+std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_ids,
+                                        cudf::size_type num_cols)
 {
   if (dtype_ids.size() == static_cast<std::size_t>(num_cols)) { return dtype_ids; }
   std::vector<cudf::type_id> out_dtypes;
@@ -586,29 +586,26 @@ std::vector<cudf::type_id> repeat_dtypes(std::vector<cudf::type_id> const& dtype
 }
 
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
-                                                 cudf::size_type num_cols,
                                                  table_size_bytes table_bytes,
                                                  data_profile const& profile,
                                                  unsigned seed)
 {
-  auto const out_dtype_ids = repeat_dtypes(dtype_ids, num_cols);
   size_t const avg_row_bytes =
-    std::accumulate(out_dtype_ids.begin(), out_dtype_ids.end(), 0ul, [&](size_t sum, auto tid) {
+    std::accumulate(dtype_ids.begin(), dtype_ids.end(), 0ul, [&](size_t sum, auto tid) {
       return sum + avg_element_size(profile, cudf::data_type(tid));
     });
   cudf::size_type const num_rows = table_bytes.size / avg_row_bytes;
 
-  return create_random_table(out_dtype_ids, num_cols, row_count{num_rows}, profile, seed);
+  return create_random_table(dtype_ids, row_count{num_rows}, profile, seed);
 }
 
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
-                                                 cudf::size_type num_cols,
                                                  row_count num_rows,
                                                  data_profile const& profile,
                                                  unsigned seed)
 {
-  auto const out_dtype_ids = repeat_dtypes(dtype_ids, num_cols);
-  auto seed_engine         = deterministic_engine(seed);
+  cudf::size_type const num_cols = dtype_ids.size();
+  auto seed_engine               = deterministic_engine(seed);
 
   auto const processor_count            = std::thread::hardware_concurrency();
   cudf::size_type const cols_per_thread = (num_cols + processor_count - 1) / processor_count;
@@ -619,8 +616,8 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
   for (unsigned int i = 0; i < processor_count && next_col < num_cols; ++i) {
     auto thread_engine         = deterministic_engine(seed_dist(seed_engine));
     auto const thread_num_cols = std::min(num_cols - next_col, cols_per_thread);
-    std::vector<cudf::type_id> thread_types(out_dtype_ids.begin() + next_col,
-                                            out_dtype_ids.begin() + next_col + thread_num_cols);
+    std::vector<cudf::type_id> thread_types(dtype_ids.begin() + next_col,
+                                            dtype_ids.begin() + next_col + thread_num_cols);
     col_futures.emplace_back(std::async(std::launch::async,
                                         create_random_columns,
                                         std::cref(profile),
@@ -642,6 +639,22 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
   return std::make_unique<cudf::table>(std::move(output_columns));
 }
 
+std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> const& dtype_ids,
+                                                   row_count num_rows,
+                                                   float null_probability,
+                                                   unsigned seed)
+{
+  auto columns = std::vector<std::unique_ptr<cudf::column>>(dtype_ids.size());
+  std::transform(dtype_ids.begin(), dtype_ids.end(), columns.begin(), [&](auto dtype) mutable {
+    auto init          = cudf::make_default_constructed_scalar(cudf::data_type{dtype});
+    auto col           = cudf::sequence(num_rows.count, *init);
+    auto [mask, count] = create_random_null_mask(num_rows.count, null_probability, seed++);
+    col->set_null_mask(std::move(mask), count);
+    return col;
+  });
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
 std::vector<cudf::type_id> get_type_or_group(int32_t id)
 {
   // identity transformation when passing a concrete type_id
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 1999ccb8ec3..5246de00a73 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -19,6 +19,7 @@
 #include <map>
 
 #include <cudf/table/table.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
 /**
@@ -113,7 +114,8 @@ std::pair<int64_t, int64_t> default_range()
 template <typename T, std::enable_if_t<cudf::is_numeric<T>()>* = nullptr>
 std::pair<T, T> default_range()
 {
-  return {std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max()};
+  // Limits need to be such that `upper - lower` does not overflow
+  return {std::numeric_limits<T>::lowest() / 2, std::numeric_limits<T>::max() / 2};
 }
 }  // namespace
 
@@ -127,9 +129,7 @@ struct distribution_params;
  * @brief Numeric values are parameterized with a distribution type and bounds of the same type.
  */
 template <typename T>
-struct distribution_params<
-  T,
-  typename std::enable_if_t<!std::is_same_v<T, bool> && cudf::is_numeric<T>()>> {
+struct distribution_params<T, std::enable_if_t<!std::is_same_v<T, bool> && cudf::is_numeric<T>()>> {
   distribution_id id;
   T lower_bound;
   T upper_bound;
@@ -139,7 +139,7 @@ struct distribution_params<
  * @brief Booleans are parameterized with the probability of getting `true` value.
  */
 template <typename T>
-struct distribution_params<T, typename std::enable_if_t<std::is_same_v<T, bool>>> {
+struct distribution_params<T, std::enable_if_t<std::is_same_v<T, bool>>> {
   double probability_true;
 };
 
@@ -147,7 +147,7 @@ struct distribution_params<T, typename std::enable_if_t<std::is_same_v<T, bool>>
  * @brief Timestamps and durations are parameterized with a distribution type and int64_t bounds.
  */
 template <typename T>
-struct distribution_params<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
+struct distribution_params<T, std::enable_if_t<cudf::is_chrono<T>()>> {
   distribution_id id;
   int64_t lower_bound;
   int64_t upper_bound;
@@ -157,7 +157,7 @@ struct distribution_params<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
  * @brief Strings are parameterized by the distribution of their length, as an integral value.
  */
 template <typename T>
-struct distribution_params<T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
+struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   distribution_params<uint32_t> length_params;
 };
 
@@ -166,7 +166,7 @@ struct distribution_params<T, typename std::enable_if_t<std::is_same_v<T, cudf::
  * the element type.
  */
 template <typename T>
-struct distribution_params<T, typename std::enable_if_t<std::is_same_v<T, cudf::list_view>>> {
+struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::list_view>>> {
   cudf::type_id element_type;
   distribution_params<uint32_t> length_params;
   cudf::size_type max_depth;
@@ -174,7 +174,7 @@ struct distribution_params<T, typename std::enable_if_t<std::is_same_v<T, cudf::
 
 // Present for compilation only. To be implemented once reader/writers support the fixed width type.
 template <typename T>
-struct distribution_params<T, typename std::enable_if_t<cudf::is_fixed_point<T>()>> {
+struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
 };
 
 /**
@@ -223,9 +223,8 @@ class data_profile {
   cudf::size_type avg_run_length = 4;
 
  public:
-  template <
-    typename T,
-    typename std::enable_if_t<!std::is_same_v<T, bool> && std::is_integral_v<T>, T>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!std::is_same_v<T, bool> && cuda::std::is_integral_v<T>, T>* = nullptr>
   distribution_params<T> get_distribution_params() const
   {
     auto it = int_params.find(cudf::type_to_id<T>());
@@ -238,7 +237,7 @@ class data_profile {
     }
   }
 
-  template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>, T>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_floating_point_v<T>, T>* = nullptr>
   distribution_params<T> get_distribution_params() const
   {
     auto it = float_params.find(cudf::type_to_id<T>());
@@ -257,7 +256,7 @@ class data_profile {
     return distribution_params<T>{bool_probability};
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_chrono<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_chrono<T>()>* = nullptr>
   distribution_params<T> get_distribution_params() const
   {
     auto it = int_params.find(cudf::type_to_id<T>());
@@ -283,7 +282,7 @@ class data_profile {
     return list_dist_desc;
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   distribution_params<typename T::rep> get_distribution_params() const
   {
     using rep = typename T::rep;
@@ -306,7 +305,7 @@ class data_profile {
   // discrete distributions (integers, strings, lists). Otherwise the call with have no effect.
   template <typename T,
             typename Type_enum,
-            typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
+            std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
   void set_distribution_params(Type_enum type_or_group,
                                distribution_id dist,
                                T lower_bound,
@@ -330,7 +329,7 @@ class data_profile {
   // have continuous distributions (floating point types). Otherwise the call with have no effect.
   template <typename T,
             typename Type_enum,
-            typename std::enable_if_t<std::is_floating_point_v<T>, T>* = nullptr>
+            std::enable_if_t<std::is_floating_point_v<T>, T>* = nullptr>
   void set_distribution_params(Type_enum type_or_group,
                                distribution_id dist,
                                T lower_bound,
@@ -369,18 +368,13 @@ struct row_count {
 /**
  * @brief Deterministically generates a table filled with data with the given parameters.
  *
- * If the number of passed types is smaller than the number of requested column, the columns types
- * with be repeated in round-robin order to fill the table.
- *
  * @param dtype_ids Vector of requested column types
- * @param num_cols Number of columns in the output table
  * @param table_bytes Target size of the output table, in bytes. Some type may not produce columns
  * of exact size
  * @param data_params optional, set of data parameters describing the data profile for each type
  * @param seed optional, seed for the pseudo-random engine
  */
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
-                                                 cudf::size_type num_cols,
                                                  table_size_bytes table_bytes,
                                                  data_profile const& data_params = data_profile{},
                                                  unsigned seed                   = 1);
@@ -388,17 +382,51 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
 /**
  * @brief Deterministically generates a table filled with data with the given parameters.
  *
- * If the number of passed types is smaller than the number of requested column, the columns types
- * with be repeated in round-robin order to fill the table.
- *
  * @param dtype_ids Vector of requested column types
- * @param num_cols Number of columns in the output table
  * @param num_rows Number of rows in the output table
  * @param data_params optional, set of data parameters describing the data profile for each type
  * @param seed optional, seed for the pseudo-random engine
  */
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
-                                                 cudf::size_type num_cols,
                                                  row_count num_rows,
                                                  data_profile const& data_params = data_profile{},
                                                  unsigned seed                   = 1);
+
+/**
+ * @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in
+ * subsequent rows.
+ *
+ * @param dtype_ids Vector of requested column types
+ * @param num_rows Number of rows in the output table
+ * @param null_probability optional, probability of a null value
+ *  <0 implies no null mask, =0 implies all valids, >=1 implies all nulls
+ * @param seed optional, seed for the pseudo-random engine
+ * @return A table with the sequence columns.
+ */
+std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> const& dtype_ids,
+                                                   row_count num_rows,
+                                                   float null_probability = -1.0,
+                                                   unsigned seed          = 1);
+
+/**
+ * @brief Repeats the input data types cyclically to fill a vector of @ref num_cols
+ * elements.
+ *
+ * @param dtype_ids Vector of requested column types
+ * @param num_cols Number of types in the output vector
+ * @return A vector of type_ids
+ */
+std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_ids,
+                                        cudf::size_type num_cols);
+/**
+ * @brief Create a random null mask object
+ *
+ * @param size number of rows
+ * @param null_probability probability of a null value
+ *  <0 implies no null mask, =0 implies all valids, >=1 implies all nulls
+ * @param seed optional, seed for the pseudo-random engine
+ * @return null mask device buffer with random null mask data and null count
+ */
+std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(cudf::size_type size,
+                                                                       float null_probability,
+                                                                       unsigned seed = 1);
diff --git a/cpp/benchmarks/common/generate_nullmask.cu b/cpp/benchmarks/common/generate_nullmask.cu
new file mode 100644
index 00000000000..502af95a971
--- /dev/null
+++ b/cpp/benchmarks/common/generate_nullmask.cu
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "generate_input.hpp"
+
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/null_mask.hpp>
+
+#include <thrust/random.h>
+
+/**
+ * @brief bool generator with given probability [0.0 - 1.0] of returning true.
+ *
+ */
+struct bool_generator {
+  thrust::minstd_rand engine;
+  thrust::uniform_real_distribution<float> dist;
+  float probability_true;
+  bool_generator(unsigned seed, float probability_true)
+    : engine(seed), dist{0, 1}, probability_true{probability_true}
+  {
+  }
+
+  __device__ bool operator()(size_t n)
+  {
+    engine.discard(n);
+    return dist(engine) < probability_true;
+  }
+};
+
+std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(cudf::size_type size,
+                                                                       float null_probability,
+                                                                       unsigned seed)
+{
+  if (null_probability < 0.0f) {
+    return {rmm::device_buffer{}, 0};
+  } else if (null_probability == 0.0f) {
+    return {cudf::create_null_mask(size, cudf::mask_state::ALL_NULL), size};
+  } else if (null_probability >= 1.0f) {
+    return {cudf::create_null_mask(size, cudf::mask_state::ALL_VALID), 0};
+  } else {
+    return cudf::detail::valid_if(thrust::make_counting_iterator<cudf::size_type>(0),
+                                  thrust::make_counting_iterator<cudf::size_type>(size),
+                                  bool_generator{seed, 1.0f - null_probability});
+  }
+};
diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp
index 3289c6f40ab..f2f3833f15d 100644
--- a/cpp/benchmarks/common/random_distribution_factory.hpp
+++ b/cpp/benchmarks/common/random_distribution_factory.hpp
@@ -24,7 +24,7 @@
 /**
  * @brief Generates a normal(binomial) distribution between zero and upper_bound.
  */
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
+template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
 auto make_normal_dist(T upper_bound)
 {
   using uT = typename std::make_unsigned<T>::type;
@@ -42,7 +42,7 @@ auto make_normal_dist(T upper_bound)
   return std::normal_distribution<T>(mean, stddev);
 }
 
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
+template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
 auto make_uniform_dist(T range_start, T range_end)
 {
   return std::uniform_int_distribution<T>(range_start, range_end);
@@ -62,7 +62,7 @@ double geometric_dist_p(T range_size)
   return p ? p : std::numeric_limits<double>::epsilon();
 }
 
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
+template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
 auto make_geometric_dist(T range_start, T range_end)
 {
   using uT = typename std::make_unsigned<T>::type;
@@ -82,7 +82,7 @@ auto make_geometric_dist(T range_start, T range_end)
 template <typename T>
 using distribution_fn = std::function<T(std::mt19937&)>;
 
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
+template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
 distribution_fn<T> make_distribution(distribution_id did, T lower_bound, T upper_bound)
 {
   switch (did) {
diff --git a/cpp/benchmarks/copying/copy_if_else.cpp b/cpp/benchmarks/copying/copy_if_else.cpp
index 6f3ba34e373..6f094aba680 100644
--- a/cpp/benchmarks/copying/copy_if_else.cpp
+++ b/cpp/benchmarks/copying/copy_if_else.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ static void BM_copy_if_else(benchmark::State& state, bool nulls)
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto input_type  = cudf::type_to_id<TypeParam>();
   auto bool_type   = cudf::type_id::BOOL8;
-  auto const input = create_random_table({input_type, input_type, bool_type}, 3, row_count{n_rows});
+  auto const input = create_random_table({input_type, input_type, bool_type}, row_count{n_rows});
 
   if (!nulls) {
     input->get_column(2).set_null_mask(rmm::device_buffer{}, 0);
diff --git a/cpp/benchmarks/copying/scatter.cu b/cpp/benchmarks/copying/scatter.cu
index a9ab376c8c3..977937beaa2 100644
--- a/cpp/benchmarks/copying/scatter.cu
+++ b/cpp/benchmarks/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,24 +14,15 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/copying.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-// #include <cudf_test/type_lists.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-
 #include <cudf/types.hpp>
 
-#include <algorithm>
-#include <random>
-
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <thrust/reverse.h>
+#include <thrust/shuffle.h>
 
 class Scatter : public cudf::benchmark {
 };
@@ -39,53 +30,33 @@ class Scatter : public cudf::benchmark {
 template <class TypeParam, bool coalesce>
 void BM_scatter(benchmark::State& state)
 {
-  const cudf::size_type source_size{(cudf::size_type)state.range(0)};
-  const auto n_cols = (cudf::size_type)state.range(1);
-
-  // Every element is valid
-  auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto const source_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const n_cols{static_cast<cudf::size_type>(state.range(1))};
 
   // Gather indices
-  std::vector<cudf::size_type> host_map_data(source_size);
-  std::iota(host_map_data.begin(), host_map_data.end(), 0);
+  auto scatter_map_table =
+    create_sequence_table({cudf::type_to_id<cudf::size_type>()}, row_count{source_size});
+  auto scatter_map = scatter_map_table->get_column(0).mutable_view();
 
   if (coalesce) {
-    std::reverse(host_map_data.begin(), host_map_data.end());
+    thrust::reverse(
+      thrust::device, scatter_map.begin<cudf::size_type>(), scatter_map.end<cudf::size_type>());
   } else {
-    std::random_shuffle(host_map_data.begin(), host_map_data.end());
+    thrust::shuffle(thrust::device,
+                    scatter_map.begin<cudf::size_type>(),
+                    scatter_map.end<cudf::size_type>(),
+                    thrust::default_random_engine());
   }
 
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> scatter_map(host_map_data.begin(),
-                                                                      host_map_data.end());
-
-  std::vector<cudf::test::fixed_width_column_wrapper<TypeParam>> source_column_wrappers;
-  std::vector<cudf::column_view> source_columns(n_cols);
-
-  std::vector<cudf::test::fixed_width_column_wrapper<TypeParam>> target_column_wrappers;
-  std::vector<cudf::column_view> target_columns(n_cols);
-
-  std::generate_n(std::back_inserter(source_column_wrappers), n_cols, [=]() {
-    return cudf::test::fixed_width_column_wrapper<TypeParam>(data, data + source_size);
-  });
-  std::transform(source_column_wrappers.begin(),
-                 source_column_wrappers.end(),
-                 source_columns.begin(),
-                 [](auto const& col) { return static_cast<cudf::column_view>(col); });
-
-  std::generate_n(std::back_inserter(target_column_wrappers), n_cols, [=]() {
-    return cudf::test::fixed_width_column_wrapper<TypeParam>(data, data + source_size);
-  });
-  std::transform(target_column_wrappers.begin(),
-                 target_column_wrappers.end(),
-                 target_columns.begin(),
-                 [](auto const& col) { return static_cast<cudf::column_view>(col); });
-
-  cudf::table_view source_table{source_columns};
-  cudf::table_view target_table{target_columns};
+  // Every element is valid
+  auto source_table = create_sequence_table(cycle_dtypes({cudf::type_to_id<TypeParam>()}, n_cols),
+                                            row_count{source_size});
+  auto target_table = create_sequence_table(cycle_dtypes({cudf::type_to_id<TypeParam>()}, n_cols),
+                                            row_count{source_size});
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::scatter(source_table, scatter_map, target_table);
+    cudf::scatter(*source_table, scatter_map, *target_table);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) * n_cols * 2 *
diff --git a/cpp/benchmarks/groupby/group_struct.cu b/cpp/benchmarks/groupby/group_struct.cu
index 355c7cbab6c..34f2d1adc75 100644
--- a/cpp/benchmarks/groupby/group_struct.cu
+++ b/cpp/benchmarks/groupby/group_struct.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,18 +41,11 @@ static auto create_data_table(cudf::size_type n_rows)
 
   // The first two struct members are int32 and string.
   // The first column is also used as keys in groupby.
-  auto col_ids = std::vector<cudf::type_id>{cudf::type_id::INT32, cudf::type_id::STRING};
-
   // The subsequent struct members are int32 and string again.
-  for (cudf::size_type i = 3; i <= num_struct_members; ++i) {
-    if (i % 2) {
-      col_ids.push_back(cudf::type_id::INT32);
-    } else {
-      col_ids.push_back(cudf::type_id::STRING);
-    }
-  }
-
-  return create_random_table(col_ids, num_struct_members, row_count{n_rows}, table_profile);
+  return create_random_table(
+    cycle_dtypes({cudf::type_id::INT32, cudf::type_id::STRING}, num_struct_members),
+    row_count{n_rows},
+    table_profile);
 }
 
 // Max aggregation/scan technically has the same performance as min.
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index e2ad38230a2..fe22795bb6b 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ enum contains_nulls { no_nulls, nulls };
 static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls has_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const data = create_random_table({cudf::type_id::INT64}, 1, row_count{n_rows});
+  auto const data = create_random_table({cudf::type_id::INT64}, row_count{n_rows});
   if (has_nulls == contains_nulls::no_nulls)
     data->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
 
diff --git a/cpp/benchmarks/io/csv/csv_reader.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp
index 241ba4d5954..c50f5220200 100644
--- a/cpp/benchmarks/io/csv/csv_reader.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,8 @@ void BM_csv_read_varying_input(benchmark::State& state)
   auto const data_types  = get_type_or_group(state.range(0));
   auto const source_type = static_cast<io_type>(state.range(1));
 
-  auto const tbl  = create_random_table(data_types, num_cols, table_size_bytes{data_size});
+  auto const tbl =
+    create_random_table(cycle_dtypes(data_types, num_cols), table_size_bytes{data_size});
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(source_type);
@@ -75,7 +76,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
                                 col_sel);
   auto const cols_to_read = select_column_indexes(data_types.size(), col_sel);
 
-  auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
+  auto const tbl  = create_random_table(data_types, table_size_bytes{data_size});
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
index 413a269bcb2..65aa31c68dc 100644
--- a/cpp/benchmarks/io/csv/csv_writer.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,8 @@ void BM_csv_write_varying_inout(benchmark::State& state)
   auto const data_types = get_type_or_group(state.range(0));
   auto const sink_type  = static_cast<io_type>(state.range(1));
 
-  auto const tbl  = create_random_table(data_types, num_cols, table_size_bytes{data_size});
+  auto const tbl =
+    create_random_table(cycle_dtypes(data_types, num_cols), table_size_bytes{data_size});
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
@@ -66,7 +67,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
                                              int32_t(type_group_id::TIMESTAMP),
                                              int32_t(cudf::type_id::STRING)});
 
-  auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
+  auto const tbl  = create_random_table(data_types, table_size_bytes{data_size});
   auto const view = tbl->view();
 
   std::string const na_per(na_per_len, '#');
diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
index e15513275ee..29d4860a0e5 100644
--- a/cpp/benchmarks/io/orc/orc_reader.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,8 +45,8 @@ void BM_orc_read_varying_input(benchmark::State& state)
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
   table_data_profile.set_avg_run_length(run_length);
-  auto const tbl =
-    create_random_table(data_types, num_cols, table_size_bytes{data_size}, table_data_profile);
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, table_data_profile);
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(source_type);
@@ -96,7 +96,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
                                                    int32_t(type_group_id::TIMESTAMP),
                                                    int32_t(cudf::type_id::STRING)}),
                                 col_sel);
-  auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
+  auto const tbl  = create_random_table(data_types, table_size_bytes{data_size});
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index 50ae76e867c..e24ca7f749d 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -46,8 +46,8 @@ void BM_orc_write_varying_inout(benchmark::State& state)
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
   table_data_profile.set_avg_run_length(run_length);
-  auto const tbl =
-    create_random_table(data_types, num_cols, table_size_bytes{data_size}, table_data_profile);
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, table_data_profile);
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
@@ -83,7 +83,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
                                              int32_t(cudf::type_id::STRING),
                                              int32_t(cudf::type_id::LIST)});
 
-  auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
+  auto const tbl  = create_random_table(data_types, table_size_bytes{data_size});
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::FILEPATH);
diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
index 09194931498..74613e50158 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,8 +45,8 @@ void BM_parq_read_varying_input(benchmark::State& state)
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
   table_data_profile.set_avg_run_length(run_length);
-  auto const tbl =
-    create_random_table(data_types, num_cols, table_size_bytes{data_size}, table_data_profile);
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, table_data_profile);
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(source_type);
@@ -96,7 +96,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
                        static_cast<int32_t>(type_group_id::TIMESTAMP),
                        static_cast<int32_t>(cudf::type_id::STRING)}),
     col_sel);
-  auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
+  auto const tbl  = create_random_table(data_types, table_size_bytes{data_size});
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index 8287c27f804..d203f0d27c8 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,8 +45,8 @@ void BM_parq_write_varying_inout(benchmark::State& state)
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
   table_data_profile.set_avg_run_length(run_length);
-  auto const tbl =
-    create_random_table(data_types, num_cols, table_size_bytes{data_size}, table_data_profile);
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, table_data_profile);
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(sink_type);
@@ -77,7 +77,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
                                              int32_t(cudf::type_id::STRING),
                                              int32_t(cudf::type_id::LIST)});
 
-  auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
+  auto const tbl  = create_random_table(data_types, table_size_bytes{data_size});
   auto const view = tbl->view();
 
   cuio_source_sink_pair source_sink(io_type::FILEPATH);
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
index 98eaba213e5..30ed245ed9a 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,8 @@ void PQ_write(benchmark::State& state)
 {
   cudf::size_type num_cols = state.range(0);
 
-  auto tbl = create_random_table({cudf::type_id::INT32}, num_cols, table_size_bytes{data_size});
+  auto tbl              = create_random_table(cycle_dtypes({cudf::type_id::INT32}, num_cols),
+                                 table_size_bytes{data_size});
   cudf::table_view view = tbl->view();
 
   auto mem_stats_logger = cudf::memory_stats_logger();
@@ -69,8 +70,8 @@ void PQ_write_chunked(benchmark::State& state)
 
   std::vector<std::unique_ptr<cudf::table>> tables;
   for (cudf::size_type idx = 0; idx < num_tables; idx++) {
-    tables.push_back(create_random_table(
-      {cudf::type_id::INT32}, num_cols, table_size_bytes{size_t(data_size / num_tables)}));
+    tables.push_back(create_random_table(cycle_dtypes({cudf::type_id::INT32}, num_cols),
+                                         table_size_bytes{size_t(data_size / num_tables)}));
   }
 
   auto mem_stats_logger = cudf::memory_stats_logger();
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index b13835c15bb..8c4b10d928d 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -70,7 +70,6 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
 
   auto const values_table = create_random_table(  //
     {cudf::type_id::STRING},
-    1,
     row_count{num_rows},
     table_profile);
 
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index f2b9cb1bdb9..c1957db7929 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -147,8 +147,8 @@ static void BM_join(state_type& state, Join JoinFunc)
   // Benchmark conditional join
   if constexpr (std::is_same_v<state_type, benchmark::State> and is_conditional) {
     // Common column references.
-    const auto col_ref_left_0  = cudf::ast::column_reference(0);
-    const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+    auto const col_ref_left_0  = cudf::ast::column_reference(0);
+    auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
     auto left_zero_eq_right_zero =
       cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
 
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index 05c15a4fcb5..7a0d3f9515f 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const dtype = cudf::type_to_id<type>();
-  auto const table = create_random_table({dtype}, 1, row_count{n_rows});
+  auto const table = create_random_table({dtype}, row_count{n_rows});
   if (!include_nulls) table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
   cudf::column_view input(table->view().column(0));
 
diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segment_reduce.cu
new file mode 100644
index 00000000000..47e47943d36
--- /dev/null
+++ b/cpp/benchmarks/reduction/segment_reduce.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <nvbench/nvbench.cuh>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/reduction.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/device_vector.h>
+
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+namespace cudf {
+
+bool constexpr is_boolean_output_agg(segmented_reduce_aggregation::Kind kind)
+{
+  return kind == segmented_reduce_aggregation::ALL || kind == segmented_reduce_aggregation::ANY;
+}
+
+template <segmented_reduce_aggregation::Kind kind>
+std::unique_ptr<segmented_reduce_aggregation> make_simple_aggregation()
+{
+  switch (kind) {
+    case segmented_reduce_aggregation::SUM:
+      return make_sum_aggregation<segmented_reduce_aggregation>();
+    case segmented_reduce_aggregation::PRODUCT:
+      return make_product_aggregation<segmented_reduce_aggregation>();
+    case segmented_reduce_aggregation::MIN:
+      return make_min_aggregation<segmented_reduce_aggregation>();
+    case segmented_reduce_aggregation::MAX:
+      return make_max_aggregation<segmented_reduce_aggregation>();
+    case segmented_reduce_aggregation::ALL:
+      return make_all_aggregation<segmented_reduce_aggregation>();
+    case segmented_reduce_aggregation::ANY:
+      return make_any_aggregation<segmented_reduce_aggregation>();
+    default: CUDF_FAIL("Unsupported simple segmented aggregation");
+  }
+}
+
+template <typename InputType>
+std::pair<std::unique_ptr<column>, thrust::device_vector<size_type>> make_test_data(
+  nvbench::state& state)
+{
+  auto const column_size{size_type(state.get_int64("column_size"))};
+  auto const num_segments{size_type(state.get_int64("num_segments"))};
+
+  auto segment_length = column_size / num_segments;
+
+  test::UniformRandomGenerator<InputType> rand_gen(0, 100);
+  auto data_it = detail::make_counting_transform_iterator(
+    0, [&rand_gen](auto i) { return rand_gen.generate(); });
+
+  auto offset_it =
+    detail::make_counting_transform_iterator(0, [&column_size, &segment_length](auto i) {
+      return column_size < i * segment_length ? column_size : i * segment_length;
+    });
+
+  test::fixed_width_column_wrapper<InputType> input(data_it, data_it + column_size);
+  std::vector<size_type> h_offsets(offset_it, offset_it + num_segments + 1);
+  thrust::device_vector<size_type> d_offsets(h_offsets);
+
+  return std::make_pair(input.release(), d_offsets);
+}
+
+template <typename InputType, typename OutputType, aggregation::Kind kind>
+std::enable_if_t<!is_boolean_output_agg(kind) || std::is_same_v<OutputType, bool>, void>
+BM_Simple_Segmented_Reduction(nvbench::state& state,
+                              nvbench::type_list<InputType, OutputType, nvbench::enum_type<kind>>)
+{
+  // TODO: to be replaced by nvbench fixture once it's ready
+  cudf::rmm_pool_raii rmm_pool;
+
+  auto const column_size{size_type(state.get_int64("column_size"))};
+  auto [input, offsets] = make_test_data<InputType>(state);
+  auto agg              = make_simple_aggregation<kind>();
+
+  state.add_element_count(column_size);
+  state.add_global_memory_reads<InputType>(column_size);
+  state.add_global_memory_writes<OutputType>(column_size);
+
+  state.exec(
+    nvbench::exec_tag::sync,
+    [input_view = input->view(), offset_span = device_span<size_type>{offsets}, &agg](
+      nvbench::launch& launch) {
+      segmented_reduce(
+        input_view, offset_span, *agg, data_type{type_to_id<OutputType>()}, null_policy::INCLUDE);
+    });
+}
+
+template <typename InputType, typename OutputType, aggregation::Kind kind>
+std::enable_if_t<is_boolean_output_agg(kind) && !std::is_same_v<OutputType, bool>, void>
+BM_Simple_Segmented_Reduction(nvbench::state& state,
+                              nvbench::type_list<InputType, OutputType, nvbench::enum_type<kind>>)
+{
+  state.skip("Invalid combination of dtype and aggregation type.");
+}
+
+using Types = nvbench::type_list<bool, int32_t, float, double>;
+// Skip benchmarking MAX/ANY since they are covered by MIN/ALL respectively.
+using AggKinds = nvbench::
+  enum_type_list<aggregation::SUM, aggregation::PRODUCT, aggregation::MIN, aggregation::ALL>;
+
+NVBENCH_BENCH_TYPES(BM_Simple_Segmented_Reduction, NVBENCH_TYPE_AXES(Types, Types, AggKinds))
+  .set_name("segmented_reduction_simple")
+  .set_type_axes_names({"InputType", "OutputType", "AggregationKinds"})
+  .add_int64_axis("column_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+  .add_int64_axis("num_segments", {1'000, 10'000, 100'000});
+
+}  // namespace cudf
diff --git a/cpp/benchmarks/replace/clamp.cpp b/cpp/benchmarks/replace/clamp.cpp
index dd8b06227bc..d3a7415a478 100644
--- a/cpp/benchmarks/replace/clamp.cpp
+++ b/cpp/benchmarks/replace/clamp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ static void BM_clamp(benchmark::State& state, bool include_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const dtype = cudf::type_to_id<type>();
-  auto const table = create_random_table({dtype}, 1, row_count{n_rows});
+  auto const table = create_random_table({dtype}, row_count{n_rows});
   if (!include_nulls) { table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); }
   cudf::column_view input(table->view().column(0));
 
diff --git a/cpp/benchmarks/replace/nans.cpp b/cpp/benchmarks/replace/nans.cpp
index 3faf217956b..e1b05bbc337 100644
--- a/cpp/benchmarks/replace/nans.cpp
+++ b/cpp/benchmarks/replace/nans.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ static void BM_replace_nans(benchmark::State& state, bool include_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const dtype = cudf::type_to_id<type>();
-  auto const table = create_random_table({dtype}, 1, row_count{n_rows});
+  auto const table = create_random_table({dtype}, row_count{n_rows});
   if (!include_nulls) { table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); }
   cudf::column_view input(table->view().column(0));
 
diff --git a/cpp/benchmarks/search/search.cpp b/cpp/benchmarks/search/search.cpp
index c3529c7e79c..0bccbbaff54 100644
--- a/cpp/benchmarks/search/search.cpp
+++ b/cpp/benchmarks/search/search.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,63 +14,47 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
 
-#include <benchmark/benchmark.h>
-
 #include <random>
 
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-#include <cudf_test/base_fixture.hpp>
-
 class Search : public cudf::benchmark {
 };
 
-auto make_validity_iter()
-{
-  static constexpr int r_min = 1;
-  static constexpr int r_max = 10;
-
-  cudf::test::UniformRandomGenerator<uint8_t> rand_gen(r_min, r_max);
-  uint8_t mod_base = rand_gen.generate();
-  return cudf::detail::make_counting_transform_iterator(
-    0, [mod_base](auto row) { return (row % mod_base) > 0; });
-}
-
 void BM_column(benchmark::State& state, bool nulls)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type values_size = column_size;
-
-  auto col_data_it = cudf::detail::make_counting_transform_iterator(
-    0, [=](cudf::size_type row) { return static_cast<float>(row); });
-  auto val_data_it = cudf::detail::make_counting_transform_iterator(
-    0, [=](cudf::size_type row) { return static_cast<float>(values_size - row); });
-
-  auto column = [&]() {
-    return nulls ? cudf::test::fixed_width_column_wrapper<float>(
-                     col_data_it, col_data_it + column_size, make_validity_iter())
-                 : cudf::test::fixed_width_column_wrapper<float>(col_data_it,
-                                                                 col_data_it + column_size);
-  }();
-  auto values = [&]() {
-    return nulls ? cudf::test::fixed_width_column_wrapper<float>(
-                     val_data_it, val_data_it + values_size, make_validity_iter())
-                 : cudf::test::fixed_width_column_wrapper<float>(val_data_it,
-                                                                 val_data_it + values_size);
-  }();
-
-  auto data_table = cudf::sort(cudf::table_view({column}));
+  auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const values_size = column_size;
+
+  auto init_data  = cudf::make_fixed_width_scalar<float>(static_cast<float>(0));
+  auto init_value = cudf::make_fixed_width_scalar<float>(static_cast<float>(values_size));
+  auto step       = cudf::make_fixed_width_scalar<float>(static_cast<float>(-1));
+  auto column     = cudf::sequence(column_size, *init_data);
+  auto values     = cudf::sequence(values_size, *init_value, *step);
+  if (nulls) {
+    auto [column_null_mask, column_null_count] = create_random_null_mask(column->size(), 0.1, 1);
+    column->set_null_mask(std::move(column_null_mask), column_null_count);
+    auto [values_null_mask, values_null_count] = create_random_null_mask(values->size(), 0.1, 2);
+    values->set_null_mask(std::move(values_null_mask), values_null_count);
+  }
+
+  auto data_table = cudf::sort(cudf::table_view({*column}));
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
     auto col = cudf::upper_bound(data_table->view(),
-                                 cudf::table_view({values}),
+                                 cudf::table_view({*values}),
                                  {cudf::order::ASCENDING},
                                  {cudf::null_order::BEFORE});
   }
@@ -93,9 +77,9 @@ void BM_table(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<float>;
 
-  const cudf::size_type num_columns{(cudf::size_type)state.range(0)};
-  const cudf::size_type column_size{(cudf::size_type)state.range(1)};
-  const cudf::size_type values_size = column_size;
+  auto const num_columns{static_cast<cudf::size_type>(state.range(0))};
+  auto const column_size{static_cast<cudf::size_type>(state.range(1))};
+  auto const values_size = column_size;
 
   auto make_table = [&](cudf::size_type col_size) {
     cudf::test::UniformRandomGenerator<int> random_gen(0, 100);
@@ -142,30 +126,24 @@ BENCHMARK_REGISTER_F(Search, Table)
 
 void BM_contains(benchmark::State& state, bool nulls)
 {
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type values_size = column_size;
-
-  auto col_data_it = cudf::detail::make_counting_transform_iterator(
-    0, [=](cudf::size_type row) { return static_cast<float>(row); });
-  auto val_data_it = cudf::detail::make_counting_transform_iterator(
-    0, [=](cudf::size_type row) { return static_cast<float>(values_size - row); });
-
-  auto column = [&]() {
-    return nulls ? cudf::test::fixed_width_column_wrapper<float>(
-                     col_data_it, col_data_it + column_size, make_validity_iter())
-                 : cudf::test::fixed_width_column_wrapper<float>(col_data_it,
-                                                                 col_data_it + column_size);
-  }();
-  auto values = [&]() {
-    return nulls ? cudf::test::fixed_width_column_wrapper<float>(
-                     val_data_it, val_data_it + values_size, make_validity_iter())
-                 : cudf::test::fixed_width_column_wrapper<float>(val_data_it,
-                                                                 val_data_it + values_size);
-  }();
+  auto const column_size{static_cast<cudf::size_type>(state.range(0))};
+  auto const values_size = column_size;
+
+  auto init_data  = cudf::make_fixed_width_scalar<float>(static_cast<float>(0));
+  auto init_value = cudf::make_fixed_width_scalar<float>(static_cast<float>(values_size));
+  auto step       = cudf::make_fixed_width_scalar<float>(static_cast<float>(-1));
+  auto column     = cudf::sequence(column_size, *init_data);
+  auto values     = cudf::sequence(values_size, *init_value, *step);
+  if (nulls) {
+    auto [column_null_mask, column_null_count] = create_random_null_mask(column->size(), 0.1, 1);
+    column->set_null_mask(std::move(column_null_mask), column_null_count);
+    auto [values_null_mask, values_null_count] = create_random_null_mask(values->size(), 0.1, 2);
+    values->set_null_mask(std::move(values_null_mask), values_null_count);
+  }
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto col = cudf::contains(column, values);
+    auto col = cudf::contains(*column, *values);
   }
 }
 
diff --git a/cpp/benchmarks/sort/sort_strings.cpp b/cpp/benchmarks/sort/sort_strings.cpp
index 8adeef21a79..30a7aee043b 100644
--- a/cpp/benchmarks/sort/sort_strings.cpp
+++ b/cpp/benchmarks/sort/sort_strings.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ static void BM_sort(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
 
-  auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows});
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index 0f1653af2c6..0d74d0a6b7c 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ class StringCase : public cudf::benchmark {
 static void BM_case(benchmark::State& state)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows});
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
index 8983646b6f1..a0cfcd15fe8 100644
--- a/cpp/benchmarks/string/combine.cpp
+++ b/cpp/benchmarks/string/combine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,8 +36,8 @@ static void BM_combine(benchmark::State& state)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 2, row_count{n_rows}, table_profile);
+  auto const table = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input1(table->view().column(0));
   cudf::strings_column_view input2(table->view().column(1));
   cudf::string_scalar separator("+");
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index fbcfabb4532..8c536372359 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ enum contains_type { contains, count, findall };
 static void BM_contains(benchmark::State& state, contains_type ct)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows});
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp
index af51b504ee8..3782fea1e36 100644
--- a/cpp/benchmarks/string/convert_datetime.cpp
+++ b/cpp/benchmarks/string/convert_datetime.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ void BM_convert_datetime(benchmark::State& state, direction dir)
   auto const n_rows    = static_cast<cudf::size_type>(state.range(0));
   auto const data_type = cudf::data_type(cudf::type_to_id<TypeParam>());
 
-  auto const table = create_random_table({data_type.id()}, 1, row_count{n_rows});
+  auto const table = create_random_table({data_type.id()}, row_count{n_rows});
   cudf::column_view input(table->view().column(0));
 
   auto source = dir == direction::to ? cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S")
diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp
index 5c050592c7b..05b87906eca 100644
--- a/cpp/benchmarks/string/convert_fixed_point.cpp
+++ b/cpp/benchmarks/string/convert_fixed_point.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace {
 std::unique_ptr<cudf::column> get_strings_column(cudf::size_type rows)
 {
   std::unique_ptr<cudf::table> result =
-    create_random_table({cudf::type_id::FLOAT32}, 1, row_count{static_cast<cudf::size_type>(rows)});
+    create_random_table({cudf::type_id::FLOAT32}, row_count{static_cast<cudf::size_type>(rows)});
   return cudf::strings::from_floats(result->release().front()->view());
 }
 
diff --git a/cpp/benchmarks/string/convert_numerics.cpp b/cpp/benchmarks/string/convert_numerics.cpp
index 02ccb17e74a..71a23c76829 100644
--- a/cpp/benchmarks/string/convert_numerics.cpp
+++ b/cpp/benchmarks/string/convert_numerics.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ template <typename NumericType>
 std::unique_ptr<cudf::column> get_numerics_column(cudf::size_type rows)
 {
   std::unique_ptr<cudf::table> result =
-    create_random_table({cudf::type_to_id<NumericType>()}, 1, row_count{rows});
+    create_random_table({cudf::type_to_id<NumericType>()}, row_count{rows});
   return std::move(result->release().front());
 }
 
diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu
index 2f064e71c44..00eb818256c 100644
--- a/cpp/benchmarks/string/copy.cu
+++ b/cpp/benchmarks/string/copy.cu
@@ -40,9 +40,9 @@ static void BM_copy(benchmark::State& state, copy_type ct)
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
 
   auto const source =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+    create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   auto const target =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+    create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
 
   // scatter indices
   auto index_map_col = make_numeric_column(
diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu
index 2a88def1871..47356af129e 100644
--- a/cpp/benchmarks/string/factory.cu
+++ b/cpp/benchmarks/string/factory.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,9 +53,8 @@ static void BM_factory(benchmark::State& state)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
-  auto d_column = cudf::column_device_view::create(table->view().column(0));
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
+  auto d_column    = cudf::column_device_view::create(table->view().column(0));
   rmm::device_uvector<string_pair> pairs(d_column->size(), rmm::cuda_stream_default);
   thrust::transform(thrust::device,
                     d_column->pair_begin<cudf::string_view, true>(),
diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp
index fb030c2ccc2..b39cf25bc91 100644
--- a/cpp/benchmarks/string/filter.cpp
+++ b/cpp/benchmarks/string/filter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,8 +41,7 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
   auto const types = cudf::strings::string_character_types::SPACE;
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 167e9bc1348..55eb52c9b30 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,8 +39,7 @@ static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
   cudf::string_scalar target("+");
   cudf::test::strings_column_wrapper targets({"+", "-"});
diff --git a/cpp/benchmarks/string/json.cpp b/cpp/benchmarks/string/json.cpp
deleted file mode 100644
index 1ade4d01e1e..00000000000
--- a/cpp/benchmarks/string/json.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmark/benchmark.h>
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/strings/json.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
-class JsonPath : public cudf::benchmark {
-};
-
-float frand() { return static_cast<float>(rand()) / static_cast<float>(RAND_MAX); }
-
-int rand_range(int min, int max) { return min + static_cast<int>(frand() * (max - min)); }
-
-std::vector<std::string> Books{
-  "{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the "
-  "Century\",\n\"price\": 8.95\n}",
-  "{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of "
-  "Honour\",\n\"price\": 12.99\n}",
-  "{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby "
-  "Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}",
-  "{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the "
-  "Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"};
-constexpr int Approx_book_size = 110;
-std::vector<std::string> Bicycles{
-  "{\"color\": \"red\", \"price\": 9.95}",
-  "{\"color\": \"green\", \"price\": 29.95}",
-  "{\"color\": \"blue\", \"price\": 399.95}",
-  "{\"color\": \"yellow\", \"price\": 99.95}",
-  "{\"color\": \"mauve\", \"price\": 199.95}",
-};
-constexpr int Approx_bicycle_size = 33;
-std::string Misc{"\n\"expensive\": 10\n"};
-std::string generate_field(std::vector<std::string> const& values, int num_values)
-{
-  std::string res;
-  for (int idx = 0; idx < num_values; idx++) {
-    if (idx > 0) { res += std::string(",\n"); }
-    int vindex = std::min(static_cast<int>(floor(frand() * values.size())),
-                          static_cast<int>(values.size() - 1));
-    res += values[vindex];
-  }
-  return res;
-}
-
-std::string build_row(int desired_bytes)
-{
-  // always have at least 2 books and 2 bikes
-  int num_books    = 2;
-  int num_bicycles = 2;
-  int remaining_bytes =
-    desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size));
-
-  // divide up the remainder between books and bikes
-  float book_pct    = frand();
-  float bicycle_pct = 1.0f - book_pct;
-  num_books += (remaining_bytes * book_pct) / Approx_book_size;
-  num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size;
-
-  std::string books    = "\"book\": [\n" + generate_field(Books, num_books) + "]\n";
-  std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n";
-
-  std::string store = "\"store\": {\n";
-  if (frand() <= 0.5f) {
-    store += books + std::string(",\n") + bicycles;
-  } else {
-    store += bicycles + std::string(",\n") + books;
-  }
-  store += std::string("}\n");
-
-  std::string row = std::string("{\n");
-  if (frand() <= 0.5f) {
-    row += store + std::string(",\n") + Misc;
-  } else {
-    row += Misc + std::string(",\n") + store;
-  }
-  row += std::string("}\n");
-  return row;
-}
-
-template <class... QueryArg>
-static void BM_case(benchmark::State& state, QueryArg&&... query_arg)
-{
-  srand(5236);
-  auto iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    [desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); });
-  int num_rows = state.range(0);
-  cudf::test::strings_column_wrapper input(iter, iter + num_rows);
-  cudf::strings_column_view scv(input);
-  size_t num_chars = scv.chars().size();
-
-  std::string json_path(query_arg...);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = cudf::strings::get_json_object(scv, json_path);
-    cudaStreamSynchronize(0);
-  }
-
-  // this isn't strictly 100% accurate. a given query isn't necessarily
-  // going to visit every single incoming character.  but in spirit it does.
-  state.SetBytesProcessed(state.iterations() * num_chars);
-}
-
-#define JSON_BENCHMARK_DEFINE(name, query)                         \
-  BENCHMARK_CAPTURE(BM_case, name, query)                          \
-    ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \
-    ->UseManualTime()                                              \
-    ->Unit(benchmark::kMillisecond);
-
-JSON_BENCHMARK_DEFINE(query0, "$");
-JSON_BENCHMARK_DEFINE(query1, "$.store");
-JSON_BENCHMARK_DEFINE(query2, "$.store.book");
-JSON_BENCHMARK_DEFINE(query3, "$.store.*");
-JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
-JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
-JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
-JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
-JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/string/json.cu
new file mode 100644
index 00000000000..69c42f97d7f
--- /dev/null
+++ b/cpp/benchmarks/string/json.cu
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/strings/string_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/json.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <thrust/random.h>
+
+class JsonPath : public cudf::benchmark {
+};
+
+const std::vector<std::string> Books{
+  R"json({
+"category": "reference",
+"author": "Nigel Rees",
+"title": "Sayings of the Century",
+"price": 8.95
+})json",
+  R"json({
+"category": "fiction",
+"author": "Evelyn Waugh",
+"title": "Sword of Honour",
+"price": 12.99
+})json",
+  R"json({
+"category": "fiction",
+"author": "Herman Melville",
+"title": "Moby Dick",
+"isbn": "0-553-21311-3",
+"price": 8.99
+})json",
+  R"json({
+"category": "fiction",
+"author": "J. R. R. Tolkien",
+"title": "The Lord of the Rings",
+"isbn": "0-395-19395-8",
+"price": 22.99
+})json"};
+constexpr int Approx_book_size = 110;
+const std::vector<std::string> Bicycles{
+  R"json({"color": "red", "price": 9.95})json",
+  R"json({"color": "green", "price": 29.95})json",
+  R"json({"color": "blue", "price": 399.95})json",
+  R"json({"color": "yellow", "price": 99.95})json",
+  R"json({"color": "mauve", "price": 199.95})json",
+};
+constexpr int Approx_bicycle_size = 33;
+std::string Misc{"\n\"expensive\": 10\n"};
+
+struct json_benchmark_row_builder {
+  int const desired_bytes;
+  cudf::size_type const num_rows;
+  cudf::column_device_view const d_books_bicycles[2];  // Books, Bicycles strings
+  cudf::column_device_view const d_book_pct;           // Book percentage
+  cudf::column_device_view const d_misc_order;         // Misc-Store order
+  cudf::column_device_view const d_store_order;        // Books-Bicycles order
+  int32_t* d_offsets{};
+  char* d_chars{};
+  thrust::minstd_rand rng{5236};
+  thrust::uniform_int_distribution<int> dist{};
+
+  // internal data structure for {bytes, out_ptr} with operator+=
+  struct bytes_and_ptr {
+    cudf::size_type bytes;
+    char* ptr;
+    __device__ bytes_and_ptr& operator+=(cudf::string_view const& str_append)
+    {
+      bytes += str_append.size_bytes();
+      if (ptr) { ptr = cudf::strings::detail::copy_string(ptr, str_append); }
+      return *this;
+    }
+  };
+
+  __device__ inline void copy_items(int this_idx,
+                                    cudf::size_type num_items,
+                                    bytes_and_ptr& output_str)
+  {
+    using param_type = thrust::uniform_int_distribution<int>::param_type;
+    dist.param(param_type{0, d_books_bicycles[this_idx].size() - 1});
+    cudf::string_view comma(",\n", 2);
+    for (int i = 0; i < num_items; i++) {
+      if (i > 0) { output_str += comma; }
+      int idx   = dist(rng);
+      auto item = d_books_bicycles[this_idx].element<cudf::string_view>(idx);
+      output_str += item;
+    }
+  }
+
+  __device__ void operator()(cudf::size_type idx)
+  {
+    int num_books       = 2;
+    int num_bicycles    = 2;
+    int remaining_bytes = max(
+      0, desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size)));
+
+    // divide up the remainder between books and bikes
+    auto book_pct = d_book_pct.element<float>(idx);
+    // {Misc, store} OR {store, Misc}
+    // store: {books, bicycles} OR store: {bicycles, books}
+    float bicycle_pct = 1.0f - book_pct;
+    num_books += (remaining_bytes * book_pct) / Approx_book_size;
+    num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size;
+
+    char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    bytes_and_ptr output_str{0, out_ptr};
+    //
+    cudf::string_view comma(",\n", 2);
+    cudf::string_view brace1("{\n", 2);
+    cudf::string_view store_member_start[2]{{"\"book\": [\n", 10}, {"\"bicycle\": [\n", 13}};
+    cudf::string_view store("\"store\": {\n", 11);
+    cudf::string_view Misc{"\"expensive\": 10", 15};
+    cudf::string_view brace2("\n}", 2);
+    cudf::string_view square2{"\n]", 2};
+
+    output_str += brace1;
+    if (d_misc_order.element<bool>(idx)) {  // Misc. first.
+      output_str += Misc;
+      output_str += comma;
+    }
+    output_str += store;
+    for (int store_order = 0; store_order < 2; store_order++) {
+      if (store_order > 0) { output_str += comma; }
+      int this_idx    = (d_store_order.element<bool>(idx) == store_order);
+      auto& mem_start = store_member_start[this_idx];
+      output_str += mem_start;
+      copy_items(this_idx, this_idx == 0 ? num_books : num_bicycles, output_str);
+      output_str += square2;
+    }
+    output_str += brace2;
+    if (!d_misc_order.element<bool>(idx)) {  // Misc, if not first.
+      output_str += comma;
+      output_str += Misc;
+    }
+    output_str += brace2;
+    if (!output_str.ptr) d_offsets[idx] = output_str.bytes;
+  }
+};
+
+auto build_json_string_column(int desired_bytes, int num_rows)
+{
+  data_profile profile;
+  profile.set_cardinality(0);
+  profile.set_null_frequency(-0.1);
+  profile.set_distribution_params<float>(
+    cudf::type_id::FLOAT32, distribution_id::UNIFORM, 0.0, 1.0);
+  auto float_2bool_columns =
+    create_random_table({cudf::type_id::FLOAT32, cudf::type_id::BOOL8, cudf::type_id::BOOL8},
+                        row_count{num_rows},
+                        profile);
+
+  cudf::test::strings_column_wrapper books(Books.begin(), Books.end());
+  cudf::test::strings_column_wrapper bicycles(Bicycles.begin(), Bicycles.end());
+  auto d_books       = cudf::column_device_view::create(books);
+  auto d_bicycles    = cudf::column_device_view::create(bicycles);
+  auto d_book_pct    = cudf::column_device_view::create(float_2bool_columns->get_column(0));
+  auto d_misc_order  = cudf::column_device_view::create(float_2bool_columns->get_column(1));
+  auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
+  json_benchmark_row_builder jb{
+    desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
+  auto children = cudf::strings::detail::make_strings_children(jb, num_rows);
+  return cudf::make_strings_column(
+    num_rows, std::move(children.first), std::move(children.second), 0, {});
+}
+
+void BM_case(benchmark::State& state, std::string query_arg)
+{
+  srand(5236);
+  int num_rows      = state.range(0);
+  int desired_bytes = state.range(1);
+  auto input        = build_json_string_column(desired_bytes, num_rows);
+  cudf::strings_column_view scv(input->view());
+  size_t num_chars = scv.chars().size();
+
+  std::string json_path(query_arg);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);
+    auto result = cudf::strings::get_json_object(scv, json_path);
+    cudaStreamSynchronize(0);
+  }
+
+  // this isn't strictly 100% accurate. a given query isn't necessarily
+  // going to visit every single incoming character.  but in spirit it does.
+  state.SetBytesProcessed(state.iterations() * num_chars);
+}
+
+#define JSON_BENCHMARK_DEFINE(name, query)                                                  \
+  BENCHMARK_DEFINE_F(JsonPath, name)(::benchmark::State & state) { BM_case(state, query); } \
+  BENCHMARK_REGISTER_F(JsonPath, name)                                                      \
+    ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}})                          \
+    ->UseManualTime()                                                                       \
+    ->Unit(benchmark::kMillisecond);
+
+JSON_BENCHMARK_DEFINE(query0, "$");
+JSON_BENCHMARK_DEFINE(query1, "$.store");
+JSON_BENCHMARK_DEFINE(query2, "$.store.book");
+JSON_BENCHMARK_DEFINE(query3, "$.store.*");
+JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
+JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
+JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
+JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
+JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp
index 86b8525023f..9044db18522 100644
--- a/cpp/benchmarks/string/repeat_strings.cpp
+++ b/cpp/benchmarks/string/repeat_strings.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ static std::unique_ptr<cudf::table> create_data_table(cudf::size_type n_cols,
       cudf::type_id::INT32, distribution_id::NORMAL, min_repeat_times, max_repeat_times);
   }
 
-  return create_random_table(dtype_ids, n_cols, row_count{n_rows}, table_profile);
+  return create_random_table(dtype_ids, row_count{n_rows}, table_profile);
 }
 
 static void BM_repeat_strings_scalar_times(benchmark::State& state)
diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index 9be2e3a8627..0a3607c64f0 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,8 +40,7 @@ static void BM_replace(benchmark::State& state, replace_type rt)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
   cudf::string_scalar target("+");
   cudf::string_scalar repl("");
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index c106953bf69..b9d04630837 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,7 @@ static void BM_replace(benchmark::State& state, replace_type rt)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
   cudf::test::strings_column_wrapper repls({"#", ""});
 
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index fc879d1d0eb..ad25cfe54de 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,8 +38,7 @@ static void BM_split(benchmark::State& state, split_type rt)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
   cudf::string_scalar target("+");
 
diff --git a/cpp/benchmarks/string/substring.cpp b/cpp/benchmarks/string/substring.cpp
index 8864fffc40b..2195cc56515 100644
--- a/cpp/benchmarks/string/substring.cpp
+++ b/cpp/benchmarks/string/substring.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,8 +43,7 @@ static void BM_substring(benchmark::State& state, substring_type rt)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
   auto starts_itr = thrust::constant_iterator<cudf::size_type>(1);
   auto stops_itr  = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp
index 98688fa14fc..38c6ff9c701 100644
--- a/cpp/benchmarks/string/translate.cpp
+++ b/cpp/benchmarks/string/translate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,8 +41,7 @@ static void BM_translate(benchmark::State& state, int entry_count)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
   std::vector<entry_type> entries(entry_count);
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 7c39ebbb1bb..157c27ae48a 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,8 +38,7 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index ac8e92b3376..2cc083f4ae8 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,8 +36,7 @@ static void BM_normalize(benchmark::State& state, bool to_lower)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
diff --git a/cpp/benchmarks/text/normalize_spaces.cpp b/cpp/benchmarks/text/normalize_spaces.cpp
index 34749b579b9..3bd636d4aa9 100644
--- a/cpp/benchmarks/text/normalize_spaces.cpp
+++ b/cpp/benchmarks/text/normalize_spaces.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,7 @@ static void BM_normalize(benchmark::State& state)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index fa3f816db59..4cb9c9e5271 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,8 +40,7 @@ static void BM_tokenize(benchmark::State& state, tokenize_type tt)
   data_profile table_profile;
   table_profile.set_distribution_params(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
   cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
 
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 48b31e5dae7..3be599e8c41 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -42,7 +42,7 @@ struct Functor {
 };
 
 template <class Float, FunctorType ft>
-struct Functor<Float, ft, typename std::enable_if_t<std::is_floating_point_v<Float>>> {
+struct Functor<Float, ft, std::enable_if_t<std::is_floating_point_v<Float>>> {
   static __device__ Float f(Float x)
   {
     if (ft == BANDWIDTH_BOUND) {
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 9dc8f1750ed..539a7c04106 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -170,6 +170,17 @@ class scan_aggregation : public virtual aggregation {
   scan_aggregation() {}
 };
 
+/**
+ * @brief Derived class intended for segmented reduction usage.
+ */
+class segmented_reduce_aggregation : public virtual aggregation {
+ public:
+  ~segmented_reduce_aggregation() override = default;
+
+ protected:
+  segmented_reduce_aggregation() {}
+};
+
 enum class udf_type : bool { CUDA, PTX };
 enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN };
 
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 4c3a0002694..eba24dd2d13 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -149,7 +149,8 @@ class sum_aggregation final : public rolling_aggregation,
                               public groupby_aggregation,
                               public groupby_scan_aggregation,
                               public reduce_aggregation,
-                              public scan_aggregation {
+                              public scan_aggregation,
+                              public segmented_reduce_aggregation {
  public:
   sum_aggregation() : aggregation(SUM) {}
 
@@ -170,7 +171,8 @@ class sum_aggregation final : public rolling_aggregation,
  */
 class product_aggregation final : public groupby_aggregation,
                                   public reduce_aggregation,
-                                  public scan_aggregation {
+                                  public scan_aggregation,
+                                  public segmented_reduce_aggregation {
  public:
   product_aggregation() : aggregation(PRODUCT) {}
 
@@ -193,7 +195,8 @@ class min_aggregation final : public rolling_aggregation,
                               public groupby_aggregation,
                               public groupby_scan_aggregation,
                               public reduce_aggregation,
-                              public scan_aggregation {
+                              public scan_aggregation,
+                              public segmented_reduce_aggregation {
  public:
   min_aggregation() : aggregation(MIN) {}
 
@@ -216,7 +219,8 @@ class max_aggregation final : public rolling_aggregation,
                               public groupby_aggregation,
                               public groupby_scan_aggregation,
                               public reduce_aggregation,
-                              public scan_aggregation {
+                              public scan_aggregation,
+                              public segmented_reduce_aggregation {
  public:
   max_aggregation() : aggregation(MAX) {}
 
@@ -256,7 +260,7 @@ class count_aggregation final : public rolling_aggregation,
 /**
  * @brief Derived class for specifying an any aggregation
  */
-class any_aggregation final : public reduce_aggregation {
+class any_aggregation final : public reduce_aggregation, public segmented_reduce_aggregation {
  public:
   any_aggregation() : aggregation(ANY) {}
 
@@ -275,7 +279,7 @@ class any_aggregation final : public reduce_aggregation {
 /**
  * @brief Derived class for specifying an all aggregation
  */
-class all_aggregation final : public reduce_aggregation {
+class all_aggregation final : public reduce_aggregation, public segmented_reduce_aggregation {
  public:
   all_aggregation() : aggregation(ALL) {}
 
@@ -694,7 +698,9 @@ class percent_rank_aggregation final : public rolling_aggregation,
 /**
  * @brief Derived aggregation class for specifying COLLECT_LIST aggregation
  */
-class collect_list_aggregation final : public rolling_aggregation, public groupby_aggregation {
+class collect_list_aggregation final : public rolling_aggregation,
+                                       public groupby_aggregation,
+                                       public reduce_aggregation {
  public:
   explicit collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE)
     : aggregation{COLLECT_LIST}, _null_handling{null_handling}
@@ -733,7 +739,9 @@ class collect_list_aggregation final : public rolling_aggregation, public groupb
 /**
  * @brief Derived aggregation class for specifying COLLECT_SET aggregation
  */
-class collect_set_aggregation final : public rolling_aggregation, public groupby_aggregation {
+class collect_set_aggregation final : public rolling_aggregation,
+                                      public groupby_aggregation,
+                                      public reduce_aggregation {
  public:
   explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
                                    null_equality nulls_equal = null_equality::EQUAL,
@@ -881,7 +889,7 @@ class udf_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived aggregation class for specifying MERGE_LISTS aggregation
  */
-class merge_lists_aggregation final : public groupby_aggregation {
+class merge_lists_aggregation final : public groupby_aggregation, public reduce_aggregation {
  public:
   explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {}
 
@@ -900,7 +908,7 @@ class merge_lists_aggregation final : public groupby_aggregation {
 /**
  * @brief Derived aggregation class for specifying MERGE_SETS aggregation
  */
-class merge_sets_aggregation final : public groupby_aggregation {
+class merge_sets_aggregation final : public groupby_aggregation, public reduce_aggregation {
  public:
   explicit merge_sets_aggregation(null_equality nulls_equal, nan_equality nans_equal)
     : aggregation{MERGE_SETS}, _nulls_equal(nulls_equal), _nans_equal(nans_equal)
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
index 00742db7982..321cc3d19ef 100644
--- a/cpp/include/cudf/detail/calendrical_month_sequence.cuh
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,12 +30,12 @@ namespace cudf {
 namespace detail {
 struct calendrical_month_sequence_functor {
   template <typename T>
-  typename std::enable_if_t<cudf::is_timestamp_t<T>::value, std::unique_ptr<cudf::column>>
-  operator()(size_type n,
-             scalar const& input,
-             size_type months,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::enable_if_t<cudf::is_timestamp_t<T>::value, std::unique_ptr<cudf::column>> operator()(
+    size_type n,
+    scalar const& input,
+    size_type months,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     // Return empty column if n = 0
     if (n == 0) return cudf::make_empty_column(input.type());
@@ -59,8 +59,8 @@ struct calendrical_month_sequence_functor {
   }
 
   template <typename T, typename... Args>
-  typename std::enable_if_t<!cudf::is_timestamp_t<T>::value, std::unique_ptr<cudf::column>>
-  operator()(Args&&...)
+  std::enable_if_t<!cudf::is_timestamp_t<T>::value, std::unique_ptr<cudf::column>> operator()(
+    Args&&...)
   {
     CUDF_FAIL("Cannot make a date_range of a non-datetime type");
   }
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index 0fc807593fb..5854654b436 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,20 +61,41 @@ std::unique_ptr<column> serial_murmur_hash3_32(
  * http://www.boost.org/LICENSE_1_0.txt)
  */
 /**
- * @brief Combines two hashed values into a single hashed value.
+ * @brief Combines two hash values into a single hash value.
  *
- * Adapted from Boost hash_combine function, modified for 64-bit
+ * Taken from the Boost hash_combine function.
  * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
  *
- * @param lhs The first hashed value
- * @param rhs The second hashed value
+ * @param lhs The first hash value
+ * @param rhs The second hash value
+ * @return Combined hash value
+ */
+constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
+{
+  return lhs ^ (rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2));
+}
+
+/* Copyright 2005-2014 Daniel James.
+ *
+ * Use, modification and distribution is subject to the Boost Software
+ * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ */
+/**
+ * @brief Combines two hash values into a single hash value.
+ *
+ * Adapted from Boost hash_combine function and modified for 64-bit.
+ * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
+ *
+ * @param lhs The first hash value
+ * @param rhs The second hash value
  * @return Combined hash value
  */
 constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
 {
-  lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2);
-  return lhs;
+  return lhs ^ (rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2));
 }
+
 }  // namespace detail
 }  // namespace cudf
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index df06ad9e4f3..78eaa4f2448 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
@@ -32,6 +33,7 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 
 #include <algorithm>
 #include <iterator>
@@ -279,7 +281,8 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                                     OffsetIterator first_bit_indices_end,
                                                     OffsetIterator last_bit_indices_begin,
                                                     count_bits_policy count_bits,
-                                                    rmm::cuda_stream_view stream)
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
 {
   auto const num_ranges =
     static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
@@ -329,14 +332,15 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
     // set bits from the length of the segment.
     auto segments_begin =
       thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
-    auto segments_size = thrust::transform_iterator(segments_begin, [] __device__(auto segment) {
-      auto const begin = thrust::get<0>(segment);
-      auto const end   = thrust::get<1>(segment);
-      return end - begin;
-    });
+    auto segment_length_iterator =
+      thrust::transform_iterator(segments_begin, [] __device__(auto const& segment) {
+        auto const begin = thrust::get<0>(segment);
+        auto const end   = thrust::get<1>(segment);
+        return end - begin;
+      });
     thrust::transform(rmm::exec_policy(stream),
-                      segments_size,
-                      segments_size + num_ranges,
+                      segment_length_iterator,
+                      segment_length_iterator + num_ranges,
                       d_bit_counts.data(),
                       d_bit_counts.data(),
                       [] __device__(auto segment_size, auto segment_bit_count) {
@@ -438,7 +442,8 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                        first_bit_indices_end,
                                        last_bit_indices_begin,
                                        count_bits,
-                                       stream);
+                                       stream,
+                                       rmm::mr::get_current_device_resource());
 
   // Copy the results back to the host.
   return make_std_vector_sync(d_bit_counts, stream);
@@ -501,6 +506,80 @@ std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
   return detail::segmented_count_unset_bits(bitmask, indices_begin, indices_end, stream);
 }
 
+/**
+ * @brief Create an output null mask whose validity is determined by the
+ * validity of any/all elements of segments of an input null mask.
+ *
+ * @tparam OffsetIterator Random-access input iterator type.
+ * @param bitmask Null mask residing in device memory whose segments will be
+ * reduced into a new mask.
+ * @param first_bit_indices_begin Random-access input iterator to the beginning
+ * of a sequence of indices of the first bit in each segment (inclusive).
+ * @param first_bit_indices_end Random-access input iterator to the end of a
+ * sequence of indices of the first bit in each segment (inclusive).
+ * @param last_bit_indices_begin Random-access input iterator to the beginning
+ * of a sequence of indices of the last bit in each segment (exclusive).
+ * @param null_handling If `null_policy::INCLUDE`, all elements in a segment
+ * must be valid for the reduced value to be valid. If `null_policy::EXCLUDE`,
+ * the reduction is valid if any element in the segment is valid.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned buffer's device memory.
+ * @return A pair containing the reduced null mask and number of nulls.
+ */
+template <typename OffsetIterator>
+std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
+  bitmask_type const* bitmask,
+  OffsetIterator first_bit_indices_begin,
+  OffsetIterator first_bit_indices_end,
+  OffsetIterator last_bit_indices_begin,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto const segments_begin =
+    thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
+  auto const segment_length_iterator =
+    thrust::make_transform_iterator(segments_begin, [] __device__(auto const& segment) {
+      auto const begin = thrust::get<0>(segment);
+      auto const end   = thrust::get<1>(segment);
+      return end - begin;
+    });
+
+  auto const num_segments =
+    static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
+
+  if (bitmask == nullptr) {
+    return cudf::detail::valid_if(
+      segment_length_iterator,
+      segment_length_iterator + num_segments,
+      [] __device__(auto const& length) { return length > 0; },
+      stream,
+      mr);
+  }
+
+  auto const segment_valid_counts =
+    cudf::detail::segmented_count_bits(bitmask,
+                                       first_bit_indices_begin,
+                                       first_bit_indices_end,
+                                       last_bit_indices_begin,
+                                       cudf::detail::count_bits_policy::SET_BITS,
+                                       stream,
+                                       rmm::mr::get_current_device_resource());
+  auto const length_and_valid_count =
+    thrust::make_zip_iterator(segment_length_iterator, segment_valid_counts.begin());
+  return cudf::detail::valid_if(
+    length_and_valid_count,
+    length_and_valid_count + num_segments,
+    [null_handling] __device__(auto const& length_and_valid_count) {
+      auto const length      = thrust::get<0>(length_and_valid_count);
+      auto const valid_count = thrust::get<1>(length_and_valid_count);
+      return (length > 0) and
+             ((null_handling == null_policy::EXCLUDE) ? valid_count > 0 : valid_count == length);
+    },
+    stream,
+    mr);
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh
index 76825285745..76afbf7e4b8 100644
--- a/cpp/include/cudf/detail/reduction.cuh
+++ b/cpp/include/cudf/detail/reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include "reduction_operators.cuh"
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,6 +27,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/device/device_reduce.cuh>
+#include <cub/device/device_segmented_reduce.cuh>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -40,6 +42,8 @@ namespace detail {
  * @param[in] num_items the number of items
  * @param[in] op        the reduction operator
  * @param[in] stream    CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr        Device memory resource used to allocate the returned scalar's device
+ * memory
  * @returns   Output scalar in device memory
  *
  * @tparam Op               the reduction operator with device binary operator
@@ -49,8 +53,8 @@ namespace detail {
 template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type,
-          typename std::enable_if_t<is_fixed_width<OutputType>() &&
-                                    not cudf::is_fixed_point<OutputType>()>* = nullptr>
+          std::enable_if_t<is_fixed_width<OutputType>() &&
+                           not cudf::is_fixed_point<OutputType>()>* = nullptr>
 std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type num_items,
                                op::simple_op<Op> sop,
@@ -92,7 +96,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
 template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type,
-          typename std::enable_if_t<is_fixed_point<OutputType>()>* = nullptr>
+          std::enable_if_t<is_fixed_point<OutputType>()>* = nullptr>
 std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type num_items,
                                op::simple_op<Op> sop,
@@ -102,14 +106,13 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
   CUDF_FAIL(
     "This function should never be called. fixed_point reduce should always go through the reduce "
     "for the corresponding device_storage_type_t");
-  ;
 }
 
 // @brief string_view specialization of simple reduction
 template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type,
-          typename std::enable_if_t<std::is_same_v<OutputType, string_view>>* = nullptr>
+          std::enable_if_t<std::is_same_v<OutputType, string_view>>* = nullptr>
 std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type num_items,
                                op::simple_op<Op> sop,
@@ -157,6 +160,8 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
  * @param[in] valid_count   the intermediate operator argument 1
  * @param[in] ddof      the intermediate operator argument 2
  * @param[in] stream    CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr        Device memory resource used to allocate the returned scalar's device
+ * memory
  * @returns   Output scalar in device memory
  *
  * The reduction operator must have `intermediate::compute_result()` method.
@@ -218,6 +223,92 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
   return std::unique_ptr<scalar>(result);
 }
 
+/**
+ * @brief Compute the specified simple reduction over each of the segments in the
+ * input range of elements.
+ *
+ * @tparam Op               the reduction operator with device binary operator
+ * @tparam InputIterator    the input column iterator
+ * @tparam OffsetIterator   the offset column iterator
+ * @tparam OutputType       the output type of reduction
+ *
+ * @param[in] d_in          the begin iterator to input
+ * @param[in] d_offset      the begin iterator to offset
+ * @param[in] num_segments  the number of segments
+ * @param[in] sop           the reduction operator
+ * @param[in] stream        CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr            Device memory resource used to allocate the returned column's device
+ * memory
+ * @returns   Output column in device memory
+ *
+ */
+template <typename Op,
+          typename InputIterator,
+          typename OffsetIterator,
+          typename OutputType = typename thrust::iterator_value<InputIterator>::type,
+          typename std::enable_if_t<is_fixed_width<OutputType>() &&
+                                    not cudf::is_fixed_point<OutputType>()>* = nullptr>
+std::unique_ptr<column> segmented_reduce(InputIterator d_in,
+                                         OffsetIterator d_offset,
+                                         cudf::size_type num_segments,
+                                         op::simple_op<Op> sop,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  auto binary_op  = sop.get_binary_op();
+  auto identity   = sop.template get_identity<OutputType>();
+  auto dev_result = make_fixed_width_column(
+    data_type{type_to_id<OutputType>()}, num_segments, mask_state::UNALLOCATED, stream, mr);
+  auto dev_result_mview = dev_result->mutable_view();
+
+  // Allocate temporary storage
+  rmm::device_buffer d_temp_storage;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(),
+                                     temp_storage_bytes,
+                                     d_in,
+                                     dev_result_mview.data<OutputType>(),
+                                     num_segments,
+                                     d_offset,
+                                     d_offset + 1,
+                                     binary_op,
+                                     identity,
+                                     stream.value());
+  d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
+
+  // Run reduction
+  cub::DeviceSegmentedReduce::Reduce(d_temp_storage.data(),
+                                     temp_storage_bytes,
+                                     d_in,
+                                     dev_result_mview.data<OutputType>(),
+                                     num_segments,
+                                     d_offset,
+                                     d_offset + 1,
+                                     binary_op,
+                                     identity,
+                                     stream.value());
+
+  return dev_result;
+}
+
+template <typename Op,
+          typename InputIterator,
+          typename OffsetIterator,
+          typename OutputType = typename thrust::iterator_value<InputIterator>::type,
+          typename std::enable_if_t<not is_fixed_width<OutputType>() ||
+                                    is_fixed_point<OutputType>()>* = nullptr>
+std::unique_ptr<column> segmented_reduce(InputIterator,
+                                         OffsetIterator,
+                                         cudf::size_type,
+                                         op::simple_op<Op>,
+                                         rmm::cuda_stream_view,
+                                         rmm::mr::device_memory_resource*)
+{
+  CUDF_FAIL(
+    "Unsupported data types called on segmented_reduce. Only numeric and chrono types are "
+    "supported.");
+}
+
 }  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
index 0565f332b48..ccec4bf8a6c 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
+#include "cudf/lists/lists_column_view.hpp"
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -29,7 +30,7 @@ namespace reduction {
  * If all elements in input column are null, output scalar is null.
  *
  * @throw cudf::logic_error if input column type is not convertible to `output_dtype`
- * @throw cudf::logic_error if `output_dtype` is not arithmetic point type
+ * @throw cudf::logic_error if `output_dtype` is not an arithmetic type
  *
  * @param col input column to compute sum
  * @param output_dtype data type of return type and typecast elements of input column
@@ -127,7 +128,7 @@ std::unique_ptr<scalar> all(
  * If all elements in input column are null, output scalar is null.
  *
  * @throw cudf::logic_error if input column type is not convertible to `output_dtype`
- * @throw cudf::logic_error if `output_dtype` is not arithmetic point type
+ * @throw cudf::logic_error if `output_dtype` is not an arithmetic type
  *
  * @param col input column to compute product.
  * @param output_dtype data type of return type and typecast elements of input column
@@ -147,7 +148,7 @@ std::unique_ptr<scalar> product(
  * If all elements in input column are null, output scalar is null.
  *
  * @throw cudf::logic_error if input column type is not convertible to `output_dtype`
- * @throw cudf::logic_error if `output_dtype` is not arithmetic point type
+ * @throw cudf::logic_error if `output_dtype` is not an arithmetic type
  *
  * @param col input column to compute sum of squares.
  * @param output_dtype data type of return type and typecast elements of input column
@@ -244,7 +245,7 @@ std::unique_ptr<scalar> standard_deviation(
  * @param n index of element to get
  * @param null_handling Indicates if null values will be counted while indexing.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @return nth element as scalar
  */
 std::unique_ptr<scalar> nth_element(
@@ -254,5 +255,223 @@ std::unique_ptr<scalar> nth_element(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Collect input column into a (list) scalar
+ *
+ * @param col input column to collect from
+ * @param null_handling Indicates if null values will be counted while collecting.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return collected list as scalar
+ */
+std::unique_ptr<scalar> collect_list(
+  column_view const& col,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Merge a bunch of list scalars into single list scalar
+ *
+ * @param col input list column representing numbers of list scalars to be merged
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return merged list as scalar
+ */
+std::unique_ptr<scalar> merge_lists(
+  lists_column_view const& col,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Collect input column into a (list) scalar without duplicated elements
+ *
+ * @param col input column to collect from
+ * @param null_handling Indicates if null values will be counted while collecting.
+ * @param nulls_equal Indicates if null values will be considered as equal values.
+ * @param nans_equal Indicates if nan values will be considered as equal values.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return collected list with unique elements as scalar
+ */
+std::unique_ptr<scalar> collect_set(
+  column_view const& col,
+  null_policy null_handling,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Merge a bunch of list scalars into single list scalar then drop duplicated elements
+ *
+ * @param col input list column representing numbers of list scalars to be merged
+ * @param nulls_equal Indicates if null values will be considered as equal values.
+ * @param nans_equal Indicates if nan values will be considered as equal values.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return collected list with unique elements as scalar
+ */
+std::unique_ptr<scalar> merge_sets(
+  lists_column_view const& col,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute sum of each segment in input column.
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to `output_dtype`.
+ * @throw cudf::logic_error if `output_dtype` is not an arithmetic type.
+ *
+ * @param col Input column to compute sum.
+ * @param offsets Indices to identify segment boundaries.
+ * @param output_dtype Data type of return type and typecast elements of input column.
+ * @param null_handling If `INCLUDE`, the reduction is valid if all elements in
+ * a segment are valid, otherwise null. If `EXCLUDE`, the reduction is valid if
+ * any element in the segment is valid, otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return Sums of segments in type `output_dtype`.
+ */
+std::unique_ptr<column> segmented_sum(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes product of each segment in input column.
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to `output_dtype`.
+ * @throw cudf::logic_error if `output_dtype` is not an arithmetic type.
+ *
+ * @param col Input column to compute product.
+ * @param offsets Indices to identify segment boundaries.
+ * @param output_dtype data type of return type and typecast elements of input column.
+ * @param null_handling If `INCLUDE`, the reduction is valid if all elements in
+ * a segment are valid, otherwise null. If `EXCLUDE`, the reduction is valid if
+ * any element in the segment is valid, otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @return Product as scalar of type `output_dtype`.
+ */
+std::unique_ptr<column> segmented_product(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute minimum of each segment in input column.
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * @throw cudf::logic_error if input column type is convertible to `output_dtype`.
+ *
+ * @param col Input column to compute minimum.
+ * @param offsets Indices to identify segment boundaries.
+ * @param output_dtype Data type of return type and typecast elements of input column.
+ * @param null_handling If `INCLUDE`, the reduction is valid if all elements in
+ * a segment are valid, otherwise null. If `EXCLUDE`, the reduction is valid if
+ * any element in the segment is valid, otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @return Minimums of segments in type `output_dtype`.
+ */
+std::unique_ptr<column> segmented_min(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute maximum of each segment in input column.
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * @throw cudf::logic_error if input column type is convertible to `output_dtype`.
+ *
+ * @param col Input column to compute maximum.
+ * @param offsets Indices to identify segment boundaries.
+ * @param output_dtype Data type of return type and typecast elements of input column.
+ * @param null_handling If `INCLUDE`, the reduction is valid if all elements in
+ * a segment are valid, otherwise null. If `EXCLUDE`, the reduction is valid if
+ * any element in the segment is valid, otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @return Maximums of segments in type `output_dtype`.
+ */
+std::unique_ptr<column> segmented_max(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute if any of the values in the segment are true when typecasted to bool.
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to bool.
+ * @throw cudf::logic_error if `output_dtype` is not bool8.
+ *
+ * @param col Input column to compute any_of.
+ * @param offsets Indices to identify segment boundaries.
+ * @param output_dtype Data type of return type and typecast elements of input column.
+ * @param null_handling If `INCLUDE`, the reduction is valid if all elements in
+ * a segment are valid, otherwise null. If `EXCLUDE`, the reduction is valid if
+ * any element in the segment is valid, otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @return Column of bool8 for the results of the segments.
+ */
+std::unique_ptr<column> segmented_any(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Compute if all of the values in the segment are true when typecasted to bool.
+ *
+ * If an input segment is empty, the segment result is null.
+ *
+ * @throw cudf::logic_error if input column type is not convertible to bool.
+ * @throw cudf::logic_error if `output_dtype` is not bool8.
+ *
+ * @param col Input column to compute all_of.
+ * @param offsets Indices to identify segment boundaries.
+ * @param output_dtype Data type of return type and typecast elements of input column.
+ * @param null_handling If `INCLUDE`, the reduction is valid if all elements in
+ * a segment are valid, otherwise null. If `EXCLUDE`, the reduction is valid if
+ * any element in the segment is valid, otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @return Column of bool8 for the results of the segments.
+ */
+std::unique_ptr<column> segmented_all(
+  column_view const& col,
+  device_span<size_type const> offsets,
+  data_type const output_dtype,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 3aa85e87b1d..97490ee3e1c 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,6 +63,19 @@ std::unique_ptr<table> sort_by_key(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::stable_sort_by_key
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> stable_sort_by_key(
+  table_view const& values,
+  table_view const& keys,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::segmented_sorted_order
  *
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 221e90a9816..f985135064f 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -426,7 +426,7 @@ struct typesAtomicCASImpl<T, 8> {
  * @returns The old value at `address`
  */
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<cudf::is_numeric<T>(), T> __forceinline__ __device__
+std::enable_if_t<cudf::is_numeric<T>(), T> __forceinline__ __device__
 genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
 {
   auto fun = cudf::detail::genericAtomicOperationImpl<T, BinaryOp>{};
@@ -435,7 +435,7 @@ genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
 
 // specialization for cudf::detail::timestamp types
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<cudf::is_timestamp<T>(), T> __forceinline__ __device__
+std::enable_if_t<cudf::is_timestamp<T>(), T> __forceinline__ __device__
 genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
 {
   using R = typename T::rep;
@@ -448,7 +448,7 @@ genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
 
 // specialization for cudf::detail::duration types
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<cudf::is_duration<T>(), T> __forceinline__ __device__
+std::enable_if_t<cudf::is_duration<T>(), T> __forceinline__ __device__
 genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
 {
   using R = typename T::rep;
@@ -616,7 +616,7 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
  *
  * @returns The old value at `address`
  */
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
 __forceinline__ __device__ T atomicAnd(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceAnd{});
@@ -637,7 +637,7 @@ __forceinline__ __device__ T atomicAnd(T* address, T val)
  *
  * @returns The old value at `address`
  */
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
 __forceinline__ __device__ T atomicOr(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceOr{});
@@ -658,7 +658,7 @@ __forceinline__ __device__ T atomicOr(T* address, T val)
  *
  * @returns The old value at `address`
  */
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
+template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
 __forceinline__ __device__ T atomicXor(T* address, T val)
 {
   return cudf::genericAtomicOperation(address, val, cudf::DeviceXor{});
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 9423cb6b998..87fef5bc187 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -61,27 +61,26 @@ CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
  * @brief Binary `sum` operator
  */
 struct DeviceSum {
-  template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
   CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
   static constexpr T identity()
   {
     return T{typename T::duration{0}};
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<!cudf::is_timestamp<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!cudf::is_timestamp<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     return T{0};
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support device operator identity");
@@ -93,13 +92,13 @@ struct DeviceSum {
  * @brief `count` operator - used in rolling windows
  */
 struct DeviceCount {
-  template <typename T, typename std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
   CUDF_HOST_DEVICE inline T operator()(const T& lhs, const T& rhs)
   {
     return T{DeviceCount{}(lhs.time_since_epoch(), rhs.time_since_epoch())};
   }
 
-  template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
   CUDF_HOST_DEVICE inline T operator()(const T&, const T& rhs)
   {
     return rhs + T{1};
@@ -123,10 +122,9 @@ struct DeviceMin {
     return numeric::detail::min(lhs, rhs);
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
-                              !cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
+                             !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::max()
@@ -135,7 +133,7 @@ struct DeviceMin {
     return cuda::std::numeric_limits<T>::max();
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
@@ -143,13 +141,13 @@ struct DeviceMin {
   }
 
   // @brief identity specialized for string_view
-  template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
   CUDF_HOST_DEVICE inline static constexpr T identity()
   {
     return string_view::max();
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
   static constexpr T identity()
   {
     return static_cast<T>(T::max_value());
@@ -167,10 +165,9 @@ struct DeviceMax {
     return numeric::detail::max(lhs, rhs);
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
-                              !cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
+                             !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::min()
@@ -179,20 +176,20 @@ struct DeviceMax {
     return cuda::std::numeric_limits<T>::lowest();
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
     return cuda::std::numeric_limits<T>::lowest();
   }
 
-  template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
   CUDF_HOST_DEVICE inline static constexpr T identity()
   {
     return string_view::min();
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
   static constexpr T identity()
   {
     return static_cast<T>(T::lowest_value());
@@ -203,19 +200,19 @@ struct DeviceMax {
  * @brief binary `product` operator
  */
 struct DeviceProduct {
-  template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
   CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
 
-  template <typename T, typename std::enable_if_t<!cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     return T{1};
   }
 
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
     CUDF_FAIL("fixed_point does not yet support DeviceProduct identity");
@@ -227,7 +224,7 @@ struct DeviceProduct {
  * @brief binary `and` operator
  */
 struct DeviceAnd {
-  template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
   {
     return (lhs & rhs);
@@ -238,7 +235,7 @@ struct DeviceAnd {
  * @brief binary `or` operator
  */
 struct DeviceOr {
-  template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
   {
     return (lhs | rhs);
@@ -249,7 +246,7 @@ struct DeviceOr {
  * @brief binary `xor` operator
  */
 struct DeviceXor {
-  template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
   {
     return (lhs ^ rhs);
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 6cf1acd2f5a..7eefdc90f4b 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -90,9 +90,9 @@ struct MurmurHash3_32 {
   MurmurHash3_32() = default;
   constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  [[nodiscard]] __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
+  [[nodiscard]] __device__ inline uint32_t rotl32(uint32_t x, uint32_t r) const
   {
-    return (x << r) | (x >> (32 - r));
+    return __funnelshift_l(x, x, r);  // Equivalent to (x << r) | (x >> (32 - r))
   }
 
   [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
@@ -114,32 +114,6 @@ struct MurmurHash3_32 {
     return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
   }
 
-  /* Copyright 2005-2014 Daniel James.
-   *
-   * Use, modification and distribution is subject to the Boost Software
-   * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
-   * http://www.boost.org/LICENSE_1_0.txt)
-   */
-  /**
-   * @brief  Combines two hash values into a new single hash value. Called
-   * repeatedly to create a hash value from several variables.
-   * Taken from the Boost hash_combine function
-   * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
-   *
-   * @param lhs The first hash value to combine
-   * @param rhs The second hash value to combine
-   *
-   * @returns A hash value that intelligently combines the lhs and rhs hash values
-   */
-  constexpr result_type hash_combine(result_type lhs, result_type rhs) const
-  {
-    result_type combined{lhs};
-
-    combined ^= rhs + 0x9e3779b9 + (combined << 6) + (combined >> 2);
-
-    return combined;
-  }
-
   // TODO Do we need this operator() and/or compute? Probably not both.
   [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
   {
@@ -218,28 +192,25 @@ hash_value_type __device__ inline MurmurHash3_32<bool>::operator()(bool const& k
   return this->compute(static_cast<uint8_t>(key));
 }
 
-/**
- * @brief Specialization of MurmurHash3_32 operator for strings.
- */
 template <>
-hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
+hash_value_type __device__ inline MurmurHash3_32<float>::operator()(float const& key) const
 {
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return this->compute_bytes(data, len);
+  return this->compute_floating_point(key);
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_32<float>::operator()(float const& key) const
+hash_value_type __device__ inline MurmurHash3_32<double>::operator()(double const& key) const
 {
   return this->compute_floating_point(key);
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_32<double>::operator()(double const& key) const
+hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
 {
-  return this->compute_floating_point(key);
+  auto const data = reinterpret_cast<std::byte const*>(key.data());
+  auto const len  = key.size_bytes();
+  return this->compute_bytes(data, len);
 }
 
 template <>
@@ -286,9 +257,9 @@ struct SparkMurmurHash3_32 {
   SparkMurmurHash3_32() = default;
   constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
+  [[nodiscard]] __device__ inline uint32_t rotl32(uint32_t x, uint32_t r) const
   {
-    return (x << r) | (x >> (32 - r));
+    return __funnelshift_l(x, x, r);  // Equivalent to (x << r) | (x >> (32 - r))
   }
 
   __device__ inline uint32_t fmix32(uint32_t h) const
@@ -408,6 +379,27 @@ hash_value_type __device__ inline SparkMurmurHash3_32<uint16_t>::operator()(
   return this->compute<uint32_t>(key);
 }
 
+template <>
+hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(float const& key) const
+{
+  return this->compute_floating_point(key);
+}
+
+template <>
+hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(double const& key) const
+{
+  return this->compute_floating_point(key);
+}
+
+template <>
+hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
+{
+  auto const data = reinterpret_cast<std::byte const*>(key.data());
+  auto const len  = key.size_bytes();
+  return this->compute_bytes(data, len);
+}
+
 template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal32>::operator()(
   numeric::decimal32 const& key) const
@@ -480,30 +472,6 @@ hash_value_type __device__ inline SparkMurmurHash3_32<cudf::struct_view>::operat
   return 0;
 }
 
-/**
- * @brief Specialization of MurmurHash3_32 operator for strings.
- */
-template <>
-hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
-{
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return this->compute_bytes(data, len);
-}
-
-template <>
-hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(float const& key) const
-{
-  return this->compute_floating_point(key);
-}
-
-template <>
-hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(double const& key) const
-{
-  return this->compute_floating_point(key);
-}
-
 /**
  * @brief  This hash function simply returns the value that is asked to be hash
  * reinterpreted as the result_type of the functor.
@@ -514,32 +482,6 @@ struct IdentityHash {
   IdentityHash()    = default;
   constexpr IdentityHash(uint32_t seed) : m_seed(seed) {}
 
-  /* Copyright 2005-2014 Daniel James.
-   *
-   * Use, modification and distribution is subject to the Boost Software
-   * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
-   * http://www.boost.org/LICENSE_1_0.txt)
-   */
-  /**
-   * @brief  Combines two hash values into a new single hash value. Called
-   * repeatedly to create a hash value from several variables.
-   * Taken from the Boost hash_combine function
-   * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
-   *
-   * @param lhs The first hash value to combine
-   * @param rhs The second hash value to combine
-   *
-   * @returns A hash value that intelligently combines the lhs and rhs hash values
-   */
-  constexpr result_type hash_combine(result_type lhs, result_type rhs) const
-  {
-    result_type combined{lhs};
-
-    combined ^= rhs + 0x9e3779b9 + (combined << 6) + (combined >> 2);
-
-    return combined;
-  }
-
   template <typename return_type = result_type>
   constexpr std::enable_if_t<!std::is_arithmetic_v<Key>, return_type> operator()(
     Key const& key) const
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 4a7e9b89c80..ee9e4b2c687 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ __global__ void valid_if_kernel(
 
   size_type block_count = single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_count);
   if (threadIdx.x == 0) { atomicAdd(valid_count, block_count); }
-}  // namespace detail
+}
 
 /**
  * @brief Generate a bitmask where every bit is set for which a predicate is
diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
new file mode 100644
index 00000000000..cb2d00f0d1f
--- /dev/null
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/error.hpp>
+
+#include <cstdint>
+#include <vector>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+/**
+ * @brief stores offset and size used to indicate a byte range
+ */
+class byte_range_info {
+ private:
+  int64_t _offset;
+  int64_t _size;
+
+ public:
+  constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
+  constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
+  {
+    CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
+    CUDF_EXPECTS(size >= 0, "size must be non-negative");
+  }
+
+  constexpr byte_range_info(byte_range_info const& other) noexcept = default;
+  constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;
+
+  [[nodiscard]] constexpr int64_t offset() { return _offset; }
+  [[nodiscard]] constexpr int64_t size() { return _size; }
+};
+
+/**
+ * @brief Create a collection of consecutive ranges between [0, total_bytes).
+ *
+ * Each range wil be the same size except if `total_bytes` is not evenly divisible by
+ * `range_count`, in which case the last range size will be the remainder.
+ *
+ * @param total_bytes total number of bytes in all ranges
+ * @param range_count total number of ranges in which to divide bytes
+ * @return Vector of range objects
+ */
+std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
+                                                                 int64_t range_count);
+
+/**
+ * @brief Create a byte_range_info which represents as much of a file as possible. Specifically,
+ * `[0, numeric_limit<int64_t>::max())`.
+ *
+ * @return `[0, numeric_limit<int64_t>::max())`
+ */
+byte_range_info create_byte_range_info_max();
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index 5e6dda5a514..3499b86ab42 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@ namespace text {
  */
 class device_data_chunk {
  public:
+  virtual ~device_data_chunk()                     = default;
   [[nodiscard]] virtual char const* data() const   = 0;
   [[nodiscard]] virtual std::size_t size() const   = 0;
   virtual operator device_span<char const>() const = 0;
@@ -52,6 +53,9 @@ class device_data_chunk {
  */
 class data_chunk_reader {
  public:
+  virtual ~data_chunk_reader()              = default;
+  virtual void skip_bytes(std::size_t size) = 0;
+
   /**
    * @brief Get the next chunk of bytes from the data source
    *
@@ -76,6 +80,7 @@ class data_chunk_reader {
  */
 class data_chunk_source {
  public:
+  virtual ~data_chunk_source()                                                   = default;
   [[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
 };
 
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index aeb4b7fff53..ffe159b59dc 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,6 +89,8 @@ class istream_data_chunk_reader : public data_chunk_reader {
     }
   }
 
+  void skip_bytes(std::size_t size) override { _datastream->ignore(size); };
+
   std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
                                                     rmm::cuda_stream_view stream) override
   {
@@ -143,6 +145,12 @@ class device_span_data_chunk_reader : public data_chunk_reader {
  public:
   device_span_data_chunk_reader(device_span<char const> data) : _data(data) {}
 
+  void skip_bytes(std::size_t read_size) override
+  {
+    if (read_size > _data.size() - _position) { read_size = _data.size() - _position; }
+    _position += read_size;
+  };
+
   std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
                                                     rmm::cuda_stream_view stream) override
   {
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index 06d15276a68..a908a9fa227 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,20 +89,6 @@ struct trie_device_view {
    */
   constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; }
 
-  /**
-   * @brief returns the longest matching state of any state in the multistate.
-   */
-  template <uint32_t N>
-  constexpr uint8_t get_match_length(multistate const& states)
-  {
-    int8_t val = 0;
-    for (uint8_t i = 0; i < states.size(); i++) {
-      auto match_length = get_match_length(states.get_tail(i));
-      if (match_length > val) { val = match_length; }
-    }
-    return val;
-  }
-
  private:
   constexpr void transition_enqueue_all(  //
     char c,
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index d42ee9f510e..77affa95ca8 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,20 +17,65 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <memory>
+#include <optional>
 
 namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @brief Splits the source text into a strings column using a multiple byte delimiter.
+ *
+ * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
+ * of delimiters which begin within the range. If thinking in terms of "records", where each
+ * delimiter dictates the end of a record, all records which begin within the byte range provided
+ * will be returned, including any record which may begin in the range but end outside of the
+ * range. Records which begin outside of the range will ignored, even if those records end inside
+ * the range.
+ *
+ * @code{.pseudo}
+ * Examples:
+ *  source:     "abc..def..ghi..jkl.."
+ *  delimiter:  ".."
+ *
+ *  byte_range: nullopt
+ *  return:     ["abc..", "def..", "ghi..", jkl..", ""]
+ *
+ *  byte_range: [0, 2)
+ *  return:     ["abc.."]
+ *
+ *  byte_range: [2, 9)
+ *  return:     ["def..", "ghi.."]
+ *
+ *  byte_range: [11, 2)
+ *  return:     []
+ *
+ *  byte_range: [13, 7)
+ *  return:     ["jkl..", ""]
+ * @endcode
+ *
+ * @param source The source string
+ * @param delimiter UTF-8 encoded string for which to find offsets in the source
+ * @param byte_range range in which to consider offsets relevant
+ * @param mr Memory resource to use for the device memory allocation
+ * @return The strings found by splitting the source by the delimiter within the relevant byte
+ * range.
+ */
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::optional<byte_range_info> byte_range = std::nullopt,
+  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace text
 }  // namespace io
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index ca8a33ac924..f140ba7d4a9 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -48,18 +48,21 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE };
  * output data type.
  * @throw cudf::logic_error if `min` or `max` reduction is called and the
  * output type does not match the input column data type.
+ * @throw cudf::logic_error if `any` or `all` reduction is called and the
+ * output type is not bool8.
+ * @throw cudf::logic_error if `mean`, `var`, or `std` reduction is called and
+ * the output type is not floating point.
  *
  * If the input column has arithmetic type, output_dtype can be any arithmetic
- * type. For `mean`, `var` and `std` ops, a floating point output type must be
- * specified. If the input column has non-arithmetic type
- *   eg.(timestamp, string...), the same type must be specified.
+ * type. If the input column has non-arithmetic type, e.g. timestamp or string,
+ * the same output type must be specified.
  *
  * If the reduction fails, the member is_valid of the output scalar
  * will contain `false`.
  *
  * @param col Input column view
  * @param agg Aggregation operator applied by the reduction
- * @param output_dtype  The computation and output precision.
+ * @param output_dtype The computation and output precision.
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output scalar with reduce result.
  */
@@ -69,6 +72,56 @@ std::unique_ptr<scalar> reduce(
   data_type output_dtype,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief  Compute reduction of each segment in the input column
+ *
+ * This function does not detect overflows in reductions. When given integral and
+ * floating point inputs, their values are promoted to `int64_t` and `double`
+ * respectively to compute, and casted to @p output_dtype before returning.
+ *
+ * Null values are treated as identities during reduction.
+ *
+ * If the segment is empty, the row corresponding to the result of the
+ * segment is null.
+ *
+ * If any index in @p offsets is out of bound of @p segmented_values , the behavior
+ * is undefined.
+ *
+ * @note If the input column has arithmetic type, output_dtype can be any arithmetic
+ * type. If the input column has non-arithmetic type, e.g. timestamp, the same
+ * output type must be specified.
+ *
+ * @note If input is not empty, the result is always nullable.
+ *
+ * @throw cudf::logic_error if reduction is called for non-arithmetic output
+ * type and operator other than `min` and `max`.
+ * @throw cudf::logic_error if input column data type is not convertible to
+ * output data type.
+ * @throw cudf::logic_error if `min` or `max` reduction is called and the
+ * output type does not match the input column data type.
+ * @throw cudf::logic_error if `any` or `all` reduction is called and the
+ * output type is not bool8.
+ *
+ * @param segmented_values Column view of segmented inputs.
+ * @param offsets Each segment's offset of @p segmented_values. A list of offsets
+ * with size `num_segments + 1`. The size of `i`th segment is `offsets[i+1] -
+ * offsets[i]`.
+ * @param agg Aggregation operator applied by the reduction.
+ * @param output_dtype  The output precision.
+ * @param null_handling If `INCLUDE`, the reduction is valid if all elements in
+ * a segment are valid, otherwise null. If `EXCLUDE`, the reduction is valid if
+ * any element in the segment is valid, otherwise null.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @returns Output column with results of segmented reduction.
+ */
+std::unique_ptr<column> segmented_reduce(
+  column_view const& segmented_values,
+  device_span<size_type const> offsets,
+  segmented_reduce_aggregation const& agg,
+  data_type output_dtype,
+  null_policy null_handling,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief  Computes the scan of a column.
  *
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index c17abe8267d..ff334b9ee85 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,6 +145,36 @@ std::unique_ptr<table> sort_by_key(
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Performs a key-value stable sort.
+ *
+ * Creates a new table that reorders the rows of `values` according to the
+ * lexicographic ordering of the rows of `keys`.
+ *
+ * The order of equivalent elements is guaranteed to be preserved.
+ *
+ * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`.
+ *
+ * @param values The table to reorder
+ * @param keys The table that determines the ordering
+ * @param column_order The desired order for each column in `keys`. Size must be
+ * equal to `keys.num_columns()` or empty. If empty, all columns are sorted in
+ * ascending order.
+ * @param null_precedence The desired order of a null element compared to other
+ * elements for each column in `keys`. Size must be equal to
+ * `keys.num_columns()` or empty. If empty, all columns will be sorted with
+ * `null_order::BEFORE`.
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return The reordering of `values` determined by the lexicographic order of
+ * the rows of `keys`.
+ */
+std::unique_ptr<table> stable_sort_by_key(
+  table_view const& values,
+  table_view const& keys,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Computes the ranks of input column in sorted order.
  *
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
new file mode 100644
index 00000000000..1b88a9dd8fd
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+/**
+ * @file
+ * @brief Standalone string functions.
+ */
+
+namespace cudf {
+
+using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
+
+namespace strings {
+namespace detail {
+
+/**
+ * @brief This will return true if passed the first byte of a UTF-8 character.
+ *
+ * @param byte Any byte from a valid UTF-8 character
+ * @return true if this the first byte of the character
+ */
+constexpr bool is_begin_utf8_char(uint8_t byte)
+{
+  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
+  return (byte & 0xC0) != 0x80;
+}
+
+/**
+ * @brief Returns the number of bytes in the specified character.
+ *
+ * @param character Single character
+ * @return Number of bytes
+ */
+constexpr size_type bytes_in_char_utf8(char_utf8 character)
+{
+  return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
+         static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
+         static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
+}
+
+/**
+ * @brief Returns the number of bytes used to represent the provided byte.
+ *
+ * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
+ * single character. For example, for the two-byte 0xC3A8 single character,
+ * the first byte would return 2 and the second byte would return 0.
+ *
+ * @param byte Byte from an encoded character.
+ * @return Number of bytes.
+ */
+constexpr size_type bytes_in_utf8_byte(uint8_t byte)
+{
+  return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
+         + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
+         + static_cast<size_type>((byte & 0xC0) == 0xC0)    // 2-byte character prefix
+         - static_cast<size_type>((byte & 0xC0) == 0x80);   // intermediate byte
+}
+
+/**
+ * @brief Convert a char array into a char_utf8 value.
+ *
+ * @param str String containing encoded char bytes.
+ * @param[out] character Single char_utf8 value.
+ * @return The number of bytes in the character
+ */
+constexpr size_type to_char_utf8(const char* str, char_utf8& character)
+{
+  size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
+
+  character = static_cast<char_utf8>(*str++) & 0xFF;
+  if (chr_width > 1) {
+    character = character << 8;
+    character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 8;
+    if (chr_width > 2) {
+      character = character << 8;
+      character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 16;
+      if (chr_width > 3) {
+        character = character << 8;
+        character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 24;
+      }
+    }
+  }
+  return chr_width;
+}
+
+/**
+ * @brief Place a char_utf8 value into a char array.
+ *
+ * @param character Single character
+ * @param[out] str Output array.
+ * @return The number of bytes in the character
+ */
+constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
+{
+  size_type const chr_width = bytes_in_char_utf8(character);
+  for (size_type idx = 0; idx < chr_width; ++idx) {
+    str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
+    character                = character >> 8;
+  }
+  return chr_width;
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 24c8bfea2be..9ef361d6519 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.hpp>
 
 #ifndef __CUDA_ARCH__
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index f88f573ac0c..0c76f7d818d 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -333,96 +333,4 @@ class string_view {
   __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const;
 };
 
-namespace strings {
-namespace detail {
-
-/**
- * @brief This will return true if passed the first byte of a UTF-8 character.
- *
- * @param byte Any byte from a valid UTF-8 character
- * @return true if this the first byte of the character
- */
-constexpr bool is_begin_utf8_char(uint8_t byte)
-{
-  // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
-  return (byte & 0xC0) != 0x80;
-}
-
-/**
- * @brief Returns the number of bytes in the specified character.
- *
- * @param character Single character
- * @return Number of bytes
- */
-constexpr size_type bytes_in_char_utf8(char_utf8 character)
-{
-  return 1 + static_cast<size_type>((character & unsigned{0x0000FF00}) > 0) +
-         static_cast<size_type>((character & unsigned{0x00FF0000}) > 0) +
-         static_cast<size_type>((character & unsigned{0xFF000000}) > 0);
-}
-
-/**
- * @brief Returns the number of bytes used to represent the provided byte.
- *
- * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a
- * single character. For example, for the two-byte 0xC3A8 single character,
- * the first byte would return 2 and the second byte would return 0.
- *
- * @param byte Byte from an encoded character.
- * @return Number of bytes.
- */
-constexpr size_type bytes_in_utf8_byte(uint8_t byte)
-{
-  return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
-         + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
-         + static_cast<size_type>((byte & 0xC0) == 0xC0)    // 2-byte character prefix
-         - static_cast<size_type>((byte & 0xC0) == 0x80);   // intermediate byte
-}
-
-/**
- * @brief Convert a char array into a char_utf8 value.
- *
- * @param str String containing encoded char bytes.
- * @param[out] character Single char_utf8 value.
- * @return The number of bytes in the character
- */
-CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character)
-{
-  size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
-
-  character = static_cast<char_utf8>(*str++) & 0xFF;
-  if (chr_width > 1) {
-    character = character << 8;
-    character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 8;
-    if (chr_width > 2) {
-      character = character << 8;
-      character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 16;
-      if (chr_width > 3) {
-        character = character << 8;
-        character |= (static_cast<char_utf8>(*str++) & 0xFF);  // << 24;
-      }
-    }
-  }
-  return chr_width;
-}
-
-/**
- * @brief Place a char_utf8 value into a char array.
- *
- * @param character Single character
- * @param[out] str Allocated char array with enough space to hold the encoded character.
- * @return The number of bytes in the character
- */
-CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str)
-{
-  size_type const chr_width = bytes_in_char_utf8(character);
-  for (size_type idx = 0; idx < chr_width; ++idx) {
-    str[chr_width - idx - 1] = static_cast<char>(character) & 0xFF;
-    character                = character >> 8;
-  }
-  return chr_width;
-}
-
-}  // namespace detail
-}  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 5572c98fa58..b5d46935fe8 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/hashing.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/sorting.hpp>
@@ -355,12 +356,13 @@ class row_lexicographic_comparator {
    * @brief Construct a function object for performing a lexicographic
    * comparison between the rows of two tables.
    *
+   * Behavior is undefined if called with incomparable column types.
+   *
    * @throws cudf::logic_error if `lhs.num_columns() != rhs.num_columns()`
-   * @throws cudf::logic_error if column types of `lhs` and `rhs` are not comparable.
    *
+   * @param has_nulls Indicates if either input table contains columns with nulls.
    * @param lhs The first table
    * @param rhs The second table (may be the same table as `lhs`)
-   * @param has_nulls Indicates if either input table contains columns with nulls.
    * @param column_order Optional, device array the same length as a row that
    * indicates the desired ascending/descending order of each column in a row.
    * If `nullptr`, it is assumed all columns are sorted in ascending order.
@@ -381,8 +383,6 @@ class row_lexicographic_comparator {
       _null_precedence{null_precedence}
   {
     CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns.");
-    CUDF_EXPECTS(detail::is_relationally_comparable(_lhs, _rhs),
-                 "Attempted to compare elements of uncomparable types.");
   }
 
   /**
@@ -503,18 +503,14 @@ class row_hasher {
 
   __device__ auto operator()(size_type row_index) const
   {
-    auto hash_combiner = [](hash_value_type lhs, hash_value_type rhs) {
-      return hash_function<hash_value_type>{}.hash_combine(lhs, rhs);
-    };
-
     // Hash the first column w/ the seed
-    auto const initial_hash =
-      hash_combiner(hash_value_type{0},
-                    type_dispatcher<dispatch_storage_type>(
-                      _table.column(0).type(),
-                      element_hasher_with_seed<hash_function, Nullate>{_has_nulls, _seed},
-                      _table.column(0),
-                      row_index));
+    auto const initial_hash = cudf::detail::hash_combine(
+      hash_value_type{0},
+      type_dispatcher<dispatch_storage_type>(
+        _table.column(0).type(),
+        element_hasher_with_seed<hash_function, Nullate>{_has_nulls, _seed},
+        _table.column(0),
+        row_index));
 
     // Hashes an element in a column
     auto hasher = [=](size_type column_index) {
@@ -533,7 +529,9 @@ class row_hasher {
       thrust::make_counting_iterator(_table.num_columns()),
       hasher,
       initial_hash,
-      hash_combiner);
+      [](hash_value_type lhs, hash_value_type rhs) {
+        return cudf::detail::hash_combine(lhs, rhs);
+      });
   }
 
  private:
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index ce61e8853b6..3ed18099463 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -150,10 +150,4 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
 
-namespace detail {
-extern template bool is_relationally_comparable<table_device_view>(table_device_view const& lhs,
-                                                                   table_device_view const& rhs);
-extern template bool is_relationally_comparable<mutable_table_device_view>(
-  mutable_table_device_view const& lhs, mutable_table_device_view const& rhs);
-}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 9ccd4d21682..1172a5a68cd 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -159,9 +159,9 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   // Copy construction to support const conversion
   template <typename OtherT,
             std::size_t OtherExtent,
-            typename std::enable_if<(Extent == OtherExtent || Extent == dynamic_extent) &&
-                                      std::is_convertible_v<OtherT (*)[], T (*)[]>,
-                                    void>::type* = nullptr>
+            std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
+                               std::is_convertible_v<OtherT (*)[], T (*)[]>,
+                             void>* = nullptr>
   constexpr host_span(const host_span<OtherT, OtherExtent>& other) noexcept
     : base(other.data(), other.size())
   {
@@ -220,9 +220,9 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
 
   template <typename OtherT,
             std::size_t OtherExtent,
-            typename std::enable_if<(Extent == OtherExtent || Extent == dynamic_extent) &&
-                                      std::is_convertible_v<OtherT (*)[], T (*)[]>,
-                                    void>::type* = nullptr>
+            std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
+                               std::is_convertible_v<OtherT (*)[], T (*)[]>,
+                             void>* = nullptr>
   constexpr device_span(const device_span<OtherT, OtherExtent>& other) noexcept
     : base(other.data(), other.size())
   {
@@ -283,9 +283,9 @@ class base_2dspan {
   template <typename OtherT,
             template <typename, size_t>
             typename OtherRowType,
-            typename std::enable_if<std::is_convertible_v<OtherRowType<OtherT, dynamic_extent>,
-                                                          RowType<T, dynamic_extent>>,
-                                    void>::type* = nullptr>
+            std::enable_if_t<std::is_convertible_v<OtherRowType<OtherT, dynamic_extent>,
+                                                   RowType<T, dynamic_extent>>,
+                             void>* = nullptr>
   constexpr base_2dspan(base_2dspan<OtherT, OtherRowType> const& other) noexcept
     : _data{other.data()}, _size{other.size()}
   {
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index f1ad11a9030..504ec6de405 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -676,13 +676,13 @@ constexpr inline bool is_nested(data_type type)
 
 template <typename FromType>
 struct is_bit_castable_to_impl {
-  template <typename ToType, typename std::enable_if_t<is_compound<ToType>()>* = nullptr>
+  template <typename ToType, std::enable_if_t<is_compound<ToType>()>* = nullptr>
   constexpr bool operator()()
   {
     return false;
   }
 
-  template <typename ToType, typename std::enable_if_t<not is_compound<ToType>()>* = nullptr>
+  template <typename ToType, std::enable_if_t<not is_compound<ToType>()>* = nullptr>
   constexpr bool operator()()
   {
     if (not cuda::std::is_trivially_copyable_v<FromType> ||
@@ -696,13 +696,13 @@ struct is_bit_castable_to_impl {
 };
 
 struct is_bit_castable_from_impl {
-  template <typename FromType, typename std::enable_if_t<is_compound<FromType>()>* = nullptr>
+  template <typename FromType, std::enable_if_t<is_compound<FromType>()>* = nullptr>
   constexpr bool operator()(data_type)
   {
     return false;
   }
 
-  template <typename FromType, typename std::enable_if_t<not is_compound<FromType>()>* = nullptr>
+  template <typename FromType, std::enable_if_t<not is_compound<FromType>()>* = nullptr>
   constexpr bool operator()(data_type to)
   {
     return cudf::type_dispatcher(to, is_bit_castable_to_impl<FromType>{});
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index aa77686fee4..cd96748f081 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -178,7 +178,7 @@ bool validate_host_masks(std::vector<bitmask_type> const& expected_mask,
  * @return std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> first is the
  *  `column_view`'s data, and second is the column's bitmask.
  */
-template <typename T, typename std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
+template <typename T, std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
 std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c)
 {
   thrust::host_vector<T> host_data(c.size());
@@ -197,7 +197,7 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
  * @return std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> first is the
  *  `column_view`'s data, and second is the column's bitmask.
  */
-template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
 std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c)
 {
   using namespace numeric;
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index c190105e292..4005a4f9adc 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -93,31 +93,31 @@ class column_wrapper {
 template <typename From, typename To>
 struct fixed_width_type_converter {
   // Are the types same - simply copy elements from [begin, end) to out
-  template <typename FromT                                                   = From,
-            typename ToT                                                     = To,
-            typename std::enable_if<std::is_same_v<FromT, ToT>, void>::type* = nullptr>
+  template <typename FromT                                      = From,
+            typename ToT                                        = To,
+            std::enable_if_t<std::is_same_v<FromT, ToT>, void>* = nullptr>
   constexpr ToT operator()(FromT element) const
   {
     return element;
   }
 
   // Are the types convertible or can target be constructed from source?
-  template <typename FromT                       = From,
-            typename ToT                         = To,
-            typename std::enable_if<!std::is_same_v<FromT, ToT> &&
-                                      (cudf::is_convertible<FromT, ToT>::value ||
-                                       std::is_constructible_v<ToT, FromT>),
-                                    void>::type* = nullptr>
+  template <
+    typename FromT          = From,
+    typename ToT            = To,
+    std::enable_if_t<!std::is_same_v<FromT, ToT> && (cudf::is_convertible<FromT, ToT>::value ||
+                                                     std::is_constructible_v<ToT, FromT>),
+                     void>* = nullptr>
   constexpr ToT operator()(FromT element) const
   {
     return static_cast<ToT>(element);
   }
 
   // Convert integral values to timestamps
-  template <typename FromT                       = From,
-            typename ToT                         = To,
-            typename std::enable_if<std::is_integral_v<FromT> && cudf::is_timestamp<ToT>(),
-                                    void>::type* = nullptr>
+  template <
+    typename FromT                                                                  = From,
+    typename ToT                                                                    = To,
+    std::enable_if_t<std::is_integral_v<FromT> && cudf::is_timestamp<ToT>(), void>* = nullptr>
   constexpr ToT operator()(FromT element) const
   {
     return ToT{typename ToT::duration{element}};
@@ -137,7 +137,7 @@ struct fixed_width_type_converter {
 template <typename ElementTo,
           typename ElementFrom,
           typename InputIterator,
-          typename std::enable_if_t<not cudf::is_fixed_point<ElementTo>()>* = nullptr>
+          std::enable_if_t<not cudf::is_fixed_point<ElementTo>()>* = nullptr>
 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
 {
   static_assert(cudf::is_fixed_width<ElementTo>(), "Unexpected non-fixed width type.");
@@ -162,8 +162,8 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
 template <typename ElementTo,
           typename ElementFrom,
           typename InputIterator,
-          typename std::enable_if_t<not cudf::is_fixed_point<ElementFrom>() and
-                                    cudf::is_fixed_point<ElementTo>()>* = nullptr>
+          std::enable_if_t<not cudf::is_fixed_point<ElementFrom>() and
+                           cudf::is_fixed_point<ElementTo>()>* = nullptr>
 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
 {
   using RepType        = typename ElementTo::rep;
@@ -187,8 +187,8 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
 template <typename ElementTo,
           typename ElementFrom,
           typename InputIterator,
-          typename std::enable_if_t<cudf::is_fixed_point<ElementFrom>() and
-                                    cudf::is_fixed_point<ElementTo>()>* = nullptr>
+          std::enable_if_t<cudf::is_fixed_point<ElementFrom>() and
+                           cudf::is_fixed_point<ElementTo>()>* = nullptr>
 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
 {
   using namespace numeric;
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 6c21d8dfad2..4df7b6a69c8 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,6 +47,11 @@ class temp_directory {
     return std::remove(pathname);
   }
 
+  temp_directory& operator=(temp_directory const&) = delete;
+  temp_directory(temp_directory const&)            = delete;
+  temp_directory& operator=(temp_directory&&) = default;
+  temp_directory(temp_directory&&)            = default;
+
   ~temp_directory()
   {
     // TODO: should use std::filesystem instead, once C++17 support added
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index e84417c91d6..ac2892a0f34 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -80,9 +80,8 @@ constexpr auto types_to_ids()
  * @return Vector of TypeParam with the values specified
  */
 template <typename TypeParam, typename T>
-typename std::enable_if<cudf::is_fixed_width<TypeParam>() &&
-                          !cudf::is_timestamp_t<TypeParam>::value,
-                        thrust::host_vector<TypeParam>>::type
+std::enable_if_t<cudf::is_fixed_width<TypeParam>() && !cudf::is_timestamp_t<TypeParam>::value,
+                 thrust::host_vector<TypeParam>>
 make_type_param_vector(std::initializer_list<T> const& init_list)
 {
   thrust::host_vector<TypeParam> vec(init_list.size());
@@ -100,8 +99,7 @@ make_type_param_vector(std::initializer_list<T> const& init_list)
  * @return Vector of TypeParam with the values specified
  */
 template <typename TypeParam, typename T>
-typename std::enable_if<cudf::is_timestamp_t<TypeParam>::value,
-                        thrust::host_vector<TypeParam>>::type
+std::enable_if_t<cudf::is_timestamp_t<TypeParam>::value, thrust::host_vector<TypeParam>>
 make_type_param_vector(std::initializer_list<T> const& init_list)
 {
   thrust::host_vector<TypeParam> vec(init_list.size());
@@ -119,8 +117,7 @@ make_type_param_vector(std::initializer_list<T> const& init_list)
  */
 
 template <typename TypeParam, typename T>
-typename std::enable_if<std::is_same_v<TypeParam, std::string>,
-                        thrust::host_vector<std::string>>::type
+std::enable_if_t<std::is_same_v<TypeParam, std::string>, thrust::host_vector<std::string>>
 make_type_param_vector(std::initializer_list<T> const& init_list)
 {
   thrust::host_vector<std::string> vec(init_list.size());
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 9f9b23c3057..9405b4c37ac 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -419,6 +419,8 @@ template std::unique_ptr<groupby_aggregation> make_sum_aggregation<groupby_aggre
 template std::unique_ptr<groupby_scan_aggregation> make_sum_aggregation<groupby_scan_aggregation>();
 template std::unique_ptr<reduce_aggregation> make_sum_aggregation<reduce_aggregation>();
 template std::unique_ptr<scan_aggregation> make_sum_aggregation<scan_aggregation>();
+template std::unique_ptr<segmented_reduce_aggregation>
+make_sum_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a PRODUCT aggregation
 template <typename Base>
@@ -430,6 +432,8 @@ template std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_product_aggregation<groupby_aggregation>();
 template std::unique_ptr<reduce_aggregation> make_product_aggregation<reduce_aggregation>();
 template std::unique_ptr<scan_aggregation> make_product_aggregation<scan_aggregation>();
+template std::unique_ptr<segmented_reduce_aggregation>
+make_product_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MIN aggregation
 template <typename Base>
@@ -443,6 +447,8 @@ template std::unique_ptr<groupby_aggregation> make_min_aggregation<groupby_aggre
 template std::unique_ptr<groupby_scan_aggregation> make_min_aggregation<groupby_scan_aggregation>();
 template std::unique_ptr<reduce_aggregation> make_min_aggregation<reduce_aggregation>();
 template std::unique_ptr<scan_aggregation> make_min_aggregation<scan_aggregation>();
+template std::unique_ptr<segmented_reduce_aggregation>
+make_min_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MAX aggregation
 template <typename Base>
@@ -456,6 +462,8 @@ template std::unique_ptr<groupby_aggregation> make_max_aggregation<groupby_aggre
 template std::unique_ptr<groupby_scan_aggregation> make_max_aggregation<groupby_scan_aggregation>();
 template std::unique_ptr<reduce_aggregation> make_max_aggregation<reduce_aggregation>();
 template std::unique_ptr<scan_aggregation> make_max_aggregation<scan_aggregation>();
+template std::unique_ptr<segmented_reduce_aggregation>
+make_max_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a COUNT aggregation
 template <typename Base>
@@ -482,6 +490,8 @@ std::unique_ptr<Base> make_any_aggregation()
 }
 template std::unique_ptr<aggregation> make_any_aggregation<aggregation>();
 template std::unique_ptr<reduce_aggregation> make_any_aggregation<reduce_aggregation>();
+template std::unique_ptr<segmented_reduce_aggregation>
+make_any_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a ALL aggregation
 template <typename Base>
@@ -491,6 +501,8 @@ std::unique_ptr<Base> make_all_aggregation()
 }
 template std::unique_ptr<aggregation> make_all_aggregation<aggregation>();
 template std::unique_ptr<reduce_aggregation> make_all_aggregation<reduce_aggregation>();
+template std::unique_ptr<segmented_reduce_aggregation>
+make_all_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a SUM_OF_SQUARES aggregation
 template <typename Base>
@@ -675,6 +687,8 @@ template std::unique_ptr<rolling_aggregation> make_collect_list_aggregation<roll
   null_policy null_handling);
 template std::unique_ptr<groupby_aggregation> make_collect_list_aggregation<groupby_aggregation>(
   null_policy null_handling);
+template std::unique_ptr<reduce_aggregation> make_collect_list_aggregation<reduce_aggregation>(
+  null_policy null_handling);
 
 /// Factory to create a COLLECT_SET aggregation
 template <typename Base>
@@ -690,6 +704,8 @@ template std::unique_ptr<rolling_aggregation> make_collect_set_aggregation<rolli
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
 template std::unique_ptr<groupby_aggregation> make_collect_set_aggregation<groupby_aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
+template std::unique_ptr<reduce_aggregation> make_collect_set_aggregation<reduce_aggregation>(
+  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
 
 /// Factory to create a LAG aggregation
 template <typename Base>
@@ -736,6 +752,7 @@ std::unique_ptr<Base> make_merge_lists_aggregation()
 }
 template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_merge_lists_aggregation<groupby_aggregation>();
+template std::unique_ptr<reduce_aggregation> make_merge_lists_aggregation<reduce_aggregation>();
 
 /// Factory to create a MERGE_SETS aggregation
 template <typename Base>
@@ -748,6 +765,8 @@ template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(n
                                                                                nan_equality);
 template std::unique_ptr<groupby_aggregation> make_merge_sets_aggregation<groupby_aggregation>(
   null_equality, nan_equality);
+template std::unique_ptr<reduce_aggregation> make_merge_sets_aggregation<reduce_aggregation>(
+  null_equality, nan_equality);
 
 /// Factory to create a MERGE_M2 aggregation
 template <typename Base>
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 995c6702cf8..c4538379836 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -119,9 +119,9 @@ struct compare_functor {
 
   // This is used to compare a scalar and a column value
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  __device__ inline typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
-                                                !std::is_same_v<RhsViewT, column_device_view>,
-                                              OutT>
+  __device__ inline std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
+                                       !std::is_same_v<RhsViewT, column_device_view>,
+                                     OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(i),
@@ -133,9 +133,9 @@ struct compare_functor {
 
   // This is used to compare a scalar and a column value
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  __device__ inline typename std::enable_if_t<!std::is_same_v<LhsViewT, column_device_view> &&
-                                                std::is_same_v<RhsViewT, column_device_view>,
-                                              OutT>
+  __device__ inline std::enable_if_t<!std::is_same_v<LhsViewT, column_device_view> &&
+                                       std::is_same_v<RhsViewT, column_device_view>,
+                                     OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(),
@@ -147,9 +147,9 @@ struct compare_functor {
 
   // This is used to compare 2 column values
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  __device__ inline typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
-                                                std::is_same_v<RhsViewT, column_device_view>,
-                                              OutT>
+  __device__ inline std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
+                                       std::is_same_v<RhsViewT, column_device_view>,
+                                     OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(i),
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index fefe0b3c862..118a08ab26d 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,21 +31,20 @@ namespace cudf {
 namespace {
 struct size_of_helper {
   cudf::data_type type;
-  template <typename T, typename std::enable_if_t<not is_fixed_width<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<not is_fixed_width<T>()>* = nullptr>
   constexpr int operator()() const
   {
     CUDF_FAIL("Invalid, non fixed-width element type.");
     return 0;
   }
 
-  template <typename T,
-            typename std::enable_if_t<is_fixed_width<T>() && not is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<is_fixed_width<T>() && not is_fixed_point<T>()>* = nullptr>
   constexpr int operator()() const noexcept
   {
     return sizeof(T);
   }
 
-  template <typename T, typename std::enable_if_t<is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   constexpr int operator()() const noexcept
   {
     // Only want the sizeof fixed_point::Rep as fixed_point::scale is stored in data_type
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 3412733f0b2..82e189b5a36 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -166,7 +166,7 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
   auto const output_size = output_view.size();
   auto* output_data      = output_view.data<T>();
 
-  size_type output_index     = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t output_index       = threadIdx.x + blockIdx.x * blockDim.x;
   size_type warp_valid_count = 0;
 
   unsigned active_mask;
@@ -222,7 +222,7 @@ std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
   auto const& d_offsets   = std::get<2>(device_views);
   auto const output_size  = std::get<3>(device_views);
 
-  CUDF_EXPECTS(output_size < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
+  CUDF_EXPECTS(output_size <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "Total number of concatenated rows exceeds size_type range");
 
   // Allocate output
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 122ad4a9752..4dbe9faaa47 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -278,14 +278,14 @@ struct launch_functor {
   launch_functor(column_view inp, mutable_column_view out) : input(inp), output(out) {}
 
   template <typename Element>
-  typename std::enable_if_t<!cudf::is_timestamp_t<Element>::value, void> operator()(
+  std::enable_if_t<!cudf::is_timestamp_t<Element>::value, void> operator()(
     rmm::cuda_stream_view stream) const
   {
     CUDF_FAIL("Cannot extract datetime component from non-timestamp column.");
   }
 
   template <typename Timestamp>
-  typename std::enable_if_t<cudf::is_timestamp_t<Timestamp>::value, void> operator()(
+  std::enable_if_t<cudf::is_timestamp_t<Timestamp>::value, void> operator()(
     rmm::cuda_stream_view stream) const
   {
     thrust::transform(rmm::exec_policy(stream),
@@ -326,18 +326,18 @@ std::unique_ptr<column> apply_datetime_op(column_view const& column,
 
 struct add_calendrical_months_functor {
   template <typename Element, typename... Args>
-  typename std::enable_if_t<!cudf::is_timestamp_t<Element>::value, std::unique_ptr<column>>
-  operator()(Args&&...) const
+  std::enable_if_t<!cudf::is_timestamp_t<Element>::value, std::unique_ptr<column>> operator()(
+    Args&&...) const
   {
     CUDF_FAIL("Cannot extract datetime component from non-timestamp column.");
   }
 
   template <typename Timestamp, typename MonthIterator>
-  typename std::enable_if_t<cudf::is_timestamp_t<Timestamp>::value, std::unique_ptr<column>>
-  operator()(column_view timestamp_column,
-             MonthIterator months_begin,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const
+  std::enable_if_t<cudf::is_timestamp_t<Timestamp>::value, std::unique_ptr<column>> operator()(
+    column_view timestamp_column,
+    MonthIterator months_begin,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
   {
     auto size            = timestamp_column.size();
     auto output_col_type = timestamp_column.type();
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 301338fa1a8..871a36f7d62 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -122,8 +122,7 @@ struct compute_children_offsets_fn {
  */
 struct dispatch_compute_indices {
   template <typename Element>
-  typename std::enable_if_t<cudf::is_relationally_comparable<Element, Element>(),
-                            std::unique_ptr<column>>
+  std::enable_if_t<cudf::is_relationally_comparable<Element, Element>(), std::unique_ptr<column>>
   operator()(column_view const& all_keys,
              column_view const& all_indices,
              column_view const& new_keys,
@@ -184,8 +183,7 @@ struct dispatch_compute_indices {
   }
 
   template <typename Element, typename... Args>
-  typename std::enable_if_t<!cudf::is_relationally_comparable<Element, Element>(),
-                            std::unique_ptr<column>>
+  std::enable_if_t<!cudf::is_relationally_comparable<Element, Element>(), std::unique_ptr<column>>
   operator()(Args&&...)
   {
     CUDF_FAIL("dictionary concatenate not supported for this column type");
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index c1fb1fa2180..7783e5f8daf 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -50,8 +50,7 @@ namespace {
  */
 struct dispatch_compute_indices {
   template <typename Element>
-  typename std::enable_if_t<cudf::is_relationally_comparable<Element, Element>(),
-                            std::unique_ptr<column>>
+  std::enable_if_t<cudf::is_relationally_comparable<Element, Element>(), std::unique_ptr<column>>
   operator()(dictionary_column_view const& input,
              column_view const& new_keys,
              rmm::cuda_stream_view stream,
@@ -100,8 +99,7 @@ struct dispatch_compute_indices {
   }
 
   template <typename Element, typename... Args>
-  typename std::enable_if_t<!cudf::is_relationally_comparable<Element, Element>(),
-                            std::unique_ptr<column>>
+  std::enable_if_t<!cudf::is_relationally_comparable<Element, Element>(), std::unique_ptr<column>>
   operator()(Args&&...)
   {
     CUDF_FAIL("dictionary set_keys not supported for this column type");
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index c49142f91f9..e5bffcf21c1 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,9 +55,8 @@ struct const_tabulator {
  * by init and step.
  */
 struct sequence_functor {
-  template <
-    typename T,
-    typename std::enable_if_t<cudf::is_numeric<T>() and not cudf::is_boolean<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<cudf::is_numeric<T>() and not cudf::is_boolean<T>()>* = nullptr>
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
                                      scalar const& step,
@@ -83,9 +82,8 @@ struct sequence_functor {
     return result;
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<cudf::is_numeric<T>() and not cudf::is_boolean<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<cudf::is_numeric<T>() and not cudf::is_boolean<T>()>* = nullptr>
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
                                      rmm::cuda_stream_view stream,
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index 5154c867095..37d13d5aea3 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,14 +33,14 @@ namespace detail {
 namespace {
 struct nunique_functor {
   template <typename T>
-  typename std::enable_if_t<cudf::is_equality_comparable<T, T>(), std::unique_ptr<column>>
-  operator()(column_view const& values,
-             cudf::device_span<size_type const> group_labels,
-             size_type const num_groups,
-             cudf::device_span<size_type const> group_offsets,
-             null_policy null_handling,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+  std::enable_if_t<cudf::is_equality_comparable<T, T>(), std::unique_ptr<column>> operator()(
+    column_view const& values,
+    cudf::device_span<size_type const> group_labels,
+    size_type const num_groups,
+    cudf::device_span<size_type const> group_offsets,
+    null_policy null_handling,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     auto result = make_numeric_column(
       data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -94,8 +94,8 @@ struct nunique_functor {
   }
 
   template <typename T, typename... Args>
-  typename std::enable_if_t<!cudf::is_equality_comparable<T, T>(), std::unique_ptr<column>>
-  operator()(Args&&...)
+  std::enable_if_t<!cudf::is_equality_comparable<T, T>(), std::unique_ptr<column>> operator()(
+    Args&&...)
   {
     CUDF_FAIL("list_view group_nunique not supported yet");
   }
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 13f5a57ac1f..e2e478af9ef 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -375,8 +375,8 @@ struct decode_op {
    * @return bool Whether the parsed value is valid.
    */
   template <typename T,
-            typename std::enable_if_t<std::is_integral_v<T> and !std::is_same_v<T, bool> and
-                                      !cudf::is_fixed_point<T>()>* = nullptr>
+            std::enable_if_t<std::is_integral_v<T> and !std::is_same_v<T, bool> and
+                             !cudf::is_fixed_point<T>()>* = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type,
@@ -402,7 +402,7 @@ struct decode_op {
    *
    * @return bool Whether the parsed value is valid.
    */
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type output_type,
@@ -423,7 +423,7 @@ struct decode_op {
   /**
    * @brief Dispatch for boolean type types.
    */
-  template <typename T, typename std::enable_if_t<std::is_same_v<T, bool>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_same_v<T, bool>>* = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type,
@@ -447,7 +447,7 @@ struct decode_op {
    * @brief Dispatch for floating points, which are set to NaN if the input
    * is not valid. In such case, the validity mask is set to zero too.
    */
-  template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type,
@@ -466,8 +466,8 @@ struct decode_op {
    * @brief Dispatch for all other types.
    */
   template <typename T,
-            typename std::enable_if_t<!std::is_integral_v<T> and !std::is_floating_point_v<T> and
-                                      !cudf::is_fixed_point<T>()>* = nullptr>
+            std::enable_if_t<!std::is_integral_v<T> and !std::is_floating_point_v<T> and
+                             !cudf::is_fixed_point<T>()>* = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void* out_buffer,
                                                       size_t row,
                                                       const data_type,
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 5cf0b03a6f1..21455e3ab93 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -216,7 +216,7 @@ struct ConvertFunctor {
    * It is handled here rather than within convertStrToValue() as that function
    * is used by other types (ex. timestamp) that aren't 'booleable'.
    */
-  template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   __host__ __device__ __forceinline__ bool operator()(char const* begin,
                                                       char const* end,
                                                       void* output_column,
@@ -240,7 +240,7 @@ struct ConvertFunctor {
    * @brief Dispatch for floating points, which are set to NaN if the input
    * is not valid. In such case, the validity mask is set to zero too.
    */
-  template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
   __host__ __device__ __forceinline__ bool operator()(char const* begin,
                                                       char const* end,
                                                       void* out_buffer,
@@ -257,9 +257,8 @@ struct ConvertFunctor {
    * @brief Default template operator() dispatch specialization all data types
    * (including wrapper types) that is not covered by above.
    */
-  template <
-    typename T,
-    typename std::enable_if_t<!std::is_floating_point_v<T> and !std::is_integral_v<T>>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!std::is_floating_point_v<T> and !std::is_integral_v<T>>* = nullptr>
   __host__ __device__ __forceinline__ bool operator()(char const* begin,
                                                       char const* end,
                                                       void* output_column,
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 386e3d8d73a..47020023419 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -137,56 +137,51 @@ int inline constexpr encode_field_number(int field_number, ProtofType field_type
 }
 
 namespace {
-template <
-  typename base_t,
-  typename std::enable_if_t<!std::is_arithmetic_v<base_t> and !std::is_enum_v<base_t>>* = nullptr>
+template <typename base_t,
+          std::enable_if_t<!std::is_arithmetic_v<base_t> and !std::is_enum_v<base_t>>* = nullptr>
 int static constexpr encode_field_number_base(int field_number) noexcept
 {
   return encode_field_number(field_number, ProtofType::FIXEDLEN);
 }
 
-template <
-  typename base_t,
-  typename std::enable_if_t<std::is_integral_v<base_t> or std::is_enum_v<base_t>>* = nullptr>
+template <typename base_t,
+          std::enable_if_t<std::is_integral_v<base_t> or std::is_enum_v<base_t>>* = nullptr>
 int static constexpr encode_field_number_base(int field_number) noexcept
 {
   return encode_field_number(field_number, ProtofType::VARINT);
 }
 
-template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, float>>* = nullptr>
+template <typename base_t, std::enable_if_t<std::is_same_v<base_t, float>>* = nullptr>
 int static constexpr encode_field_number_base(int field_number) noexcept
 {
   return encode_field_number(field_number, ProtofType::FIXED32);
 }
 
-template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, double>>* = nullptr>
+template <typename base_t, std::enable_if_t<std::is_same_v<base_t, double>>* = nullptr>
 int static constexpr encode_field_number_base(int field_number) noexcept
 {
   return encode_field_number(field_number, ProtofType::FIXED64);
 }
 };  // namespace
 
-template <
-  typename T,
-  typename std::enable_if_t<!std::is_class_v<T> or std::is_same_v<T, std::string>>* = nullptr>
+template <typename T,
+          std::enable_if_t<!std::is_class_v<T> or std::is_same_v<T, std::string>>* = nullptr>
 int constexpr encode_field_number(int field_number) noexcept
 {
   return encode_field_number_base<T>(field_number);
 }
 
 // containters change the field number encoding
-template <
-  typename T,
-  typename std::enable_if_t<std::is_same_v<T, std::vector<typename T::value_type>>>* = nullptr>
+template <typename T,
+          std::enable_if_t<std::is_same_v<T, std::vector<typename T::value_type>>>* = nullptr>
 int constexpr encode_field_number(int field_number) noexcept
 {
   return encode_field_number_base<T>(field_number);
 }
 
 // optional fields don't change the field number encoding
-template <
-  typename T,
-  typename std::enable_if_t<std::is_same_v<T, std::optional<typename T::value_type>>>* = nullptr>
+template <typename T,
+          std::enable_if_t<std::is_same_v<T, std::optional<typename T::value_type>>>* = nullptr>
 int constexpr encode_field_number(int field_number) noexcept
 {
   return encode_field_number_base<typename T::value_type>(field_number);
@@ -244,19 +239,19 @@ class ProtobufReader {
 
   uint32_t read_field_size(const uint8_t* end);
 
-  template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   void read_field(T& value, const uint8_t* end)
   {
     value = get<T>();
   }
 
-  template <typename T, typename std::enable_if_t<std::is_enum_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_enum_v<T>>* = nullptr>
   void read_field(T& value, const uint8_t* end)
   {
     value = static_cast<T>(get<uint32_t>());
   }
 
-  template <typename T, typename std::enable_if_t<std::is_same_v<T, std::string>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_same_v<T, std::string>>* = nullptr>
   void read_field(T& value, const uint8_t* end)
   {
     auto const size = read_field_size(end);
@@ -264,8 +259,7 @@ class ProtobufReader {
     m_cur += size;
   }
 
-  template <typename T,
-            typename std::enable_if_t<std::is_same_v<T, std::vector<std::string>>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_same_v<T, std::vector<std::string>>>* = nullptr>
   void read_field(T& value, const uint8_t* end)
   {
     auto const size = read_field_size(end);
@@ -273,10 +267,9 @@ class ProtobufReader {
     m_cur += size;
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<std::is_same_v<T, std::vector<typename T::value_type>> and
-                              !std::is_same_v<std::string, typename T::value_type>>* = nullptr>
+  template <typename T,
+            std::enable_if_t<std::is_same_v<T, std::vector<typename T::value_type>> and
+                             !std::is_same_v<std::string, typename T::value_type>>* = nullptr>
   void read_field(T& value, const uint8_t* end)
   {
     auto const size = read_field_size(end);
@@ -284,9 +277,8 @@ class ProtobufReader {
     read(value.back(), size);
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<std::is_same_v<T, std::optional<typename T::value_type>>>* = nullptr>
+  template <typename T,
+            std::enable_if_t<std::is_same_v<T, std::optional<typename T::value_type>>>* = nullptr>
   void read_field(T& value, const uint8_t* end)
   {
     typename T::value_type contained_value;
@@ -301,7 +293,7 @@ class ProtobufReader {
     read(value, size);
   }
 
-  template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
   void read_field(T& value, const uint8_t* end)
   {
     memcpy(&value, m_cur, sizeof(T));
diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp
new file mode 100644
index 00000000000..290e0451839
--- /dev/null
+++ b/cpp/src/io/text/byte_range_info.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
+
+#include <limits>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }
+
+std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
+                                                                 int64_t range_count)
+{
+  auto range_size = util::div_rounding_up_safe(total_bytes, range_count);
+  auto ranges     = std::vector<byte_range_info>();
+
+  ranges.reserve(range_size);
+
+  for (int64_t i = 0; i < range_count; i++) {
+    auto offset = i * range_size;
+    auto size   = std::min(range_size, total_bytes - offset);
+    ranges.emplace_back(offset, size);
+  }
+
+  return ranges;
+}
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index d287b9f2419..51622747831 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,20 +18,29 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
 #include <cudf/io/text/detail/tile_state.hpp>
 #include <cudf/io/text/detail/trie.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <limits>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 
 #include <memory>
+#include <optional>
 
 namespace {
 
@@ -96,7 +105,7 @@ __global__ void multibyte_split_init_kernel(
   cudf::size_type base_tile_idx,
   cudf::size_type num_tiles,
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<int64_t> tile_output_offsets,
   cudf::io::text::detail::scan_tile_status status =
     cudf::io::text::detail::scan_tile_status::invalid)
 {
@@ -110,7 +119,7 @@ __global__ void multibyte_split_init_kernel(
 
 __global__ void multibyte_split_seed_kernel(
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<int64_t> tile_output_offsets,
   multistate tile_multistate_seed,
   uint32_t tile_output_offset)
 {
@@ -124,17 +133,15 @@ __global__ void multibyte_split_seed_kernel(
 __global__ void multibyte_split_kernel(
   cudf::size_type base_tile_idx,
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<int64_t> tile_output_offsets,
   cudf::io::text::detail::trie_device_view trie,
-  int32_t chunk_input_offset,
   cudf::device_span<char const> chunk_input_chars,
-  cudf::device_span<int32_t> abs_output_delimiter_offsets,
-  cudf::device_span<char> abs_output_chars)
+  cudf::device_span<int64_t> abs_output_delimiter_offsets)
 {
   using InputLoad =
     cub::BlockLoad<char, THREADS_PER_TILE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_VECTORIZE>;
-  using OffsetScan         = cub::BlockScan<uint32_t, THREADS_PER_TILE>;
-  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<uint32_t>;
+  using OffsetScan         = cub::BlockScan<int64_t, THREADS_PER_TILE>;
+  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<int64_t>;
 
   __shared__ union {
     typename InputLoad::TempStorage input_load;
@@ -166,7 +173,7 @@ __global__ void multibyte_split_kernel(
 
   // STEP 3: Flag matches
 
-  uint32_t thread_offsets[ITEMS_PER_THREAD];
+  int64_t thread_offsets[ITEMS_PER_THREAD];
 
   for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
     thread_offsets[i] = i < thread_input_size and trie.is_match(thread_states[i]);
@@ -182,16 +189,11 @@ __global__ void multibyte_split_kernel(
 
   // Step 5: Assign outputs from each thread using match offsets.
 
-  if (abs_output_chars.size() > 0) {
-    for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) {
-      abs_output_chars[chunk_input_offset + thread_input_offset + i] = thread_chars[i];
-    }
-  }
-
   if (abs_output_delimiter_offsets.size() > 0) {
     for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) {
       if (trie.is_match(thread_states[i])) {
-        auto const match_end = base_tile_idx * ITEMS_PER_TILE + thread_input_offset + i + 1;
+        auto const match_end =
+          static_cast<int64_t>(base_tile_idx) * ITEMS_PER_TILE + thread_input_offset + i + 1;
         abs_output_delimiter_offsets[thread_offsets[i]] = match_end;
       }
     }
@@ -236,17 +238,16 @@ std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_p
   return streams;
 }
 
-cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source const& source,
-                                                 cudf::io::text::detail::trie const& trie,
-                                                 scan_tile_state<multistate>& tile_multistates,
-                                                 scan_tile_state<uint32_t>& tile_offsets,
-                                                 device_span<cudf::size_type> output_buffer,
-                                                 device_span<char> output_char_buffer,
-                                                 rmm::cuda_stream_view stream,
-                                                 std::vector<rmm::cuda_stream_view> const& streams)
+int64_t multibyte_split_scan_full_source(cudf::io::text::data_chunk_source const& source,
+                                         cudf::io::text::detail::trie const& trie,
+                                         scan_tile_state<multistate>& tile_multistates,
+                                         scan_tile_state<int64_t>& tile_offsets,
+                                         device_span<int64_t> output_buffer,
+                                         rmm::cuda_stream_view stream,
+                                         std::vector<rmm::cuda_stream_view> const& streams)
 {
   CUDF_FUNC_RANGE();
-  cudf::size_type chunk_offset = 0;
+  int64_t chunk_offset = 0;
 
   multibyte_split_init_kernel<<<TILES_PER_CHUNK, THREADS_PER_TILE, 0, stream.value()>>>(  //
     -TILES_PER_CHUNK,
@@ -298,14 +299,14 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour
       tile_multistates,
       tile_offsets,
       trie.view(),
-      chunk_offset,
       *chunk,
-      output_buffer,
-      output_char_buffer);
+      output_buffer);
 
     cudaEventRecord(last_launch_event, chunk_stream);
 
     chunk_offset += chunk->size();
+
+    chunk.reset();
   }
 
   cudaEventDestroy(last_launch_event);
@@ -317,6 +318,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
+                                              byte_range_info byte_range,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr,
                                               rmm::cuda_stream_pool& stream_pool)
@@ -336,7 +338,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
   auto num_tile_states  = std::max(32, TILES_PER_CHUNK * concurrency + 32);
   auto tile_multistates = scan_tile_state<multistate>(num_tile_states, stream);
-  auto tile_offsets     = scan_tile_state<uint32_t>(num_tile_states, stream);
+  auto tile_offsets     = scan_tile_state<int64_t>(num_tile_states, stream);
 
   auto streams = get_streams(concurrency, stream_pool);
 
@@ -345,52 +347,104 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                      trie,
                                      tile_multistates,
                                      tile_offsets,
-                                     cudf::device_span<int32_t>(static_cast<int32_t*>(nullptr), 0),
-                                     cudf::device_span<char>(static_cast<char*>(nullptr), 0),
+                                     cudf::device_span<int64_t>(static_cast<int64_t*>(nullptr), 0),
                                      stream,
                                      streams);
 
   // allocate results
-  auto num_tiles      = cudf::util::div_rounding_up_safe(bytes_total, ITEMS_PER_TILE);
-  auto num_results    = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream);
-  auto string_offsets = rmm::device_uvector<int32_t>(num_results + 2, stream, mr);
-  auto string_chars   = rmm::device_uvector<char>(bytes_total, stream, mr);
+  auto num_tiles =
+    cudf::util::div_rounding_up_safe(bytes_total, static_cast<int64_t>(ITEMS_PER_TILE));
+  auto num_results = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream);
+
+  auto string_offsets = rmm::device_uvector<int64_t>(num_results + 2, stream);
 
   // first and last element are set manually to zero and size of input, respectively.
   // kernel is only responsible for determining delimiter offsets
-  auto string_count = static_cast<cudf::size_type>(string_offsets.size() - 1);
   string_offsets.set_element_to_zero_async(0, stream);
-  string_offsets.set_element_async(string_count, bytes_total, stream);
+  string_offsets.set_element_async(string_offsets.size() - 1, bytes_total, stream);
+
+  // kernel needs to find first and last relevant offset., as well as count of relevant offsets.
 
   multibyte_split_scan_full_source(
     source,
     trie,
     tile_multistates,
     tile_offsets,
-    cudf::device_span<int32_t>(string_offsets).subspan(1, num_results),
-    string_chars,
+    cudf::device_span<int64_t>(string_offsets).subspan(1, num_results),
     stream,
     streams);
 
+  auto relevant_offsets_begin = thrust::lower_bound(rmm::exec_policy(stream),
+                                                    string_offsets.begin(),
+                                                    string_offsets.end() - 1,
+                                                    byte_range.offset());
+
+  auto relevant_offsets_end = thrust::upper_bound(rmm::exec_policy(stream),
+                                                  string_offsets.begin(),
+                                                  string_offsets.end() - 1,
+                                                  byte_range.offset() + byte_range.size()) +
+                              1;
+
+  auto string_offsets_out_size = relevant_offsets_end - relevant_offsets_begin;
+
+  auto string_offsets_out = rmm::device_uvector<int32_t>(string_offsets_out_size, stream, mr);
+
+  auto relevant_offset_first =
+    string_offsets.element(relevant_offsets_begin - string_offsets.begin(), stream);
+  auto relevant_offset_last =
+    string_offsets.element(relevant_offsets_end - string_offsets.begin() - 1, stream);
+
+  auto string_chars_size = relevant_offset_last - relevant_offset_first;
+  auto string_chars      = rmm::device_uvector<char>(string_chars_size, stream, mr);
+
+  // copy relevant offsets and adjust them to be zero-based.
+  thrust::transform(rmm::exec_policy(stream),
+                    relevant_offsets_begin,
+                    relevant_offsets_end,
+                    string_offsets_out.begin(),
+                    [relevant_offset_first] __device__(int64_t offset) {
+                      return static_cast<int32_t>(offset - relevant_offset_first);
+                    });
+
+  auto reader = source.create_reader();
+  reader->skip_bytes(relevant_offset_first);
+
+  auto relevant_bytes = reader->get_next_chunk(string_chars_size, stream);
+
+  thrust::copy(rmm::exec_policy(stream),
+               relevant_bytes->data(),  //
+               relevant_bytes->data() + relevant_bytes->size(),
+               string_chars.begin());
+
+  auto string_count = string_offsets_out.size() - 1;
+
   return cudf::make_strings_column(
-    string_count, std::move(string_offsets), std::move(string_chars));
+    string_count, std::move(string_offsets_out), std::move(string_chars));
 }
 
 }  // namespace detail
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
+                                              std::optional<byte_range_info> byte_range,
                                               rmm::mr::device_memory_resource* mr)
 {
   auto stream      = rmm::cuda_stream_default;
   auto stream_pool = rmm::cuda_stream_pool(2);
-  auto result      = detail::multibyte_split(source, delimiter, stream, mr, stream_pool);
 
-  stream.synchronize();
+  auto result = detail::multibyte_split(
+    source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool);
 
   return result;
 }
 
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  return multibyte_split(source, delimiter, std::nullopt, mr);
+}
+
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index e2893a2e881..f7e250f1d3f 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -88,13 +88,13 @@ class cufile_shim {
 void cufile_shim::modify_cufile_json() const
 {
   std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
-  temp_directory tmp_config_dir{"cudf_cufile_config"};
+  static temp_directory tmp_config_dir{"cudf_cufile_config"};
 
   // Modify the config file based on the policy
   auto const config_file_path = getenv_or<std::string>(json_path_env_var, "/etc/cufile.json");
   std::ifstream user_config_file(config_file_path);
   // Modified config file is stored in a temporary directory
-  auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+  auto const cudf_config_path = tmp_config_dir.path() + "cufile.json";
   std::ofstream cudf_config_file(cudf_config_path);
 
   std::string line;
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index d1b2e2862c6..74b98eff010 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -98,7 +98,7 @@ struct parse_options {
  *
  * @return uint8_t Numeric value of the character, or `0`
  */
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
 constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
@@ -119,7 +119,7 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag)
  *
  * @return uint8_t Numeric value of the character, or `0`
  */
-template <typename T, typename std::enable_if_t<!std::is_integral_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<!std::is_integral_v<T>>* = nullptr>
 constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 37b5f58da22..159681eaffc 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,13 +104,10 @@ std::string get_program_cache_dir()
 #endif
 }
 
-void try_parse_numeric_env_var(std::size_t& result, char const* const env_name)
+std::size_t try_parse_numeric_env_var(char const* const env_name, std::size_t default_val)
 {
-  auto value = std::getenv(env_name);
-
-  if (value != nullptr) {
-    result = std::stoull(value);  // fails if env var contains invalid value.
-  }
+  auto const value = std::getenv(env_name);
+  return value != nullptr ? std::stoull(value) : default_val;
 }
 
 jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog)
@@ -123,27 +120,19 @@ jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData prep
   auto existing_cache = caches.find(preprog.name());
 
   if (existing_cache == caches.end()) {
-    std::size_t kernel_limit_proc = std::numeric_limits<std::size_t>::max();
-    std::size_t kernel_limit_disk = std::numeric_limits<std::size_t>::max();
-    try_parse_numeric_env_var(kernel_limit_proc, "LIBCUDF_KERNEL_CACHE_LIMIT_PER_PROCESS");
-    try_parse_numeric_env_var(kernel_limit_disk, "LIBCUDF_KERNEL_CACHE_LIMIT_DISK");
-
-    auto cache_dir = get_program_cache_dir();
-
-    if (kernel_limit_disk == 0) {
-      // if kernel_limit_disk is zero, jitify will assign it the value of kernel_limit_proc.
-      // to avoid this, we treat zero as "disable disk caching" by not providing the cache dir.
-      cache_dir = {};
-    }
-
-    auto res = caches.insert({preprog.name(),
-                              std::make_unique<jitify2::ProgramCache<>>(  //
-                                kernel_limit_proc,
-                                preprog,
-                                nullptr,
-                                cache_dir,
-                                kernel_limit_disk)});
-
+    auto const kernel_limit_proc =
+      try_parse_numeric_env_var("LIBCUDF_KERNEL_CACHE_LIMIT_PER_PROCESS", 10'000);
+    auto const kernel_limit_disk =
+      try_parse_numeric_env_var("LIBCUDF_KERNEL_CACHE_LIMIT_DISK", 100'000);
+
+    // if kernel_limit_disk is zero, jitify will assign it the value of kernel_limit_proc.
+    // to avoid this, we treat zero as "disable disk caching" by not providing the cache dir.
+    auto const cache_dir = kernel_limit_disk == 0 ? std::string{} : get_program_cache_dir();
+
+    auto const res =
+      caches.insert({preprog.name(),
+                     std::make_unique<jitify2::ProgramCache<>>(
+                       kernel_limit_proc, preprog, nullptr, cache_dir, kernel_limit_disk)});
     existing_cache = res.first;
   }
 
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 9df8e42d845..0198bd11107 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -707,9 +707,8 @@ struct get_scalar_minmax {
 };
 
 struct typed_group_tdigest {
-  template <
-    typename T,
-    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& col,
                                      cudf::device_span<size_type const> group_offsets,
                                      cudf::device_span<size_type const> group_labels,
@@ -766,10 +765,9 @@ struct typed_group_tdigest {
                             mr);
   }
 
-  template <
-    typename T,
-    typename... Args,
-    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            typename... Args,
+            std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(Args&&...)
   {
     CUDF_FAIL("Non-numeric type in group_tdigest");
diff --git a/cpp/src/reductions/collect_ops.cu b/cpp/src/reductions/collect_ops.cu
new file mode 100644
index 00000000000..c9bd06a1171
--- /dev/null
+++ b/cpp/src/reductions/collect_ops.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/copy_if.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/lists/drop_list_duplicates.hpp>
+#include <cudf/lists/lists_column_factories.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<scalar> drop_duplicates(list_scalar const& scalar,
+                                        null_equality nulls_equal,
+                                        nan_equality nans_equal,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  auto list_wrapper   = lists::detail::make_lists_column_from_scalar(scalar, 1, stream, mr);
+  auto lcw            = lists_column_view(list_wrapper->view());
+  auto no_dup_wrapper = lists::drop_list_duplicates(lcw, nulls_equal, nans_equal, mr);
+  auto no_dup         = lists_column_view(no_dup_wrapper->view()).get_sliced_child(stream);
+  return make_list_scalar(no_dup, stream, mr);
+}
+
+std::unique_ptr<scalar> collect_list(column_view const& col,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  if (null_handling == null_policy::EXCLUDE && col.has_nulls()) {
+    auto d_view             = column_device_view::create(col, stream);
+    auto filter             = detail::validity_accessor(*d_view);
+    auto null_purged_table  = detail::copy_if(table_view{{col}}, filter, stream, mr);
+    column* null_purged_col = null_purged_table->release().front().release();
+    null_purged_col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
+    return std::make_unique<list_scalar>(std::move(*null_purged_col), true, stream, mr);
+  } else {
+    return make_list_scalar(col, stream, mr);
+  }
+}
+
+std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  auto flatten_col = col.get_sliced_child(stream);
+  return make_list_scalar(flatten_col, stream, mr);
+}
+
+std::unique_ptr<scalar> collect_set(column_view const& col,
+                                    null_policy null_handling,
+                                    null_equality nulls_equal,
+                                    nan_equality nans_equal,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  auto scalar = collect_list(col, null_handling, stream, mr);
+  auto ls     = dynamic_cast<list_scalar*>(scalar.get());
+  return drop_duplicates(*ls, nulls_equal, nans_equal, stream, mr);
+}
+
+std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
+                                   null_equality nulls_equal,
+                                   nan_equality nans_equal,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  auto flatten_col = col.get_sliced_child(stream);
+  auto scalar      = std::make_unique<list_scalar>(flatten_col, true, stream, mr);
+  return drop_duplicates(*scalar, nulls_equal, nans_equal, stream, mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 3558b5348ea..bd8c8342708 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -102,6 +102,22 @@ struct reduce_dispatch_functor {
         auto nth_agg = dynamic_cast<nth_element_aggregation const*>(agg.get());
         return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, stream, mr);
       } break;
+      case aggregation::COLLECT_LIST: {
+        auto col_agg = dynamic_cast<collect_list_aggregation const*>(agg.get());
+        return reduction::collect_list(col, col_agg->_null_handling, stream, mr);
+      } break;
+      case aggregation::COLLECT_SET: {
+        auto col_agg = dynamic_cast<collect_set_aggregation const*>(agg.get());
+        return reduction::collect_set(
+          col, col_agg->_null_handling, col_agg->_nulls_equal, col_agg->_nans_equal, stream, mr);
+      } break;
+      case aggregation::MERGE_LISTS: {
+        return reduction::merge_lists(col, stream, mr);
+      } break;
+      case aggregation::MERGE_SETS: {
+        auto col_agg = dynamic_cast<merge_sets_aggregation const*>(agg.get());
+        return reduction::merge_sets(col, col_agg->_nulls_equal, col_agg->_nans_equal, stream, mr);
+      } break;
       default: CUDF_FAIL("Unsupported reduction operator");
     }
   }
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 0ef78cc2f91..3b8cc17c4aa 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -50,7 +50,7 @@ struct scan_dispatcher {
    * @param mr Device memory resource used to allocate the returned column's device memory
    * @return Output column with scan results
    */
-  template <typename T, typename std::enable_if_t<cuda::std::is_arithmetic_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<cuda::std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      null_policy,
                                      rmm::cuda_stream_view stream,
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index b2fe8a9fb6b..bc2f1d47311 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -225,7 +225,7 @@ struct scan_dispatcher {
    *
    * @tparam T type of input column
    */
-  template <typename T, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      null_policy,
                                      rmm::cuda_stream_view stream,
diff --git a/cpp/src/reductions/segmented_all.cu b/cpp/src/reductions/segmented_all.cu
new file mode 100644
index 00000000000..a04da1ac2fa
--- /dev/null
+++ b/cpp/src/reductions/segmented_all.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "simple_segmented.cuh"
+
+#include <cudf/detail/reduction_functions.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_all(column_view const& col,
+                                            device_span<size_type const> offsets,
+                                            cudf::data_type const output_dtype,
+                                            null_policy null_handling,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
+               "segmented_all() operation requires output type `BOOL8`");
+
+  // A minimum over bool types is used to implement all()
+  return cudf::type_dispatcher(
+    col.type(),
+    simple::detail::bool_result_column_dispatcher<cudf::reduction::op::min>{},
+    col,
+    offsets,
+    null_handling,
+    stream,
+    mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_any.cu b/cpp/src/reductions/segmented_any.cu
new file mode 100644
index 00000000000..ad44289175b
--- /dev/null
+++ b/cpp/src/reductions/segmented_any.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "simple_segmented.cuh"
+
+#include <cudf/detail/reduction_functions.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_any(column_view const& col,
+                                            device_span<size_type const> offsets,
+                                            cudf::data_type const output_dtype,
+                                            null_policy null_handling,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
+               "segmented_any() operation requires output type `BOOL8`");
+
+  // A maximum over bool types is used to implement any()
+  return cudf::type_dispatcher(
+    col.type(),
+    simple::detail::bool_result_column_dispatcher<cudf::reduction::op::max>{},
+    col,
+    offsets,
+    null_handling,
+    stream,
+    mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_max.cu b/cpp/src/reductions/segmented_max.cu
new file mode 100644
index 00000000000..19896064343
--- /dev/null
+++ b/cpp/src/reductions/segmented_max.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "simple_segmented.cuh"
+
+#include <cudf/detail/reduction_functions.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_max(column_view const& col,
+                                            device_span<size_type const> offsets,
+                                            cudf::data_type const output_dtype,
+                                            null_policy null_handling,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(col.type() == output_dtype,
+               "segmented_max() operation requires matching output type");
+  return cudf::type_dispatcher(
+    col.type(),
+    simple::detail::same_column_type_dispatcher<cudf::reduction::op::max>{},
+    col,
+    offsets,
+    null_handling,
+    stream,
+    mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_min.cu b/cpp/src/reductions/segmented_min.cu
new file mode 100644
index 00000000000..5c880f45bf8
--- /dev/null
+++ b/cpp/src/reductions/segmented_min.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "simple_segmented.cuh"
+
+#include <cudf/detail/reduction_functions.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_min(column_view const& col,
+                                            device_span<size_type const> offsets,
+                                            data_type const output_dtype,
+                                            null_policy null_handling,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(col.type() == output_dtype,
+               "segmented_min() operation requires matching output type");
+  return cudf::type_dispatcher(
+    col.type(),
+    simple::detail::same_column_type_dispatcher<cudf::reduction::op::min>{},
+    col,
+    offsets,
+    null_handling,
+    stream,
+    mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_product.cu b/cpp/src/reductions/segmented_product.cu
new file mode 100644
index 00000000000..1b852870820
--- /dev/null
+++ b/cpp/src/reductions/segmented_product.cu
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "simple_segmented.cuh"
+
+#include <cudf/detail/reduction_functions.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_product(column_view const& col,
+                                                device_span<size_type const> offsets,
+                                                cudf::data_type const output_dtype,
+                                                null_policy null_handling,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  return cudf::type_dispatcher(
+    col.type(),
+    simple::detail::column_type_dispatcher<cudf::reduction::op::product>{},
+    col,
+    offsets,
+    output_dtype,
+    null_handling,
+    stream,
+    mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_reductions.cpp b/cpp/src/reductions/segmented_reductions.cpp
new file mode 100644
index 00000000000..415f5ae488e
--- /dev/null
+++ b/cpp/src/reductions/segmented_reductions.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+struct segmented_reduce_dispatch_functor {
+  column_view const& col;
+  device_span<size_type const> offsets;
+  data_type output_dtype;
+  null_policy null_handling;
+  rmm::mr::device_memory_resource* mr;
+  rmm::cuda_stream_view stream;
+
+  segmented_reduce_dispatch_functor(column_view const& segmented_values,
+                                    device_span<size_type const> offsets,
+                                    data_type output_dtype,
+                                    null_policy null_handling,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+    : col(segmented_values),
+      offsets(offsets),
+      output_dtype(output_dtype),
+      null_handling(null_handling),
+      mr(mr),
+      stream(stream)
+  {
+  }
+
+  template <segmented_reduce_aggregation::Kind k>
+  std::unique_ptr<column> operator()()
+  {
+    switch (k) {
+      case segmented_reduce_aggregation::SUM:
+        return reduction::segmented_sum(col, offsets, output_dtype, null_handling, stream, mr);
+      case segmented_reduce_aggregation::PRODUCT:
+        return reduction::segmented_product(col, offsets, output_dtype, null_handling, stream, mr);
+      case segmented_reduce_aggregation::MIN:
+        return reduction::segmented_min(col, offsets, output_dtype, null_handling, stream, mr);
+      case segmented_reduce_aggregation::MAX:
+        return reduction::segmented_max(col, offsets, output_dtype, null_handling, stream, mr);
+      case segmented_reduce_aggregation::ANY:
+        return reduction::segmented_any(col, offsets, output_dtype, null_handling, stream, mr);
+      case segmented_reduce_aggregation::ALL:
+        return reduction::segmented_all(col, offsets, output_dtype, null_handling, stream, mr);
+      default:
+        CUDF_FAIL("Unsupported aggregation type.");
+        // TODO: Add support for compound_ops
+    }
+  }
+};
+
+std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
+                                         device_span<size_type const> offsets,
+                                         segmented_reduce_aggregation const& agg,
+                                         data_type output_dtype,
+                                         null_policy null_handling,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(offsets.size() > 0, "`offsets` should have at least 1 element.");
+  if (segmented_values.is_empty()) { return empty_like(segmented_values); }
+
+  return aggregation_dispatcher(
+    agg.kind,
+    segmented_reduce_dispatch_functor{
+      segmented_values, offsets, output_dtype, null_handling, stream, mr});
+}
+}  // namespace detail
+
+std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
+                                         device_span<size_type const> offsets,
+                                         segmented_reduce_aggregation const& agg,
+                                         data_type output_dtype,
+                                         null_policy null_handling,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::segmented_reduce(
+    segmented_values, offsets, agg, output_dtype, null_handling, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented_sum.cu b/cpp/src/reductions/segmented_sum.cu
new file mode 100644
index 00000000000..f2deeddbcbb
--- /dev/null
+++ b/cpp/src/reductions/segmented_sum.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "simple_segmented.cuh"
+
+#include <cudf/detail/reduction_functions.hpp>
+
+namespace cudf {
+namespace reduction {
+
+std::unique_ptr<cudf::column> segmented_sum(column_view const& col,
+                                            device_span<size_type const> offsets,
+                                            cudf::data_type const output_dtype,
+                                            null_policy null_handling,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  return cudf::type_dispatcher(col.type(),
+                               simple::detail::column_type_dispatcher<cudf::reduction::op::sum>{},
+                               col,
+                               offsets,
+                               output_dtype,
+                               null_handling,
+                               stream,
+                               mr);
+}
+
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 7dc8e6cb2c4..e5303246452 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -260,8 +260,7 @@ struct same_element_type_dispatcher {
     return !(cudf::is_dictionary<ElementType>() || std::is_same_v<ElementType, cudf::list_view>);
   }
 
-  template <typename IndexType,
-            typename std::enable_if_t<cudf::is_index_type<IndexType>()>* = nullptr>
+  template <typename IndexType, std::enable_if_t<cudf::is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<scalar> resolve_key(column_view const& keys,
                                       scalar const& keys_index,
                                       rmm::cuda_stream_view stream,
@@ -271,8 +270,7 @@ struct same_element_type_dispatcher {
     return cudf::detail::get_element(keys, index.value(stream), stream, mr);
   }
 
-  template <typename IndexType,
-            typename std::enable_if_t<!cudf::is_index_type<IndexType>()>* = nullptr>
+  template <typename IndexType, std::enable_if_t<!cudf::is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<scalar> resolve_key(column_view const&,
                                       scalar const&,
                                       rmm::cuda_stream_view,
@@ -353,7 +351,7 @@ struct element_type_dispatcher {
    * @brief Specialization for reducing floating-point column types to any output type.
    */
   template <typename ElementType,
-            typename std::enable_if_t<std::is_floating_point_v<ElementType>>* = nullptr>
+            std::enable_if_t<std::is_floating_point_v<ElementType>>* = nullptr>
   std::unique_ptr<scalar> reduce_numeric(column_view const& col,
                                          data_type const output_type,
                                          rmm::cuda_stream_view stream,
@@ -375,8 +373,7 @@ struct element_type_dispatcher {
   /**
    * @brief Specialization for reducing integer column types to any output type.
    */
-  template <typename ElementType,
-            typename std::enable_if_t<std::is_integral_v<ElementType>>* = nullptr>
+  template <typename ElementType, std::enable_if_t<std::is_integral_v<ElementType>>* = nullptr>
   std::unique_ptr<scalar> reduce_numeric(column_view const& col,
                                          data_type const output_type,
                                          rmm::cuda_stream_view stream,
@@ -405,8 +402,7 @@ struct element_type_dispatcher {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned scalar's device memory
    */
-  template <typename ElementType,
-            typename std::enable_if_t<cudf::is_numeric<ElementType>()>* = nullptr>
+  template <typename ElementType, std::enable_if_t<cudf::is_numeric<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      data_type const output_type,
                                      rmm::cuda_stream_view stream,
@@ -423,8 +419,7 @@ struct element_type_dispatcher {
   /**
    * @brief Specialization for reducing fixed_point column types to fixed_point number
    */
-  template <typename ElementType,
-            typename std::enable_if_t<cudf::is_fixed_point<ElementType>()>* = nullptr>
+  template <typename ElementType, std::enable_if_t<cudf::is_fixed_point<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      data_type const output_type,
                                      rmm::cuda_stream_view stream,
@@ -436,8 +431,8 @@ struct element_type_dispatcher {
   }
 
   template <typename ElementType,
-            typename std::enable_if_t<not cudf::is_numeric<ElementType>() and
-                                      not cudf::is_fixed_point<ElementType>()>* = nullptr>
+            std::enable_if_t<not cudf::is_numeric<ElementType>() and
+                             not cudf::is_fixed_point<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
                                      data_type const,
                                      rmm::cuda_stream_view,
diff --git a/cpp/src/reductions/simple_segmented.cuh b/cpp/src/reductions/simple_segmented.cuh
new file mode 100644
index 00000000000..99837e67398
--- /dev/null
+++ b/cpp/src/reductions/simple_segmented.cuh
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/null_mask.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/reduction.cuh>
+#include <cudf/detail/unary.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+namespace cudf {
+namespace reduction {
+namespace simple {
+namespace detail {
+
+/**
+ * @brief Segment reduction for 'sum', 'product', 'min', 'max', 'sum of squares'
+ * which directly compute the reduction by a single step reduction call.
+ *
+ * @tparam InputType    the input column data-type
+ * @tparam ResultType   the output data-type
+ * @tparam Op           the operator of cudf::reduction::op::
+
+ * @param col Input column of data to reduce.
+ * @param offsets Indices to segment boundaries.
+ * @param null_handling If `null_policy::INCLUDE`, all elements in a segment
+ * must be valid for the reduced value to be valid. If `null_policy::EXCLUDE`,
+ * the reduced value is valid if any element in the segment is valid.
+ * @param stream Used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Output column in device memory
+ */
+template <typename InputType, typename ResultType, typename Op>
+std::unique_ptr<column> simple_segmented_reduction(column_view const& col,
+                                                   device_span<size_type const> offsets,
+                                                   null_policy null_handling,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  // TODO: Rewrites this function to accept a pair of iterators for start/end indices
+  // to enable `2N` type offset input.
+  // reduction by iterator
+  auto dcol              = cudf::column_device_view::create(col, stream);
+  auto simple_op         = Op{};
+  size_type num_segments = offsets.size() - 1;
+
+  // TODO: Explore rewriting null_replacing_element_transformer/element_transformer with nullate
+  auto result = [&] {
+    if (col.has_nulls()) {
+      auto f  = simple_op.template get_null_replacing_element_transformer<ResultType>();
+      auto it = thrust::make_transform_iterator(dcol->pair_begin<InputType, true>(), f);
+      return cudf::reduction::detail::segmented_reduce(
+        it, offsets.begin(), num_segments, simple_op, stream, mr);
+    } else {
+      auto f  = simple_op.template get_element_transformer<ResultType>();
+      auto it = thrust::make_transform_iterator(dcol->begin<InputType>(), f);
+      return cudf::reduction::detail::segmented_reduce(
+        it, offsets.begin(), num_segments, simple_op, stream, mr);
+    }
+  }();
+
+  // Compute the output null mask
+  auto const bitmask                 = col.null_mask();
+  auto const first_bit_indices_begin = offsets.begin();
+  auto const first_bit_indices_end   = offsets.end() - 1;
+  auto const last_bit_indices_begin  = first_bit_indices_begin + 1;
+  auto const [output_null_mask, output_null_count] =
+    cudf::detail::segmented_null_mask_reduction(bitmask,
+                                                first_bit_indices_begin,
+                                                first_bit_indices_end,
+                                                last_bit_indices_begin,
+                                                null_handling,
+                                                stream,
+                                                mr);
+  result->set_null_mask(output_null_mask, output_null_count, stream);
+
+  return result;
+}
+
+/**
+ * @brief Call reduce and return a column of type bool.
+ *
+ * This is used by operations `any()` and `all()`.
+ *
+ * @tparam Op The reduce operation to execute on the column.
+ */
+template <typename Op>
+struct bool_result_column_dispatcher {
+  template <typename ElementType, std::enable_if_t<cudf::is_numeric<ElementType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     device_span<size_type const> offsets,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return simple_segmented_reduction<ElementType, bool, Op>(
+      col, offsets, null_handling, stream, mr);
+  }
+
+  template <typename ElementType, std::enable_if_t<not cudf::is_numeric<ElementType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const&,
+                                     device_span<size_type const>,
+                                     null_policy,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Reduction operator not supported for this type");
+  }
+};
+
+/**
+ * @brief Call reduce and return a column of type matching the input column.
+ *
+ * This is used by operations `min()` and `max()`.
+ *
+ * @tparam Op The reduce operation to execute on the column.
+ */
+template <typename Op>
+struct same_column_type_dispatcher {
+ private:
+  template <typename ElementType>
+  static constexpr bool is_supported()
+  {
+    return !(cudf::is_fixed_point<ElementType>() || cudf::is_dictionary<ElementType>() ||
+             std::is_same_v<ElementType, cudf::list_view> ||
+             std::is_same_v<ElementType, cudf::struct_view>);
+  }
+
+ public:
+  template <typename ElementType, std::enable_if_t<is_supported<ElementType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     device_span<size_type const> offsets,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return simple_segmented_reduction<ElementType, ElementType, Op>(
+      col, offsets, null_handling, stream, mr);
+  }
+
+  template <typename ElementType, std::enable_if_t<not is_supported<ElementType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const&,
+                                     device_span<size_type const>,
+                                     null_policy,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Reduction operator not supported for this type");
+  }
+};
+
+/**
+ * @brief Call reduce and return a column of the type specified.
+ *
+ * This is used by operations sum(), product(), and sum_of_squares().
+ * It only supports numeric types. If the output type is not the
+ * same as the input type, an extra cast operation may occur.
+ *
+ * @tparam Op The reduce operation to execute on the column.
+ */
+template <typename Op>
+struct column_type_dispatcher {
+  /**
+   * @brief Specialization for reducing floating-point column types to any output type.
+   */
+  template <typename ElementType,
+            typename std::enable_if_t<std::is_floating_point<ElementType>::value>* = nullptr>
+  std::unique_ptr<column> reduce_numeric(column_view const& col,
+                                         device_span<size_type const> offsets,
+                                         data_type const output_type,
+                                         null_policy null_handling,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+  {
+    // TODO: per gh-9988, we should change the compute precision to `output_type`.
+    auto result =
+      simple_segmented_reduction<ElementType, double, Op>(col, offsets, null_handling, stream, mr);
+    if (output_type == result->type()) { return result; }
+    return cudf::detail::cast(*result, output_type, stream, mr);
+  }
+
+  /**
+   * @brief Specialization for reducing integer column types to any output type.
+   */
+  template <typename ElementType,
+            typename std::enable_if_t<std::is_integral<ElementType>::value>* = nullptr>
+  std::unique_ptr<column> reduce_numeric(column_view const& col,
+                                         device_span<size_type const> offsets,
+                                         data_type const output_type,
+                                         null_policy null_handling,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+  {
+    // TODO: per gh-9988, we should change the compute precision to `output_type`.
+    auto result =
+      simple_segmented_reduction<ElementType, int64_t, Op>(col, offsets, null_handling, stream, mr);
+    if (output_type == result->type()) { return result; }
+    return cudf::detail::cast(*result, output_type, stream, mr);
+  }
+
+  /**
+   * @brief Called by the type-dispatcher to reduce the input column `col` using
+   * the `Op` operation.
+   *
+   * @tparam ElementType The input column type or key type.
+   * @param col Input column (must be numeric)
+   * @param offsets Indices to segment boundaries
+   * @param output_type Requested type of the scalar result
+   * @param null_handling If `null_policy::INCLUDE`, all elements in a segment
+   * must be valid for the reduced value to be valid. If `null_policy::EXCLUDE`,
+   * the reduced value is valid if any element in the segment is valid.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned scalar's device memory
+   */
+  template <typename ElementType,
+            typename std::enable_if_t<cudf::is_numeric<ElementType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     device_span<size_type const> offsets,
+                                     data_type const output_type,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    if (output_type.id() == cudf::type_to_id<ElementType>()) {
+      return simple_segmented_reduction<ElementType, ElementType, Op>(
+        col, offsets, null_handling, stream, mr);
+    }
+    // reduce and map to output type
+    return reduce_numeric<ElementType>(col, offsets, output_type, null_handling, stream, mr);
+  }
+
+  template <typename ElementType,
+            typename std::enable_if_t<not cudf::is_numeric<ElementType>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const&,
+                                     device_span<size_type const>,
+                                     data_type const,
+                                     null_policy,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Reduction operator not supported for this type");
+  }
+};
+
+}  // namespace detail
+}  // namespace simple
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index 1de48ef482d..b0f2d50b0f5 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -35,15 +35,15 @@ namespace detail {
 struct row_arg_minmax_fn {
   size_type const num_rows;
   row_lexicographic_comparator<nullate::DYNAMIC> const comp;
-  bool const arg_min;
+  bool const is_arg_min;
 
   row_arg_minmax_fn(table_device_view const& table,
                     bool has_nulls,
                     null_order const* null_precedence,
-                    bool const arg_min)
+                    bool const is_arg_min)
     : num_rows(table.num_rows()),
       comp(nullate::DYNAMIC{has_nulls}, table, table, nullptr, null_precedence),
-      arg_min(arg_min)
+      is_arg_min(is_arg_min)
   {
   }
 
@@ -53,7 +53,7 @@ struct row_arg_minmax_fn {
   // `thrust::reduce_by_key` or `thrust::scan_by_key` will result in significant compile time.
   __attribute__((noinline)) __device__ auto operator()(size_type lhs_idx, size_type rhs_idx) const
   {
-    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/issues/9156 and
     // github.com/NVIDIA/thrust/issues/1525
     // where invalid random values may be passed here by thrust::reduce_by_key
     if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
@@ -62,7 +62,7 @@ struct row_arg_minmax_fn {
     // Return `lhs_idx` iff:
     //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
     //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
-    return comp(lhs_idx, rhs_idx) == arg_min ? lhs_idx : rhs_idx;
+    return comp(lhs_idx, rhs_idx) == is_arg_min ? lhs_idx : rhs_idx;
   }
 };
 
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 93bc6cf5ae5..d41bdb6ca5a 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -297,8 +297,7 @@ struct replace_nulls_functor {
  *        `replace_nulls` with the appropriate data types.
  */
 struct replace_nulls_scalar_kernel_forwarder {
-  template <typename col_type,
-            typename std::enable_if_t<cudf::is_fixed_width<col_type>()>* = nullptr>
+  template <typename col_type, std::enable_if_t<cudf::is_fixed_width<col_type>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::scalar const& replacement,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 0e3ead3fd99..cd66cad392e 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ struct interleave_columns_functor {
 };
 
 template <typename T>
-struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::list_view>>> {
+struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::list_view>>> {
   std::unique_ptr<column> operator()(table_view const& lists_columns,
                                      bool create_mask,
                                      rmm::cuda_stream_view stream,
@@ -64,7 +64,7 @@ struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cu
 };
 
 template <typename T>
-struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::struct_view>>> {
+struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struct_view>>> {
   std::unique_ptr<cudf::column> operator()(table_view const& structs_columns,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
@@ -131,7 +131,7 @@ struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cu
 };
 
 template <typename T>
-struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
+struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
@@ -214,7 +214,7 @@ struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cu
 };
 
 template <typename T>
-struct interleave_columns_impl<T, typename std::enable_if_t<cudf::is_fixed_width<T>()>> {
+struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
   std::unique_ptr<cudf::column> operator()(table_view const& input,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 7c52856b147..a121e247258 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include "rolling/rolling_jit_detail.hpp"
 #include "rolling_detail.hpp"
 
+#include <reductions/struct_minmax_util.cuh>
+
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -52,8 +54,10 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/count.h>
 #include <thrust/find.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
 
 #include <cuda/std/limits>
 
@@ -91,14 +95,14 @@ struct DeviceRolling {
 
   // operations we do support
   template <typename T = InputType, aggregation::Kind O = op>
-  DeviceRolling(size_type _min_periods, typename std::enable_if_t<is_supported<T, O>()>* = nullptr)
+  DeviceRolling(size_type _min_periods, std::enable_if_t<is_supported<T, O>()>* = nullptr)
     : min_periods(_min_periods)
   {
   }
 
   // operations we don't support
   template <typename T = InputType, aggregation::Kind O = op>
-  DeviceRolling(size_type _min_periods, typename std::enable_if_t<!is_supported<T, O>()>* = nullptr)
+  DeviceRolling(size_type _min_periods, std::enable_if_t<!is_supported<T, O>()>* = nullptr)
     : min_periods(_min_periods)
   {
     CUDF_FAIL("Invalid aggregation/type pair");
@@ -140,23 +144,35 @@ struct DeviceRolling {
 };
 
 /**
- * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window.
+ * @brief The base struct used for checking if the combination of input type and aggregation op is
+ * supported.
  */
 template <typename InputType, aggregation::Kind op>
-struct DeviceRollingArgMinMax {
+struct DeviceRollingArgMinMaxBase {
   size_type min_periods;
+  DeviceRollingArgMinMaxBase(size_type _min_periods) : min_periods(_min_periods) {}
 
-  // what operations do we support
-  template <typename T = InputType, aggregation::Kind O = op>
   static constexpr bool is_supported()
   {
-    // strictly speaking, I think it would be ok to make this work
-    // for comparable types as well.  but right now the only use case is
-    // for MIN/MAX on strings.
-    return std::is_same_v<T, cudf::string_view>;
+    // Right now only support ARGMIN/ARGMAX of strings and structs.
+    auto const type_supported =
+      std::is_same_v<InputType, cudf::string_view> || std::is_same_v<InputType, cudf::struct_view>;
+    auto const op_supported = op == aggregation::Kind::ARGMIN || op == aggregation::Kind::ARGMAX;
+
+    return type_supported && op_supported;
   }
+};
 
-  DeviceRollingArgMinMax(size_type _min_periods) : min_periods(_min_periods) {}
+/**
+ * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window for string.
+ */
+template <aggregation::Kind op>
+struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase<cudf::string_view, op> {
+  DeviceRollingArgMinMaxString(size_type _min_periods)
+    : DeviceRollingArgMinMaxBase<cudf::string_view, op>(_min_periods)
+  {
+  }
+  using DeviceRollingArgMinMaxBase<cudf::string_view, op>::min_periods;
 
   template <typename OutputType, bool has_nulls>
   bool __device__ operator()(column_device_view const& input,
@@ -166,14 +182,17 @@ struct DeviceRollingArgMinMax {
                              size_type end_index,
                              size_type current_index)
   {
-    using AggOp = typename corresponding_operator<op>::type;
+    auto constexpr default_output = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
+
+    using InputType = cudf::string_view;
+    using AggOp     = typename corresponding_operator<op>::type;
     AggOp agg_op;
 
     // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
     // for CUDA 10.0 and below (fixed in CUDA 10.1)
     volatile cudf::size_type count = 0;
     InputType val                  = AggOp::template identity<InputType>();
-    OutputType val_index = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
+    OutputType val_index           = default_output;
 
     for (size_type j = start_index; j < end_index; j++) {
       if (!has_nulls || input.is_valid(j)) {
@@ -185,9 +204,9 @@ struct DeviceRollingArgMinMax {
     }
 
     bool output_is_valid = (count >= min_periods);
-    // -1 will help identify null elements while gathering for Min and Max
-    // In case of count, this would be null, so doesn't matter.
-    output.element<OutputType>(current_index) = (output_is_valid) ? val_index : -1;
+    // Use the sentinel value (i.e., -1) for the output will help identify null elements while
+    // gathering for Min and Max.
+    output.element<OutputType>(current_index) = output_is_valid ? val_index : default_output;
 
     // The gather mask shouldn't contain null values, so
     // always return zero
@@ -195,6 +214,50 @@ struct DeviceRollingArgMinMax {
   }
 };
 
+/**
+ * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window for struct.
+ */
+template <aggregation::Kind op, typename Comparator>
+struct DeviceRollingArgMinMaxStruct : DeviceRollingArgMinMaxBase<cudf::struct_view, op> {
+  DeviceRollingArgMinMaxStruct(size_type _min_periods, Comparator const& _comp)
+    : DeviceRollingArgMinMaxBase<cudf::struct_view, op>(_min_periods), comp(_comp)
+  {
+  }
+  using DeviceRollingArgMinMaxBase<cudf::struct_view, op>::min_periods;
+  Comparator comp;
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const&,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    auto constexpr default_output = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
+
+    auto const valid_count =
+      has_nulls ? thrust::count_if(thrust::seq,
+                                   thrust::make_counting_iterator(start_index),
+                                   thrust::make_counting_iterator(end_index),
+                                   [&input](size_type idx) { return input.is_valid_nocheck(idx); })
+                : end_index - start_index;
+
+    // Use the sentinel value (i.e., -1) for the output will help identify null elements while
+    // gathering for Min and Max.
+    output.element<OutputType>(current_index) =
+      (valid_count >= min_periods) ? thrust::reduce(thrust::seq,
+                                                    thrust::make_counting_iterator(start_index),
+                                                    thrust::make_counting_iterator(end_index),
+                                                    size_type{start_index},
+                                                    comp)
+                                   : default_output;
+
+    // The gather mask shouldn't contain null values, so always return true.
+    return true;
+  }
+};
+
 /**
  * @brief Operator for applying a COUNT_VALID rolling aggregation on a single window.
  */
@@ -219,8 +282,8 @@ struct DeviceRollingCountValid {
                              size_type end_index,
                              size_type current_index)
   {
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-    // for CUDA 10.0 and below (fixed in CUDA 10.1)
+    // declare this as volatile to avoid some compiler optimizations that lead to incorrect
+    // results for CUDA 10.0 and below (fixed in CUDA 10.1)
     volatile cudf::size_type count = 0;
 
     bool output_is_valid = ((end_index - start_index) >= min_periods);
@@ -441,12 +504,12 @@ struct DeviceRollingLead {
     return cudf::is_fixed_width<T>();
   }
 
-  template <typename T = InputType, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+  template <typename T = InputType, std::enable_if_t<is_supported<T>()>* = nullptr>
   DeviceRollingLead(size_type _row_offset) : row_offset(_row_offset)
   {
   }
 
-  template <typename T = InputType, typename std::enable_if_t<!is_supported<T>()>* = nullptr>
+  template <typename T = InputType, std::enable_if_t<!is_supported<T>()>* = nullptr>
   DeviceRollingLead(size_type _row_offset) : row_offset(_row_offset)
   {
     CUDF_FAIL("Invalid aggregation/type pair");
@@ -497,12 +560,12 @@ struct DeviceRollingLag {
     return cudf::is_fixed_width<T>();
   }
 
-  template <typename T = InputType, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+  template <typename T = InputType, std::enable_if_t<is_supported<T>()>* = nullptr>
   DeviceRollingLag(size_type _row_offset) : row_offset(_row_offset)
   {
   }
 
-  template <typename T = InputType, typename std::enable_if_t<!is_supported<T>()>* = nullptr>
+  template <typename T = InputType, std::enable_if_t<!is_supported<T>()>* = nullptr>
   DeviceRollingLag(size_type _row_offset) : row_offset(_row_offset)
   {
     CUDF_FAIL("Invalid aggregation/type pair");
@@ -553,12 +616,12 @@ struct corresponding_rolling_operator {
 
 template <typename InputType>
 struct corresponding_rolling_operator<InputType, aggregation::ARGMIN> {
-  using type = DeviceRollingArgMinMax<InputType, aggregation::ARGMIN>;
+  using type = DeviceRollingArgMinMaxBase<InputType, aggregation::ARGMIN>;
 };
 
 template <typename InputType>
 struct corresponding_rolling_operator<InputType, aggregation::ARGMAX> {
-  using type = DeviceRollingArgMinMax<InputType, aggregation::ARGMAX>;
+  using type = DeviceRollingArgMinMaxBase<InputType, aggregation::ARGMAX>;
 };
 
 template <typename InputType>
@@ -577,13 +640,13 @@ struct corresponding_rolling_operator<InputType, aggregation::ROW_NUMBER> {
 };
 
 template <typename InputType>
-struct corresponding_rolling_operator<InputType, aggregation::Kind::LEAD> {
-  using type = DeviceRollingLead<InputType>;
+struct corresponding_rolling_operator<InputType, aggregation::Kind::VARIANCE> {
+  using type = DeviceRollingVariance<InputType>;
 };
 
 template <typename InputType>
-struct corresponding_rolling_operator<InputType, aggregation::Kind::VARIANCE> {
-  using type = DeviceRollingVariance<InputType>;
+struct corresponding_rolling_operator<InputType, aggregation::Kind::LEAD> {
+  using type = DeviceRollingLead<InputType>;
 };
 
 template <typename InputType>
@@ -594,49 +657,34 @@ struct corresponding_rolling_operator<InputType, aggregation::Kind::LAG> {
 /**
  * @brief Functor for creating a device rolling operator based on input type and aggregation type.
  */
-template <typename InputType, aggregation::Kind op, typename Enable = void>
+template <typename InputType, aggregation::Kind k, typename = void>
 struct create_rolling_operator {
-  auto operator()(size_type min_periods, rolling_aggregation const& agg)
-  {
-    CUDF_FAIL("Invalid aggregation/type pair");
-  }
-};
-
-template <typename InputType, aggregation::Kind op>
-struct create_rolling_operator<
-  InputType,
-  op,
-  std::enable_if_t<corresponding_rolling_operator<InputType, op>::type::is_supported()>> {
-  template <typename T                                          = InputType,
-            aggregation::Kind O                                 = op,
-            std::enable_if_t<O != aggregation::Kind::LEAD && O != aggregation::Kind::LAG &&
-                             O != aggregation::Kind::VARIANCE>* = nullptr>
   auto operator()(size_type min_periods, rolling_aggregation const&)
   {
-    return typename corresponding_rolling_operator<InputType, op>::type(min_periods);
+    return typename corresponding_rolling_operator<InputType, k>::type(min_periods);
   }
+};
 
-  template <typename T                                          = InputType,
-            aggregation::Kind O                                 = op,
-            std::enable_if_t<O == aggregation::Kind::VARIANCE>* = nullptr>
+template <typename InputType>
+struct create_rolling_operator<InputType, aggregation::Kind::VARIANCE> {
   auto operator()(size_type min_periods, rolling_aggregation const& agg)
   {
     return DeviceRollingVariance<InputType>{
       min_periods, dynamic_cast<cudf::detail::var_aggregation const&>(agg)._ddof};
   }
+};
 
-  template <typename T                                      = InputType,
-            aggregation::Kind O                             = op,
-            std::enable_if_t<O == aggregation::Kind::LEAD>* = nullptr>
+template <typename InputType>
+struct create_rolling_operator<InputType, aggregation::Kind::LEAD> {
   auto operator()(size_type, rolling_aggregation const& agg)
   {
     return DeviceRollingLead<InputType>{
       dynamic_cast<cudf::detail::lead_lag_aggregation const&>(agg).row_offset};
   }
+};
 
-  template <typename T                                     = InputType,
-            aggregation::Kind O                            = op,
-            std::enable_if_t<O == aggregation::Kind::LAG>* = nullptr>
+template <typename InputType>
+struct create_rolling_operator<InputType, aggregation::Kind::LAG> {
   auto operator()(size_type, rolling_aggregation const& agg)
   {
     return DeviceRollingLag<InputType>{
@@ -644,6 +692,31 @@ struct create_rolling_operator<
   }
 };
 
+template <typename InputType, aggregation::Kind k>
+struct create_rolling_operator<
+  InputType,
+  k,
+  typename std::enable_if_t<std::is_same_v<InputType, cudf::string_view> &&
+                            (k == aggregation::Kind::ARGMIN || k == aggregation::Kind::ARGMAX)>> {
+  auto operator()(size_type min_periods, rolling_aggregation const&)
+  {
+    return DeviceRollingArgMinMaxString<k>{min_periods};
+  }
+};
+
+template <typename InputType, aggregation::Kind k>
+struct create_rolling_operator<
+  InputType,
+  k,
+  typename std::enable_if_t<std::is_same_v<InputType, cudf::struct_view> &&
+                            (k == aggregation::Kind::ARGMIN || k == aggregation::Kind::ARGMAX)>> {
+  template <typename Comparator>
+  auto operator()(size_type min_periods, Comparator const& comp)
+  {
+    return DeviceRollingArgMinMaxStruct<k, Comparator>{min_periods, comp};
+  }
+};
+
 /**
  * @brief Rolling window specific implementation of simple_aggregations_collector.
  *
@@ -652,7 +725,7 @@ struct create_rolling_operator<
  * happens, the equivalent aggregation/type implementation of finalize() will perform
  * some postprocessing step.
  *
- * An example of this would be applying a MIN aggregation to strings.  This cannot be done
+ * An example of this would be applying a MIN aggregation to strings. This cannot be done
  * directly in the rolling operation, so instead the following happens:
  *
  * - the rolling_aggregation_preprocessor transforms the incoming MIN/string pair to
@@ -662,8 +735,8 @@ struct create_rolling_operator<
  * - The rolling_aggregation_postprocessor then takes this gather map and performs a final
  *   gather() on the input string data to generate the final output.
  *
- * Another example is COLLECT_LIST.  COLLECT_LIST is odd in that it doesn't go through the
- * normal gpu rolling kernel at all.  It has a completely custom implementation.  So the
+ * Another example is COLLECT_LIST. COLLECT_LIST is odd in that it doesn't go through the
+ * normal gpu rolling kernel at all. It has a completely custom implementation. So the
  * following happens:
  *
  * - the rolling_aggregation_preprocessor transforms the COLLECT_LIST aggregation into nothing,
@@ -687,8 +760,9 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
                                                   cudf::detail::min_aggregation const&) override
   {
     std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
-                                                    : make_min_aggregation());
+    aggs.push_back(col_type.id() == type_id::STRING || col_type.id() == type_id::STRUCT
+                     ? make_argmin_aggregation()
+                     : make_min_aggregation());
     return aggs;
   }
 
@@ -700,8 +774,9 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
                                                   cudf::detail::max_aggregation const&) override
   {
     std::vector<std::unique_ptr<aggregation>> aggs;
-    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
-                                                    : make_max_aggregation());
+    aggs.push_back(col_type.id() == type_id::STRING || col_type.id() == type_id::STRUCT
+                     ? make_argmax_aggregation()
+                     : make_max_aggregation());
     return aggs;
   }
 
@@ -787,7 +862,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
   // perform a final gather on the generated ARGMIN data
   void visit(cudf::detail::min_aggregation const&) override
   {
-    if (result_type.id() == type_id::STRING) {
+    if (result_type.id() == type_id::STRING || result_type.id() == type_id::STRUCT) {
       // The rows that represent null elements will have negative values in gather map,
       // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
       auto output_table = detail::gather(table_view{{input}},
@@ -805,7 +880,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
   // perform a final gather on the generated ARGMAX data
   void visit(cudf::detail::max_aggregation const&) override
   {
-    if (result_type.id() == type_id::STRING) {
+    if (result_type.id() == type_id::STRING || result_type.id() == type_id::STRUCT) {
       // The rows that represent null elements will have negative values in gather map,
       // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
       auto output_table = detail::gather(table_view{{input}},
@@ -901,29 +976,24 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
 /**
  * @brief Computes the rolling window function
  *
- * @tparam InputType  Datatype of `input`
- * @tparam OutputType  Datatype of `output`
- * @tparam op The aggregation operator (enum value)
+ * @tparam OutputType Datatype of `output`
  * @tparam block_size CUDA block size for the kernel
  * @tparam has_nulls true if the input column has nulls
  * @tparam DeviceRollingOperator An operator that performs a single windowing operation
  * @tparam PrecedingWindowIterator iterator type (inferred)
  * @tparam FollowingWindowIterator iterator type (inferred)
- * @param input Input column device view
- * @param default_outputs A column of per-row default values to be returned instead
- *                        of nulls for certain aggregation types.
- * @param output Output column device view
- * @param output_valid_count Output count of valid values
- * @param device_operator The operator used to perform a single window operation
+ * @param[in] input Input column device view
+ * @param[in] default_outputs A column of per-row default values to be returned instead
+ *            of nulls for certain aggregation types.
+ * @param[out] output Output column device view
+ * @param[out] output_valid_count Output count of valid values
+ * @param[in] device_operator The operator used to perform a single window operation
  * @param[in] preceding_window_begin Rolling window size iterator, accumulates from
- *                in_col[i-preceding_window] to in_col[i] inclusive
+ *            in_col[i-preceding_window] to in_col[i] inclusive
  * @param[in] following_window_begin Rolling window size iterator in the forward
- *                direction, accumulates from in_col[i] to
- *                in_col[i+following_window] inclusive
+ *            direction, accumulates from in_col[i] to in_col[i+following_window] inclusive
  */
-template <typename InputType,
-          typename OutputType,
-          aggregation::Kind op,
+template <typename OutputType,
           int block_size,
           bool has_nulls,
           typename DeviceRollingOperator,
@@ -1003,61 +1073,63 @@ struct rolling_window_launcher {
              PrecedingWindowIterator preceding_window_begin,
              FollowingWindowIterator following_window_begin,
              int min_periods,
-             rolling_aggregation const& agg,
+             [[maybe_unused]] rolling_aggregation const& agg,
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr)
   {
-    auto const output_type = target_type(input.type(), op);
-    auto device_operator   = create_rolling_operator<InputType, op>{}(min_periods, agg);
-
-    auto output =
-      make_fixed_width_column(output_type, input.size(), mask_state::UNINITIALIZED, stream, mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-
-    size_type valid_count{0};
-    {
-      using Type    = device_storage_type_t<InputType>;
-      using OutType = device_storage_type_t<target_type_t<InputType, op>>;
+    auto const do_rolling = [&](auto const& device_op) {
+      auto output = make_fixed_width_column(
+        target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
 
-      constexpr cudf::size_type block_size = 256;
-      cudf::detail::grid_1d grid(input.size(), block_size);
+      auto const d_inp_ptr         = column_device_view::create(input, stream);
+      auto const d_default_out_ptr = column_device_view::create(default_outputs, stream);
+      auto const d_out_ptr = mutable_column_device_view::create(output->mutable_view(), stream);
+      auto d_valid_count   = rmm::device_scalar<size_type>{0, stream};
 
-      auto input_device_view           = column_device_view::create(input, stream);
-      auto output_device_view          = mutable_column_device_view::create(output_view, stream);
-      auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
-
-      rmm::device_scalar<size_type> device_valid_count{0, stream};
+      auto constexpr block_size = 256;
+      auto const grid           = cudf::detail::grid_1d(input.size(), block_size);
+      using OutType             = device_storage_type_t<target_type_t<InputType, op>>;
 
       if (input.has_nulls()) {
-        gpu_rolling<Type, OutType, op, block_size, true>
-          <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                               *default_outputs_device_view,
-                                                               *output_device_view,
-                                                               device_valid_count.data(),
-                                                               device_operator,
+        gpu_rolling<OutType, block_size, true>
+          <<<grid.num_blocks, block_size, 0, stream.value()>>>(*d_inp_ptr,
+                                                               *d_default_out_ptr,
+                                                               *d_out_ptr,
+                                                               d_valid_count.data(),
+                                                               device_op,
                                                                preceding_window_begin,
                                                                following_window_begin);
       } else {
-        gpu_rolling<Type, OutType, op, block_size, false>
-          <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                               *default_outputs_device_view,
-                                                               *output_device_view,
-                                                               device_valid_count.data(),
-                                                               device_operator,
+        gpu_rolling<OutType, block_size, false>
+          <<<grid.num_blocks, block_size, 0, stream.value()>>>(*d_inp_ptr,
+                                                               *d_default_out_ptr,
+                                                               *d_out_ptr,
+                                                               d_valid_count.data(),
+                                                               device_op,
                                                                preceding_window_begin,
                                                                following_window_begin);
       }
 
-      valid_count = device_valid_count.value(stream);
-
-      // check the stream for debugging
-      CHECK_CUDA(stream.value());
+      auto const valid_count = d_valid_count.value(stream);
+      output->set_null_count(output->size() - valid_count);
+
+      return output;
+    };  // end do_rolling
+
+    auto constexpr is_arg_minmax =
+      op == aggregation::Kind::ARGMIN || op == aggregation::Kind::ARGMAX;
+
+    if constexpr (is_arg_minmax && std::is_same_v<InputType, cudf::struct_view>) {
+      // Using comp_generator to create a LESS operator for finding ARGMIN/ARGMAX of structs.
+      auto const comp_generator =
+        cudf::reduction::detail::comparison_binop_generator::create<op>(input, stream);
+      auto const device_op =
+        create_rolling_operator<InputType, op>{}(min_periods, comp_generator.binop());
+      return do_rolling(device_op);
+    } else {  // all the remaining rolling operations
+      auto const device_op = create_rolling_operator<InputType, op>{}(min_periods, agg);
+      return do_rolling(device_op);
     }
-
-    output->set_null_count(output->size() - valid_count);
-
-    return output;
   }
 
   template <aggregation::Kind op,
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 81bf03f7c0a..9a2b1002997 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -49,26 +49,26 @@ inline double __device__ generic_round_half_even(double d) { return rint(d); }
 inline float __device__ generic_modf(float a, float* b) { return modff(a, b); }
 inline double __device__ generic_modf(double a, double* b) { return modf(a, b); }
 
-template <typename T, typename std::enable_if_t<cuda::std::is_signed_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<cuda::std::is_signed_v<T>>* = nullptr>
 T __device__ generic_abs(T value)
 {
   return numeric::detail::abs(value);
 }
 
-template <typename T, typename std::enable_if_t<not cuda::std::is_signed_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<not cuda::std::is_signed_v<T>>* = nullptr>
 T __device__ generic_abs(T value)
 {
   return value;
 }
 
-template <typename T, typename std::enable_if_t<cuda::std::is_signed_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<cuda::std::is_signed_v<T>>* = nullptr>
 int16_t __device__ generic_sign(T value)
 {
   return value < 0 ? -1 : 1;
 }
 
 // this is needed to suppress warning: pointless comparison of unsigned integer with zero
-template <typename T, typename std::enable_if_t<not cuda::std::is_signed_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<not cuda::std::is_signed_v<T>>* = nullptr>
 int16_t __device__ generic_sign(T)
 {
   return 1;
@@ -83,13 +83,13 @@ constexpr inline auto is_supported_round_type()
 template <typename T>
 struct half_up_zero {
   T n;  // unused in the decimal_places = 0 case
-  template <typename U = T, typename std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
+  template <typename U = T, std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
   __device__ U operator()(U e)
   {
     return generic_round(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
+  template <typename U = T, std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -100,7 +100,7 @@ struct half_up_zero {
 template <typename T>
 struct half_up_positive {
   T n;
-  template <typename U = T, typename std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
+  template <typename U = T, std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
   __device__ U operator()(U e)
   {
     T integer_part;
@@ -108,7 +108,7 @@ struct half_up_positive {
     return integer_part + generic_round(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
+  template <typename U = T, std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -119,13 +119,13 @@ struct half_up_positive {
 template <typename T>
 struct half_up_negative {
   T n;
-  template <typename U = T, typename std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
+  template <typename U = T, std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
   __device__ U operator()(U e)
   {
     return generic_round(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
+  template <typename U = T, std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down = (e / n) * n;  // result from rounding down
@@ -136,13 +136,13 @@ struct half_up_negative {
 template <typename T>
 struct half_even_zero {
   T n;  // unused in the decimal_places = 0 case
-  template <typename U = T, typename std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
+  template <typename U = T, std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
   __device__ U operator()(U e)
   {
     return generic_round_half_even(e);
   }
 
-  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
+  template <typename U = T, std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -153,7 +153,7 @@ struct half_even_zero {
 template <typename T>
 struct half_even_positive {
   T n;
-  template <typename U = T, typename std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
+  template <typename U = T, std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
   __device__ U operator()(U e)
   {
     T integer_part;
@@ -161,7 +161,7 @@ struct half_even_positive {
     return integer_part + generic_round_half_even(fractional_part * n) / n;
   }
 
-  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
+  template <typename U = T, std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
   __device__ U operator()(U)
   {
     assert(false);  // Should never get here. Just for compilation
@@ -172,13 +172,13 @@ struct half_even_positive {
 template <typename T>
 struct half_even_negative {
   T n;
-  template <typename U = T, typename std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
+  template <typename U = T, std::enable_if_t<cudf::is_floating_point<U>()>* = nullptr>
   __device__ U operator()(U e)
   {
     return generic_round_half_even(e / n) * n;
   }
 
-  template <typename U = T, typename std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
+  template <typename U = T, std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
   __device__ U operator()(U e)
   {
     auto const down_over_n = e / n;            // use this to determine HALF_EVEN case
@@ -205,7 +205,7 @@ struct half_even_fixed_point {
 template <typename T,
           template <typename>
           typename RoundFunctor,
-          typename std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
+          std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
                                    rmm::cuda_stream_view stream,
@@ -231,7 +231,7 @@ std::unique_ptr<column> round_with(column_view const& input,
 template <typename T,
           template <typename>
           typename RoundFunctor,
-          typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
+          std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
                                    rmm::cuda_stream_view stream,
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index c18b57d220f..3a2920f8f1a 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,8 +28,8 @@ namespace cudf {
 namespace {
 struct scalar_construction_helper {
   template <typename T,
-            typename ScalarType = scalar_type_t<T>,
-            typename std::enable_if_t<is_fixed_width<T>() and not is_fixed_point<T>()>* = nullptr>
+            typename ScalarType                                                = scalar_type_t<T>,
+            std::enable_if_t<is_fixed_width<T>() and not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
@@ -39,8 +39,8 @@ struct scalar_construction_helper {
   }
 
   template <typename T,
-            typename ScalarType                             = scalar_type_t<T>,
-            typename std::enable_if_t<is_fixed_point<T>()>* = nullptr>
+            typename ScalarType                    = scalar_type_t<T>,
+            std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
@@ -49,9 +49,7 @@ struct scalar_construction_helper {
     return std::unique_ptr<scalar>(s);
   }
 
-  template <typename T,
-            typename... Args,
-            typename std::enable_if_t<not is_fixed_width<T>()>* = nullptr>
+  template <typename T, typename... Args, std::enable_if_t<not is_fixed_width<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(Args... args) const
   {
     CUDF_FAIL("Invalid type.");
@@ -124,14 +122,14 @@ namespace {
 struct default_scalar_functor {
   data_type type;
 
-  template <typename T, typename std::enable_if_t<not is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
   {
     return make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
   }
 
-  template <typename T, typename std::enable_if_t<is_fixed_point<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
   {
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 42b57bdb47a..5ce82cd3740 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
 }
 
 struct inplace_column_sort_fn {
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
   void operator()(mutable_column_view& col, bool ascending, rmm::cuda_stream_view stream) const
   {
     CUDF_EXPECTS(!col.has_nulls(), "Nulls not supported for in-place sort");
@@ -68,7 +68,7 @@ struct inplace_column_sort_fn {
     }
   }
 
-  template <typename T, typename std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
   void operator()(mutable_column_view&, bool, rmm::cuda_stream_view) const
   {
     CUDF_FAIL("Column type must be relationally comparable and fixed-width");
diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
index 74c796e7962..7a4072cf8ae 100644
--- a/cpp/src/sort/sort_column.cu
+++ b/cpp/src/sort/sort_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ struct column_sorted_order_fn {
    * @param ascending True if sort order is ascending
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  template <typename T, typename std::enable_if_t<is_radix_sort_supported<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<is_radix_sort_supported<T>()>* = nullptr>
   void radix_sort(column_view const& input,
                   mutable_column_view& indices,
                   bool ascending,
@@ -68,7 +68,7 @@ struct column_sorted_order_fn {
                           thrust::greater<T>());
     }
   }
-  template <typename T, typename std::enable_if_t<!is_radix_sort_supported<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!is_radix_sort_supported<T>()>* = nullptr>
   void radix_sort(column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view)
   {
     CUDF_FAIL("Only fixed-width types are suitable for faster sorting");
@@ -85,8 +85,7 @@ struct column_sorted_order_fn {
    * @param null_precedence How null rows are to be ordered
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  template <typename T,
-            typename std::enable_if_t<cudf::is_relationally_comparable<T, T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_relationally_comparable<T, T>()>* = nullptr>
   void operator()(column_view const& input,
                   mutable_column_view& indices,
                   bool ascending,
@@ -105,8 +104,7 @@ struct column_sorted_order_fn {
     }
   }
 
-  template <typename T,
-            typename std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>* = nullptr>
   void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view)
   {
     CUDF_FAIL("Column type must be relationally comparable");
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 75335579de2..1d3734cace5 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "sort_impl.cuh"
 
 #include <cudf/column/column.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
@@ -34,6 +35,26 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
   return sorted_order<true>(input, column_order, null_precedence, stream, mr);
 }
 
+std::unique_ptr<table> stable_sort_by_key(table_view const& values,
+                                          table_view const& keys,
+                                          std::vector<order> const& column_order,
+                                          std::vector<null_order> const& null_precedence,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
+               "Mismatch in number of rows for values and keys");
+
+  auto sorted_order = detail::stable_sorted_order(
+    keys, column_order, null_precedence, stream, rmm::mr::get_current_device_resource());
+
+  return detail::gather(values,
+                        sorted_order->view(),
+                        out_of_bounds_policy::DONT_CHECK,
+                        detail::negative_index_policy::NOT_ALLOWED,
+                        stream,
+                        mr);
+}
 }  // namespace detail
 
 std::unique_ptr<column> stable_sorted_order(table_view const& input,
@@ -45,4 +66,15 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
     input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<table> stable_sort_by_key(table_view const& values,
+                                          table_view const& keys,
+                                          std::vector<order> const& column_order,
+                                          std::vector<null_order> const& null_precedence,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::stable_sort_by_key(
+    values, keys, column_order, null_precedence, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu
index 49aecf52625..d79a691a580 100644
--- a/cpp/src/sort/stable_sort_column.cu
+++ b/cpp/src/sort/stable_sort_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ struct column_stable_sorted_order_fn {
    * @param indices Output sorted indices
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
   void faster_stable_sort(column_view const& input,
                           mutable_column_view& indices,
                           rmm::cuda_stream_view stream)
@@ -38,7 +38,7 @@ struct column_stable_sorted_order_fn {
     thrust::stable_sort_by_key(
       rmm::exec_policy(stream), d_col.begin<T>(), d_col.end<T>(), indices.begin<size_type>());
   }
-  template <typename T, typename std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
   void faster_stable_sort(column_view const&, mutable_column_view&, rmm::cuda_stream_view)
   {
     CUDF_FAIL("Only fixed-width types are suitable for faster stable sorting");
@@ -55,8 +55,7 @@ struct column_stable_sorted_order_fn {
    * @param null_precedence How null rows are to be ordered
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  template <typename T,
-            typename std::enable_if_t<cudf::is_relationally_comparable<T, T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_relationally_comparable<T, T>()>* = nullptr>
   void operator()(column_view const& input,
                   mutable_column_view& indices,
                   bool ascending,
@@ -74,8 +73,7 @@ struct column_stable_sorted_order_fn {
       faster_stable_sort<T>(input, indices, stream);
     }
   }
-  template <typename T,
-            typename std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>* = nullptr>
+  template <typename T, std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>* = nullptr>
   void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view)
   {
     CUDF_FAIL("Column type must be relationally comparable");
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index efdee65c1f6..23bc5cf2dfe 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <strings/regex/dispatcher.hpp>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,123 +27,90 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/transform.h>
+
 namespace cudf {
 namespace strings {
 namespace detail {
+
 namespace {
 /**
  * @brief This functor handles both contains_re and match_re to minimize the number
  * of regex calls to find() to be inlined greatly reducing compile time.
- *
- * The stack is used to keep progress on evaluating the regex instructions on each string.
- * So the size of the stack is in proportion to the number of instructions in the given regex
- * pattern.
- *
- * There are three call types based on the number of regex instructions in the given pattern.
- * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
- * Longer patterns require global memory.
  */
 template <int stack_size>
 struct contains_fn {
   reprog_device prog;
-  column_device_view d_strings;
-  bool bmatch{false};  // do not make this a template parameter to keep compile times down
+  column_device_view const d_strings;
+  bool const beginning_only;  // do not make this a template parameter to keep compile times down
 
   __device__ bool operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) return false;
-    string_view d_str = d_strings.element<string_view>(idx);
-    int32_t begin     = 0;
-    int32_t end       = bmatch ? 1    // match only the beginning of the string;
-                               : -1;  // this handles empty strings too
+    auto const d_str = d_strings.element<string_view>(idx);
+    int32_t begin    = 0;
+    int32_t end      = beginning_only ? 1    // match only the beginning of the string;
+                                      : -1;  // match anywhere in the string
     return static_cast<bool>(prog.find<stack_size>(idx, d_str, begin, end));
   }
 };
 
-//
-std::unique_ptr<column> contains_util(
-  strings_column_view const& strings,
-  std::string const& pattern,
-  regex_flags const flags,
-  bool beginning_only                 = false,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto strings_count  = strings.size();
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
-  // compile regex into device object
-  auto prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-
-  // create the output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<bool>();
+struct contains_dispatch_fn {
+  reprog_device d_prog;
+  bool const beginning_only;
 
-  // fill the output column
-  int regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      contains_fn<RX_STACK_SMALL>{d_prog, d_column, beginning_only});
-  else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      contains_fn<RX_STACK_MEDIUM>{d_prog, d_column, beginning_only});
-  else if (regex_insts <= RX_LARGE_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      contains_fn<RX_STACK_LARGE>{d_prog, d_column, beginning_only});
-  else
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto results = make_numeric_column(data_type{type_id::BOOL8},
+                                       input.size(),
+                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                       input.null_count(),
+                                       stream,
+                                       mr);
+
+    auto const d_strings = column_device_view::create(input.parent(), stream);
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      contains_fn<RX_STACK_ANY>{d_prog, d_column, beginning_only});
-
-  results->set_null_count(strings.null_count());
-  return results;
-}
+                      thrust::make_counting_iterator<size_type>(input.size()),
+                      results->mutable_view().data<bool>(),
+                      contains_fn<stack_size>{d_prog, *d_strings, beginning_only});
+    return results;
+  }
+};
 
 }  // namespace
 
 std::unique_ptr<column> contains_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return contains_util(strings, pattern, flags, false, stream, mr);
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, false}, input, stream, mr);
 }
 
 std::unique_ptr<column> matches_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return contains_util(strings, pattern, flags, true, stream, mr);
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, true}, input, stream, mr);
 }
 
 }  // namespace detail
@@ -172,12 +143,12 @@ namespace {
 template <int stack_size>
 struct count_fn {
   reprog_device prog;
-  column_device_view d_strings;
+  column_device_view const d_strings;
 
   __device__ int32_t operator()(unsigned int idx)
   {
     if (d_strings.is_null(idx)) return 0;
-    string_view d_str  = d_strings.element<string_view>(idx);
+    auto const d_str   = d_strings.element<string_view>(idx);
     auto const nchars  = d_str.length();
     int32_t find_count = 0;
     int32_t begin      = 0;
@@ -191,62 +162,45 @@ struct count_fn {
   }
 };
 
+struct count_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto results = make_numeric_column(data_type{type_id::INT32},
+                                       input.size(),
+                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                       input.null_count(),
+                                       stream,
+                                       mr);
+
+    auto const d_strings = column_device_view::create(input.parent(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(input.size()),
+                      results->mutable_view().data<int32_t>(),
+                      count_fn<stack_size>{d_prog, *d_strings});
+    return results;
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<column> count_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count  = strings.size();
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-
   // compile regex into device object
-  auto prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-
-  // create the output column
-  auto results   = make_numeric_column(data_type{type_id::INT32},
-                                     strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
-  auto d_results = results->mutable_view().data<int32_t>();
-
-  // fill the output column
-  int regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      count_fn<RX_STACK_SMALL>{d_prog, d_column});
-  else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      count_fn<RX_STACK_MEDIUM>{d_prog, d_column});
-  else if (regex_insts <= RX_LARGE_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      count_fn<RX_STACK_LARGE>{d_prog, d_column});
-  else
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_results,
-                      count_fn<RX_STACK_ANY>{d_prog, d_column});
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
 
-  results->set_null_count(strings.null_count());
-  return results;
+  return regex_dispatcher(*d_prog, count_dispatch_fn{*d_prog}, input, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index d0a6825666b..ae996cafd2c 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 
 #include <cudf/column/column_device_view.cuh>
@@ -54,6 +55,27 @@ struct count_matches_fn {
     return count;
   }
 };
+
+struct count_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto results = make_numeric_column(
+      data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(d_strings.size()),
+                      results->mutable_view().data<int32_t>(),
+                      count_matches_fn<stack_size>{d_strings, d_prog});
+    return results;
+  }
+};
+
 }  // namespace
 
 /**
@@ -71,31 +93,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  // Create output column
-  auto counts = make_numeric_column(
-    data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_counts = counts->mutable_view().data<offset_type>();
-
-  auto begin = thrust::make_counting_iterator<size_type>(0);
-  auto end   = thrust::make_counting_iterator<size_type>(d_strings.size());
-
-  // Count matches
-  auto const regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    count_matches_fn<RX_STACK_SMALL> fn{d_strings, d_prog};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    count_matches_fn<RX_STACK_MEDIUM> fn{d_strings, d_prog};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    count_matches_fn<RX_STACK_LARGE> fn{d_strings, d_prog};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
-  } else {
-    count_matches_fn<RX_STACK_ANY> fn{d_strings, d_prog};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
-  }
-
-  return counts;
+  return regex_dispatcher(d_prog, count_dispatch_fn{d_prog}, d_strings, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index a67af9442f0..7394cdac6bb 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -77,53 +78,44 @@ struct extract_fn {
     thrust::fill(thrust::seq, d_output.begin(), d_output.end(), string_index_pair{nullptr, 0});
   }
 };
+
+struct extract_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  void operator()(column_device_view const& d_strings,
+                  cudf::detail::device_2dspan<string_index_pair>& d_indices,
+                  rmm::cuda_stream_view stream)
+  {
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       d_strings.size(),
+                       extract_fn<stack_size>{d_prog, d_strings, d_indices});
+  }
+};
 }  // namespace
 
 //
 std::unique_ptr<table> extract(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto const strings_count  = strings.size();
-  auto const strings_column = column_device_view::create(strings.parent(), stream);
-  auto const d_strings      = *strings_column;
-
   // compile regex into device object
-  auto prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-  // extract should include groups
-  auto const groups = d_prog.group_counts();
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
 
-  rmm::device_uvector<string_index_pair> indices(strings_count * groups, stream);
-  cudf::detail::device_2dspan<string_index_pair> d_indices(indices.data(), strings_count, groups);
+  auto indices = rmm::device_uvector<string_index_pair>(input.size() * groups, stream);
+  auto d_indices =
+    cudf::detail::device_2dspan<string_index_pair>(indices.data(), input.size(), groups);
 
-  auto const regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       extract_fn<RX_STACK_SMALL>{d_prog, d_strings, d_indices});
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       extract_fn<RX_STACK_MEDIUM>{d_prog, d_strings, d_indices});
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       extract_fn<RX_STACK_LARGE>{d_prog, d_strings, d_indices});
-  } else {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       extract_fn<RX_STACK_ANY>{d_prog, d_strings, d_indices});
-  }
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  regex_dispatcher(*d_prog, extract_dispatch_fn{*d_prog}, *d_strings, d_indices, stream);
 
   // build a result column for each group
   std::vector<std::unique_ptr<column>> results(groups);
@@ -135,7 +127,7 @@ std::unique_ptr<table> extract(
                                           0, [column_index, groups] __device__(size_type idx) {
                                             return (idx * groups) + column_index;
                                           }));
-    return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr);
+    return make_strings_column(indices_itr, indices_itr + input.size(), stream, mr);
   };
 
   std::transform(thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index e27dccb9338..1f1474c777b 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -86,6 +87,28 @@ struct extract_fn {
     }
   }
 };
+
+struct extract_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+                                     size_type total_groups,
+                                     offset_type const* d_offsets,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    rmm::device_uvector<string_index_pair> indices(total_groups, stream);
+
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       d_strings.size(),
+                       extract_fn<stack_size>{d_strings, d_prog, d_offsets, indices.data()});
+
+    return make_strings_column(indices.begin(), indices.end(), stream, mr);
+  }
+};
+
 }  // namespace
 
 /**
@@ -94,14 +117,14 @@ struct extract_fn {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> extract_all_record(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto const strings_count = strings.size();
-  auto const d_strings     = column_device_view::create(strings.parent(), stream);
+  auto const strings_count = input.size();
+  auto const d_strings     = column_device_view::create(input.parent(), stream);
 
   // Compile regex into device object.
   auto d_prog =
@@ -143,29 +166,8 @@ std::unique_ptr<column> extract_all_record(
   auto const total_groups =
     cudf::detail::get_value<offset_type>(offsets->view(), strings_count, stream);
 
-  // Create an indices vector with the total number of groups that will be extracted.
-  rmm::device_uvector<string_index_pair> indices(total_groups, stream);
-  auto d_indices = indices.data();
-  auto begin     = thrust::make_counting_iterator<size_type>(0);
-
-  // Call the extract functor to fill in the indices vector.
-  auto const regex_insts = d_prog->insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    extract_fn<RX_STACK_SMALL> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    extract_fn<RX_STACK_MEDIUM> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    extract_fn<RX_STACK_LARGE> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else {
-    extract_fn<RX_STACK_ANY> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  }
-
-  // Build the child strings column from the indices.
-  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto strings_output = regex_dispatcher(
+    *d_prog, extract_dispatch_fn{*d_prog}, *d_strings, total_groups, d_offsets, stream, mr);
 
   // Build the lists column from the offsets and the strings.
   return make_lists_column(strings_count,
diff --git a/cpp/src/strings/regex/dispatcher.hpp b/cpp/src/strings/regex/dispatcher.hpp
new file mode 100644
index 00000000000..9ff51d1c979
--- /dev/null
+++ b/cpp/src/strings/regex/dispatcher.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <strings/regex/regex.cuh>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * The stack is used to keep progress (state) on evaluating the regex instructions on each string.
+ * So the size of the stack is in proportion to the number of instructions in the given regex
+ * pattern.
+ *
+ * There are four call types based on the number of regex instructions in the given pattern.
+ * Small, medium, and large instruction counts can use the stack effectively.
+ * Smaller stack sizes execute faster.
+ *
+ * Patterns with instruction counts bigger than large use global memory rather than the stack
+ * for managing the evaluation state data.
+ *
+ * @tparam Functor The functor to invoke with stack size templated value.
+ * @tparam Ts Parameter types for the functor call.
+ */
+template <typename Functor, typename... Ts>
+constexpr decltype(auto) regex_dispatcher(reprog_device d_prog, Functor f, Ts&&... args)
+{
+  auto const num_regex_insts = d_prog.insts_counts();
+  if (num_regex_insts <= RX_SMALL_INSTS) {
+    return f.template operator()<RX_STACK_SMALL>(std::forward<Ts>(args)...);
+  }
+  if (num_regex_insts <= RX_MEDIUM_INSTS) {
+    return f.template operator()<RX_STACK_MEDIUM>(std::forward<Ts>(args)...);
+  }
+  if (num_regex_insts <= RX_LARGE_INSTS) {
+    return f.template operator()<RX_STACK_LARGE>(std::forward<Ts>(args)...);
+  }
+
+  return f.template operator()<RX_STACK_ANY>(std::forward<Ts>(args)...);
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index ff86d7aa552..27e0bd4fac9 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "backref_re.cuh"
 
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -95,27 +96,54 @@ std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string con
   return {rtn, backrefs};
 }
 
+template <typename Iterator>
+struct replace_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     string_view const& d_repl_template,
+                                     Iterator backrefs_begin,
+                                     Iterator backrefs_end,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto const d_strings = column_device_view::create(input.parent(), stream);
+
+    auto children = make_strings_children(
+      backrefs_fn<Iterator, stack_size>{
+        *d_strings, d_prog, d_repl_template, backrefs_begin, backrefs_end},
+      input.size(),
+      stream,
+      mr);
+
+    return make_strings_column(input.size(),
+                               std::move(children.first),
+                               std::move(children.second),
+                               input.null_count(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  }
+};
+
 }  // namespace
 
 //
 std::unique_ptr<column> replace_with_backrefs(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   std::string const& replacement,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
   CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty");
 
-  auto d_strings = column_device_view::create(strings.parent(), stream);
   // compile regex into device object
   auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings.size(), stream);
-  auto const regex_insts = d_prog->insts_counts();
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
 
   // parse the repl string for back-ref indicators
   auto const parse_result = parse_backrefs(replacement);
@@ -125,45 +153,14 @@ std::unique_ptr<column> replace_with_backrefs(
   string_view const d_repl_template = repl_scalar.value();
 
   using BackRefIterator = decltype(backrefs.begin());
-
-  // create child columns
-  auto [offsets, chars] = [&] {
-    if (regex_insts <= RX_SMALL_INSTS) {
-      return make_strings_children(
-        backrefs_fn<BackRefIterator, RX_STACK_SMALL>{
-          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-        strings.size(),
-        stream,
-        mr);
-    } else if (regex_insts <= RX_MEDIUM_INSTS) {
-      return make_strings_children(
-        backrefs_fn<BackRefIterator, RX_STACK_MEDIUM>{
-          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-        strings.size(),
-        stream,
-        mr);
-    } else if (regex_insts <= RX_LARGE_INSTS) {
-      return make_strings_children(
-        backrefs_fn<BackRefIterator, RX_STACK_LARGE>{
-          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-        strings.size(),
-        stream,
-        mr);
-    } else {
-      return make_strings_children(
-        backrefs_fn<BackRefIterator, RX_STACK_ANY>{
-          *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
-        strings.size(),
-        stream,
-        mr);
-    }
-  }();
-
-  return make_strings_column(strings.size(),
-                             std::move(offsets),
-                             std::move(chars),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+  return regex_dispatcher(*d_prog,
+                          replace_dispatch_fn<BackRefIterator>{*d_prog},
+                          input,
+                          d_repl_template,
+                          backrefs.begin(),
+                          backrefs.end(),
+                          stream,
+                          mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 2b5380b76dd..22f6d2cba39 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -30,6 +31,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <algorithm>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -40,16 +43,6 @@ using found_range = thrust::pair<size_type, size_type>;
 /**
  * @brief This functor handles replacing strings by applying the compiled regex patterns
  * and inserting the corresponding new string within the matched range of characters.
- *
- * The logic includes computing the size of each string and also writing the output.
- *
- * The stack is used to keep progress on evaluating the regex instructions on each string.
- * So the size of the stack is in proportion to the number of instructions in the given regex
- * pattern.
- *
- * There are three call types based on the number of regex instructions in the given pattern.
- * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
- * Longer patterns require global memory. Shorter patterns are common in data cleaning.
  */
 template <int stack_size>
 struct replace_multi_regex_fn {
@@ -127,69 +120,76 @@ struct replace_multi_regex_fn {
   }
 };
 
+struct replace_dispatch_fn {
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     device_span<reprog_device const> d_progs,
+                                     strings_column_view const& replacements,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto const d_strings = column_device_view::create(input.parent(), stream);
+    auto const d_repls   = column_device_view::create(replacements.parent(), stream);
+
+    auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
+
+    auto children = make_strings_children(
+      replace_multi_regex_fn<stack_size>{*d_strings, d_progs, found_ranges.data(), *d_repls},
+      input.size(),
+      stream,
+      mr);
+
+    return make_strings_column(input.size(),
+                               std::move(children.first),
+                               std::move(children.second),
+                               input.null_count(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<column> replace_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::string> const& patterns,
   strings_column_view const& replacements,
   regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
-  if (patterns.empty())  // no patterns; just return a copy
-    return std::make_unique<column>(strings.parent(), stream, mr);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
+  if (patterns.empty()) {  // if no patterns; just return a copy
+    return std::make_unique<column>(input.parent(), stream, mr);
+  }
 
   CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have any nulls");
 
-  auto d_strings    = column_device_view::create(strings.parent(), stream);
-  auto d_repls      = column_device_view::create(replacements.parent(), stream);
-  auto d_char_table = get_character_flags_table();
-
   // compile regexes into device objects
-  size_type regex_insts = 0;
-  std::vector<std::unique_ptr<reprog_device, std::function<void(reprog_device*)>>> h_progs;
-  std::vector<reprog_device> progs;
-  for (auto itr = patterns.begin(); itr != patterns.end(); ++itr) {
-    auto prog   = reprog_device::create(*itr, flags, d_char_table, strings_count, stream);
-    regex_insts = std::max(regex_insts, prog->insts_counts());
-    progs.push_back(*prog);
-    h_progs.emplace_back(std::move(prog));
-  }
+  auto const d_char_table = get_character_flags_table();
+  auto h_progs = std::vector<std::unique_ptr<reprog_device, std::function<void(reprog_device*)>>>(
+    patterns.size());
+  std::transform(patterns.begin(),
+                 patterns.end(),
+                 h_progs.begin(),
+                 [flags, d_char_table, input, stream](auto const& ptn) {
+                   return reprog_device::create(ptn, flags, d_char_table, input.size(), stream);
+                 });
+
+  // get the longest regex for the dispatcher
+  auto const max_prog =
+    std::max_element(h_progs.begin(), h_progs.end(), [](auto const& lhs, auto const& rhs) {
+      return lhs->insts_counts() < rhs->insts_counts();
+    });
 
   // copy all the reprog_device instances to a device memory array
+  std::vector<reprog_device> progs;
+  std::transform(h_progs.begin(), h_progs.end(), std::back_inserter(progs), [](auto const& d_prog) {
+    return *d_prog;
+  });
   auto d_progs = cudf::detail::make_device_uvector_async(progs, stream);
 
-  // create working buffer for ranges pairs
-  rmm::device_uvector<found_range> found_ranges(patterns.size() * strings_count, stream);
-  auto d_found_ranges = found_ranges.data();
-
-  // create child columns
-  auto children = [&] {
-    // Each invocation is predicated on the stack size which is dependent on the number of regex
-    // instructions
-    if (regex_insts <= RX_SMALL_INSTS) {
-      replace_multi_regex_fn<RX_STACK_SMALL> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else if (regex_insts <= RX_MEDIUM_INSTS) {
-      replace_multi_regex_fn<RX_STACK_MEDIUM> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else if (regex_insts <= RX_LARGE_INSTS) {
-      replace_multi_regex_fn<RX_STACK_LARGE> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else {
-      replace_multi_regex_fn<RX_STACK_ANY> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
-      return make_strings_children(fn, strings_count, stream, mr);
-    }
-  }();
-
-  return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+  return regex_dispatcher(
+    **max_prog, replace_dispatch_fn{}, input, d_progs, replacements, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 2c594bb86a8..d42359deeac 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -36,16 +37,6 @@ namespace {
 /**
  * @brief This functor handles replacing strings by applying the compiled regex pattern
  * and inserting the new string within the matched range of characters.
- *
- * The logic includes computing the size of each string and also writing the output.
- *
- * The stack is used to keep progress on evaluating the regex instructions on each string.
- * So the size of the stack is in proportion to the number of instructions in the given regex
- * pattern.
- *
- * There are three call types based on the number of regex instructions in the given pattern.
- * Small to medium instruction lengths can use the stack effectively though smaller executes faster.
- * Longer patterns require global memory. Shorter patterns are common in data cleaning.
  */
 template <int stack_size>
 struct replace_regex_fn {
@@ -108,11 +99,37 @@ struct replace_regex_fn {
   }
 };
 
+struct replace_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(strings_column_view const& input,
+                                     string_view const& d_replacement,
+                                     size_type max_replace_count,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto const d_strings = column_device_view::create(input.parent(), stream);
+
+    auto children = make_strings_children(
+      replace_regex_fn<stack_size>{*d_strings, d_prog, d_replacement, max_replace_count},
+      input.size(),
+      stream,
+      mr);
+
+    return make_strings_column(input.size(),
+                               std::move(children.first),
+                               std::move(children.second),
+                               input.null_count(),
+                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  }
+};
+
 }  // namespace
 
 //
 std::unique_ptr<column> replace_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   string_scalar const& replacement,
   std::optional<size_type> max_replace_count,
@@ -120,49 +137,19 @@ std::unique_ptr<column> replace_re(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   string_view d_repl(replacement.data(), replacement.size());
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
   // compile regex into device object
-  auto prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
-  auto d_prog            = *prog;
-  auto const regex_insts = d_prog.insts_counts();
-
-  // copy null mask
-  auto null_mask        = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-  auto const null_count = strings.null_count();
-  auto const maxrepl    = max_replace_count.value_or(-1);
-
-  // create child columns
-  auto children = [&] {
-    // Each invocation is predicated on the stack size which is dependent on the number of regex
-    // instructions
-    if (regex_insts <= RX_SMALL_INSTS) {
-      replace_regex_fn<RX_STACK_SMALL> fn{d_strings, d_prog, d_repl, maxrepl};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else if (regex_insts <= RX_MEDIUM_INSTS) {
-      replace_regex_fn<RX_STACK_MEDIUM> fn{d_strings, d_prog, d_repl, maxrepl};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else if (regex_insts <= RX_LARGE_INSTS) {
-      replace_regex_fn<RX_STACK_LARGE> fn{d_strings, d_prog, d_repl, maxrepl};
-      return make_strings_children(fn, strings_count, stream, mr);
-    } else {
-      replace_regex_fn<RX_STACK_ANY> fn{d_strings, d_prog, d_repl, maxrepl};
-      return make_strings_children(fn, strings_count, stream, mr);
-    }
-  }();
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+
+  auto const maxrepl = max_replace_count.value_or(-1);
 
-  return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
-                             null_count,
-                             std::move(null_mask));
+  return regex_dispatcher(
+    *d_prog, replace_dispatch_fn{*d_prog}, input, d_repl, maxrepl, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 810e44cc27d..201556033ad 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -24,19 +29,16 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/extrema.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 using string_index_pair = thrust::pair<const char*, size_type>;
-using findall_result    = thrust::pair<size_type, string_index_pair>;
 
 namespace {
 /**
@@ -47,27 +49,20 @@ template <int stack_size>
 struct findall_fn {
   column_device_view const d_strings;
   reprog_device prog;
-  size_type column_index;
+  size_type const column_index;
   size_type const* d_counts;
 
-  findall_fn(column_device_view const& d_strings,
-             reprog_device& prog,
-             size_type column_index    = -1,
-             size_type const* d_counts = nullptr)
-    : d_strings(d_strings), prog(prog), column_index(column_index), d_counts(d_counts)
+  __device__ string_index_pair operator()(size_type idx)
   {
-  }
+    if (d_strings.is_null(idx) || (column_index >= d_counts[idx])) {
+      return string_index_pair{nullptr, 0};
+    }
+
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();
+    int32_t spos      = 0;
+    auto epos         = static_cast<int32_t>(nchars);
 
-  // this will count columns as well as locate a specific string for a column
-  __device__ findall_result findall(size_type idx)
-  {
-    string_index_pair result{nullptr, 0};
-    if (d_strings.is_null(idx) || (d_counts && (column_index >= d_counts[idx])))
-      return findall_result{0, result};
-    string_view d_str      = d_strings.element<string_view>(idx);
-    auto const nchars      = d_str.length();
-    int32_t spos           = 0;
-    auto epos              = static_cast<int32_t>(nchars);
     size_type column_count = 0;
     while (spos <= nchars) {
       if (prog.find<stack_size>(idx, d_str, spos, epos) <= 0) break;  // no more matches found
@@ -76,36 +71,40 @@ struct findall_fn {
       epos = static_cast<int32_t>(nchars);
       ++column_count;
     }
-    if (spos <= epos) {
-      spos   = d_str.byte_offset(spos);  // convert
-      epos   = d_str.byte_offset(epos);  // to bytes
-      result = string_index_pair{d_str.data() + spos, (epos - spos)};
-    }
-    // return the strings location and the column count
-    return findall_result{column_count, result};
-  }
 
-  __device__ string_index_pair operator()(size_type idx)
-  {
-    // this one only cares about the string
-    return findall(idx).second;
+    auto const result = [&] {
+      if (spos > epos) { return string_index_pair{nullptr, 0}; }
+      // convert character positions to byte positions
+      spos = d_str.byte_offset(spos);
+      epos = d_str.byte_offset(epos);
+      return string_index_pair{d_str.data() + spos, (epos - spos)};
+    }();
+
+    return result;
   }
 };
 
-template <size_t stack_size>
-struct findall_count_fn : public findall_fn<stack_size> {
-  findall_count_fn(column_device_view const& strings, reprog_device& prog)
-    : findall_fn<stack_size>{strings, prog}
-  {
-  }
+struct findall_dispatch_fn {
+  reprog_device d_prog;
 
-  __device__ size_type operator()(size_type idx)
+  template <int stack_size>
+  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+                                     size_type column_index,
+                                     size_type const* d_find_counts,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    // this one only cares about the column count
-    return findall_fn<stack_size>::findall(idx).first;
+    rmm::device_uvector<string_index_pair> indices(d_strings.size(), stream);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(d_strings.size()),
+                      indices.begin(),
+                      findall_fn<stack_size>{d_strings, d_prog, column_index, d_find_counts});
+
+    return make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
-
 }  // namespace
 
 //
@@ -124,38 +123,15 @@ std::unique_ptr<table> findall(
     reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
   auto const regex_insts = d_prog->insts_counts();
 
-  rmm::device_uvector<size_type> find_counts(strings_count, stream);
-  auto d_find_counts = find_counts.data();
-
-  if (regex_insts <= RX_SMALL_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_find_counts,
-                      findall_count_fn<RX_STACK_SMALL>{*d_strings, *d_prog});
-  else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_find_counts,
-                      findall_count_fn<RX_STACK_MEDIUM>{*d_strings, *d_prog});
-  else if (regex_insts <= RX_LARGE_INSTS)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_find_counts,
-                      findall_count_fn<RX_STACK_LARGE>{*d_strings, *d_prog});
-  else
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(strings_count),
-                      d_find_counts,
-                      findall_count_fn<RX_STACK_ANY>{*d_strings, *d_prog});
+  auto find_counts =
+    count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
+  auto d_find_counts = find_counts->mutable_view().data<size_type>();
 
   std::vector<std::unique_ptr<column>> results;
 
   size_type const columns = thrust::reduce(
-    rmm::exec_policy(stream), find_counts.begin(), find_counts.end(), 0, thrust::maximum{});
+    rmm::exec_policy(stream), d_find_counts, d_find_counts + strings_count, 0, thrust::maximum{});
+
   // boundary case: if no columns, return all nulls column (issue #119)
   if (columns == 0)
     results.emplace_back(std::make_unique<column>(
@@ -166,39 +142,10 @@ std::unique_ptr<table> findall(
       strings_count));
 
   for (int32_t column_index = 0; column_index < columns; ++column_index) {
-    rmm::device_uvector<string_index_pair> indices(strings_count, stream);
-
-    if (regex_insts <= RX_SMALL_INSTS)
-      thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(strings_count),
-        indices.begin(),
-        findall_fn<RX_STACK_SMALL>{*d_strings, *d_prog, column_index, d_find_counts});
-    else if (regex_insts <= RX_MEDIUM_INSTS)
-      thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(strings_count),
-        indices.begin(),
-        findall_fn<RX_STACK_MEDIUM>{*d_strings, *d_prog, column_index, d_find_counts});
-    else if (regex_insts <= RX_LARGE_INSTS)
-      thrust::transform(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(strings_count),
-        indices.begin(),
-        findall_fn<RX_STACK_LARGE>{*d_strings, *d_prog, column_index, d_find_counts});
-    else
-      thrust::transform(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator<size_type>(0),
-                        thrust::make_counting_iterator<size_type>(strings_count),
-                        indices.begin(),
-                        findall_fn<RX_STACK_ANY>{*d_strings, *d_prog, column_index, d_find_counts});
-
-    //
-    results.emplace_back(make_strings_column(indices.begin(), indices.end(), stream, mr));
+    results.emplace_back(regex_dispatcher(
+      *d_prog, findall_dispatch_fn{*d_prog}, *d_strings, column_index, d_find_counts, stream, mr));
   }
+
   return std::make_unique<table>(std::move(results));
 }
 
diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu
index c93eb0c17db..95e347a7c35 100644
--- a/cpp/src/strings/search/findall_record.cu
+++ b/cpp/src/strings/search/findall_record.cu
@@ -15,6 +15,9 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -26,9 +29,6 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -75,6 +75,27 @@ struct findall_fn {
   }
 };
 
+struct findall_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+                                     size_type total_matches,
+                                     offset_type const* d_offsets,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    rmm::device_uvector<string_index_pair> indices(total_matches, stream);
+
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       d_strings.size(),
+                       findall_fn<stack_size>{d_strings, d_prog, d_offsets, indices.data()});
+
+    return make_strings_column(indices.begin(), indices.end(), stream, mr);
+  }
+};
+
 }  // namespace
 
 //
@@ -121,30 +142,11 @@ std::unique_ptr<column> findall_record(
     rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
 
   // Create indices vector with the total number of groups that will be extracted
-  auto total_matches = cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
-
-  rmm::device_uvector<string_index_pair> indices(total_matches, stream);
-  auto d_indices = indices.data();
-  auto begin     = thrust::make_counting_iterator<size_type>(0);
-
-  // Build the string indices
-  auto const regex_insts = d_prog->insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    findall_fn<RX_STACK_SMALL> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    findall_fn<RX_STACK_MEDIUM> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    findall_fn<RX_STACK_LARGE> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  } else {
-    findall_fn<RX_STACK_ANY> fn{*d_strings, *d_prog, d_offsets, d_indices};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
-  }
+  auto const total_matches =
+    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
 
-  // Build the child strings column from the resulting indices
-  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto strings_output = regex_dispatcher(
+    *d_prog, findall_dispatch_fn{*d_prog}, *d_strings, total_matches, d_offsets, stream, mr);
 
   // Build the lists column from the offsets and the strings
   return make_lists_column(strings_count,
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index d80148f2fe6..a8a2467dd76 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -110,6 +111,28 @@ struct token_reader_fn {
   }
 };
 
+struct generate_dispatch_fn {
+  reprog_device d_prog;
+
+  template <int stack_size>
+  rmm::device_uvector<string_index_pair> operator()(column_device_view const& d_strings,
+                                                    size_type total_tokens,
+                                                    split_direction direction,
+                                                    offset_type const* d_offsets,
+                                                    rmm::cuda_stream_view stream)
+  {
+    rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
+
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      d_strings.size(),
+      token_reader_fn<stack_size>{d_strings, d_prog, direction, d_offsets, tokens.data()});
+
+    return tokens;
+  }
+};
+
 /**
  * @brief Call regex to split each input string into tokens.
  *
@@ -148,24 +171,8 @@ rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const&
   // the last offset entry is the total number of tokens to be generated
   auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
 
-  // generate tokens for each string
-  rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
-  auto const regex_insts = d_prog.insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
-  } else {
-    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
-    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
-  }
-
-  return tokens;
+  return regex_dispatcher(
+    d_prog, generate_dispatch_fn{d_prog}, d_strings, total_tokens, direction, d_offsets, stream);
 }
 
 /**
diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu
index 859a6be3bb0..5f17574bb89 100644
--- a/cpp/src/table/table_device_view.cu
+++ b/cpp/src/table/table_device_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/logical.h>
-
 namespace cudf {
 namespace detail {
 template <typename ColumnDeviceView, typename HostTableView>
@@ -54,45 +52,5 @@ template class table_device_view_base<column_device_view, table_view>;
 // Explicit instantiation for a device table of mutable views
 template class table_device_view_base<mutable_column_device_view, mutable_table_view>;
 
-namespace {
-struct is_relationally_comparable_functor {
-  template <typename T>
-  constexpr bool operator()()
-  {
-    return cudf::is_relationally_comparable<T, T>();
-  }
-};
-}  // namespace
-
-template <typename TableView>
-bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
-{
-  return thrust::all_of(thrust::counting_iterator<size_type>(0),
-                        thrust::counting_iterator<size_type>(lhs.num_columns()),
-                        [lhs, rhs] __device__(auto const i) {
-                          // Simplified this for compile time. (Ideally use double_type_dispatcher)
-                          // TODO: possible to implement without double type dispatcher.
-                          return lhs.column(i).type() == rhs.column(i).type() and
-                                 type_dispatcher(lhs.column(i).type(),
-                                                 is_relationally_comparable_functor{});
-                        });
-}
-
-// Explicit extern template instantiation for a table of immutable views
-extern template bool is_relationally_comparable<table_view>(table_view const& lhs,
-                                                            table_view const& rhs);
-
-// Explicit extern template instantiation for a table of mutable views
-extern template bool is_relationally_comparable<mutable_table_view>(mutable_table_view const& lhs,
-                                                                    mutable_table_view const& rhs);
-
-// Explicit extern template instantiation for a device table of immutable views
-template bool is_relationally_comparable<table_device_view>(table_device_view const& lhs,
-                                                            table_device_view const& rhs);
-
-// Explicit extern template instantiation for a device table of mutable views
-template bool is_relationally_comparable<mutable_table_device_view>(
-  mutable_table_device_view const& lhs, mutable_table_device_view const& rhs);
-
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index abd909f8cfc..365ff67263c 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,4 +96,35 @@ table_view scatter_columns(table_view const& source,
   return table_view{updated_columns};
 }
 
+namespace detail {
+namespace {
+struct is_relationally_comparable_functor {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return cudf::is_relationally_comparable<T, T>();
+  }
+};
+}  // namespace
+
+template <typename TableView>
+bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
+{
+  return std::all_of(thrust::counting_iterator<size_type>(0),
+                     thrust::counting_iterator<size_type>(lhs.num_columns()),
+                     [lhs, rhs](auto const i) {
+                       return lhs.column(i).type() == rhs.column(i).type() and
+                              type_dispatcher(lhs.column(i).type(),
+                                              is_relationally_comparable_functor{});
+                     });
+}
+
+// Explicit template instantiation for a table of immutable views
+template bool is_relationally_comparable<table_view>(table_view const& lhs, table_view const& rhs);
+
+// Explicit template instantiation for a table of mutable views
+template bool is_relationally_comparable<mutable_table_view>(mutable_table_view const& lhs,
+                                                             mutable_table_view const& rhs);
+
+}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 5cc4ce5f6c9..f77ab7aa3d9 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,19 +34,19 @@ namespace detail {
 namespace {  // anonymous namespace
 template <typename _TargetT>
 struct unary_cast {
-  template <typename SourceT,
-            typename TargetT                                          = _TargetT,
-            typename std::enable_if_t<(cudf::is_numeric<SourceT>() &&
-                                       cudf::is_numeric<TargetT>())>* = nullptr>
+  template <
+    typename SourceT,
+    typename TargetT                                                                = _TargetT,
+    std::enable_if_t<(cudf::is_numeric<SourceT>() && cudf::is_numeric<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(SourceT const element)
   {
     return static_cast<TargetT>(element);
   }
 
-  template <typename SourceT,
-            typename TargetT                                            = _TargetT,
-            typename std::enable_if_t<(cudf::is_timestamp<SourceT>() &&
-                                       cudf::is_timestamp<TargetT>())>* = nullptr>
+  template <
+    typename SourceT,
+    typename TargetT                                                                    = _TargetT,
+    std::enable_if_t<(cudf::is_timestamp<SourceT>() && cudf::is_timestamp<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(SourceT const element)
   {
     // Convert source tick counts into target tick counts without blindly truncating them
@@ -55,46 +55,46 @@ struct unary_cast {
     return TargetT{cuda::std::chrono::floor<TargetT::duration>(element.time_since_epoch())};
   }
 
-  template <typename SourceT,
-            typename TargetT                                           = _TargetT,
-            typename std::enable_if_t<(cudf::is_duration<SourceT>() &&
-                                       cudf::is_duration<TargetT>())>* = nullptr>
+  template <
+    typename SourceT,
+    typename TargetT                                                                  = _TargetT,
+    std::enable_if_t<(cudf::is_duration<SourceT>() && cudf::is_duration<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT>(element)};
   }
 
-  template <typename SourceT,
-            typename TargetT                                         = _TargetT,
-            typename std::enable_if_t<cudf::is_numeric<SourceT>() &&
-                                      cudf::is_duration<TargetT>()>* = nullptr>
+  template <
+    typename SourceT,
+    typename TargetT                                                               = _TargetT,
+    std::enable_if_t<cudf::is_numeric<SourceT>() && cudf::is_duration<TargetT>()>* = nullptr>
   __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{static_cast<typename TargetT::rep>(element)};
   }
 
-  template <typename SourceT,
-            typename TargetT                                           = _TargetT,
-            typename std::enable_if_t<(cudf::is_timestamp<SourceT>() &&
-                                       cudf::is_duration<TargetT>())>* = nullptr>
+  template <
+    typename SourceT,
+    typename TargetT                                                                   = _TargetT,
+    std::enable_if_t<(cudf::is_timestamp<SourceT>() && cudf::is_duration<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT>(element.time_since_epoch())};
   }
 
-  template <typename SourceT,
-            typename TargetT                                        = _TargetT,
-            typename std::enable_if_t<cudf::is_duration<SourceT>() &&
-                                      cudf::is_numeric<TargetT>()>* = nullptr>
+  template <
+    typename SourceT,
+    typename TargetT                                                               = _TargetT,
+    std::enable_if_t<cudf::is_duration<SourceT>() && cudf::is_numeric<TargetT>()>* = nullptr>
   __device__ inline TargetT operator()(SourceT const element)
   {
     return static_cast<TargetT>(element.count());
   }
 
-  template <typename SourceT,
-            typename TargetT                                            = _TargetT,
-            typename std::enable_if_t<(cudf::is_duration<SourceT>() &&
-                                       cudf::is_timestamp<TargetT>())>* = nullptr>
+  template <
+    typename SourceT,
+    typename TargetT                                                                   = _TargetT,
+    std::enable_if_t<(cudf::is_duration<SourceT>() && cudf::is_timestamp<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT::duration>(element)};
@@ -107,20 +107,20 @@ struct fixed_point_unary_cast {
   using FixedPointT = std::conditional_t<cudf::is_fixed_point<_SourceT>(), _SourceT, _TargetT>;
   using DeviceT     = device_storage_type_t<FixedPointT>;
 
-  template <typename SourceT                                          = _SourceT,
-            typename TargetT                                          = _TargetT,
-            typename std::enable_if_t<(cudf::is_fixed_point<_SourceT>() &&
-                                       cudf::is_numeric<TargetT>())>* = nullptr>
+  template <
+    typename SourceT                                                                     = _SourceT,
+    typename TargetT                                                                     = _TargetT,
+    std::enable_if_t<(cudf::is_fixed_point<_SourceT>() && cudf::is_numeric<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(DeviceT const element)
   {
     auto const fp = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
     return static_cast<TargetT>(fp);
   }
 
-  template <typename SourceT                                              = _SourceT,
-            typename TargetT                                              = _TargetT,
-            typename std::enable_if_t<(cudf::is_numeric<_SourceT>() &&
-                                       cudf::is_fixed_point<TargetT>())>* = nullptr>
+  template <
+    typename SourceT                                                                     = _SourceT,
+    typename TargetT                                                                     = _TargetT,
+    std::enable_if_t<(cudf::is_numeric<_SourceT>() && cudf::is_fixed_point<TargetT>())>* = nullptr>
   __device__ inline DeviceT operator()(SourceT const element)
   {
     return TargetT{element, scale}.value();
@@ -169,7 +169,7 @@ struct device_cast {
  *
  * @return std::unique_ptr<column> Returned column with new @p scale
  */
-template <typename T, typename std::enable_if_t<is_fixed_point<T>()>* = nullptr>
+template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> rescale(column_view input,
                                 numeric::scale_type scale,
                                 rmm::cuda_stream_view stream,
@@ -207,10 +207,9 @@ struct dispatch_unary_cast_to {
 
   dispatch_unary_cast_to(column_view inp) : input(inp) {}
 
-  template <
-    typename TargetT,
-    typename SourceT                                                                  = _SourceT,
-    typename std::enable_if_t<is_supported_non_fixed_point_cast<SourceT, TargetT>()>* = nullptr>
+  template <typename TargetT,
+            typename SourceT                                                         = _SourceT,
+            std::enable_if_t<is_supported_non_fixed_point_cast<SourceT, TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -234,10 +233,10 @@ struct dispatch_unary_cast_to {
     return output;
   }
 
-  template <typename TargetT,
-            typename SourceT                                        = _SourceT,
-            typename std::enable_if_t<cudf::is_fixed_point<SourceT>() &&
-                                      cudf::is_numeric<TargetT>()>* = nullptr>
+  template <
+    typename TargetT,
+    typename SourceT                                                                  = _SourceT,
+    std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_numeric<TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -264,10 +263,10 @@ struct dispatch_unary_cast_to {
     return output;
   }
 
-  template <typename TargetT,
-            typename SourceT                                            = _SourceT,
-            typename std::enable_if_t<cudf::is_numeric<SourceT>() &&
-                                      cudf::is_fixed_point<TargetT>()>* = nullptr>
+  template <
+    typename TargetT,
+    typename SourceT                                                                  = _SourceT,
+    std::enable_if_t<cudf::is_numeric<SourceT>() && cudf::is_fixed_point<TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -294,11 +293,10 @@ struct dispatch_unary_cast_to {
     return output;
   }
 
-  template <
-    typename TargetT,
-    typename SourceT                                             = _SourceT,
-    typename std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_fixed_point<TargetT>() &&
-                              std::is_same_v<SourceT, TargetT>>* = nullptr>
+  template <typename TargetT,
+            typename SourceT                                    = _SourceT,
+            std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_fixed_point<TargetT>() &&
+                             std::is_same_v<SourceT, TargetT>>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -310,11 +308,10 @@ struct dispatch_unary_cast_to {
     return detail::rescale<TargetT>(input, numeric::scale_type{type.scale()}, stream, mr);
   }
 
-  template <
-    typename TargetT,
-    typename SourceT                                                 = _SourceT,
-    typename std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_fixed_point<TargetT>() &&
-                              not std::is_same_v<SourceT, TargetT>>* = nullptr>
+  template <typename TargetT,
+            typename SourceT                                        = _SourceT,
+            std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_fixed_point<TargetT>() &&
+                             not std::is_same_v<SourceT, TargetT>>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -356,8 +353,8 @@ struct dispatch_unary_cast_to {
   }
 
   template <typename TargetT,
-            typename SourceT                                                      = _SourceT,
-            typename std::enable_if_t<not is_supported_cast<SourceT, TargetT>()>* = nullptr>
+            typename SourceT                                             = _SourceT,
+            std::enable_if_t<not is_supported_cast<SourceT, TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type,
                                      rmm::cuda_stream_view,
                                      rmm::mr::device_memory_resource*)
@@ -379,7 +376,7 @@ struct dispatch_unary_cast_from {
 
   dispatch_unary_cast_from(column_view inp) : input(inp) {}
 
-  template <typename T, typename std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 474c7b76ddc..e92d5a1ca7e 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -348,7 +348,7 @@ std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& i
 
 template <typename UFN>
 struct MathOpDispatcher {
-  template <typename T, typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
@@ -362,7 +362,7 @@ struct MathOpDispatcher {
   }
 
   struct dictionary_dispatch {
-    template <typename T, typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+    template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
@@ -377,9 +377,9 @@ struct MathOpDispatcher {
     }
   };
 
-  template <typename T,
-            typename std::enable_if_t<!std::is_arithmetic_v<T> and
-                                      std::is_same_v<T, dictionary32>>* = nullptr>
+  template <
+    typename T,
+    std::enable_if_t<!std::is_arithmetic_v<T> and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
@@ -401,7 +401,7 @@ struct MathOpDispatcher {
 
 template <typename UFN>
 struct BitwiseOpDispatcher {
-  template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
@@ -415,7 +415,7 @@ struct BitwiseOpDispatcher {
   }
 
   struct dictionary_dispatch {
-    template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+    template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
@@ -431,8 +431,7 @@ struct BitwiseOpDispatcher {
   };
 
   template <typename T,
-            typename std::enable_if_t<!std::is_integral_v<T> and std::is_same_v<T, dictionary32>>* =
-              nullptr>
+            std::enable_if_t<!std::is_integral_v<T> and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
@@ -462,7 +461,7 @@ struct LogicalOpDispatcher {
   }
 
  public:
-  template <typename T, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+  template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
@@ -477,7 +476,7 @@ struct LogicalOpDispatcher {
   }
 
   struct dictionary_dispatch {
-    template <typename T, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+    template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
@@ -499,9 +498,8 @@ struct LogicalOpDispatcher {
     }
   };
 
-  template <
-    typename T,
-    typename std::enable_if_t<!is_supported<T>() and std::is_same_v<T, dictionary32>>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!is_supported<T>() and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f96edd3ce5a..05b90095562 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -150,7 +150,8 @@ ConfigureTest(
 # ##################################################################################################
 # * reduction tests -------------------------------------------------------------------------------
 ConfigureTest(
-  REDUCTION_TEST reductions/rank_tests.cpp reductions/reduction_tests.cpp reductions/scan_tests.cpp
+  REDUCTION_TEST reductions/collect_ops_tests.cpp reductions/rank_tests.cpp
+  reductions/reduction_tests.cpp reductions/scan_tests.cpp reductions/segmented_reduction_tests.cpp
 )
 
 # ##################################################################################################
@@ -223,7 +224,10 @@ endif()
 
 # ##################################################################################################
 # * sort tests ------------------------------------------------------------------------------------
-ConfigureTest(SORT_TEST sort/segmented_sort_tests.cpp sort/sort_test.cpp sort/rank_test.cpp)
+ConfigureTest(
+  SORT_TEST sort/segmented_sort_tests.cpp sort/sort_test.cpp sort/stable_sort_tests.cpp
+  sort/rank_test.cpp
+)
 
 # ##################################################################################################
 # * copying tests ---------------------------------------------------------------------------------
diff --git a/cpp/tests/binaryop/binop-fixture.hpp b/cpp/tests/binaryop/binop-fixture.hpp
index 65243b1ae2e..2ba5561826e 100644
--- a/cpp/tests/binaryop/binop-fixture.hpp
+++ b/cpp/tests/binaryop/binop-fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -61,14 +61,14 @@ struct BinaryOperationTest : public cudf::test::BaseFixture {
     return cudf::test::fixed_width_column_wrapper<T>(data_iter, data_iter + size, validity_iter);
   }
 
-  template <typename T, typename std::enable_if_t<!std::is_same_v<T, std::string>>* = nullptr>
+  template <typename T, std::enable_if_t<!std::is_same_v<T, std::string>>* = nullptr>
   auto make_random_wrapped_scalar()
   {
     cudf::test::UniformRandomGenerator<T> rand_gen(r_min, r_max);
     return cudf::scalar_type_t<T>(rand_gen.generate());
   }
 
-  template <typename T, typename std::enable_if_t<std::is_same_v<T, std::string>>* = nullptr>
+  template <typename T, std::enable_if_t<std::is_same_v<T, std::string>>* = nullptr>
   auto make_random_wrapped_scalar()
   {
     cudf::test::UniformRandomGenerator<uint8_t> rand_gen(r_min, r_max);
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index d78ad8938c4..93a84a7bc49 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -32,19 +32,19 @@ template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct Add {
   // Allow sum between chronos only when both input and output types
   // are chronos. Unsupported combinations will fail to compile
-  template <typename OutT                        = TypeOut,
-            typename std::enable_if<cudf::is_chrono<OutT>() && cudf::is_chrono<TypeLhs>() &&
-                                      cudf::is_chrono<TypeRhs>(),
-                                    void>::type* = nullptr>
+  template <typename OutT           = TypeOut,
+            std::enable_if_t<cudf::is_chrono<OutT>() && cudf::is_chrono<TypeLhs>() &&
+                               cudf::is_chrono<TypeRhs>(),
+                             void>* = nullptr>
   OutT operator()(TypeLhs lhs, TypeRhs rhs) const
   {
     return lhs + rhs;
   }
 
-  template <typename OutT                        = TypeOut,
-            typename std::enable_if<!cudf::is_chrono<OutT>() || !cudf::is_chrono<TypeLhs>() ||
-                                      !cudf::is_chrono<TypeRhs>(),
-                                    void>::type* = nullptr>
+  template <typename OutT           = TypeOut,
+            std::enable_if_t<!cudf::is_chrono<OutT>() || !cudf::is_chrono<TypeLhs>() ||
+                               !cudf::is_chrono<TypeRhs>(),
+                             void>* = nullptr>
   OutT operator()(TypeLhs lhs, TypeRhs rhs) const
   {
     using TypeCommon = typename std::common_type<OutT, TypeLhs, TypeRhs>::type;
@@ -56,19 +56,19 @@ template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct Sub {
   // Allow difference between chronos only when both input and output types
   // are chronos. Unsupported combinations will fail to compile
-  template <typename OutT                        = TypeOut,
-            typename std::enable_if<cudf::is_chrono<OutT>() && cudf::is_chrono<TypeLhs>() &&
-                                      cudf::is_chrono<TypeRhs>(),
-                                    void>::type* = nullptr>
+  template <typename OutT           = TypeOut,
+            std::enable_if_t<cudf::is_chrono<OutT>() && cudf::is_chrono<TypeLhs>() &&
+                               cudf::is_chrono<TypeRhs>(),
+                             void>* = nullptr>
   OutT operator()(TypeLhs lhs, TypeRhs rhs) const
   {
     return lhs - rhs;
   }
 
-  template <typename OutT                        = TypeOut,
-            typename std::enable_if<!cudf::is_chrono<OutT>() || !cudf::is_chrono<TypeLhs>() ||
-                                      !cudf::is_chrono<TypeRhs>(),
-                                    void>::type* = nullptr>
+  template <typename OutT           = TypeOut,
+            std::enable_if_t<!cudf::is_chrono<OutT>() || !cudf::is_chrono<TypeLhs>() ||
+                               !cudf::is_chrono<TypeRhs>(),
+                             void>* = nullptr>
   OutT operator()(TypeLhs lhs, TypeRhs rhs) const
   {
     using TypeCommon = typename std::common_type<OutT, TypeLhs, TypeRhs>::type;
@@ -78,28 +78,27 @@ struct Sub {
 
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct Mul {
-  template <typename OutT                                                           = TypeOut,
-            typename std::enable_if<!cudf::is_duration_t<OutT>::value, void>::type* = nullptr>
+  template <typename OutT                                              = TypeOut,
+            std::enable_if_t<!cudf::is_duration_t<OutT>::value, void>* = nullptr>
   TypeOut operator()(TypeLhs lhs, TypeRhs rhs) const
   {
     using TypeCommon = typename std::common_type<TypeOut, TypeLhs, TypeRhs>::type;
     return static_cast<TypeOut>(static_cast<TypeCommon>(lhs) * static_cast<TypeCommon>(rhs));
   }
 
-  template <typename OutT                                                          = TypeOut,
-            typename std::enable_if<cudf::is_duration_t<OutT>::value, void>::type* = nullptr>
+  template <typename OutT                                             = TypeOut,
+            std::enable_if_t<cudf::is_duration_t<OutT>::value, void>* = nullptr>
   TypeOut operator()(TypeLhs x, TypeRhs y) const
   {
     return DurationProduct<TypeOut>(x, y);
   }
 
-  template <
-    typename OutT,
-    typename LhsT,
-    typename RhsT,
-    typename std::enable_if<(cudf::is_duration_t<LhsT>::value && std::is_integral_v<RhsT>) ||
-                              (cudf::is_duration_t<RhsT>::value && std::is_integral_v<LhsT>),
-                            void>::type* = nullptr>
+  template <typename OutT,
+            typename LhsT,
+            typename RhsT,
+            std::enable_if_t<(cudf::is_duration_t<LhsT>::value && std::is_integral_v<RhsT>) ||
+                               (cudf::is_duration_t<RhsT>::value && std::is_integral_v<LhsT>),
+                             void>* = nullptr>
   OutT DurationProduct(LhsT x, RhsT y) const
   {
     return x * y;
@@ -108,26 +107,26 @@ struct Mul {
 
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct Div {
-  template <typename LhsT                                                           = TypeLhs,
-            typename std::enable_if<!cudf::is_duration_t<LhsT>::value, void>::type* = nullptr>
+  template <typename LhsT                                              = TypeLhs,
+            std::enable_if_t<!cudf::is_duration_t<LhsT>::value, void>* = nullptr>
   TypeOut operator()(TypeLhs lhs, TypeRhs rhs)
   {
     using TypeCommon = typename std::common_type<TypeOut, TypeLhs, TypeRhs>::type;
     return static_cast<TypeOut>(static_cast<TypeCommon>(lhs) / static_cast<TypeCommon>(rhs));
   }
 
-  template <typename LhsT                                                          = TypeLhs,
-            typename std::enable_if<cudf::is_duration_t<LhsT>::value, void>::type* = nullptr>
+  template <typename LhsT                                             = TypeLhs,
+            std::enable_if_t<cudf::is_duration_t<LhsT>::value, void>* = nullptr>
   TypeOut operator()(TypeLhs x, TypeRhs y) const
   {
     return DurationDivide<TypeOut>(x, y);
   }
 
-  template <typename OutT,
-            typename LhsT,
-            typename RhsT,
-            typename std::enable_if<(std::is_integral_v<RhsT> || cudf::is_duration<RhsT>()),
-                                    void>::type* = nullptr>
+  template <
+    typename OutT,
+    typename LhsT,
+    typename RhsT,
+    std::enable_if_t<(std::is_integral_v<RhsT> || cudf::is_duration<RhsT>()), void>* = nullptr>
   OutT DurationDivide(LhsT x, RhsT y) const
   {
     return x / y;
@@ -185,10 +184,10 @@ struct Mod {
   }
 
   // Mod with duration types - duration % (integral or a duration) = duration
-  template <typename LhsT                                                = TypeLhs,
-            typename OutT                                                = TypeOut,
-            typename std::enable_if_t<cudf::is_duration_t<LhsT>::value &&
-                                      cudf::is_duration_t<OutT>::value>* = nullptr>
+  template <typename LhsT                                       = TypeLhs,
+            typename OutT                                       = TypeOut,
+            std::enable_if_t<cudf::is_duration_t<LhsT>::value &&
+                             cudf::is_duration_t<OutT>::value>* = nullptr>
   TypeOut operator()(TypeLhs lhs, TypeRhs rhs)
   {
     return lhs % rhs;
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index a306736d131..ec7fae58f98 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -340,6 +340,22 @@ struct OverflowTest : public cudf::test::BaseFixture {
 TEST_F(OverflowTest, OverflowTest)
 {
   using namespace cudf;
+  // should concatenate up to size_type::max rows.
+  {
+    // 5 x size + size_last adds to size_type::max
+    constexpr auto size      = static_cast<size_type>(static_cast<uint32_t>(250) * 1024 * 1024);
+    constexpr auto size_last = static_cast<size_type>(836763647);
+
+    auto many_chars      = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
+    auto many_chars_last = cudf::make_fixed_width_column(data_type{type_id::INT8}, size_last);
+
+    table_view tbl({*many_chars});
+    table_view tbl_last({*many_chars_last});
+    std::vector<cudf::table_view> table_views_to_concat({tbl, tbl, tbl, tbl, tbl, tbl_last});
+    std::unique_ptr<cudf::table> concatenated_tables = cudf::concatenate(table_views_to_concat);
+    EXPECT_NO_THROW(rmm::cuda_stream_default.synchronize());
+    ASSERT_EQ(concatenated_tables->num_rows(), std::numeric_limits<size_type>::max());
+  }
 
   // primitive column
   {
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 4254794bf19..62f1300c284 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -378,18 +378,16 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar)
 template <typename T>
 struct create_chrono_scalar {
   template <typename ChronoT = T, typename... Args>
-  typename std::enable_if_t<
-    std::is_same_v<typename cudf::is_timestamp_t<ChronoT>::type, std::true_type>,
-    cudf::timestamp_scalar<ChronoT>>
+  std::enable_if_t<std::is_same_v<typename cudf::is_timestamp_t<ChronoT>::type, std::true_type>,
+                   cudf::timestamp_scalar<ChronoT>>
   operator()(Args&&... args) const
   {
     return cudf::timestamp_scalar<T>(std::forward<Args>(args)...);
   }
 
   template <typename ChronoT = T, typename... Args>
-  typename std::enable_if_t<
-    std::is_same_v<typename cudf::is_duration_t<ChronoT>::type, std::true_type>,
-    cudf::duration_scalar<ChronoT>>
+  std::enable_if_t<std::is_same_v<typename cudf::is_duration_t<ChronoT>::type, std::true_type>,
+                   cudf::duration_scalar<ChronoT>>
   operator()(Args&&... args) const
   {
     return cudf::duration_scalar<T>(std::forward<Args>(args)...);
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index fd065249c4e..31174d3fd72 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -51,7 +51,7 @@ constexpr inline bool is_timestamp_sum()
 // Disable SUM of TIMESTAMP types
 template <typename T,
           typename BinaryOp,
-          typename std::enable_if_t<is_timestamp_sum<T, BinaryOp>()>* = nullptr>
+          std::enable_if_t<is_timestamp_sum<T, BinaryOp>()>* = nullptr>
 __device__ T atomic_op(T* addr, T const& value, BinaryOp op)
 {
   return {};
@@ -59,7 +59,7 @@ __device__ T atomic_op(T* addr, T const& value, BinaryOp op)
 
 template <typename T,
           typename BinaryOp,
-          typename std::enable_if_t<!is_timestamp_sum<T, BinaryOp>()>* = nullptr>
+          std::enable_if_t<!is_timestamp_sum<T, BinaryOp>()>* = nullptr>
 __device__ T atomic_op(T* addr, T const& value, BinaryOp op)
 {
   T old_value = *addr;
@@ -92,13 +92,13 @@ __global__ void gpu_atomicCAS_test(T* result, T* data, size_t size)
 }
 
 template <typename T>
-typename std::enable_if_t<!cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const> xs)
+std::enable_if_t<!cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const> xs)
 {
   return std::accumulate(xs.begin(), xs.end(), T{0});
 }
 
 template <typename T>
-typename std::enable_if_t<cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const> xs)
+std::enable_if_t<cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const> xs)
 {
   auto ys = std::vector<typename T::rep>(xs.size());
   std::transform(
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 2591f395914..b0ce22bae7c 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,9 +60,8 @@ struct column_max {
 };
 
 struct tdigest_gen {
-  template <
-    typename T,
-    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
   {
     cudf::table_view t({keys});
@@ -75,9 +74,8 @@ struct tdigest_gen {
     return std::move(result.second[0].results[0]);
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
   {
     CUDF_FAIL("Invalid tdigest test type");
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 08cdbb10359..e5e44b1aa6e 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -262,7 +262,7 @@ void check_string_column(cudf::column_view const& col_lhs,
 }
 
 // Helper function to compare two floating-point column contents
-template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
 void expect_column_data_equal(std::vector<T> const& lhs, cudf::column_view const& rhs)
 {
   EXPECT_THAT(cudf::test::to_host<T>(rhs).first,
@@ -270,7 +270,7 @@ void expect_column_data_equal(std::vector<T> const& lhs, cudf::column_view const
 }
 
 // Helper function to compare two column contents
-template <typename T, typename std::enable_if_t<!std::is_floating_point_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<!std::is_floating_point_v<T>>* = nullptr>
 void expect_column_data_equal(std::vector<T> const& lhs, cudf::column_view const& rhs)
 {
   EXPECT_THAT(cudf::test::to_host<T>(rhs).first, ::testing::ElementsAreArray(lhs));
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 27a8be95e9b..cfd1a16f19a 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -21,6 +21,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/concatenate.hpp>
+#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -142,4 +144,29 @@ TEST_F(MultibyteSplitTest, HandpickedInput)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
 }
 
+TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
+{
+  auto host_input    = std::string();
+  auto host_expected = std::vector<std::string>();
+
+  for (auto i = 0; i < 1000; i++) {
+    host_input += "...:|";
+  }
+
+  auto delimiter = std::string("...:|");
+  auto source    = cudf::io::text::make_source(host_input);
+
+  auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
+  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
+  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
+  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+
+  auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
+  auto out       = cudf::concatenate(out_views);
+
+  auto expected = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu
index 2f4d5a7a604..035cd664aa2 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cu
+++ b/cpp/tests/quantiles/percentile_approx_test.cu
@@ -1,3 +1,18 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include <arrow/util/tdigest.h>
 
 #include <cudf/detail/tdigest/tdigest.hpp>
@@ -21,9 +36,8 @@ using namespace cudf;
 using namespace cudf::tdigest;
 
 struct tdigest_gen {
-  template <
-    typename T,
-    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
   {
     cudf::table_view t({keys});
@@ -36,9 +50,8 @@ struct tdigest_gen {
     return std::move(result.second[0].results[0]);
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
   {
     CUDF_FAIL("Invalid tdigest test type");
@@ -89,9 +102,8 @@ std::unique_ptr<column> arrow_percentile_approx(column_view const& _values,
 }
 
 struct percentile_approx_dispatch {
-  template <
-    typename T,
-    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& keys,
                                      column_view const& values,
                                      int delta,
@@ -127,9 +139,8 @@ struct percentile_approx_dispatch {
     return result;
   }
 
-  template <
-    typename T,
-    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  template <typename T,
+            std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& keys,
                                      column_view const& values,
                                      int delta,
diff --git a/cpp/tests/reductions/collect_ops_tests.cpp b/cpp/tests/reductions/collect_ops_tests.cpp
new file mode 100644
index 00000000000..688174d31c5
--- /dev/null
+++ b/cpp/tests/reductions/collect_ops_tests.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/reduction.hpp>
+
+using namespace cudf::test::iterators;
+
+namespace cudf::test {
+
+template <typename T>
+struct CollectTestFixedWidth : public cudf::test::BaseFixture {
+};
+
+using CollectFixedWidthTypes =
+  Concat<IntegralTypesNotBool, FloatingPointTypes, ChronoTypes, FixedPointTypes>;
+TYPED_TEST_SUITE(CollectTestFixedWidth, CollectFixedWidthTypes);
+
+// ------------------------------------------------------------------------
+TYPED_TEST(CollectTestFixedWidth, CollectList)
+{
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  std::vector<int> values({5, 0, -120, -111, 0, 64, 63, 99, 123, -16});
+  std::vector<bool> null_mask({1, 1, 0, 1, 1, 1, 0, 1, 0, 1});
+
+  // null_include without nulls
+  fw_wrapper col(values.begin(), values.end());
+  auto const ret = cudf::reduce(
+    col, make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, dynamic_cast<list_scalar*>(ret.get())->view());
+
+  // null_include with nulls
+  fw_wrapper col_with_null(values.begin(), values.end(), null_mask.begin());
+  auto const ret1 = cudf::reduce(
+    col_with_null, make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_with_null, dynamic_cast<list_scalar*>(ret1.get())->view());
+
+  // null_exclude with nulls
+  fw_wrapper col_null_filtered{{5, 0, -111, 0, 64, 99, -16}};
+  auto const ret2 =
+    cudf::reduce(col_with_null,
+                 make_collect_list_aggregation<reduce_aggregation>(null_policy::EXCLUDE),
+                 data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_null_filtered, dynamic_cast<list_scalar*>(ret2.get())->view());
+}
+
+TYPED_TEST(CollectTestFixedWidth, CollectSet)
+{
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  std::vector<int> values({5, 0, 120, 0, 0, 64, 64, 99, 120, 99});
+  std::vector<bool> null_mask({1, 1, 0, 1, 1, 1, 0, 1, 0, 1});
+
+  fw_wrapper col(values.begin(), values.end());
+  fw_wrapper col_with_null(values.begin(), values.end(), null_mask.begin());
+
+  auto null_exclude = make_collect_set_aggregation<reduce_aggregation>(
+    null_policy::EXCLUDE, null_equality::UNEQUAL, nan_equality::ALL_EQUAL);
+  auto null_eq = make_collect_set_aggregation<reduce_aggregation>(
+    null_policy::INCLUDE, null_equality::EQUAL, nan_equality::ALL_EQUAL);
+  auto null_unequal = make_collect_set_aggregation<reduce_aggregation>(
+    null_policy::INCLUDE, null_equality::UNEQUAL, nan_equality::ALL_EQUAL);
+
+  // test without nulls
+  auto const ret = cudf::reduce(col, null_eq, data_type{type_id::LIST});
+  fw_wrapper expected{{0, 5, 64, 99, 120}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, dynamic_cast<list_scalar*>(ret.get())->view());
+
+  // null exclude
+  auto const ret1 = cudf::reduce(col_with_null, null_exclude, data_type{type_id::LIST});
+  fw_wrapper expected1{{0, 5, 64, 99}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, dynamic_cast<list_scalar*>(ret1.get())->view());
+
+  // null equal
+  auto const ret2 = cudf::reduce(col_with_null, null_eq, data_type{type_id::LIST});
+  fw_wrapper expected2{{0, 5, 64, 99, -1}, {1, 1, 1, 1, 0}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
+
+  // null unequal
+  auto const ret3 = cudf::reduce(col_with_null, null_unequal, data_type{type_id::LIST});
+  fw_wrapper expected3{{0, 5, 64, 99, -1, -1, -1}, {1, 1, 1, 1, 0, 0, 0}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, dynamic_cast<list_scalar*>(ret3.get())->view());
+}
+
+TYPED_TEST(CollectTestFixedWidth, MergeLists)
+{
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  using lists_col  = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+
+  // test without nulls
+  auto const lists1    = lists_col{{1, 2, 3}, {}, {}, {4}, {5, 6, 7}, {8, 9}, {}};
+  auto const expected1 = fw_wrapper{{1, 2, 3, 4, 5, 6, 7, 8, 9}};
+  auto const ret1      = cudf::reduce(
+    lists1, make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, dynamic_cast<list_scalar*>(ret1.get())->view());
+
+  // test with nulls
+  auto const lists2    = lists_col{{
+                                  lists_col{1, 2, 3},
+                                  lists_col{},
+                                  lists_col{{0, 4, 0, 5}, nulls_at({0, 2})},
+                                  lists_col{{0, 0, 0}, all_nulls()},
+                                  lists_col{6},
+                                  lists_col{-1, -1},  // null_list
+                                  lists_col{7, 8, 9},
+                                },
+                                null_at(5)};
+  auto const expected2 = fw_wrapper{{1, 2, 3, 0, 4, 0, 5, 0, 0, 0, 6, 7, 8, 9},
+                                    {1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1}};
+  auto const ret2      = cudf::reduce(
+    lists2, make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
+}
+
+TYPED_TEST(CollectTestFixedWidth, MergeSets)
+{
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  using lists_col  = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+
+  // test without nulls
+  auto const lists1    = lists_col{{1, 2, 3}, {}, {}, {4}, {1, 3, 4}, {0, 3, 10}, {}};
+  auto const expected1 = fw_wrapper{{0, 1, 2, 3, 4, 10}};
+  auto const ret1      = cudf::reduce(
+    lists1, make_merge_sets_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, dynamic_cast<list_scalar*>(ret1.get())->view());
+
+  // test with null_equal
+  auto const lists2    = lists_col{{
+                                  lists_col{1, 2, 3},
+                                  lists_col{},
+                                  lists_col{{0, 4, 0, 5}, nulls_at({0, 2})},
+                                  lists_col{{0, 0, 0}, all_nulls()},
+                                  lists_col{5},
+                                  lists_col{-1, -1},  // null_list
+                                  lists_col{1, 3, 5},
+                                },
+                                null_at(5)};
+  auto const expected2 = fw_wrapper{{1, 2, 3, 4, 5, 0}, {1, 1, 1, 1, 1, 0}};
+  auto const ret2      = cudf::reduce(
+    lists2, make_merge_sets_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
+
+  // test with null_unequal
+  auto const& lists3   = lists2;
+  auto const expected3 = fw_wrapper{{1, 2, 3, 4, 5, 0, 0, 0, 0, 0}, {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
+  auto const ret3 =
+    cudf::reduce(lists3,
+                 make_merge_sets_aggregation<reduce_aggregation>(null_equality::UNEQUAL),
+                 data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, dynamic_cast<list_scalar*>(ret3.get())->view());
+}
+
+struct CollectTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(CollectTest, CollectSetWithNaN)
+{
+  using fp_wrapper = cudf::test::fixed_width_column_wrapper<float>;
+
+  fp_wrapper col{{1.0f, 1.0f, -2.3e-5f, -2.3e-5f, 2.3e5f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f, 0.0f},
+                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+
+  // nan unequal with null equal
+  fp_wrapper expected1{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f},
+                       {1, 1, 1, 1, 1, 1, 1, 0}};
+  auto const ret1 =
+    cudf::reduce(col, make_collect_set_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, dynamic_cast<list_scalar*>(ret1.get())->view());
+
+  // nan unequal with null unequal
+  fp_wrapper expected2{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f, 0.0f},
+                       {1, 1, 1, 1, 1, 1, 1, 0, 0}};
+  auto const ret2 = cudf::reduce(
+    col,
+    make_collect_set_aggregation<reduce_aggregation>(null_policy::INCLUDE, null_equality::UNEQUAL),
+    data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
+
+  // nan equal with null equal
+  fp_wrapper expected3{{-2.3e-5f, 1.0f, 2.3e5f, NAN, 0.0f}, {1, 1, 1, 1, 0}};
+  auto const ret3 =
+    cudf::reduce(col,
+                 make_collect_set_aggregation<reduce_aggregation>(
+                   null_policy::INCLUDE, null_equality::EQUAL, nan_equality::ALL_EQUAL),
+                 data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, dynamic_cast<list_scalar*>(ret3.get())->view());
+
+  // nan equal with null unequal
+  fp_wrapper expected4{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f, 0.0f}, {1, 1, 1, 1, 0, 0}};
+  auto const ret4 =
+    cudf::reduce(col,
+                 make_collect_set_aggregation<reduce_aggregation>(
+                   null_policy::INCLUDE, null_equality::UNEQUAL, nan_equality::ALL_EQUAL),
+                 data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, dynamic_cast<list_scalar*>(ret4.get())->view());
+}
+
+TEST_F(CollectTest, MergeSetsWithNaN)
+{
+  using fp_wrapper = cudf::test::fixed_width_column_wrapper<float>;
+  using lists_col  = cudf::test::lists_column_wrapper<float>;
+
+  auto const col = lists_col{
+    lists_col{1.0f, -2.3e-5f, NAN},
+    lists_col{},
+    lists_col{{-2.3e-5f, 2.3e5f, NAN, 0.0f}, nulls_at({3})},
+    lists_col{{0.0f, 0.0f}, all_nulls()},
+    lists_col{-NAN},
+  };
+
+  // nan unequal with null equal
+  fp_wrapper expected1{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, NAN, NAN, 0.0f}, {1, 1, 1, 1, 1, 1, 0}};
+  auto const ret1 =
+    cudf::reduce(col, make_merge_sets_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, dynamic_cast<list_scalar*>(ret1.get())->view());
+
+  // nan unequal with null unequal
+  fp_wrapper expected2{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, NAN, NAN, 0.0f, 0.0f, 0.0f},
+                       {1, 1, 1, 1, 1, 1, 0, 0, 0}};
+  auto const ret2 =
+    cudf::reduce(col,
+                 make_merge_sets_aggregation<reduce_aggregation>(null_equality::UNEQUAL),
+                 data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
+
+  // nan equal with null equal
+  fp_wrapper expected3{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f}, {1, 1, 1, 1, 0}};
+  auto const ret3 = cudf::reduce(
+    col,
+    make_merge_sets_aggregation<reduce_aggregation>(null_equality::EQUAL, nan_equality::ALL_EQUAL),
+    data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, dynamic_cast<list_scalar*>(ret3.get())->view());
+
+  // nan equal with null unequal
+  fp_wrapper expected4{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f, 0.0f, 0.0f}, {1, 1, 1, 1, 0, 0, 0}};
+  auto const ret4 = cudf::reduce(col,
+                                 make_merge_sets_aggregation<reduce_aggregation>(
+                                   null_equality::UNEQUAL, nan_equality::ALL_EQUAL),
+                                 data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, dynamic_cast<list_scalar*>(ret4.get())->view());
+}
+
+TEST_F(CollectTest, CollectStrings)
+{
+  using str_col   = cudf::test::strings_column_wrapper;
+  using lists_col = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  auto const s_col =
+    str_col{{"a", "a", "b", "b", "b", "c", "c", "d", "e", "e"}, {1, 1, 1, 0, 1, 1, 0, 1, 1, 1}};
+
+  // collect_list including nulls
+  auto const ret1 = cudf::reduce(
+    s_col, make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(s_col, dynamic_cast<list_scalar*>(ret1.get())->view());
+
+  // collect_list excluding nulls
+  auto const expected2 = str_col{"a", "a", "b", "b", "c", "d", "e", "e"};
+  auto const ret2 =
+    cudf::reduce(s_col,
+                 make_collect_list_aggregation<reduce_aggregation>(null_policy::EXCLUDE),
+                 data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
+
+  // collect_set with null_equal
+  auto const expected3 = str_col{{"a", "b", "c", "d", "e", ""}, null_at(5)};
+  auto const ret3      = cudf::reduce(
+    s_col, make_collect_set_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, dynamic_cast<list_scalar*>(ret3.get())->view());
+
+  // collect_set with null_unequal
+  auto const expected4 = str_col{{"a", "b", "c", "d", "e", "", ""}, {1, 1, 1, 1, 1, 0, 0}};
+  auto const ret4      = cudf::reduce(
+    s_col,
+    make_collect_set_aggregation<reduce_aggregation>(null_policy::INCLUDE, null_equality::UNEQUAL),
+    data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, dynamic_cast<list_scalar*>(ret4.get())->view());
+
+  lists_col strings{{"a"},
+                    {},
+                    {"a", "b"},
+                    lists_col{{"b", "null", "c"}, null_at(1)},
+                    lists_col{{"null", "d"}, null_at(0)},
+                    lists_col{{"null"}, null_at(0)},
+                    {"e"}};
+
+  // merge_lists
+  auto const expected5 = str_col{{"a", "a", "b", "b", "null", "c", "null", "d", "null", "e"},
+                                 {1, 1, 1, 1, 0, 1, 0, 1, 0, 1}};
+  auto const ret5      = cudf::reduce(
+    strings, make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected5, dynamic_cast<list_scalar*>(ret5.get())->view());
+
+  // merge_sets with null_equal
+  auto const expected6 = str_col{{"a", "b", "c", "d", "e", "null"}, {1, 1, 1, 1, 1, 0}};
+  auto const ret6      = cudf::reduce(
+    strings, make_merge_sets_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected6, dynamic_cast<list_scalar*>(ret6.get())->view());
+
+  // merge_sets with null_unequal
+  auto const expected7 =
+    str_col{{"a", "b", "c", "d", "e", "null", "null", "null"}, {1, 1, 1, 1, 1, 0, 0, 0}};
+  auto const ret7 =
+    cudf::reduce(strings,
+                 make_merge_sets_aggregation<reduce_aggregation>(null_equality::UNEQUAL),
+                 data_type{type_id::LIST});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected7, dynamic_cast<list_scalar*>(ret7.get())->view());
+}
+
+}  // namespace cudf::test
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 568ff6d62d4..0b90c241f31 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -39,7 +39,7 @@ using aggregation        = cudf::aggregation;
 using reduce_aggregation = cudf::reduce_aggregation;
 
 template <typename T>
-typename std::enable_if<!cudf::is_timestamp_t<T>::value, std::vector<T>>::type convert_values(
+std::enable_if_t<!cudf::is_timestamp_t<T>::value, std::vector<T>> convert_values(
   std::vector<int> const& int_values)
 {
   std::vector<T> v(int_values.size());
@@ -51,7 +51,7 @@ typename std::enable_if<!cudf::is_timestamp_t<T>::value, std::vector<T>>::type c
 }
 
 template <typename T>
-typename std::enable_if<cudf::is_timestamp_t<T>::value, std::vector<T>>::type convert_values(
+std::enable_if_t<cudf::is_timestamp_t<T>::value, std::vector<T>> convert_values(
   std::vector<int> const& int_values)
 {
   std::vector<T> v(int_values.size());
diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp
index 346103de85b..858697d8ef5 100644
--- a/cpp/tests/reductions/scan_tests.hpp
+++ b/cpp/tests/reductions/scan_tests.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,25 +58,23 @@ struct TypeParam_to_host_type<numeric::decimal128> {
 };
 
 template <typename TypeParam, typename T>
-typename std::enable_if<std::is_same_v<TypeParam, cudf::string_view>,
-                        thrust::host_vector<std::string>>::type
+std::enable_if_t<std::is_same_v<TypeParam, cudf::string_view>, thrust::host_vector<std::string>>
 make_vector(std::initializer_list<T> const& init)
 {
   return cudf::test::make_type_param_vector<std::string, T>(init);
 }
 
 template <typename TypeParam, typename T>
-typename std::enable_if<cudf::is_fixed_point<TypeParam>(),
-                        thrust::host_vector<typename TypeParam::rep>>::type
+std::enable_if_t<cudf::is_fixed_point<TypeParam>(), thrust::host_vector<typename TypeParam::rep>>
 make_vector(std::initializer_list<T> const& init)
 {
   return cudf::test::make_type_param_vector<typename TypeParam::rep, T>(init);
 }
 
 template <typename TypeParam, typename T>
-typename std::enable_if<not(std::is_same_v<TypeParam, cudf::string_view> ||
-                            cudf::is_fixed_point<TypeParam>()),
-                        thrust::host_vector<TypeParam>>::type
+std::enable_if_t<not(std::is_same_v<TypeParam, cudf::string_view> ||
+                     cudf::is_fixed_point<TypeParam>()),
+                 thrust::host_vector<TypeParam>>
 make_vector(std::initializer_list<T> const& init)
 {
   return cudf::test::make_type_param_vector<TypeParam, T>(init);
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
new file mode 100644
index 00000000000..3a432cce801
--- /dev/null
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/types.hpp>
+
+#include <limits>
+
+namespace cudf {
+namespace test {
+
+#define XXX 0  // null placeholder
+
+template <typename T>
+struct SegmentedReductionTest : public cudf::test::BaseFixture {
+};
+
+struct SegmentedReductionTestUntyped : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(SegmentedReductionTest, NumericTypes);
+
+TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
+{
+  // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
+  // values:   {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
+  // offsets:  {0, 3, 6, 7, 8, 10, 10}
+  // nullmask: {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}
+  // outputs:  {6, 4, 1, XXX, XXX, XXX}
+  // output nullmask: {1, 1, 1, 0, 0, 0}
+  auto input     = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
+                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect = fixed_width_column_wrapper<TypeParam>{{6, 4, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_sum_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<TypeParam>()},
+                              null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
+{
+  // [1, 3, 5], [null, 3, 5], [1], [null], [null, null], []
+  // values:    {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}
+  // offsets:   {0, 3, 6, 7, 8, 10, 10}
+  // nullmask:  {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}
+  // outputs:   {15, 15, 1, XXX, XXX, XXX}
+  // output nullmask: {1, 1, 1, 0, 0, 0}
+  auto input     = fixed_width_column_wrapper<TypeParam>{{1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX},
+                                                     {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect =
+    fixed_width_column_wrapper<TypeParam>{{15, 15, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_product_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<TypeParam>()},
+                              null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
+{
+  // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
+  // values:    {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
+  // offsets:   {0, 3, 6, 7, 8, 10, 10}
+  // nullmask:  {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}
+  // outputs:   {3, 3, 1, XXX, XXX, XXX}
+  // output nullmask: {1, 1, 1, 0, 0, 0}
+  auto input     = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
+                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect = fixed_width_column_wrapper<TypeParam>{{3, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_max_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<TypeParam>()},
+                              null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
+{
+  // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
+  // values:   {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
+  // offsets:  {0, 3, 6, 7, 8, 10, 10}
+  // nullmask: {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}
+  // outputs:  {1, 1, 1, XXX, XXX, XXX}
+  // output nullmask: {1, 1, 1, 0, 0, 0}
+  auto input     = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
+                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect = fixed_width_column_wrapper<TypeParam>{{1, 1, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_min_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<TypeParam>()},
+                              null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, AnyExcludeNulls)
+{
+  // [0, 0, 0], [0, null, 0], [0, 1, 0], [1, null, 0], [], [0], [1], [null], [null, null]
+  // values:  {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX}
+  // offsets: {0, 3, 6, 9, 12, 12, 13, 14, 15, 17}
+  // nullmask:{1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}
+  // outputs: {0, 0, 1, 1, XXX, 0, 1, XXX, XXX}
+  // output nullmask: {1, 1, 1, 1, 0, 1, 1, 0, 0}
+  auto input = fixed_width_column_wrapper<TypeParam>{
+    {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
+    {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect    = fixed_width_column_wrapper<bool>{
+    {false, false, true, true, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
+    {true, true, true, true, false, true, true, false, false}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_any_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_id::BOOL8},
+                              null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, AllExcludeNulls)
+{
+  // [1, 2, 3], [1, null, 3], [], [1], [null], [null, null], [1, 0, 3], [1, null, 0], [0]
+  // values: {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0}
+  // offsets: {0, 3, 6, 6, 7, 8, 10, 13, 16, 17}
+  // nullmask: {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}
+  // outputs: {true, true, XXX, true, XXX, XXX, false, false, false}
+  // output nullmask: {1, 1, 0, 1, 0, 0, 1, 1, 1}
+  auto input = fixed_width_column_wrapper<TypeParam>{
+    {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
+    {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect    = fixed_width_column_wrapper<bool>{
+    {true, true, bool{XXX}, true, bool{XXX}, bool{XXX}, false, false, false},
+    {true, true, false, true, false, false, true, true, true}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_all_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_id::BOOL8},
+                              null_policy::EXCLUDE);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
+{
+  // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
+  // values:   {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
+  // offsets:  {0, 3, 6, 7, 8, 10, 10}
+  // nullmask: {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}
+  // outputs:  {6, XXX, 1, XXX, XXX, XXX}
+  // output nullmask: {1, 0, 1, 0, 0, 0}
+  auto input     = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
+                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect =
+    fixed_width_column_wrapper<TypeParam>{{6, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_sum_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<TypeParam>()},
+                              null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
+{
+  // [1, 3, 5], [null, 3, 5], [1], [null], [null, null], []
+  // values:    {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}
+  // offsets:   {0, 3, 6, 7, 8, 10, 10}
+  // nullmask:  {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}
+  // outputs:   {15, XXX, 1, XXX, XXX, XXX}
+  // output nullmask: {1, 0, 1, 0, 0, 0}
+  auto input     = fixed_width_column_wrapper<TypeParam>{{1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX},
+                                                     {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect =
+    fixed_width_column_wrapper<TypeParam>{{15, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_product_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<TypeParam>()},
+                              null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
+{
+  // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
+  // values:    {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
+  // offsets:   {0, 3, 6, 7, 8, 10, 10}
+  // nullmask:  {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}
+  // outputs:   {3, XXX, 1, XXX, XXX, XXX}
+  // output nullmask: {1, 0, 1, 0, 0, 0}
+  auto input     = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
+                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect =
+    fixed_width_column_wrapper<TypeParam>{{3, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_max_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<TypeParam>()},
+                              null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
+{
+  // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
+  // values:   {1, 2, 3, 1, XXX, 3, 1, XXX, XXX}
+  // offsets:  {0, 3, 6, 7, 8, 10, 10}
+  // nullmask: {1, 1, 1, 1, 0, 1, 1, 0, 0}
+  // outputs:  {1, XXX, 1, XXX, XXX, XXX}
+  // output nullmask: {1, 0, 1, 0, 0, 0}
+  auto input     = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
+                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect =
+    fixed_width_column_wrapper<TypeParam>{{1, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_min_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<TypeParam>()},
+                              null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, AnyIncludeNulls)
+{
+  // [0, 0, 0], [0, null, 0], [0, 1, 0], [1, null, 0], [], [0], [1], [null], [null, null]
+  // values:  {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX}
+  // offsets: {0, 3, 6, 9, 12, 12, 13, 14, 15, 17}
+  // nullmask:{1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}
+  // outputs: {0, XXX, 1, XXX, XXX, 0, 1, XXX, XXX}
+  // output nullmask: {1, 0, 1, 0, 0, 1, 1, 0, 0}
+  auto input = fixed_width_column_wrapper<TypeParam>{
+    {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
+    {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect    = fixed_width_column_wrapper<bool>{
+    {false, bool{XXX}, true, bool{XXX}, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
+    {true, false, true, false, false, true, true, false, false}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_any_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_id::BOOL8},
+                              null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TYPED_TEST(SegmentedReductionTest, AllIncludeNulls)
+{
+  // [1, 2, 3], [1, null, 3], [], [1], [null], [null, null], [1, 0, 3], [1, null, 0], [0]
+  // values: {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0}
+  // offsets: {0, 3, 6, 6, 7, 8, 10, 13, 16, 17}
+  // nullmask: {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}
+  // outputs: {true, XXX, XXX, true, XXX, XXX, false, XXX, false}
+  // output nullmask: {1, 0, 0, 1, 0, 0, 1, 0, 1}
+  auto input = fixed_width_column_wrapper<TypeParam>{
+    {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
+    {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
+  auto offsets   = std::vector<size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect    = fixed_width_column_wrapper<bool>{
+    {true, bool{XXX}, bool{XXX}, true, bool{XXX}, bool{XXX}, false, bool{XXX}, false},
+    {true, false, false, true, false, false, true, false, true}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_all_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_id::BOOL8},
+                              null_policy::INCLUDE);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TEST_F(SegmentedReductionTestUntyped, PartialSegmentReudction)
+{
+  // Segmented reduction allows offsets only specify part of the input columns.
+  // [1], [2, 3], [4]
+  // values: {1, 2, 3, 4, 5, 6, 7}
+  // offsets: {0, 1, 3, 4}
+  // nullmask: {1, 1, 1, 1, 1, 1, 1}
+  // outputs: {1, 5, 4}
+  // output nullmask: {1, 1, 1}
+
+  auto input     = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4, 5, 6, 7},
+                                                   {true, true, true, true, true, true, true}};
+  auto offsets   = std::vector<size_type>{0, 1, 3, 4};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect    = fixed_width_column_wrapper<int32_t>{{1, 5, 4}, {true, true, true}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_sum_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_id::INT32},
+                              null_policy::INCLUDE);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
+{
+  // Segmented reduction allows offsets only specify part of the input columns.
+  // [1], [], [2, 3], [4, 5, 6, 7]
+  // values: {1, 2, 3, 4, 5, 6, 7}
+  // offsets: {0, 1, 1, 3, 7}
+  // nullmask: nullptr
+  // outputs: {1, 5, 4}
+  // output nullmask: {1, 1, 1}
+
+  auto input     = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
+  auto offsets   = std::vector<size_type>{0, 1, 1, 3, 7};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect    = fixed_width_column_wrapper<int32_t>{{1, XXX, 5, 22}, {true, false, true, true}};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_sum_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_id::INT32},
+                              null_policy::INCLUDE);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
+{
+  auto input     = fixed_width_column_wrapper<int32_t>{};
+  auto offsets   = std::vector<size_type>{0};
+  auto d_offsets = thrust::device_vector<size_type>(offsets);
+  auto expect    = fixed_width_column_wrapper<int32_t>{};
+
+  auto res = segmented_reduce(input,
+                              d_offsets,
+                              *make_sum_aggregation<segmented_reduce_aggregation>(),
+                              data_type{type_to_id<int32_t>()},
+                              null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
+}
+
+#undef XXX
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index b335bf20f95..6a16f1fc64b 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,15 +20,14 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/binaryop.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -167,6 +166,252 @@ TEST_F(RollingStringTest, ZeroWindowSize)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count, got_count->view());
 }
 
+// =========================================================================================
+class RollingStructTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(RollingStructTest, NoNullStructsMinMaxCount)
+{
+  using namespace cudf::test::iterators;
+  using strings_col = cudf::test::strings_column_wrapper;
+  using ints_col    = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const do_test = [](auto const& input) {
+    auto const expected_min = [] {
+      auto child1 = strings_col{
+        "This", "This", "being", "being", "being", "being", "column", "column", "column"};
+      auto child2 = ints_col{1, 1, 5, 5, 5, 5, 9, 9, 9};
+      return structs_col{{child1, child2}, no_nulls()};
+    }();
+
+    auto const expected_max = [] {
+      auto child1 = strings_col{
+        "rolling", "test", "test", "test", "test", "string", "string", "string", "string"};
+      auto child2 = ints_col{3, 4, 4, 4, 4, 8, 8, 8, 8};
+      return structs_col{{child1, child2}, no_nulls()};
+    }();
+
+    auto const expected_count = ints_col{{3, 4, 4, 4, 4, 4, 4, 3, 2}, no_nulls()};
+    auto constexpr preceeding = 2;
+    auto constexpr following  = 2;
+    auto constexpr min_period = 1;
+
+    auto const result_min =
+      cudf::rolling_window(input,
+                           preceeding,
+                           following,
+                           min_period,
+                           *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+    auto const result_max =
+      cudf::rolling_window(input,
+                           preceeding,
+                           following,
+                           min_period,
+                           *cudf::make_max_aggregation<cudf::rolling_aggregation>());
+    auto const result_count_valid =
+      cudf::rolling_window(input,
+                           preceeding,
+                           following,
+                           min_period,
+                           *cudf::make_count_aggregation<cudf::rolling_aggregation>());
+    auto const result_count_all = cudf::rolling_window(
+      input,
+      preceeding,
+      following,
+      min_period,
+      *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, result_min->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, result_max->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count, result_count_valid->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count, result_count_all->view());
+  };
+
+  auto const input_no_sliced = [] {
+    auto child1 =
+      strings_col{"This", "is", "rolling", "test", "being", "operated", "on", "string", "column"};
+    auto child2 = ints_col{1, 2, 3, 4, 5, 6, 7, 8, 9};
+    return structs_col{{child1, child2}};
+  }();
+
+  auto const input_before_sliced = [] {
+    auto constexpr dont_care{0};
+    auto child1 = strings_col{"1dont_care",
+                              "1dont_care",
+                              "@dont_care",
+                              "This",
+                              "is",
+                              "rolling",
+                              "test",
+                              "being",
+                              "operated",
+                              "on",
+                              "string",
+                              "column",
+                              "1dont_care",
+                              "1dont_care",
+                              "@dont_care"};
+    auto child2 = ints_col{
+      dont_care, dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, dont_care, dont_care, dont_care};
+    return structs_col{{child1, child2}};
+  }();
+  auto const input_sliced = cudf::slice(input_before_sliced, {3, 12})[0];
+
+  do_test(input_no_sliced);
+  do_test(input_sliced);
+}
+
+TEST_F(RollingStructTest, NullChildrenMinMaxCount)
+{
+  using namespace cudf::test::iterators;
+  using strings_col = cudf::test::strings_column_wrapper;
+  using ints_col    = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const input = [] {
+    auto child1 = strings_col{
+      {"This", "" /*NULL*/, "" /*NULL*/, "test", "" /*NULL*/, "operated", "on", "string", "column"},
+      nulls_at({1, 2, 4})};
+    auto child2 = ints_col{1, 2, 3, 4, 5, 6, 7, 8, 9};
+    return structs_col{{child1, child2}};
+  }();
+
+  auto const expected_min = [] {
+    auto child1 = strings_col{{"" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "column",
+                               "column",
+                               "column"},
+                              nulls_at({0, 1, 2, 3, 4, 5})};
+    auto child2 = ints_col{2, 2, 2, 3, 5, 5, 9, 9, 9};
+    return structs_col{{child1, child2}, no_nulls()};
+  }();
+
+  auto const expected_max = [] {
+    auto child1 =
+      strings_col{"This", "test", "test", "test", "test", "string", "string", "string", "string"};
+    auto child2 = ints_col{1, 4, 4, 4, 4, 8, 8, 8, 8};
+    return structs_col{{child1, child2}, no_nulls()};
+  }();
+
+  auto const expected_count = ints_col{{3, 4, 4, 4, 4, 4, 4, 3, 2}, no_nulls()};
+  auto constexpr preceeding = 2;
+  auto constexpr following  = 2;
+  auto constexpr min_period = 1;
+
+  auto const result_min =
+    cudf::rolling_window(input,
+                         preceeding,
+                         following,
+                         min_period,
+                         *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+
+  auto const result_max =
+    cudf::rolling_window(input,
+                         preceeding,
+                         following,
+                         min_period,
+                         *cudf::make_max_aggregation<cudf::rolling_aggregation>());
+
+  auto const result_count_valid =
+    cudf::rolling_window(input,
+                         preceeding,
+                         following,
+                         min_period,
+                         *cudf::make_count_aggregation<cudf::rolling_aggregation>());
+  auto const result_count_all = cudf::rolling_window(
+    input,
+    preceeding,
+    following,
+    min_period,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, result_min->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, result_max->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count, result_count_valid->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count, result_count_all->view());
+}
+
+TEST_F(RollingStructTest, NullParentMinMaxCount)
+{
+  using namespace cudf::test::iterators;
+  using strings_col = cudf::test::strings_column_wrapper;
+  using ints_col    = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto constexpr null{0};
+  auto const input = [] {
+    auto child1 = strings_col{"This",
+                              "" /*NULL*/,
+                              "" /*NULL*/,
+                              "test",
+                              "" /*NULL*/,
+                              "operated",
+                              "on",
+                              "string",
+                              "" /*NULL*/};
+    auto child2 = ints_col{1, null, null, 4, null, 6, 7, 8, null};
+    return structs_col{{child1, child2}, nulls_at({1, 2, 4, 8})};
+  }();
+
+  auto const expected_min = [] {
+    auto child1 = strings_col{"This", "This", "test", "operated", "on", "on", "on", "on", "string"};
+    auto child2 = ints_col{1, 1, 4, 6, 7, 7, 7, 7, 8};
+    return structs_col{{child1, child2}, no_nulls()};
+  }();
+
+  auto const expected_max = [] {
+    auto child1 =
+      strings_col{"This", "test", "test", "test", "test", "string", "string", "string", "string"};
+    auto child2 = ints_col{1, 4, 4, 4, 4, 8, 8, 8, 8};
+    return structs_col{{child1, child2}, no_nulls()};
+  }();
+
+  auto const expected_count_valid = ints_col{{1, 2, 1, 2, 3, 3, 3, 2, 1}, no_nulls()};
+  auto const expected_count_all   = ints_col{{3, 4, 4, 4, 4, 4, 4, 3, 2}, no_nulls()};
+  auto constexpr preceeding       = 2;
+  auto constexpr following        = 2;
+  auto constexpr min_period       = 1;
+
+  auto const result_min =
+    cudf::rolling_window(input,
+                         preceeding,
+                         following,
+                         min_period,
+                         *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+
+  auto const result_max =
+    cudf::rolling_window(input,
+                         preceeding,
+                         following,
+                         min_period,
+                         *cudf::make_max_aggregation<cudf::rolling_aggregation>());
+
+  auto const result_count_valid =
+    cudf::rolling_window(input,
+                         preceeding,
+                         following,
+                         min_period,
+                         *cudf::make_count_aggregation<cudf::rolling_aggregation>());
+  auto const result_count_all = cudf::rolling_window(
+    input,
+    preceeding,
+    following,
+    min_period,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, result_min->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, result_max->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_valid, result_count_valid->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_all, result_count_all->view());
+}
+
+// =========================================================================================
 template <typename T>
 class RollingTest : public cudf::test::BaseFixture {
  protected:
@@ -1102,10 +1347,10 @@ TEST_F(RollingTestUdf, DynamicWindow)
                                             thrust::make_constant_iterator(true));
 
   auto prec = cudf::detail::make_counting_transform_iterator(
-    0, [size] __device__(size_type row) { return row % 2 + 2; });
+    0, [] __device__(size_type row) { return row % 2 + 2; });
 
   auto follow = cudf::detail::make_counting_transform_iterator(
-    0, [size] __device__(size_type row) { return row % 2; });
+    0, [] __device__(size_type row) { return row % 2; });
 
   fixed_width_column_wrapper<int32_t> preceding(prec, prec + size);
   fixed_width_column_wrapper<int32_t> following(follow, follow + size);
@@ -1118,7 +1363,7 @@ TEST_F(RollingTestUdf, DynamicWindow)
   });
 
   auto valid = cudf::detail::make_counting_transform_iterator(
-    0, [size] __device__(size_type row) { return row != 0; });
+    0, [] __device__(size_type row) { return row != 0; });
 
   fixed_width_column_wrapper<int64_t> expected{start, start + size, valid};
 
diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp
index 7d277059ef7..44fa83204ee 100644
--- a/cpp/tests/sort/is_sorted_tests.cpp
+++ b/cpp/tests/sort/is_sorted_tests.cpp
@@ -36,8 +36,7 @@ namespace testdata {
 // ----- most numerics
 
 template <typename T>
-typename std::enable_if<std::is_arithmetic_v<T> && !std::is_same_v<T, bool>,
-                        fixed_width_column_wrapper<T>>::type
+std::enable_if_t<std::is_arithmetic_v<T> && !std::is_same_v<T, bool>, fixed_width_column_wrapper<T>>
 ascending()
 {
   return std::is_signed_v<T> ? fixed_width_column_wrapper<T>({std::numeric_limits<T>::lowest(),
@@ -58,8 +57,7 @@ ascending()
 }
 
 template <typename T>
-typename std::enable_if<std::is_arithmetic_v<T> && !std::is_same_v<T, bool>,
-                        fixed_width_column_wrapper<T>>::type
+std::enable_if_t<std::is_arithmetic_v<T> && !std::is_same_v<T, bool>, fixed_width_column_wrapper<T>>
 descending()
 {
   return std::is_signed_v<T> ? fixed_width_column_wrapper<T>({std::numeric_limits<T>::max(),
@@ -100,14 +98,13 @@ auto nulls_before()
 // ----- bool
 
 template <typename T>
-typename std::enable_if<std::is_same_v<T, bool>, fixed_width_column_wrapper<bool>>::type ascending()
+std::enable_if_t<std::is_same_v<T, bool>, fixed_width_column_wrapper<bool>> ascending()
 {
   return fixed_width_column_wrapper<bool>({false, false, true, true});
 }
 
 template <typename T>
-typename std::enable_if<std::is_same_v<T, bool>, fixed_width_column_wrapper<bool>>::type
-descending()
+std::enable_if_t<std::is_same_v<T, bool>, fixed_width_column_wrapper<bool>> descending()
 {
   return fixed_width_column_wrapper<bool>({true, true, false, false});
 }
@@ -115,13 +112,13 @@ descending()
 // ----- chrono types
 
 template <typename T>
-typename std::enable_if<cudf::is_chrono<T>(), fixed_width_column_wrapper<T>>::type ascending()
+std::enable_if_t<cudf::is_chrono<T>(), fixed_width_column_wrapper<T>> ascending()
 {
   return fixed_width_column_wrapper<T>({T::min(), T::max()});
 }
 
 template <typename T>
-typename std::enable_if<cudf::is_chrono<T>(), fixed_width_column_wrapper<T>>::type descending()
+std::enable_if_t<cudf::is_chrono<T>(), fixed_width_column_wrapper<T>> descending()
 {
   return fixed_width_column_wrapper<T>({T::max(), T::min()});
 }
@@ -129,15 +126,13 @@ typename std::enable_if<cudf::is_chrono<T>(), fixed_width_column_wrapper<T>>::ty
 // ----- string_view
 
 template <typename T>
-typename std::enable_if<std::is_same_v<T, cudf::string_view>, strings_column_wrapper>::type
-ascending()
+std::enable_if_t<std::is_same_v<T, cudf::string_view>, strings_column_wrapper> ascending()
 {
   return strings_column_wrapper({"A", "B"});
 }
 
 template <typename T>
-typename std::enable_if<std::is_same_v<T, cudf::string_view>, strings_column_wrapper>::type
-descending()
+std::enable_if_t<std::is_same_v<T, cudf::string_view>, strings_column_wrapper> descending()
 {
   return strings_column_wrapper({"B", "A"});
 }
@@ -163,8 +158,7 @@ auto nulls_before<cudf::string_view>()
 // ----- struct_view {"nestedInt" : {"Int" : 0 }, "float" : 1}
 
 template <typename T>
-typename std::enable_if<std::is_same_v<T, cudf::struct_view>, structs_column_wrapper>::type
-ascending()
+std::enable_if_t<std::is_same_v<T, cudf::struct_view>, structs_column_wrapper> ascending()
 {
   using T1           = int32_t;
   auto int_col       = fixed_width_column_wrapper<int32_t>({std::numeric_limits<T1>::lowest(),
@@ -182,8 +176,7 @@ ascending()
 }
 
 template <typename T>
-typename std::enable_if<std::is_same_v<T, cudf::struct_view>, structs_column_wrapper>::type
-descending()
+std::enable_if_t<std::is_same_v<T, cudf::struct_view>, structs_column_wrapper> descending()
 {
   using T1           = int32_t;
   auto int_col       = fixed_width_column_wrapper<int32_t>({std::numeric_limits<T1>::max(),
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index af13c35acfb..7f9f40e98b8 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,12 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
+#include <type_traits>
 #include <vector>
 
 namespace cudf {
@@ -50,10 +48,8 @@ void run_sort_test(table_view input,
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort_by_key_table->view(), got_sort_by_key_table->view());
 }
 
-using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
-                                     cudf::test::FloatingPointTypes,
-                                     cudf::test::DurationTypes,
-                                     cudf::test::TimestampTypes>;
+using TestTypes = cudf::test::Concat<cudf::test::NumericTypes,  // include integers, floats and bool
+                                     cudf::test::ChronoTypes>;  // include timestamps and durations
 
 template <typename T>
 struct Sort : public BaseFixture {
@@ -555,7 +551,12 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
   std::vector<order> column_order{order::DESCENDING};
 
   // desc_nulls_first
-  fixed_width_column_wrapper<int32_t> expected1{{3, 5, 6, 7, 2, 4, 1, 0}};
+  auto const expected1 = []() {
+    if constexpr (std::is_same_v<T, bool>) {
+      return fixed_width_column_wrapper<int32_t>{{3, 5, 6, 7, 1, 2, 4, 0}};
+    }
+    return fixed_width_column_wrapper<int32_t>{{3, 5, 6, 7, 2, 4, 1, 0}};
+  }();
   auto got = sorted_order(input, column_order, {null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
   // Run test for sort and sort_by_key
@@ -577,30 +578,18 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
   run_sort_test(input, expected3, column_order2, {null_order::BEFORE});
 
   // asce_nulls_last
-  fixed_width_column_wrapper<int32_t> expected4{{0, 1, 2, 4, 7, 6, 3, 5}};
+  auto const expected4 = []() {
+    if constexpr (std::is_same_v<T, bool>) {
+      return fixed_width_column_wrapper<int32_t>{{0, 2, 4, 1, 7, 6, 3, 5}};
+    }
+    return fixed_width_column_wrapper<int32_t>{{0, 1, 2, 4, 7, 6, 3, 5}};
+  }();
   got = sorted_order(input, column_order2, {null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(input, expected4, column_order2, {null_order::AFTER});
 }
 
-TYPED_TEST(Sort, Stable)
-{
-  using T = TypeParam;
-  using R = int32_t;
-
-  fixed_width_column_wrapper<T> col1({0, 1, 1, 0, 0, 1, 0, 1}, {0, 1, 1, 1, 1, 1, 1, 1});
-  strings_column_wrapper col2({"2", "a", "b", "x", "k", "a", "x", "a"}, {1, 1, 1, 1, 0, 1, 1, 1});
-
-  fixed_width_column_wrapper<R> expected{{4, 3, 6, 1, 5, 7, 2, 0}};
-
-  auto got = stable_sorted_order(table_view({col1, col2}),
-                                 {order::ASCENDING, order::ASCENDING},
-                                 {null_order::AFTER, null_order::BEFORE});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
-}
-
 TYPED_TEST(Sort, MisMatchInColumnOrderSize)
 {
   using T = TypeParam;
@@ -613,7 +602,6 @@ TYPED_TEST(Sort, MisMatchInColumnOrderSize)
   std::vector<order> column_order{order::ASCENDING, order::DESCENDING};
 
   EXPECT_THROW(sorted_order(input, column_order), logic_error);
-  EXPECT_THROW(stable_sorted_order(input, column_order), logic_error);
   EXPECT_THROW(sort(input, column_order), logic_error);
   EXPECT_THROW(sort_by_key(input, input, column_order), logic_error);
 }
@@ -631,7 +619,6 @@ TYPED_TEST(Sort, MisMatchInNullPrecedenceSize)
   std::vector<null_order> null_precedence{null_order::AFTER, null_order::BEFORE};
 
   EXPECT_THROW(sorted_order(input, column_order, null_precedence), logic_error);
-  EXPECT_THROW(stable_sorted_order(input, column_order, null_precedence), logic_error);
   EXPECT_THROW(sort(input, column_order, null_precedence), logic_error);
   EXPECT_THROW(sort_by_key(input, input, column_order, null_precedence), logic_error);
 }
diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp
new file mode 100644
index 00000000000..f80764e66a3
--- /dev/null
+++ b/cpp/tests/sort/stable_sort_tests.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <type_traits>
+#include <vector>
+
+namespace cudf {
+namespace test {
+void run_stable_sort_test(table_view input,
+                          column_view expected_sorted_indices,
+                          std::vector<order> column_order         = {},
+                          std::vector<null_order> null_precedence = {})
+{
+  auto got_sort_by_key_table      = sort_by_key(input, input, column_order, null_precedence);
+  auto expected_sort_by_key_table = gather(input, expected_sorted_indices);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort_by_key_table->view(), got_sort_by_key_table->view());
+}
+
+using TestTypes = cudf::test::Concat<cudf::test::NumericTypes,  // include integers, floats and bool
+                                     cudf::test::ChronoTypes>;  // include timestamps and durations
+
+template <typename T>
+struct StableSort : public BaseFixture {
+};
+
+TYPED_TEST_SUITE(StableSort, TestTypes);
+
+TYPED_TEST(StableSort, MixedNullOrder)
+{
+  using T = TypeParam;
+  using R = int32_t;
+
+  fixed_width_column_wrapper<T> col1({0, 1, 1, 0, 0, 1, 0, 1}, {0, 1, 1, 1, 1, 1, 1, 1});
+  strings_column_wrapper col2({"2", "a", "b", "x", "k", "a", "x", "a"}, {1, 1, 1, 1, 0, 1, 1, 1});
+
+  fixed_width_column_wrapper<R> expected{{4, 3, 6, 1, 5, 7, 2, 0}};
+
+  auto got = stable_sorted_order(table_view({col1, col2}),
+                                 {order::ASCENDING, order::ASCENDING},
+                                 {null_order::AFTER, null_order::BEFORE});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+}
+
+TYPED_TEST(StableSort, WithNullMax)
+{
+  using T = TypeParam;
+
+  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 5}, {1, 1, 0, 1, 1, 1}};
+  strings_column_wrapper col2({"d", "e", "a", "d", "k", "d"}, {1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2, 10}, {1, 1, 0, 1, 1, 1}};
+  table_view input{{col1, col2, col3}};
+
+  fixed_width_column_wrapper<int32_t> expected{{1, 0, 3, 5, 4, 2}};
+  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
+  std::vector<null_order> null_precedence{null_order::AFTER, null_order::AFTER, null_order::AFTER};
+
+  auto got = stable_sorted_order(input, column_order, null_precedence);
+
+  if (not std::is_same_v<T, bool>) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+
+    run_stable_sort_test(input, expected, column_order, null_precedence);
+  } else {
+    // for bools only validate that the null element landed at the back, since
+    // the rest of the values are equivalent and yields random sorted order.
+    auto to_host = [](column_view const& col) {
+      thrust::host_vector<int32_t> h_data(col.size());
+      CUDA_TRY(cudaMemcpy(
+        h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
+      return h_data;
+    };
+    thrust::host_vector<int32_t> h_exp = to_host(expected);
+    thrust::host_vector<int32_t> h_got = to_host(got->view());
+    EXPECT_EQ(h_exp[h_exp.size() - 1], h_got[h_got.size() - 1]);
+
+    fixed_width_column_wrapper<int32_t> expected_for_bool{{0, 3, 5, 1, 4, 2}};
+    run_stable_sort_test(input, expected_for_bool, column_order, null_precedence);
+  }
+}
+
+TYPED_TEST(StableSort, WithNullMin)
+{
+  using T = TypeParam;
+
+  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}, {1, 1, 0, 1, 1}};
+  strings_column_wrapper col2({"d", "e", "a", "d", "k"}, {1, 1, 0, 1, 1});
+  fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2}, {1, 1, 0, 1, 1}};
+  table_view input{{col1, col2, col3}};
+
+  fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
+  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
+
+  auto got = stable_sorted_order(input, column_order);
+
+  if (!std::is_same_v<T, bool>) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+
+    run_stable_sort_test(input, expected, column_order);
+  } else {
+    // for bools only validate that the null element landed at the front, since
+    // the rest of the values are equivalent and yields random sorted order.
+    auto to_host = [](column_view const& col) {
+      thrust::host_vector<int32_t> h_data(col.size());
+      CUDA_TRY(cudaMemcpy(
+        h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
+      return h_data;
+    };
+    thrust::host_vector<int32_t> h_exp = to_host(expected);
+    thrust::host_vector<int32_t> h_got = to_host(got->view());
+    EXPECT_EQ(h_exp.front(), h_got.front());
+
+    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
+    run_stable_sort_test(input, expected_for_bool, column_order);
+  }
+}
+
+TYPED_TEST(StableSort, WithAllValid)
+{
+  using T = TypeParam;
+
+  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2}};
+  table_view input{{col1, col2, col3}};
+
+  fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
+  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
+
+  auto got = stable_sorted_order(input, column_order);
+
+  // Skip validating bools order. Valid true bools are all
+  // equivalent, and yield random order after thrust::sort
+  if (!std::is_same_v<T, bool>) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+
+    run_stable_sort_test(input, expected, column_order);
+  } else {
+    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
+    run_stable_sort_test(input, expected_for_bool, column_order);
+  }
+}
+
+TYPED_TEST(StableSort, MisMatchInColumnOrderSize)
+{
+  using T = TypeParam;
+
+  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  table_view input{{col1, col2, col3}};
+
+  std::vector<order> column_order{order::ASCENDING, order::DESCENDING};
+
+  EXPECT_THROW(stable_sorted_order(input, column_order), logic_error);
+  EXPECT_THROW(stable_sort_by_key(input, input, column_order), logic_error);
+}
+
+TYPED_TEST(StableSort, MisMatchInNullPrecedenceSize)
+{
+  using T = TypeParam;
+
+  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  table_view input{{col1, col2, col3}};
+
+  std::vector<order> column_order{order::ASCENDING, order::DESCENDING, order::DESCENDING};
+  std::vector<null_order> null_precedence{null_order::AFTER, null_order::BEFORE};
+
+  EXPECT_THROW(stable_sorted_order(input, column_order, null_precedence), logic_error);
+  EXPECT_THROW(stable_sort_by_key(input, input, column_order, null_precedence), logic_error);
+}
+
+TYPED_TEST(StableSort, ZeroSizedColumns)
+{
+  using T = TypeParam;
+
+  fixed_width_column_wrapper<T> col1{};
+  table_view input{{col1}};
+
+  fixed_width_column_wrapper<int32_t> expected{};
+  std::vector<order> column_order{order::ASCENDING};
+
+  auto got = stable_sorted_order(input, column_order);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
+
+  run_stable_sort_test(input, expected, column_order);
+}
+
+struct StableSortByKey : public BaseFixture {
+};
+
+TEST_F(StableSortByKey, ValueKeysSizeMismatch)
+{
+  using T = int64_t;
+
+  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  table_view values{{col1, col2, col3}};
+
+  fixed_width_column_wrapper<T> key_col{{5, 4, 3, 5}};
+  table_view keys{{key_col}};
+
+  EXPECT_THROW(stable_sort_by_key(values, keys), logic_error);
+}
+
+template <typename T>
+struct StableSortFixedPoint : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+using wrapper = cudf::test::fixed_width_column_wrapper<T>;
+TYPED_TEST_SUITE(StableSortFixedPoint, cudf::test::FixedPointTypes);
+
+TYPED_TEST(StableSortFixedPoint, FixedPointSortedOrderGather)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+
+  auto const ZERO  = decimalXX{0, scale_type{0}};
+  auto const ONE   = decimalXX{1, scale_type{0}};
+  auto const TWO   = decimalXX{2, scale_type{0}};
+  auto const THREE = decimalXX{3, scale_type{0}};
+  auto const FOUR  = decimalXX{4, scale_type{0}};
+
+  auto const input_vec  = std::vector<decimalXX>{THREE, TWO, ONE, ZERO, FOUR, THREE};
+  auto const index_vec  = std::vector<cudf::size_type>{3, 2, 1, 0, 5, 4};
+  auto const sorted_vec = std::vector<decimalXX>{ZERO, ONE, TWO, THREE, THREE, FOUR};
+
+  auto const input_col  = wrapper<decimalXX>(input_vec.begin(), input_vec.end());
+  auto const index_col  = wrapper<cudf::size_type>(index_vec.begin(), index_vec.end());
+  auto const sorted_col = wrapper<decimalXX>(sorted_vec.begin(), sorted_vec.end());
+
+  auto const sorted_table = cudf::table_view{{sorted_col}};
+  auto const input_table  = cudf::table_view{{input_col}};
+
+  auto const indices = cudf::sorted_order(input_table);
+  auto const sorted  = cudf::gather(input_table, indices->view());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(index_col, indices->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(sorted_table, sorted->view());
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 5403d56318e..9daf70227f8 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -836,13 +836,13 @@ std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
 
 namespace {
 
-template <typename T, typename std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
 static auto numeric_to_string_precise(T value)
 {
   return std::to_string(value);
 }
 
-template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
 static auto numeric_to_string_precise(T value)
 {
   std::ostringstream o;
@@ -915,7 +915,7 @@ std::string nested_offsets_to_string(NestedColumnView const& c, std::string cons
 }
 
 struct column_view_printer {
-  template <typename Element, typename std::enable_if_t<is_numeric<Element>()>* = nullptr>
+  template <typename Element, std::enable_if_t<is_numeric<Element>()>* = nullptr>
   void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
   {
     auto h_data = cudf::test::to_host<Element>(col);
@@ -939,7 +939,7 @@ struct column_view_printer {
     }
   }
 
-  template <typename Element, typename std::enable_if_t<is_timestamp<Element>()>* = nullptr>
+  template <typename Element, std::enable_if_t<is_timestamp<Element>()>* = nullptr>
   void operator()(cudf::column_view const& col,
                   std::vector<std::string>& out,
                   std::string const& indent)
@@ -965,7 +965,7 @@ struct column_view_printer {
     this->template operator()<cudf::string_view>(*col_as_strings, out, indent);
   }
 
-  template <typename Element, typename std::enable_if_t<cudf::is_fixed_point<Element>()>* = nullptr>
+  template <typename Element, std::enable_if_t<cudf::is_fixed_point<Element>()>* = nullptr>
   void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
   {
     auto const h_data = cudf::test::to_host<Element>(col);
@@ -987,7 +987,7 @@ struct column_view_printer {
   }
 
   template <typename Element,
-            typename std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
+            std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
   void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
   {
     //
@@ -1008,7 +1008,7 @@ struct column_view_printer {
   }
 
   template <typename Element,
-            typename std::enable_if_t<std::is_same_v<Element, cudf::dictionary32>>* = nullptr>
+            std::enable_if_t<std::is_same_v<Element, cudf::dictionary32>>* = nullptr>
   void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
   {
     cudf::dictionary_column_view dictionary(col);
@@ -1029,7 +1029,7 @@ struct column_view_printer {
   }
 
   // Print the tick counts with the units
-  template <typename Element, typename std::enable_if_t<is_duration<Element>()>* = nullptr>
+  template <typename Element, std::enable_if_t<is_duration<Element>()>* = nullptr>
   void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
   {
     auto h_data = cudf::test::to_host<Element>(col);
@@ -1054,8 +1054,7 @@ struct column_view_printer {
     }
   }
 
-  template <typename Element,
-            typename std::enable_if_t<std::is_same_v<Element, cudf::list_view>>* = nullptr>
+  template <typename Element, std::enable_if_t<std::is_same_v<Element, cudf::list_view>>* = nullptr>
   void operator()(cudf::column_view const& col,
                   std::vector<std::string>& out,
                   std::string const& indent)
@@ -1084,7 +1083,7 @@ struct column_view_printer {
   }
 
   template <typename Element,
-            typename std::enable_if_t<std::is_same_v<Element, cudf::struct_view>>* = nullptr>
+            std::enable_if_t<std::is_same_v<Element, cudf::struct_view>>* = nullptr>
   void operator()(cudf::column_view const& col,
                   std::vector<std::string>& out,
                   std::string const& indent)
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index 097b786aefe..48500c84942 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ struct compare_chrono_elements_to_primitive_representation {
   {
   }
 
-  template <typename T = ChronoT, typename std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+  template <typename T = ChronoT, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
   __host__ __device__ bool operator()(const int32_t element_index)
   {
     using Primitive = typename ChronoT::rep;
@@ -59,7 +59,7 @@ struct compare_chrono_elements_to_primitive_representation {
     return primitive == timestamp.time_since_epoch().count();
   }
 
-  template <typename T = ChronoT, typename std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
+  template <typename T = ChronoT, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
   __host__ __device__ bool operator()(const int32_t element_index)
   {
     using Primitive = typename ChronoT::rep;
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index 2de55553c3f..7a7c9c195b2 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -209,8 +209,8 @@ Reshaping, sorting, transposing
    DataFrame.T
    DataFrame.transpose
 
-Combining / comparing / joining / merging / encoding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Combining / comparing / joining / merging
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autosummary::
    :toctree: api/
 
@@ -219,8 +219,6 @@ Combining / comparing / joining / merging / encoding
    DataFrame.join
    DataFrame.merge
    DataFrame.update
-   DataFrame.label_encoding
-   DataFrame.one_hot_encoding
 
 Numerical operations
 ~~~~~~~~~~~~~~~~~~~~
@@ -249,8 +247,6 @@ Serialization / IO / conversion
 .. autosummary::
    :toctree: api/
 
-   DataFrame.as_gpu_matrix
-   DataFrame.as_matrix
    DataFrame.from_arrow
    DataFrame.from_pandas
    DataFrame.from_records
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index d705504cc0c..b7b358e38be 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -34,7 +34,7 @@ Properties
    Index.shape
    Index.size
    Index.values
-   
+
 
 Modifying and computations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -94,7 +94,6 @@ Conversion
    Index.astype
    Index.to_arrow
    Index.to_list
-   Index.to_numpy
    Index.to_series
    Index.to_frame
    Index.to_pandas
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index cf5dd4a2a1d..376acf1694b 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -44,7 +44,6 @@ Conversion
    Series.copy
    Series.to_list
    Series.__array__
-   Series.as_mask
    Series.scale
 
 
@@ -172,9 +171,7 @@ Reindexing / selection / label manipulation
    Series.reindex
    Series.rename
    Series.reset_index
-   Series.reverse
    Series.sample
-   Series.set_mask
    Series.take
    Series.tail
    Series.tile
@@ -210,15 +207,13 @@ Reshaping, sorting
    Series.repeat
    Series.transpose
 
-Combining / comparing / joining / merging / encoding
-----------------------------------------------------
+Combining / comparing / joining / merging
+-----------------------------------------
 .. autosummary::
    :toctree: api/
 
    Series.append
    Series.update
-   Series.label_encoding
-   Series.one_hot_encoding
 
 Numerical operations
 ~~~~~~~~~~~~~~~~~~~~
@@ -409,12 +404,10 @@ Serialization / IO / conversion
    :toctree: api/
 
    Series.to_arrow
-   Series.to_cupy
    Series.to_dlpack
    Series.to_frame
    Series.to_hdf
    Series.to_json
-   Series.to_numpy
    Series.to_pandas
    Series.to_string
    Series.from_arrow
diff --git a/java/src/main/java/ai/rapids/cudf/AvroOptions.java b/java/src/main/java/ai/rapids/cudf/AvroOptions.java
new file mode 100644
index 00000000000..973f729ab5b
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/AvroOptions.java
@@ -0,0 +1,41 @@
+/*
+ *
+ *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Options for reading an Avro file
+ */
+public class AvroOptions extends ColumnFilterOptions {
+
+    public static AvroOptions DEFAULT = new AvroOptions(new Builder());
+
+    private AvroOptions(Builder builder) {
+        super(builder);
+    }
+
+    public static Builder builder() {
+        return new Builder();
+    }
+
+    public static class Builder extends ColumnFilterOptions.Builder<Builder> {
+        public AvroOptions build() {
+            return new AvroOptions(this);
+        }
+    }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 17e10933b65..ff966643866 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -251,6 +251,17 @@ private static native long[] readJSON(String[] columnNames,
   private static native long[] readParquet(String[] filterColumnNames, String filePath,
                                            long address, long length, int timeUnit) throws CudfException;
 
+  /**
+   * Read in Avro formatted data.
+   * @param filterColumnNames  name of the columns to read, or an empty array if we want to read
+   *                           all of them
+   * @param filePath           the path of the file to read, or null if no path should be read.
+   * @param address            the address of the buffer to read from or 0 if we should not.
+   * @param length             the length of the buffer to read from.
+   */
+  private static native long[] readAvro(String[] filterColumnNames, String filePath,
+                                        long address, long length) throws CudfException;
+
   /**
    * Setup everything to write parquet formatted data to a file.
    * @param columnNames     names that correspond to the table columns
@@ -1020,6 +1031,82 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
         null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
   }
 
+  /**
+   * Read an Avro file using the default AvroOptions.
+   * @param path the local file to read.
+   * @return the file parsed as a table on the GPU.
+   */
+  public static Table readAvro(File path) {
+    return readAvro(AvroOptions.DEFAULT, path);
+  }
+
+  /**
+   * Read an Avro file.
+   * @param opts various Avro parsing options.
+   * @param path the local file to read.
+   * @return the file parsed as a table on the GPU.
+   */
+  public static Table readAvro(AvroOptions opts, File path) {
+    return new Table(readAvro(opts.getIncludeColumnNames(),
+        path.getAbsolutePath(), 0, 0));
+  }
+
+  /**
+   * Read Avro formatted data.
+   * @param buffer raw Avro formatted bytes.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readAvro(byte[] buffer) {
+    return readAvro(AvroOptions.DEFAULT, buffer, 0, buffer.length);
+  }
+
+  /**
+   * Read Avro formatted data.
+   * @param opts various Avro parsing options.
+   * @param buffer raw Avro formatted bytes.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readAvro(AvroOptions opts, byte[] buffer) {
+    return readAvro(opts, buffer, 0, buffer.length);
+  }
+
+  /**
+   * Read Avro formatted data.
+   * @param opts various Avro parsing options.
+   * @param buffer raw Avro formatted bytes.
+   * @param offset the starting offset into buffer.
+   * @param len the number of bytes to parse.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readAvro(AvroOptions opts, byte[] buffer, long offset, long len) {
+    assert offset >= 0 && offset < buffer.length;
+    assert len <= buffer.length - offset;
+    len = len > 0 ? len : buffer.length - offset;
+
+    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+      newBuf.setBytes(0, buffer, offset, len);
+      return readAvro(opts, newBuf, 0, len);
+    }
+  }
+
+  /**
+   * Read Avro formatted data.
+   * @param opts various Avro parsing options.
+   * @param buffer raw Avro formatted bytes.
+   * @param offset the starting offset into buffer.
+   * @param len the number of bytes to parse.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readAvro(AvroOptions opts, HostMemoryBuffer buffer,
+                               long offset, long len) {
+    assert offset >= 0 && offset < buffer.length;
+    assert len <= buffer.length - offset;
+    len = len > 0 ? len : buffer.length - offset;
+
+    return new Table(readAvro(opts.getIncludeColumnNames(),
+        null, buffer.getAddress() + offset, len));
+  }
+
   /**
    * Read a ORC file using the default ORCOptions.
    * @param path the local file to read.
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 769e8d2f356..ce3e6ffb285 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -356,8 +356,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
     } else if (use_cuda_async_alloc) {
       // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since
       // `cuda_async_memory_resource` only has a release threshold.
+      auto const alignment = 512; // Async allocator aligns to 512.
       Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::limiting_resource_adaptor>(
-          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, pool_size), pool_size);
+          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, pool_size), pool_size,
+          alignment);
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
     } else {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 1cf56da35da..11609155ba3 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -25,6 +25,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/hashing.hpp>
 #include <cudf/interop.hpp>
+#include <cudf/io/avro.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/json.hpp>
@@ -1496,6 +1497,44 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass,
+                                                                jobjectArray filter_col_names,
+                                                                jstring inputfilepath, jlong buffer,
+                                                                jlong buffer_length, jint unit) {
+
+  const bool read_buffer = (buffer != 0);
+  if (!read_buffer) {
+    JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
+  } else if (inputfilepath != NULL) {
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                  "cannot pass in both a buffer and an inputfilepath", NULL);
+  } else if (buffer_length <= 0) {
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported",
+                  NULL);
+  }
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstring filename(env, inputfilepath);
+    if (!read_buffer && filename.is_empty()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inputfilepath can't be empty",
+                    NULL);
+    }
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)) :
+                                cudf::io::source_info(filename.get());
+
+    cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
+                                             .columns(n_filter_col_names.as_cpp_vector())
+                                             .build();
+    return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index f309b1ee703..269c9d7eda1 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -80,6 +80,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
   private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
   private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
+  private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
   private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
 
@@ -642,6 +643,65 @@ void testReadParquetContainsDecimalData() {
     }
   }
 
+  @Test
+  void testReadAvro() {
+    AvroOptions opts = AvroOptions.builder()
+        .includeColumn("bool_col")
+        .includeColumn("int_col")
+        .includeColumn("timestamp_col")
+        .build();
+
+    try (Table expected = new Table.TestBuilder()
+        .column(true, false, true, false, true, false, true, false)
+        .column(0, 1, 0, 1, 0, 1, 0, 1)
+        .column(1235865600000000L, 1235865660000000L, 1238544000000000L, 1238544060000000L,
+            1233446400000000L, 1233446460000000L, 1230768000000000L, 1230768060000000L)
+        .build();
+        Table table = Table.readAvro(opts, TEST_ALL_TYPES_PLAIN_AVRO_FILE)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testReadAvroBuffer() throws IOException{
+    AvroOptions opts = AvroOptions.builder()
+        .includeColumn("bool_col")
+        .includeColumn("timestamp_col")
+        .build();
+
+    byte[] buffer = Files.readAllBytes(TEST_ALL_TYPES_PLAIN_AVRO_FILE.toPath());
+    int bufferLen = buffer.length;
+    try (Table expected = new Table.TestBuilder()
+        .column(true, false, true, false, true, false, true, false)
+        .column(1235865600000000L, 1235865660000000L, 1238544000000000L, 1238544060000000L,
+            1233446400000000L, 1233446460000000L, 1230768000000000L, 1230768060000000L)
+        .build();
+        Table table = Table.readAvro(opts, buffer, 0, bufferLen)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testReadAvroFull() {
+    try (Table expected = new Table.TestBuilder()
+        .column(4, 5, 6, 7, 2, 3, 0, 1)
+        .column(true, false, true, false, true, false, true, false)
+        .column(0, 1, 0, 1, 0, 1, 0, 1)
+        .column(0, 1, 0, 1, 0, 1, 0, 1)
+        .column(0, 1, 0, 1, 0, 1, 0, 1)
+        .column(0L, 10L, 0L, 10L, 0L, 10L, 0L, 10L)
+        .column(0.0f, 1.100000023841858f, 0.0f, 1.100000023841858f, 0.0f, 1.100000023841858f, 0.0f, 1.100000023841858f)
+        .column(0.0d, 10.1d, 0.0d, 10.1d, 0.0d, 10.1d, 0.0d, 10.1d)
+        .column("03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09", "01/01/09", "01/01/09")
+        .column("0", "1", "0", "1", "0", "1", "0", "1")
+        .column(1235865600000000L, 1235865660000000L, 1238544000000000L, 1238544060000000L,
+            1233446400000000L, 1233446460000000L, 1230768000000000L, 1230768060000000L)
+        .build();
+        Table table = Table.readAvro(TEST_ALL_TYPES_PLAIN_AVRO_FILE)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadORC() {
     ORCOptions opts = ORCOptions.builder()
diff --git a/java/src/test/resources/alltypes_plain.avro b/java/src/test/resources/alltypes_plain.avro
new file mode 100644
index 00000000000..d60c628227a
Binary files /dev/null and b/java/src/test/resources/alltypes_plain.avro differ
diff --git a/python/cudf/.coveragerc b/python/cudf/.coveragerc
index 4aba6d098df..929ab87f493 100644
--- a/python/cudf/.coveragerc
+++ b/python/cudf/.coveragerc
@@ -1,4 +1,3 @@
 # Configuration file for Python coverage tests
 [run]
-include = cudf/*
-omit = cudf/tests/*
\ No newline at end of file
+source = cudf
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index d8836738adb..abf20869a15 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -38,6 +38,7 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.utils cimport (
+    columns_from_table_view,
     columns_from_unique_ptr,
     data_from_table_view,
     data_from_unique_ptr,
@@ -166,7 +167,7 @@ def copy_range(Column input_column,
 
 
 def gather(
-    columns: list,
+    list columns,
     Column gather_map,
     bool nullify=False
 ):
@@ -190,60 +191,80 @@ def gather(
     return columns_from_unique_ptr(move(c_result))
 
 
-def scatter(object source, Column scatter_map, Column target_column,
-            bool bounds_check=True):
-    """
-    Scattering input into target as per the scatter map,
-    input can be a list of scalars or can be a table
-    """
-
-    cdef column_view scatter_map_view = scatter_map.view()
-    cdef table_view target_table_view = table_view_from_columns(
-        (target_column,))
-    cdef bool c_bounds_check = bounds_check
+cdef scatter_scalar(list source_device_slrs,
+                    column_view scatter_map,
+                    table_view target_table,
+                    bool bounds_check):
+    cdef vector[reference_wrapper[constscalar]] c_source
+    cdef DeviceScalar d_slr
     cdef unique_ptr[table] c_result
 
-    # Needed for the table branch
-    cdef table_view source_table_view
+    c_source.reserve(len(source_device_slrs))
+    for d_slr in source_device_slrs:
+        c_source.push_back(
+            reference_wrapper[constscalar](d_slr.get_raw_ptr()[0])
+        )
 
-    # Needed for the scalar branch
-    cdef vector[reference_wrapper[constscalar]] source_scalars
-    cdef DeviceScalar slr
+    with nogil:
+        c_result = move(
+            cpp_copying.scatter(
+                c_source,
+                scatter_map,
+                target_table,
+                bounds_check
+            )
+        )
 
-    if isinstance(source, Column):
-        source_table_view = table_view_from_columns((<Column> source,))
+    return columns_from_unique_ptr(move(c_result))
 
-        with nogil:
-            c_result = move(
-                cpp_copying.scatter(
-                    source_table_view,
-                    scatter_map_view,
-                    target_table_view,
-                    c_bounds_check
-                )
-            )
-    else:
-        slr = as_device_scalar(source, target_column.dtype)
-        source_scalars.push_back(reference_wrapper[constscalar](
-            slr.get_raw_ptr()[0]))
 
-        with nogil:
-            c_result = move(
-                cpp_copying.scatter(
-                    source_scalars,
-                    scatter_map_view,
-                    target_table_view,
-                    c_bounds_check
-                )
+cdef scatter_column(list source_columns,
+                    column_view scatter_map,
+                    table_view target_table,
+                    bool bounds_check):
+    cdef table_view c_source = table_view_from_columns(source_columns)
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_copying.scatter(
+                c_source,
+                scatter_map,
+                target_table,
+                bounds_check
             )
+        )
+    return columns_from_unique_ptr(move(c_result))
 
-    data, _ = data_from_unique_ptr(
-        move(c_result),
-        column_names=(None,),
-        index_names=None
-    )
 
-    return next(iter(data.values()))
+def scatter(list sources, Column scatter_map, list target_columns,
+            bool bounds_check=True):
+    """
+    Scattering source into target as per the scatter map.
+    `source` can be a list of scalars, or a list of columns. The number of
+    items in `sources` must equal the number of `target_columns` to scatter.
+    """
+    # TODO: Only single column scatter is used, we should explore multi-column
+    # scatter for frames for performance increase.
+
+    if len(sources) != len(target_columns):
+        raise ValueError("Mismatched number of source and target columns.")
+
+    if len(sources) == 0:
+        return []
+
+    cdef column_view scatter_map_view = scatter_map.view()
+    cdef table_view target_table_view = table_view_from_columns(target_columns)
+
+    if isinstance(sources[0], Column):
+        return scatter_column(
+            sources, scatter_map_view, target_table_view, bounds_check
+        )
+    else:
+        source_scalars = [as_device_scalar(slr) for slr in sources]
+        return scatter_scalar(
+            source_scalars, scatter_map_view, target_table_view, bounds_check
+        )
 
 
 def column_empty_like(Column input_column):
@@ -281,24 +302,14 @@ def column_allocate_like(Column input_column, size=None):
     return Column.from_unique_ptr(move(c_result))
 
 
-def table_empty_like(input_table, bool keep_index=True):
-
-    cdef table_view input_table_view = table_view_from_table(
-        input_table, not keep_index
-    )
-
+def columns_empty_like(list input_columns):
+    cdef table_view input_table_view = table_view_from_columns(input_columns)
     cdef unique_ptr[table] c_result
 
     with nogil:
         c_result = move(cpp_copying.empty_like(input_table_view))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=input_table._column_names,
-        index_names=(
-            input_table._index._column_names if keep_index is True else None
-        )
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def column_slice(Column input_column, object indices):
@@ -330,21 +341,18 @@ def column_slice(Column input_column, object indices):
     return result
 
 
-def table_slice(input_table, object indices, bool keep_index=True):
-
-    cdef table_view input_table_view = table_view_from_table(
-        input_table, not keep_index
-    )
-
-    cdef vector[size_type] c_indices
-    c_indices.reserve(len(indices))
+def columns_slice(list input_columns, list indices):
+    """
+    Given a list of input columns, return columns sliced by ``indices``.
 
+    Returns a list of list of columns. The length of return is
+    `len(indices) / 2`. The `i`th item in return is a list of columns sliced
+    from ``input_columns`` with `slice(indices[i*2], indices[i*2 + 1])`.
+    """
+    cdef table_view input_table_view = table_view_from_columns(input_columns)
+    cdef vector[size_type] c_indices = indices
     cdef vector[table_view] c_result
 
-    cdef int index
-    for index in indices:
-        c_indices.push_back(index)
-
     with nogil:
         c_result = move(
             cpp_copying.slice(
@@ -352,18 +360,11 @@ def table_slice(input_table, object indices, bool keep_index=True):
                 c_indices)
         )
 
-    num_of_result_cols = c_result.size()
     return [
-        data_from_table_view(
-            c_result[i],
-            input_table,
-            column_names=input_table._column_names,
-            index_names=(
-                input_table._index._column_names if (
-                    keep_index is True)
-                else None
-            )
-        ) for i in range(num_of_result_cols)]
+        columns_from_table_view(
+            c_result[i], input_columns
+        ) for i in range(c_result.size())
+    ]
 
 
 def column_split(Column input_column, object splits):
@@ -397,21 +398,12 @@ def column_split(Column input_column, object splits):
     return result
 
 
-def table_split(input_table, object splits, bool keep_index=True):
-
-    cdef table_view input_table_view = table_view_from_table(
-        input_table, not keep_index
-    )
-
-    cdef vector[size_type] c_splits
-    c_splits.reserve(len(splits))
+def columns_split(list input_columns, object splits):
 
+    cdef table_view input_table_view = table_view_from_columns(input_columns)
+    cdef vector[size_type] c_splits = splits
     cdef vector[table_view] c_result
 
-    cdef int split
-    for split in splits:
-        c_splits.push_back(split)
-
     with nogil:
         c_result = move(
             cpp_copying.split(
@@ -419,16 +411,11 @@ def table_split(input_table, object splits, bool keep_index=True):
                 c_splits)
         )
 
-    num_of_result_cols = c_result.size()
     return [
-        data_from_table_view(
-            c_result[i],
-            input_table,
-            column_names=input_table._column_names,
-            index_names=input_table._index_names if (
-                keep_index is True)
-            else None
-        ) for i in range(num_of_result_cols)]
+        columns_from_table_view(
+            c_result[i], input_columns
+        ) for i in range(c_result.size())
+    ]
 
 
 def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask):
@@ -656,32 +643,6 @@ def get_element(Column input_column, size_type index):
     )
 
 
-def sample(input, size_type n,
-           bool replace, int64_t seed, bool keep_index=True):
-    cdef table_view tbl_view = table_view_from_table(input, not keep_index)
-    cdef cpp_copying.sample_with_replacement replacement
-
-    if replace:
-        replacement = cpp_copying.sample_with_replacement.TRUE
-    else:
-        replacement = cpp_copying.sample_with_replacement.FALSE
-
-    cdef unique_ptr[table] c_output
-    with nogil:
-        c_output = move(
-            cpp_copying.sample(tbl_view, n, replacement, seed)
-        )
-
-    return data_from_unique_ptr(
-        move(c_output),
-        column_names=input._column_names,
-        index_names=(
-            None if keep_index is False
-            else input._index_names
-        )
-    )
-
-
 def segmented_gather(Column source_column, Column gather_map):
     cdef shared_ptr[lists_column_view] source_LCV = (
         make_shared[lists_column_view](source_column.view())
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index be1b6d8069c..a1c433774b5 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
@@ -175,10 +175,3 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
     ctypedef enum sample_with_replacement:
         FALSE 'cudf::sample_with_replacement::FALSE',
         TRUE 'cudf::sample_with_replacement::TRUE',
-
-    cdef unique_ptr[table] sample (
-        table_view input,
-        size_type n,
-        sample_with_replacement replacement,
-        int64_t seed
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd
index 9ce0c68cb08..5b110d6234c 100644
--- a/python/cudf/cudf/_lib/cpp/io/text.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/text.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -6,6 +6,13 @@ from libcpp.string cimport string
 from cudf._lib.cpp.column.column cimport column
 
 
+cdef extern from "cudf/io/text/byte_range_info.hpp" \
+        namespace "cudf::io::text" nogil:
+
+    cdef cppclass byte_range_info:
+        byte_range_info() except +
+        byte_range_info(size_t offset, size_t size) except +
+
 cdef extern from "cudf/io/text/data_chunk_source.hpp" \
         namespace "cudf::io::text" nogil:
 
@@ -25,3 +32,7 @@ cdef extern from "cudf/io/text/multibyte_split.hpp" \
 
     unique_ptr[column] multibyte_split(data_chunk_source source,
                                        string delimiter) except +
+
+    unique_ptr[column] multibyte_split(data_chunk_source source,
+                                       string delimiter,
+                                       byte_range_info byte_range) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index ce4f183e795..127e3a612dc 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -8,6 +8,12 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
+from collections import OrderedDict
+
+try:
+    import ujson as json
+except ImportError:
+    import json
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 from cudf._lib.column cimport Column
@@ -123,8 +129,22 @@ cpdef read_orc(object filepaths_or_buffers,
         c_result = move(libcudf_read_orc(c_orc_reader_options))
 
     names = [name.decode() for name in c_result.metadata.column_names]
+    actual_index_names, names, is_range_index, reset_index_name, range_idx = \
+        _get_index_from_metadata(c_result.metadata.user_data,
+                                 names,
+                                 skip_rows,
+                                 num_rows)
+
+    data, index = data_from_unique_ptr(
+        move(c_result.tbl),
+        names,
+        actual_index_names
+    )
 
-    data, index = data_from_unique_ptr(move(c_result.tbl), names)
+    if is_range_index:
+        index = range_idx
+    elif reset_index_name:
+        index.names = [None] * len(index.names)
 
     data = {
         name: update_column_struct_field_names(
@@ -144,6 +164,60 @@ cdef compression_type _get_comp_type(object compression):
     else:
         raise ValueError(f"Unsupported `compression` type {compression}")
 
+cdef tuple _get_index_from_metadata(
+        map[string, string] user_data,
+        object names,
+        object skip_rows,
+        object num_rows):
+    json_str = user_data[b'pandas'].decode('utf-8')
+    meta = None
+    index_col = None
+    is_range_index = False
+    reset_index_name = False
+    range_idx = None
+    if json_str != "":
+        meta = json.loads(json_str)
+
+        if 'index_columns' in meta and len(meta['index_columns']) > 0:
+            index_col = meta['index_columns']
+            if isinstance(index_col[0], dict) and \
+                    index_col[0]['kind'] == 'range':
+                is_range_index = True
+            else:
+                index_col_names = OrderedDict()
+                for idx_col in index_col:
+                    for c in meta['columns']:
+                        if c['field_name'] == idx_col:
+                            index_col_names[idx_col] = \
+                                c['name'] or c['field_name']
+                            if c['name'] is None:
+                                reset_index_name = True
+
+    actual_index_names = None
+    if index_col is not None and len(index_col) > 0:
+        if is_range_index:
+            range_index_meta = index_col[0]
+            range_idx = cudf.RangeIndex(
+                start=range_index_meta['start'],
+                stop=range_index_meta['stop'],
+                step=range_index_meta['step'],
+                name=range_index_meta['name']
+            )
+            if skip_rows is not None:
+                range_idx = range_idx[skip_rows:]
+            if num_rows is not None:
+                range_idx = range_idx[:num_rows]
+        else:
+            actual_index_names = list(index_col_names.values())
+            names = names[len(actual_index_names):]
+
+    return (
+        actual_index_names,
+        names,
+        is_range_index,
+        reset_index_name,
+        range_idx
+    )
 
 cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
     """
@@ -180,6 +254,10 @@ cpdef write_orc(table,
     cdef unique_ptr[data_sink] data_sink_c
     cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
     cdef unique_ptr[table_input_metadata] tbl_meta
+    cdef map[string, string] user_data
+    user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata(
+        table, None)
+    )
 
     if not isinstance(table._index, cudf.RangeIndex):
         tv = table_view_from_table(table)
@@ -204,8 +282,9 @@ cpdef write_orc(table,
 
     cdef orc_writer_options c_orc_writer_options = move(
         orc_writer_options.builder(
-            sink_info_c, table_view_from_table(table, ignore_index=True)
+            sink_info_c, tv
         ).metadata(tbl_meta.get())
+        .key_value_metadata(move(user_data))
         .compression(compression_)
         .enable_statistics(_get_orc_stat_freq(statistics))
         .build()
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index c4f885382f3..876c7145399 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -32,7 +32,7 @@ from cudf._lib.utils cimport (
 )
 
 
-def drop_nulls(columns: list, how="any", keys=None, thresh=None):
+def drop_nulls(list columns, how="any", keys=None, thresh=None):
     """
     Drops null rows from cols depending on key columns.
 
@@ -75,7 +75,7 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None):
     return columns_from_unique_ptr(move(c_result))
 
 
-def apply_boolean_mask(columns: list, Column boolean_mask):
+def apply_boolean_mask(list columns, Column boolean_mask):
     """
     Drops the rows which correspond to False in boolean_mask.
 
@@ -104,7 +104,7 @@ def apply_boolean_mask(columns: list, Column boolean_mask):
     return columns_from_unique_ptr(move(c_result))
 
 
-def drop_duplicates(columns: list,
+def drop_duplicates(list columns,
                     object keys=None,
                     object keep='first',
                     bool nulls_are_equal=True):
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index 9f33f32bdaf..daea227cc39 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import cudf
 
@@ -10,6 +10,7 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.io.text cimport (
+    byte_range_info,
     data_chunk_source,
     make_source,
     make_source_from_file,
@@ -18,7 +19,8 @@ from cudf._lib.cpp.io.text cimport (
 
 
 def read_text(object filepaths_or_buffers,
-              object delimiter=None):
+              object delimiter=None,
+              object byte_range=None):
     """
     Cython function to call into libcudf API, see `multibyte_split`.
 
@@ -31,9 +33,25 @@ def read_text(object filepaths_or_buffers,
 
     cdef unique_ptr[data_chunk_source] datasource
     cdef unique_ptr[column] c_col
-
-    with nogil:
-        datasource = move(make_source_from_file(filename))
-        c_col = move(multibyte_split(dereference(datasource), delim))
+    cdef size_t c_byte_range_offset
+    cdef size_t c_byte_range_size
+    cdef byte_range_info c_byte_range
+
+    if (byte_range is not None):
+        c_byte_range_offset = byte_range[0]
+        c_byte_range_size = byte_range[1]
+        with nogil:
+            datasource = move(make_source_from_file(filename))
+            c_byte_range = byte_range_info(
+                c_byte_range_offset,
+                c_byte_range_size)
+            c_col = move(multibyte_split(
+                dereference(datasource),
+                delim,
+                c_byte_range))
+    else:
+        with nogil:
+            datasource = move(make_source_from_file(filename))
+            c_col = move(multibyte_split(dereference(datasource), delim))
 
     return {None: Column.from_unique_ptr(move(c_col))}
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 50893ef9838..8a53b71124a 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -17,3 +17,4 @@ cdef data_from_table_view(
 cdef table_view table_view_from_columns(columns) except *
 cdef table_view table_view_from_table(tbl, ignore_index=*) except*
 cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
+cdef columns_from_table_view(table_view tv, object owners)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 91dfea735a1..8557f430e25 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -311,6 +311,24 @@ cdef data_from_unique_ptr(
     }
     return data, index
 
+cdef columns_from_table_view(
+    table_view tv,
+    object owners,
+):
+    """
+    Given a ``cudf::table_view``, construsts a list of columns from it,
+    along with referencing an ``owner`` Python object that owns the memory
+    lifetime. ``owner`` must be either None or a list of column. If ``owner``
+    is a list of columns, the owner of the `i`th ``cudf::column_view`` in the
+    table view is ``owners[i]``. For more about memory ownership,
+    see ``Column.from_column_view``.
+    """
+
+    return [
+        Column.from_column_view(
+            tv.column(i), owners[i] if isinstance(owners, list) else None
+        ) for i in range(tv.num_columns())
+    ]
 
 cdef data_from_table_view(
     table_view tv,
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 7a9a17631a9..60f739cff8b 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import pickle
+import warnings
 from functools import cached_property
 from typing import Any, Set
 
@@ -18,7 +19,6 @@
 from cudf._typing import DtypeObj
 from cudf.api.types import (
     is_bool_dtype,
-    is_dtype_equal,
     is_integer,
     is_integer_dtype,
     is_list_like,
@@ -33,6 +33,37 @@
     numeric_normalize_types,
 )
 
+_index_astype_docstring = """\
+Create an Index with values cast to dtypes.
+
+The class of a new Index is determined by dtype. When conversion is
+impossible, a ValueError exception is raised.
+
+Parameters
+----------
+dtype : numpy dtype
+    Use a numpy.dtype to cast entire Index object to.
+copy : bool, default False
+    By default, astype always returns a newly allocated object.
+    If copy is set to False and internal requirements on dtype are
+    satisfied, the original data is used to create a new Index
+    or the original Index is returned.
+
+Returns
+-------
+Index
+    Index with values cast to specified dtype.
+
+Examples
+--------
+>>> import cudf
+>>> index = cudf.Index([1, 2, 3])
+>>> index
+Int64Index([1, 2, 3], dtype='int64')
+>>> index.astype('float64')
+Float64Index([1.0, 2.0, 3.0], dtype='float64')
+"""
+
 
 class BaseIndex(Serializable):
     """Base class for all cudf Index types."""
@@ -41,14 +72,6 @@ class BaseIndex(Serializable):
     _accessors: Set[Any] = set()
     _data: ColumnAccessor
 
-    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-
-        if method == "__call__" and hasattr(cudf, ufunc.__name__):
-            func = getattr(cudf, ufunc.__name__)
-            return func(*inputs)
-        else:
-            return NotImplemented
-
     @cached_property
     def _values(self) -> ColumnBase:
         raise NotImplementedError
@@ -957,7 +980,9 @@ def _union(self, other, sort=None):
         self_df["order"] = self_df.index
         other_df["order"] = other_df.index
         res = self_df.merge(other_df, on=[0], how="outer")
-        res = res.sort_values(by=res.columns[1:], ignore_index=True)
+        res = res.sort_values(
+            by=res._data.to_pandas_index()[1:], ignore_index=True
+        )
         union_result = cudf.core.index._index_from_data({0: res._data[0]})
 
         if sort is None and len(other):
@@ -1205,43 +1230,6 @@ def rename(self, name, inplace=False):
             out.name = name
             return out
 
-    def astype(self, dtype, copy=False):
-        """
-        Create an Index with values cast to dtypes. The class of a new Index
-        is determined by dtype. When conversion is impossible, a ValueError
-        exception is raised.
-
-        Parameters
-        ----------
-        dtype : numpy dtype
-            Use a numpy.dtype to cast entire Index object to.
-        copy : bool, default False
-            By default, astype always returns a newly allocated object.
-            If copy is set to False and internal requirements on dtype are
-            satisfied, the original data is used to create a new Index
-            or the original Index is returned.
-
-        Returns
-        -------
-        Index
-            Index with values cast to specified dtype.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> index = cudf.Index([1, 2, 3])
-        >>> index
-        Int64Index([1, 2, 3], dtype='int64')
-        >>> index.astype('float64')
-        Float64Index([1.0, 2.0, 3.0], dtype='float64')
-        """
-        if is_dtype_equal(dtype, self.dtype):
-            return self.copy(deep=copy)
-
-        return cudf.Index(
-            self.copy(deep=copy)._values.astype(dtype), name=self.name
-        )
-
     def to_series(self, index=None, name=None):
         """
         Create a Series with both index and values equal to the index keys.
@@ -1283,7 +1271,7 @@ def get_slice_bound(self, label, side, kind=None):
         int
             Index of label.
         """
-        raise (NotImplementedError)
+        raise NotImplementedError
 
     def __array_function__(self, func, types, args, kwargs):
 
@@ -1534,6 +1522,30 @@ def _split_columns_by_levels(self, levels):
             [],
         )
 
+    def _split(self, splits):
+        raise NotImplementedError
+
+    def sample(
+        self,
+        n=None,
+        frac=None,
+        replace=False,
+        weights=None,
+        random_state=None,
+        axis=None,
+        ignore_index=False,
+    ):
+        warnings.warn(
+            "Index.sample is deprecated and will be removed.", FutureWarning,
+        )
+        return cudf.core.index._index_from_data(
+            self.to_frame()
+            .sample(
+                n, frac, replace, weights, random_state, axis, ignore_index
+            )
+            ._data
+        )
+
 
 def _get_result_name(left_name, right_name):
     if left_name == right_name:
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index 2cf579ce3f1..70162c7afc6 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import pandas as pd
 from packaging import version
@@ -9,4 +9,5 @@
 PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
 PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2")
 PANDAS_GE_130 = PANDAS_VERSION >= version.parse("1.3.0")
+PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4")
 PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0")
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 6c94a84fd37..8bfcad4c8f4 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import warnings
 from typing import Any, Optional, Tuple, Union, cast
@@ -264,7 +264,7 @@ def where(
                 )
             # Setting `frame` column names to `cond`
             # as `cond` has no column names.
-            cond.columns = frame.columns
+            cond._set_column_names_like(frame)
 
         (source_df, others,) = _normalize_columns_and_scalars_type(
             frame, other
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2788ac6a600..1c1845373e1 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import builtins
 import pickle
 import warnings
 from functools import cached_property
@@ -69,6 +68,7 @@
     ListDtype,
     StructDtype,
 )
+from cudf.core.mixins import Reducible
 from cudf.utils import utils
 from cudf.utils.dtypes import (
     cudf_dtype_from_pa_type,
@@ -86,7 +86,14 @@
 Slice = TypeVar("Slice", bound=slice)
 
 
-class ColumnBase(Column, Serializable, NotIterable):
+class ColumnBase(Column, Serializable, Reducible, NotIterable):
+    _VALID_REDUCTIONS = {
+        "any",
+        "all",
+        "max",
+        "min",
+    }
+
     def as_frame(self) -> "cudf.core.frame.Frame":
         """
         Converts a Column to Frame
@@ -585,9 +592,9 @@ def _scatter_by_column(
                     [value], [self], key
                 )[0]._with_type_metadata(self.dtype)
             else:
-                return libcudf.copying.scatter(
-                    value, key, self
-                )._with_type_metadata(self.dtype)
+                return libcudf.copying.scatter([value], key, [self])[
+                    0
+                ]._with_type_metadata(self.dtype)
         except RuntimeError as e:
             if "out of bounds" in str(e):
                 raise IndexError(
@@ -611,10 +618,7 @@ def _check_scatter_key_length(
                 raise ValueError(msg)
 
     def fillna(
-        self: T,
-        value: Any = None,
-        method: builtins.str = None,
-        dtype: Dtype = None,
+        self: T, value: Any = None, method: str = None, dtype: Dtype = None,
     ) -> T:
         """Fill null values with ``value``.
 
@@ -674,16 +678,10 @@ def append(self, other: ColumnBase) -> ColumnBase:
         return concat_columns([self, as_column(other)])
 
     def quantile(
-        self,
-        q: Union[float, Sequence[float]],
-        interpolation: builtins.str,
-        exact: bool,
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
     ) -> ColumnBase:
         raise TypeError(f"cannot perform quantile with type {self.dtype}")
 
-    def median(self, skipna: bool = None) -> ScalarLike:
-        raise TypeError(f"cannot perform median with type {self.dtype}")
-
     def take(
         self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
     ) -> T:
@@ -807,9 +805,7 @@ def is_monotonic_decreasing(self) -> bool:
             ascending=[False], null_position=None
         )
 
-    def get_slice_bound(
-        self, label: ScalarLike, side: builtins.str, kind: builtins.str
-    ) -> int:
+    def get_slice_bound(self, label: ScalarLike, side: str, kind: str) -> int:
         """
         Calculate slice bound that corresponds to given label.
         Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
@@ -842,9 +838,7 @@ def get_slice_bound(
             raise ValueError(f"Invalid value for side: {side}")
 
     def sort_by_values(
-        self: ColumnBase,
-        ascending: bool = True,
-        na_position: builtins.str = "last",
+        self: ColumnBase, ascending: bool = True, na_position: str = "last",
     ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]:
         col_inds = self.as_frame()._get_sorted_inds(
             ascending=ascending, na_position=na_position
@@ -852,12 +846,7 @@ def sort_by_values(
         col_keys = self.take(col_inds)
         return col_keys, col_inds
 
-    def distinct_count(
-        self, method: builtins.str = "sort", dropna: bool = True
-    ) -> int:
-        if method != "sort":
-            msg = "non sort based distinct_count() not implemented yet"
-            raise NotImplementedError(msg)
+    def distinct_count(self, dropna: bool = True) -> int:
         try:
             return self._distinct_count[dropna]
         except KeyError:
@@ -1011,7 +1000,7 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
         )
 
     def argsort(
-        self, ascending: bool = True, na_position: builtins.str = "last"
+        self, ascending: bool = True, na_position: str = "last"
     ) -> ColumnBase:
 
         return self.as_frame()._get_sorted_inds(
@@ -1087,9 +1076,9 @@ def __ge__(self, other):
     def searchsorted(
         self,
         value,
-        side: builtins.str = "left",
+        side: str = "left",
         ascending: bool = True,
-        na_position: builtins.str = "last",
+        na_position: str = "last",
     ):
         values = as_column(value).as_frame()
         return self.as_frame().searchsorted(
@@ -1138,13 +1127,13 @@ def deserialize(cls, header: dict, frames: list) -> ColumnBase:
             data=data, dtype=dtype, mask=mask, size=header.get("size", None)
         )
 
-    def unary_operator(self, unaryop: builtins.str):
+    def unary_operator(self, unaryop: str):
         raise TypeError(
             f"Operation {unaryop} not supported for dtype {self.dtype}."
         )
 
     def binary_operator(
-        self, op: builtins.str, other: BinaryOperand, reflect: bool = False
+        self, op: str, other: BinaryOperand, reflect: bool = False
     ) -> ColumnBase:
         raise TypeError(
             f"Operation {op} not supported between dtypes {self.dtype} and "
@@ -1162,53 +1151,23 @@ def _minmax(self, skipna: bool = None):
             return libcudf.reduce.minmax(result_col)
         return result_col
 
-    def min(self, skipna: bool = None, dtype: Dtype = None):
-        result_col = self._process_for_reduction(skipna=skipna)
-        if isinstance(result_col, ColumnBase):
-            return libcudf.reduce.reduce("min", result_col, dtype=dtype)
-        return result_col
-
-    def max(self, skipna: bool = None, dtype: Dtype = None):
-        result_col = self._process_for_reduction(skipna=skipna)
-        if isinstance(result_col, ColumnBase):
-            return libcudf.reduce.reduce("max", result_col, dtype=dtype)
-        return result_col
-
-    def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ):
-        raise TypeError(f"cannot perform sum with type {self.dtype}")
-
-    def product(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ):
-        raise TypeError(f"cannot perform product with type {self.dtype}")
-
-    def mean(self, skipna: bool = None, dtype: Dtype = None):
-        raise TypeError(f"cannot perform mean with type {self.dtype}")
-
-    def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
-        raise TypeError(f"cannot perform std with type {self.dtype}")
-
-    def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
-        raise TypeError(f"cannot perform var with type {self.dtype}")
+    def _reduce(
+        self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs
+    ) -> ScalarLike:
+        """Compute {op} of column values.
 
-    def kurtosis(self, skipna: bool = None):
-        raise TypeError(f"cannot perform kurtosis with type {self.dtype}")
-
-    def skew(self, skipna: bool = None):
-        raise TypeError(f"cannot perform skew with type {self.dtype}")
-
-    def cov(self, other: ColumnBase):
-        raise TypeError(
-            f"cannot perform covarience with types {self.dtype}, "
-            f"{other.dtype}"
-        )
-
-    def corr(self, other: ColumnBase):
-        raise TypeError(
-            f"cannot perform corr with types {self.dtype}, {other.dtype}"
+        skipna : bool
+            Whether or not na values must be skipped.
+        min_count : int, default 0
+            The minimum number of entries for the reduction, otherwise the
+            reduction returns NaN.
+        """
+        preprocessed = self._process_for_reduction(
+            skipna=skipna, min_count=min_count
         )
+        if isinstance(preprocessed, ColumnBase):
+            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
+        return preprocessed
 
     @property
     def contains_na_entries(self) -> bool:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index c72fb66addc..4ed296ceb52 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -1,8 +1,7 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
-import builtins
 import datetime as dt
 import locale
 import re
@@ -277,7 +276,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn":
         )
 
     @property
-    def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
+    def __cuda_array_interface__(self) -> Mapping[str, Any]:
         output = {
             "shape": (len(self),),
             "strides": (self.dtype.itemsize,),
@@ -346,17 +345,27 @@ def as_string_column(
                 column.column_empty(0, dtype="object", masked=False),
             )
 
-    def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
+    def mean(
+        self, skipna=None, min_count: int = 0, dtype=np.float64
+    ) -> ScalarLike:
         return pd.Timestamp(
-            self.as_numerical.mean(skipna=skipna, dtype=dtype),
+            self.as_numerical.mean(
+                skipna=skipna, min_count=min_count, dtype=dtype
+            ),
             unit=self.time_unit,
         )
 
     def std(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+        self,
+        skipna: bool = None,
+        min_count: int = 0,
+        dtype: Dtype = np.float64,
+        ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype)
+            self.as_numerical.std(
+                skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            )
             * _numpy_to_pandas_conversion[self.time_unit],
         )
 
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 2c483cbd00b..4e8acbf2634 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -583,9 +583,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
                     s = cudf.Series(self)
                     # TODO: replace np.inf with cudf scalar when
                     # https://github.com/rapidsai/cudf/pull/6297 merges
-                    non_infs = s[
-                        ((s == np.inf) | (s == -np.inf)).logical_not()
-                    ]
+                    non_infs = s[~((s == np.inf) | (s == -np.inf))]
                     col = non_infs._column
                 else:
                     col = self
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 1f84cb88e37..87e1a87e68b 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 """Define an interface for columns that can perform numerical operations."""
 
 from __future__ import annotations
@@ -10,11 +10,12 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import Dtype, ScalarLike
+from cudf._typing import ScalarLike
 from cudf.core.column import ColumnBase
+from cudf.core.mixins import Scannable
 
 
-class NumericalBaseColumn(ColumnBase):
+class NumericalBaseColumn(ColumnBase, Scannable):
     """A column composed of numerical data.
 
     This class encodes a standard interface for different types of columns
@@ -23,59 +24,21 @@ class NumericalBaseColumn(ColumnBase):
     point, should be encoded here.
     """
 
-    def reduce(
-        self, op: str, skipna: bool = None, min_count: int = 0, **kwargs
-    ) -> ScalarLike:
-        """Perform a reduction operation.
-
-        op : str
-            The operation to perform.
-        skipna : bool
-            Whether or not na values must be
-        """
-        preprocessed = self._process_for_reduction(
-            skipna=skipna, min_count=min_count
-        )
-        if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
-        else:
-            return preprocessed
-
-    def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> ScalarLike:
-        return self.reduce(
-            "sum", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def product(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> ScalarLike:
-        return self.reduce(
-            "product", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def mean(
-        self, skipna: bool = None, dtype: Dtype = np.float64
-    ) -> ScalarLike:
-        return self.reduce("mean", skipna=skipna, dtype=dtype)
-
-    def var(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
-    ) -> ScalarLike:
-        return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof)
-
-    def std(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
-    ) -> ScalarLike:
-        return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof)
-
-    def sum_of_squares(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> ScalarLike:
-        return self.reduce(
-            "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
-        )
+    _VALID_REDUCTIONS = {
+        "sum",
+        "product",
+        "sum_of_squares",
+        "mean",
+        "var",
+        "std",
+    }
+
+    _VALID_SCANS = {
+        "cumsum",
+        "cumprod",
+        "cummin",
+        "cummax",
+    }
 
     def _can_return_nan(self, skipna: bool = None) -> bool:
         return not skipna and self.has_nulls()
@@ -148,6 +111,25 @@ def quantile(
             )
         return result
 
+    def mean(self, skipna: bool = None, min_count: int = 0, dtype=np.float64):
+        return self._reduce(
+            "mean", skipna=skipna, min_count=min_count, dtype=dtype
+        )
+
+    def var(
+        self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
+    ):
+        return self._reduce(
+            "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+        )
+
+    def std(
+        self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
+    ):
+        return self._reduce(
+            "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+        )
+
     def median(self, skipna: bool = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
 
@@ -171,7 +153,7 @@ def _numeric_quantile(
             self, quant, interpolation, sorted_indices, exact
         )
 
-    def cov(self, other: ColumnBase) -> float:
+    def cov(self, other: NumericalBaseColumn) -> float:
         if (
             len(self) == 0
             or len(other) == 0
@@ -183,7 +165,7 @@ def cov(self, other: ColumnBase) -> float:
         cov_sample = result.sum() / (len(self) - 1)
         return cov_sample
 
-    def corr(self, other: ColumnBase) -> float:
+    def corr(self, other: NumericalBaseColumn) -> float:
         if len(self) == 0 or len(other) == 0:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
@@ -200,7 +182,7 @@ def round(
         """Round the values in the Column to the given number of decimals."""
         return libcudf.round.round(self, decimal_places=decimals, how=how)
 
-    def _apply_scan_op(self, op: str) -> ColumnBase:
-        return libcudf.reduce.scan(op, self, True)._with_type_metadata(
-            self.dtype
-        )
+    def _scan(self, op: str) -> ColumnBase:
+        return libcudf.reduce.scan(
+            op.replace("cum", ""), self, True
+        )._with_type_metadata(self.dtype)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 982ac098bbf..8f017376c6a 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import builtins
 import pickle
 import re
 import warnings
@@ -5154,7 +5153,7 @@ def to_arrow(self) -> pa.Array:
             return super().to_arrow()
 
     def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0,
     ):
         result_col = self._process_for_reduction(
             skipna=skipna, min_count=min_count
@@ -5398,10 +5397,7 @@ def find_and_replace(
         return libcudf.replace.replace(res, df._data["old"], df._data["new"])
 
     def fillna(
-        self,
-        fill_value: Any = None,
-        method: builtins.str = None,
-        dtype: Dtype = None,
+        self, fill_value: Any = None, method: str = None, dtype: Dtype = None,
     ) -> StringColumn:
         if fill_value is not None:
             if not is_scalar(fill_value):
@@ -5450,7 +5446,7 @@ def normalize_binop_value(self, other) -> "column.ColumnBase":
             raise TypeError(f"cannot broadcast {type(other)}")
 
     def binary_operator(
-        self, op: builtins.str, rhs, reflect: bool = False
+        self, op: str, rhs, reflect: bool = False
     ) -> "column.ColumnBase":
         lhs = self
         if reflect:
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 6c8c904e13c..7a5f777e88e 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -385,20 +385,29 @@ def quantile(
         return result.astype(self.dtype)
 
     def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count=0
+        self, skipna: bool = None, min_count: int = 0, dtype: Dtype = None,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical.sum(
-                skipna=skipna, dtype=dtype, min_count=min_count
+            # Since sum isn't overriden in Numerical[Base]Column, mypy only
+            # sees the signature from Reducible (which doesn't have the extra
+            # parameters from ColumnBase._reduce) so we have to ignore this.
+            self.as_numerical.sum(  # type: ignore
+                skipna=skipna, min_count=min_count, dtype=dtype
             ),
             unit=self.time_unit,
         )
 
     def std(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+        self,
+        skipna: bool = None,
+        min_count: int = 0,
+        dtype: Dtype = np.float64,
+        ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype),
+            self.as_numerical.std(
+                skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
+            ),
             unit=self.time_unit,
         )
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1c672aacd86..57d591dd3e7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -11,7 +11,18 @@
 import warnings
 from collections import defaultdict
 from collections.abc import Iterable, Sequence
-from typing import Any, MutableMapping, Optional, Set, TypeVar
+from typing import (
+    Any,
+    Dict,
+    List,
+    MutableMapping,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
 
 import cupy
 import numpy as np
@@ -26,6 +37,7 @@
 import cudf
 import cudf.core.common
 from cudf import _lib as libcudf
+from cudf._typing import ColumnLike
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
@@ -44,6 +56,7 @@
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     CategoricalColumn,
+    ColumnBase,
     as_column,
     build_categorical_column,
     build_column,
@@ -51,7 +64,7 @@
     concat_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame, _drop_rows_by_labels
+from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import DataFrameGroupBy
 from cudf.core.index import BaseIndex, Index, RangeIndex, as_index
 from cudf.core.indexed_frame import (
@@ -75,7 +88,11 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.utils import GetAttrGetItemMixin
+from cudf.utils.utils import (
+    GetAttrGetItemMixin,
+    _cudf_nvtx_annotate,
+    _external_only_api,
+)
 
 T = TypeVar("T", bound="DataFrame")
 
@@ -94,8 +111,9 @@
 
 class _DataFrameIndexer(_FrameIndexer):
     def __getitem__(self, arg):
-        if isinstance(self._frame.index, MultiIndex) or isinstance(
-            self._frame.columns, MultiIndex
+        if (
+            isinstance(self._frame.index, MultiIndex)
+            or self._frame._data.multiindex
         ):
             # This try/except block allows the use of pandas-like
             # tuple arguments into MultiIndex dataframes.
@@ -113,7 +131,7 @@ def __setitem__(self, key, value):
             key = (key, slice(None))
         return self._setitem_tuple_arg(key, value)
 
-    @annotate("_CAN_DOWNCAST_TO_SERIES", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _can_downcast_to_series(self, df, arg):
         """
         This method encapsulates the logic used
@@ -154,7 +172,7 @@ def _can_downcast_to_series(self, df, arg):
                 return True
         return False
 
-    @annotate("_DOWNCAST_TO_SERIES", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _downcast_to_series(self, df, arg):
         """
         "Downcast" from a DataFrame to a Series
@@ -164,7 +182,7 @@ def _downcast_to_series(self, df, arg):
         # determine the axis along which the Series is taken:
         if nrows == 1 and ncols == 1:
             if is_scalar(arg[0]) and is_scalar(arg[1]):
-                return df[df.columns[0]].iloc[0]
+                return df[df._column_names[0]].iloc[0]
             elif not is_scalar(arg[0]):
                 axis = 1
             else:
@@ -196,11 +214,11 @@ class _DataFrameLocIndexer(_DataFrameIndexer):
     For selection by label.
     """
 
-    @annotate("_GETITEM_SCALAR", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _getitem_scalar(self, arg):
         return self._frame[arg[1]].loc[arg[0]]
 
-    @annotate("LOC_GETITEM", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _getitem_tuple_arg(self, arg):
         from uuid import uuid4
 
@@ -283,10 +301,11 @@ def _getitem_tuple_arg(self, arg):
             return self._downcast_to_series(df, arg)
         return df
 
-    @annotate("LOC_SETITEM", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _setitem_tuple_arg(self, key, value):
-        if isinstance(self._frame.index, MultiIndex) or isinstance(
-            self._frame.columns, pd.MultiIndex
+        if (
+            isinstance(self._frame.index, MultiIndex)
+            or self._frame._data.multiindex
         ):
             raise NotImplementedError(
                 "Setting values using df.loc[] not supported on "
@@ -349,7 +368,7 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
     For selection by index.
     """
 
-    @annotate("ILOC_GETITEM", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _getitem_tuple_arg(self, arg):
         # Iloc Step 1:
         # Gather the columns specified by the second tuple arg
@@ -401,7 +420,7 @@ def _getitem_tuple_arg(self, arg):
             df._index = as_index(self._frame.index[arg[0]])
         return df
 
-    @annotate("ILOC_SETITEM", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _setitem_tuple_arg(self, key, value):
         columns = self._frame._get_columns_by_index(key[1])
 
@@ -514,7 +533,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     _loc_indexer_type = _DataFrameLocIndexer
     _iloc_indexer_type = _DataFrameIlocIndexer
 
-    @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __init__(
         self, data=None, index=None, columns=None, dtype=None, nan_as_null=True
     ):
@@ -546,7 +565,6 @@ def __init__(
                 )
             else:
                 self._data = data._data
-                self.columns = data.columns
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
                 data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
@@ -595,7 +613,6 @@ def __init__(
 
             self._data = new_df._data
             self.index = new_df._index
-            self.columns = new_df.columns
         elif hasattr(data, "__array_interface__"):
             arr_interface = data.__array_interface__
             if len(arr_interface["descr"]) == 1:
@@ -605,7 +622,6 @@ def __init__(
                 new_df = self.from_records(data, index=index, columns=columns)
             self._data = new_df._data
             self.index = new_df._index
-            self.columns = new_df.columns
         else:
             if is_list_like(data):
                 if len(data) > 0 and is_scalar(data[0]):
@@ -617,7 +633,6 @@ def __init__(
 
                     self._data = new_df._data
                     self.index = new_df._index
-                    self.columns = new_df.columns
                 elif len(data) > 0 and isinstance(data[0], Series):
                     self._init_from_series_list(
                         data=data, columns=columns, index=index
@@ -638,9 +653,7 @@ def __init__(
         if dtype:
             self._data = self.astype(dtype)._data
 
-    @annotate(
-        "DATAFRAME_INIT_FROM_SERIES_LIST", color="blue", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _init_from_series_list(self, data, columns, index):
         if index is None:
             # When `index` is `None`, the final index of
@@ -719,8 +732,9 @@ def _init_from_series_list(self, data, columns, index):
         else:
             concat_df = cudf.concat(data, axis=1)
 
-            if concat_df.columns.dtype == "object":
-                concat_df.columns = concat_df.columns.astype("str")
+            cols = concat_df._data.to_pandas_index()
+            if cols.dtype == "object":
+                concat_df.columns = cols.astype("str")
 
             transpose = concat_df.T
 
@@ -739,9 +753,7 @@ def _init_from_series_list(self, data, columns, index):
                     )
             self._data = self._data.select_by_label(columns)
 
-    @annotate(
-        "DATAFRAME_INIT_FROM_LIST_LIKE", color="blue", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
@@ -778,9 +790,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
 
             self.columns = columns
 
-    @annotate(
-        "DATAFRAME_INIT_FROM_DICT_LIKE", color="blue", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
@@ -854,11 +864,7 @@ def _from_data(
         return out
 
     @staticmethod
-    @annotate(
-        "DATAFRAME_ALIGN_INPUT_SERIES_INDICES",
-        color="blue",
-        domain="cudf_python",
-    )
+    @_cudf_nvtx_annotate
     def _align_input_series_indices(data, index):
         data = data.copy()
 
@@ -972,7 +978,9 @@ def __dir__(self):
         o = set(dir(type(self)))
         o.update(self.__dict__)
         o.update(
-            c for c in self.columns if isinstance(c, str) and c.isidentifier()
+            c
+            for c in self._column_names
+            if isinstance(c, str) and c.isidentifier()
         )
         return list(o)
 
@@ -983,7 +991,6 @@ def __setattr__(self, key, col):
             # properties, and we must call object.__getattribute__ to bypass
             # the `__getitem__` behavior inherited from `GetAttrGetItemMixin`.
             object.__getattribute__(self, key)
-            super().__setattr__(key, col)
         except AttributeError:
             if key not in self._PROTECTED_KEYS:
                 try:
@@ -998,7 +1005,18 @@ def __setattr__(self, key, col):
             # Set a new attribute that is not already a column.
             super().__setattr__(key, col)
 
-    @annotate("DATAFRAME_GETITEM", color="blue", domain="cudf_python")
+        except RuntimeError as e:
+            # TODO: This allows setting properties that are marked as forbidden
+            # for internal usage. It is necesary because the __getattribute__
+            # call in the try block will trigger the error. We should see if
+            # setting these variables can also always be disabled
+            if "External-only API" not in str(e):
+                raise
+            super().__setattr__(key, col)
+        else:
+            super().__setattr__(key, col)
+
+    @_cudf_nvtx_annotate
     def __getitem__(self, arg):
         """
         If *arg* is a ``str`` or ``int`` type, return the column Series.
@@ -1082,7 +1100,7 @@ def __getitem__(self, arg):
                 f"__getitem__ on type {type(arg)} is not supported"
             )
 
-    @annotate("DATAFRAME_SETITEM", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __setitem__(self, arg, value):
         """Add/set column by *arg or DataFrame*"""
         if isinstance(arg, DataFrame):
@@ -1139,7 +1157,9 @@ def __setitem__(self, arg, value):
                             allow_non_unique=True,
                         )
                     if is_scalar(value):
-                        self._data[arg][:] = value
+                        self._data[arg] = utils.scalar_broadcast_to(
+                            value, len(self)
+                        )
                     else:
                         value = as_column(value)
                         self._data[arg] = value
@@ -1199,12 +1219,9 @@ def __setitem__(self, arg, value):
             )
 
     def __delitem__(self, name):
-        """
-        Drop the given column by *name*.
-        """
         self._drop_column(name)
 
-    @annotate("DATAFRAME_SLICE", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _slice(self: T, arg: slice) -> T:
         """
         _slice : slice the frame as per the arg
@@ -1260,99 +1277,87 @@ def _slice(self: T, arg: slice) -> T:
                     ),
                 )
 
-        # This is just to handle RangeIndex type, stop
-        # it from materializing unnecessarily
-        keep_index = True
-        if self.index is not None and isinstance(self.index, RangeIndex):
+        # If index type is RangeIndex, slice without materializing.
+        is_range_index = isinstance(self.index, RangeIndex)
+        if is_range_index:
             if self._num_columns == 0:
-                result = self._empty_like(keep_index)
+                result = self._empty_like(keep_index=False)
                 result._index = self.index[start:stop:stride]
                 return result
-            keep_index = False
 
-        # For decreasing slices, terminal at before-the-zero
-        # position is preserved.
         if start < 0:
             start = start + num_rows
+
+        # Decreasing slices that terminates at -1, such as slice(4, -1, -1),
+        # has end index of 0, The check below makes sure -1 is not wrapped
+        # to `-1 + num_rows`.
         if stop < 0 and not (stride < 0 and stop == -1):
             stop = stop + num_rows
+        stride = 1 if stride is None else stride
 
-        if start > stop and (stride is None or stride == 1):
-            return self._empty_like(keep_index)
-        else:
-            start = len(self) if start > num_rows else start
-            stop = len(self) if stop > num_rows else stop
+        if (stop - start) * stride <= 0:
+            return self._empty_like(keep_index=True)
 
-            if stride is not None and stride != 1:
-                return self._gather(
-                    cudf.core.column.arange(
-                        start, stop=stop, step=stride, dtype=np.int32
-                    )
-                )
-            else:
-                result = self._from_data(
-                    *libcudf.copying.table_slice(
-                        self, [start, stop], keep_index
-                    )[0]
+        start = len(self) if start > num_rows else start
+        stop = len(self) if stop > num_rows else stop
+
+        if stride != 1:
+            return self._gather(
+                cudf.core.column.arange(
+                    start, stop=stop, step=stride, dtype=np.int32
                 )
+            )
 
-                result._copy_type_metadata(self, include_index=keep_index)
-                if self.index is not None:
-                    if keep_index:
-                        result._index.names = self.index.names
-                    else:
-                        # Adding index of type RangeIndex back to
-                        # result
-                        result.index = self.index[start:stop]
-                result.columns = self.columns
-                return result
+        columns_to_slice = [
+            *(self._index._data.columns if not is_range_index else []),
+            *self._columns,
+        ]
+        result = self._from_columns_like_self(
+            libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
+            self._column_names,
+            None if is_range_index else self._index.names,
+        )
 
-    @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python")
+        if is_range_index:
+            result.index = self.index[start:stop]
+        return result
+
+    @_cudf_nvtx_annotate
     def memory_usage(self, index=True, deep=False):
         return Series(
             {str(k): v for k, v in super().memory_usage(index, deep).items()}
         )
 
-    @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __array_function__(self, func, types, args, kwargs):
-
-        cudf_df_module = DataFrame
-        cudf_series_module = Series
-
-        for submodule in func.__module__.split(".")[1:]:
-            # point cudf to the correct submodule
-            if hasattr(cudf_df_module, submodule):
-                cudf_df_module = getattr(cudf_df_module, submodule)
-            else:
-                return NotImplemented
-
-        fname = func.__name__
-
-        handled_types = [cudf_df_module, cudf_series_module]
-
-        for t in types:
-            if t not in handled_types:
-                return NotImplemented
-
-        if hasattr(cudf_df_module, fname):
-            cudf_func = getattr(cudf_df_module, fname)
-            # Handle case if cudf_func is same as numpy function
-            if cudf_func is func:
-                return NotImplemented
-            # numpy returns an array from the dot product of two dataframes
-            elif (
-                func is np.dot
-                and isinstance(args[0], (DataFrame, pd.DataFrame))
-                and isinstance(args[1], (DataFrame, pd.DataFrame))
-            ):
-                return cudf_func(*args, **kwargs).values
-            else:
-                return cudf_func(*args, **kwargs)
-        else:
+        if "out" in kwargs or not all(
+            issubclass(t, (Series, DataFrame)) for t in types
+        ):
             return NotImplemented
 
+        try:
+            if cudf_func := getattr(self.__class__, func.__name__, None):
+                out = cudf_func(*args, **kwargs)
+                # The dot product of two DataFrames returns an array in pandas.
+                if (
+                    func is np.dot
+                    and isinstance(args[0], (DataFrame, pd.DataFrame))
+                    and isinstance(args[1], (DataFrame, pd.DataFrame))
+                ):
+                    return out.values
+                return out
+        except Exception:
+            # The rare instance where a "silent" failure is preferable. Except
+            # in the (highly unlikely) case that some other library
+            # interoperates with cudf objects, the result will be that numpy
+            # raises a TypeError indicating that the operation is not
+            # implemented, which is much friendlier than an arbitrary internal
+            # cudf error.
+            pass
+        return NotImplemented
+
     # The _get_numeric_data method is necessary for dask compatibility.
-    @annotate("DATAFRAME_GET_NUMERIC_DATA", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _get_numeric_data(self):
         """Return a dataframe with only numeric data types"""
         columns = [
@@ -1362,7 +1367,7 @@ def _get_numeric_data(self):
         ]
         return self[columns]
 
-    @annotate("DATAFRAME_ASSIGN", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def assign(self, **kwargs):
         """
         Assign columns to DataFrame from keyword arguments.
@@ -1390,7 +1395,7 @@ def assign(self, **kwargs):
         return new_df
 
     @classmethod
-    @annotate("CONCAT", color="orange", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _concat(
         cls, objs, axis=0, join="outer", ignore_index=False, sort=False
     ):
@@ -1606,8 +1611,8 @@ def _concat(
                 )
 
         # Reassign index and column names
-        if isinstance(objs[0].columns, pd.MultiIndex):
-            out.columns = objs[0].columns
+        if objs[0]._data.multiindex:
+            out._set_column_names_like(objs[0])
         else:
             out.columns = names
         if not ignore_index:
@@ -1617,99 +1622,15 @@ def _concat(
         return out
 
     def astype(self, dtype, copy=False, errors="raise", **kwargs):
-        """
-        Cast the DataFrame to the given dtype
-
-        Parameters
-        ----------
-
-        dtype : data type, or dict of column name -> data type
-            Use a numpy.dtype or Python type to cast entire DataFrame object to
-            the same type. Alternatively, use ``{col: dtype, ...}``, where col
-            is a column label and dtype is a numpy.dtype or Python type
-            to cast one or more of the DataFrame's columns to
-            column-specific types.
-        copy : bool, default False
-            Return a deep-copy when ``copy=True``. Note by default
-            ``copy=False`` setting is used and hence changes to
-            values then may propagate to other cudf objects.
-        errors : {'raise', 'ignore', 'warn'}, default 'raise'
-            Control raising of exceptions on invalid data for provided dtype.
-
-            -   ``raise`` : allow exceptions to be raised
-            -   ``ignore`` : suppress exceptions. On error return original
-                object.
-            -   ``warn`` : prints last exceptions as warnings and
-                return original object.
-        **kwargs : extra arguments to pass on to the constructor
-
-        Returns
-        -------
-        casted : DataFrame
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]})
-        >>> df
-            a  b
-        0  10  1
-        1  20  2
-        2  30  3
-        >>> df.dtypes
-        a    int64
-        b    int64
-        dtype: object
-
-        Cast all columns to `int32`:
-
-        >>> df.astype('int32').dtypes
-        a    int32
-        b    int32
-        dtype: object
-
-        Cast `a` to `float32` using a dictionary:
-
-        >>> df.astype({'a': 'float32'}).dtypes
-        a    float32
-        b      int64
-        dtype: object
-        >>> df.astype({'a': 'float32'})
-              a  b
-        0  10.0  1
-        1  20.0  2
-        2  30.0  3
-        """
-        result = DataFrame(index=self.index)
-
         if is_dict_like(dtype):
-            current_cols = self._data.names
-            if len(set(dtype.keys()) - set(current_cols)) > 0:
+            if len(set(dtype.keys()) - set(self._data.names)) > 0:
                 raise KeyError(
                     "Only a column name can be used for the "
                     "key in a dtype mappings argument."
                 )
-            for col_name in current_cols:
-                if col_name in dtype:
-                    result._data[col_name] = self._data[col_name].astype(
-                        dtype=dtype[col_name],
-                        errors=errors,
-                        copy=copy,
-                        **kwargs,
-                    )
-                else:
-                    result._data[col_name] = (
-                        self._data[col_name].copy(deep=True)
-                        if copy
-                        else self._data[col_name]
-                    )
         else:
-            for col in self._data:
-                result._data[col] = self._data[col].astype(
-                    dtype=dtype, **kwargs
-                )
-
-        return result
+            dtype = {cc: dtype for cc in self._data.names}
+        return super().astype(dtype, copy, errors, **kwargs)
 
     def _clean_renderable_dataframe(self, output):
         """
@@ -1847,12 +1768,12 @@ def _get_renderable_dataframe(self):
 
         return output
 
-    @annotate("DATAFRAME_REPR", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __repr__(self):
         output = self._get_renderable_dataframe()
         return self._clean_renderable_dataframe(output)
 
-    @annotate("DATAFRAME_REPR_HTML", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _repr_html_(self):
         lines = (
             self._get_renderable_dataframe()
@@ -1869,13 +1790,11 @@ def _repr_html_(self):
             lines.append("</div>")
         return "\n".join(lines)
 
-    @annotate("DATAFRAME_REPR_LATEX", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
-    @annotate(
-        "DATAFRAME_GET_COLUMNS_BY_LABEL", color="blue", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _get_columns_by_label(self, labels, downcast=False):
         """
         Return columns of dataframe by `labels`
@@ -1898,7 +1817,7 @@ def _get_columns_by_label(self, labels, downcast=False):
         )
         return out
 
-    def _prep_for_binop(
+    def _make_operands_and_index_for_binop(
         self,
         other: Any,
         fn: str,
@@ -1907,7 +1826,13 @@ def _prep_for_binop(
         can_reindex: bool = False,
         *args,
         **kwargs,
-    ):
+    ) -> Tuple[
+        Union[
+            Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
+            Type[NotImplemented],
+        ],
+        Optional[BaseIndex],
+    ]:
         lhs, rhs = self, other
 
         if _is_scalar_or_zero_d_array(rhs):
@@ -1930,12 +1855,14 @@ def _prep_for_binop(
                 not can_reindex
                 and fn in cudf.utils.utils._EQUALITY_OPS
                 and (
-                    not lhs.columns.equals(rhs.columns)
+                    not lhs._data.to_pandas_index().equals(
+                        rhs._data.to_pandas_index()
+                    )
                     or not lhs.index.equals(rhs.index)
                 )
             ):
                 raise ValueError(
-                    "Can only compare identically-labeled " "DataFrame objects"
+                    "Can only compare identically-labeled DataFrame objects"
                 )
 
             lhs, rhs = _align_indices(lhs, rhs)
@@ -1986,29 +1913,7 @@ def _prep_for_binop(
 
         return operands, lhs._index
 
-    @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python")
-    def _binaryop(
-        self,
-        other: Any,
-        fn: str,
-        fill_value: Any = None,
-        reflect: bool = False,
-        can_reindex: bool = False,
-        *args,
-        **kwargs,
-    ):
-        operands, out_index = self._prep_for_binop(
-            other, fn, fill_value, reflect, can_reindex
-        )
-        if operands is NotImplemented:
-            return NotImplemented
-
-        return self._from_data(
-            ColumnAccessor(type(self)._colwise_binop(operands, fn)),
-            index=out_index,
-        )
-
-    @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def update(
         self,
         other,
@@ -2074,8 +1979,9 @@ def update(
         if not isinstance(other, DataFrame):
             other = DataFrame(other)
 
-        if not self.columns.equals(other.columns):
-            other = other.reindex(self.columns, axis=1)
+        self_cols = self._data.to_pandas_index()
+        if not self_cols.equals(other._data.to_pandas_index()):
+            other = other.reindex(self_cols, axis=1)
         if not self.index.equals(other.index):
             other = other.reindex(self.index, axis=0)
 
@@ -2102,11 +2008,11 @@ def update(
 
         self._mimic_inplace(source_df, inplace=True)
 
-    @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __iter__(self):
-        return iter(self.columns)
+        return iter(self._column_names)
 
-    @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def iteritems(self):
         """Iterate over column names and series pairs"""
         warnings.warn(
@@ -2116,13 +2022,13 @@ def iteritems(self):
         )
         return self.items()
 
-    @annotate("DATAFRAME_ITEMS", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def items(self):
         """Iterate over column names and series pairs"""
         for k in self:
             yield (k, self[k])
 
-    @annotate("DATAFRAME_EQUALS", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def equals(self, other, **kwargs):
         ret = super().equals(other)
         # If all other checks matched, validate names.
@@ -2150,13 +2056,18 @@ def at(self):
         return self.loc
 
     @property  # type: ignore
-    @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python")
+    @_external_only_api(
+        "Use _column_names instead, or _data.to_pandas_index() if a pandas "
+        "index is absolutely necessary. For checking if the columns are a "
+        "MultiIndex, use _data.multiindex."
+    )
+    @_cudf_nvtx_annotate
     def columns(self):
         """Returns a tuple of columns"""
         return self._data.to_pandas_index()
 
     @columns.setter  # type: ignore
-    @annotate("DATAFRAME_COLUMNS_SETTER", color="yellow", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def columns(self, columns):
         if isinstance(columns, cudf.BaseIndex):
             columns = columns.to_pandas()
@@ -2164,9 +2075,7 @@ def columns(self, columns):
             columns = pd.Index(range(len(self._data.columns)))
         is_multiindex = isinstance(columns, pd.MultiIndex)
 
-        if isinstance(
-            columns, (Series, cudf.Index, cudf.core.column.ColumnBase)
-        ):
+        if isinstance(columns, (Series, cudf.Index, ColumnBase)):
             columns = pd.Index(columns.to_numpy(), tupleize_cols=is_multiindex)
         elif not isinstance(columns, pd.Index):
             columns = pd.Index(columns, tupleize_cols=is_multiindex)
@@ -2177,15 +2086,23 @@ def columns(self, columns):
                 f"got {len(columns)} elements"
             )
 
-        data = dict(zip(columns, self._data.columns))
-        if len(columns) != len(data):
+        self._set_column_names(columns, is_multiindex, columns.names)
+
+    def _set_column_names(self, names, multiindex=False, level_names=None):
+        data = dict(zip(names, self._data.columns))
+        if len(names) != len(data):
             raise ValueError("Duplicate column names are not allowed")
 
         self._data = ColumnAccessor(
-            data, multiindex=is_multiindex, level_names=columns.names,
+            data, multiindex=multiindex, level_names=level_names,
+        )
+
+    def _set_column_names_like(self, other):
+        self._set_column_names(
+            other._data.names, other._data.multiindex, other._data.level_names
         )
 
-    @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _reindex(
         self, columns, dtypes=None, deep=False, index=None, inplace=False
     ):
@@ -2262,7 +2179,7 @@ def _reindex(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @annotate("DATAFRAME_REINDEX", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def reindex(
         self, labels=None, axis=None, index=None, columns=None, copy=True
     ):
@@ -2341,7 +2258,7 @@ def reindex(
             inplace=False,
         )
 
-    @annotate("DATAFRAME_SET_INDEX", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def set_index(
         self,
         keys,
@@ -2451,7 +2368,7 @@ def set_index(
         for col in keys:
             # Is column label
             if is_scalar(col) or isinstance(col, tuple):
-                if col in self.columns:
+                if col in self._column_names:
                     columns_to_add.append(self[col])
                     names.append(col)
                     if drop:
@@ -2602,7 +2519,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @annotate("DATAFRAME_INSERT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def insert(self, loc, name, value, nan_as_null=None):
         """Add a column to DataFrame at the index specified by loc.
 
@@ -2626,7 +2543,7 @@ def insert(self, loc, name, value, nan_as_null=None):
             ignore_index=False,
         )
 
-    @annotate("DATAFRAME__INSERT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         """
         Same as `insert`, with additional `ignore_index` param.
@@ -2747,195 +2664,14 @@ def diff(self, periods=1, axis=0):
             df = cudf.DataFrame._from_data(
                 {
                     name: column_empty(len(self), dtype=dtype, masked=True)
-                    for name, dtype in zip(self.columns, self.dtypes)
+                    for name, dtype in zip(self._column_names, self.dtypes)
                 }
             )
             return df
 
         return self - self.shift(periods=periods)
 
-    @annotate("DATAFRAME_DROP", color="green", domain="cudf_python")
-    def drop(
-        self,
-        labels=None,
-        axis=0,
-        index=None,
-        columns=None,
-        level=None,
-        inplace=False,
-        errors="raise",
-    ):
-        """
-        Drop specified labels from rows or columns.
-
-        Remove rows or columns by specifying label names and corresponding
-        axis, or by specifying directly index or column names. When using a
-        multi-index, labels on different levels can be removed by specifying
-        the level.
-
-        Parameters
-        ----------
-        labels : single label or list-like
-            Index or column labels to drop.
-        axis : {0 or 'index', 1 or 'columns'}, default 0
-            Whether to drop labels from the index (0 or 'index') or
-            columns (1 or 'columns').
-        index : single label or list-like
-            Alternative to specifying axis (``labels, axis=0``
-            is equivalent to ``index=labels``).
-        columns : single label or list-like
-            Alternative to specifying axis (``labels, axis=1``
-            is equivalent to ``columns=labels``).
-        level : int or level name, optional
-            For MultiIndex, level from which the labels will be removed.
-        inplace : bool, default False
-            If False, return a copy. Otherwise, do operation
-            inplace and return None.
-        errors : {'ignore', 'raise'}, default 'raise'
-            If 'ignore', suppress error and only existing labels are
-            dropped.
-
-        Returns
-        -------
-        DataFrame
-            DataFrame without the removed index or column labels.
-
-        Raises
-        ------
-        KeyError
-            If any of the labels is not found in the selected axis.
-
-        See Also
-        --------
-        DataFrame.loc : Label-location based indexer for selection by label.
-        DataFrame.dropna : Return DataFrame with labels on given axis omitted
-            where (all or any) data are missing.
-        DataFrame.drop_duplicates : Return DataFrame with duplicate rows
-            removed, optionally only considering certain columns.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({"A": [1, 2, 3, 4],
-        ...                      "B": [5, 6, 7, 8],
-        ...                      "C": [10, 11, 12, 13],
-        ...                      "D": [20, 30, 40, 50]})
-        >>> df
-           A  B   C   D
-        0  1  5  10  20
-        1  2  6  11  30
-        2  3  7  12  40
-        3  4  8  13  50
-
-        Drop columns
-
-        >>> df.drop(['B', 'C'], axis=1)
-           A   D
-        0  1  20
-        1  2  30
-        2  3  40
-        3  4  50
-        >>> df.drop(columns=['B', 'C'])
-           A   D
-        0  1  20
-        1  2  30
-        2  3  40
-        3  4  50
-
-        Drop a row by index
-
-        >>> df.drop([0, 1])
-           A  B   C   D
-        2  3  7  12  40
-        3  4  8  13  50
-
-        Drop columns and/or rows of MultiIndex DataFrame
-
-        >>> midx = cudf.MultiIndex(levels=[['lama', 'cow', 'falcon'],
-        ...                              ['speed', 'weight', 'length']],
-        ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
-        ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
-        >>> df = cudf.DataFrame(index=midx, columns=['big', 'small'],
-        ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
-        ...                         [250, 150], [1.5, 0.8], [320, 250],
-        ...                         [1, 0.8], [0.3, 0.2]])
-        >>> df
-                         big  small
-        lama   speed    45.0   30.0
-               weight  200.0  100.0
-               length    1.5    1.0
-        cow    speed    30.0   20.0
-               weight  250.0  150.0
-               length    1.5    0.8
-        falcon speed   320.0  250.0
-               weight    1.0    0.8
-               length    0.3    0.2
-        >>> df.drop(index='cow', columns='small')
-                         big
-        lama   speed    45.0
-               weight  200.0
-               length    1.5
-        falcon speed   320.0
-               weight    1.0
-               length    0.3
-        >>> df.drop(index='length', level=1)
-                         big  small
-        lama   speed    45.0   30.0
-               weight  200.0  100.0
-        cow    speed    30.0   20.0
-               weight  250.0  150.0
-        falcon speed   320.0  250.0
-               weight    1.0    0.8
-        """
-
-        if labels is not None:
-            if index is not None or columns is not None:
-                raise ValueError(
-                    "Cannot specify both 'labels' and 'index'/'columns'"
-                )
-            target = labels
-        elif index is not None:
-            target = index
-            axis = 0
-        elif columns is not None:
-            target = columns
-            axis = 1
-        else:
-            raise ValueError(
-                "Need to specify at least one of 'labels', "
-                "'index' or 'columns'"
-            )
-
-        if inplace:
-            out = self
-        else:
-            out = self.copy()
-
-        if axis in (1, "columns"):
-            target = _get_host_unique(target)
-
-            _drop_columns(out, target, errors)
-        elif axis in (0, "index"):
-            dropped = _drop_rows_by_labels(out, target, level, errors)
-
-            if columns is not None:
-                columns = _get_host_unique(columns)
-                _drop_columns(dropped, columns, errors)
-
-            out._data = dropped._data
-            out._index = dropped._index
-
-        if not inplace:
-            return out
-
-    @annotate("DATAFRAME_DROP_COLUMN", color="green", domain="cudf_python")
-    def _drop_column(self, name):
-        """Drop a column by *name*"""
-        if name not in self._data:
-            raise KeyError(f"column '{name}' does not exist")
-        del self._data[name]
-
-    @annotate("DATAFRAME_DROP_DUPLICATES", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def drop_duplicates(
         self, subset=None, keep="first", inplace=False, ignore_index=False
     ):
@@ -3013,14 +2749,14 @@ def drop_duplicates(
 
         return self._mimic_inplace(outdf, inplace=inplace)
 
-    @annotate("DATAFRAME_POP", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def pop(self, item):
         """Return a column and drop it from the DataFrame."""
         popped = self[item]
         del self[item]
         return popped
 
-    @annotate("DATAFRAME_RENAME", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def rename(
         self,
         mapper=None,
@@ -3164,7 +2900,7 @@ def rename(
         else:
             return out.copy(deep=copy)
 
-    @annotate("DATAFRAME_ADD_PREFIX", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def add_prefix(self, prefix):
         out = self.copy(deep=True)
         out.columns = [
@@ -3172,7 +2908,7 @@ def add_prefix(self, prefix):
         ]
         return out
 
-    @annotate("DATAFRAME_ADD_SUFFIX", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def add_suffix(self, suffix):
         out = self.copy(deep=True)
         out.columns = [
@@ -3180,7 +2916,7 @@ def add_suffix(self, suffix):
         ]
         return out
 
-    @annotate("DATAFRAME_AGG", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -3312,7 +3048,7 @@ def agg(self, aggs, axis=None):
         else:
             raise ValueError("argument must be a string, list or dict")
 
-    @annotate("DATAFRAME_NLARGEST", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def nlargest(self, n, columns, keep="first"):
         """Get the rows of the DataFrame sorted by the n largest value of *columns*
 
@@ -3444,7 +3180,7 @@ def nsmallest(self, n, columns, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, columns, keep)
 
-    @annotate("DATAFRAME_TRANSPOSE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def transpose(self):
         """Transpose index and columns.
 
@@ -3460,7 +3196,7 @@ def transpose(self):
         # Never transpose a MultiIndex - remove the existing columns and
         # replace with a RangeIndex. Afterward, reassign.
         columns = self.index.copy(deep=False)
-        index = self.columns.copy(deep=False)
+        index = self._data.to_pandas_index()
         if self._num_columns == 0 or self._num_rows == 0:
             return DataFrame(index=index, columns=columns)
         # Set the old column names as the new index
@@ -3475,7 +3211,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @annotate("DATAFRAME_MELT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def melt(self, **kwargs):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
@@ -3505,7 +3241,7 @@ def melt(self, **kwargs):
 
         return melt(self, **kwargs)
 
-    @annotate("DATAFRAME_JOIN", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def merge(
         self,
         right,
@@ -3645,7 +3381,7 @@ def merge(
         )
         return gdf_result
 
-    @annotate("JOIN", color="blue", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def join(
         self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False,
     ):
@@ -3687,7 +3423,7 @@ def join(
         )
         return df
 
-    @annotate("DATAFRAME_GROUPBY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @copy_docstring(DataFrameGroupBy)
     def groupby(
         self,
@@ -3827,7 +3563,7 @@ def query(self, expr, local_dict=None):
             boolmask = queryutils.query_execute(self, expr, callenv)
             return self._apply_boolean_mask(boolmask)
 
-    @annotate("DATAFRAME_APPLY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def apply(
         self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
     ):
@@ -3976,7 +3712,7 @@ def apply(
 
         return self._apply(func, _get_row_kernel, *args, **kwargs)
 
-    @annotate("DATAFRAME_APPLY_ROWS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @applyutils.doc_apply()
     def apply_rows(
         self,
@@ -4055,7 +3791,7 @@ def apply_rows(
             cache_key=cache_key,
         )
 
-    @annotate("DATAFRAME_APPLY_CHUNKS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @applyutils.doc_applychunks()
     def apply_chunks(
         self,
@@ -4123,9 +3859,7 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    @annotate(
-        "DATAFRAME_PARTITION_BY_HASH", color="green", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
 
@@ -4313,13 +4047,13 @@ def info(
         )
         lines.append(index_summary)
 
-        if len(self.columns) == 0:
+        if len(self._data) == 0:
             lines.append(f"Empty {type(self).__name__}")
             cudf.utils.ioutils.buffer_write_lines(buf, lines)
             return
 
-        cols = self.columns
-        col_count = len(self.columns)
+        cols = self._column_names
+        col_count = len(cols)
 
         if max_cols is None:
             max_cols = pd.options.display.max_info_columns
@@ -4337,7 +4071,7 @@ def _put_str(s, space):
             return str(s)[:space].ljust(space)
 
         def _verbose_repr():
-            lines.append(f"Data columns (total {len(self.columns)} columns):")
+            lines.append(f"Data columns (total {col_count} columns):")
 
             id_head = " # "
             column_head = "Column"
@@ -4357,10 +4091,10 @@ def _verbose_repr():
             )
             if show_counts:
                 counts = self.count().to_pandas().tolist()
-                if len(cols) != len(counts):
+                if col_count != len(counts):
                     raise AssertionError(
                         f"Columns must equal "
-                        f"counts ({len(cols)} != {len(counts)})"
+                        f"counts ({col_count} != {len(counts)})"
                     )
                 count_header = "Non-Null Count"
                 len_count = len(count_header)
@@ -4393,7 +4127,7 @@ def _verbose_repr():
                 + _put_str("-" * len_dtype, space_dtype).rstrip()
             )
 
-            for i, col in enumerate(self.columns):
+            for i, col in enumerate(self._column_names):
                 dtype = self.dtypes.iloc[i]
                 col = pprint_thing(col)
 
@@ -4410,13 +4144,11 @@ def _verbose_repr():
                 )
 
         def _non_verbose_repr():
-            if len(self.columns) > 0:
-                entries_summary = f", {self.columns[0]} to {self.columns[-1]}"
+            if col_count > 0:
+                entries_summary = f", {cols[0]} to {cols[-1]}"
             else:
                 entries_summary = ""
-            columns_summary = (
-                f"Columns: {len(self.columns)} entries{entries_summary}"
-            )
+            columns_summary = f"Columns: {col_count} entries{entries_summary}"
             lines.append(columns_summary)
 
         def _sizeof_fmt(num, size_qualifier):
@@ -4463,7 +4195,7 @@ def _sizeof_fmt(num, size_qualifier):
 
         cudf.utils.ioutils.buffer_write_lines(buf, lines)
 
-    @annotate("DATAFRAME_DESCRIBE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @docutils.doc_describe()
     def describe(
         self,
@@ -4479,7 +4211,7 @@ def describe(
             if datetime_is_numeric:
                 default_include.append("datetime")
             data_to_describe = self.select_dtypes(include=default_include)
-            if len(data_to_describe.columns) == 0:
+            if data_to_describe._num_columns == 0:
                 data_to_describe = self
 
         elif include == "all":
@@ -4497,7 +4229,7 @@ def describe(
 
         describe_series_list = [
             data_to_describe[col].describe(percentiles=percentiles)
-            for col in data_to_describe.columns
+            for col in data_to_describe._column_names
         ]
         if len(describe_series_list) == 1:
             return describe_series_list[0].to_frame()
@@ -4523,7 +4255,7 @@ def describe(
                 sort=False,
             )
 
-    @annotate("DATAFRAME_TO_PANDAS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def to_pandas(self, nullable=False, **kwargs):
         """
         Convert to a Pandas DataFrame.
@@ -4587,30 +4319,18 @@ def to_pandas(self, nullable=False, **kwargs):
         out_data = {}
         out_index = self.index.to_pandas()
 
-        if not isinstance(self.columns, pd.Index):
-            out_columns = self.columns.to_pandas()
-        else:
-            out_columns = self.columns
-
         for i, col_key in enumerate(self._data):
             out_data[i] = self._data[col_key].to_pandas(
                 index=out_index, nullable=nullable
             )
 
-        if isinstance(self.columns, BaseIndex):
-            out_columns = self.columns.to_pandas()
-            if isinstance(self.columns, MultiIndex):
-                if self.columns.names is not None:
-                    out_columns.names = self.columns.names
-            else:
-                out_columns.name = self.columns.name
-
         out_df = pd.DataFrame(out_data, index=out_index)
-        out_df.columns = out_columns
+        out_df.columns = self._data.to_pandas_index()
+
         return out_df
 
     @classmethod
-    @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def from_pandas(cls, dataframe, nan_as_null=None):
         """
         Convert from a Pandas DataFrame.
@@ -4680,7 +4400,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
         return result
 
     @classmethod
-    @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def from_arrow(cls, table):
         """
         Convert from PyArrow Table to DataFrame.
@@ -4736,7 +4456,7 @@ def from_arrow(cls, table):
 
         return out
 
-    @annotate("DATAFRAME_TO_ARROW", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def to_arrow(self, preserve_index=True):
         """
         Convert to a PyArrow Table.
@@ -4818,7 +4538,7 @@ def to_arrow(self, preserve_index=True):
 
         return out.replace_schema_metadata(metadata)
 
-    @annotate("DATAFRAME_TO_RECORDS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def to_records(self, index=True):
         """Convert to a numpy recarray
 
@@ -4842,7 +4562,7 @@ def to_records(self, index=True):
         return ret
 
     @classmethod
-    @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         """
         Convert structured or record ndarray to DataFrame.
@@ -4904,9 +4624,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         return df
 
     @classmethod
-    @annotate(
-        "DATAFRAME_FROM_ARRAYS_INTERNAL", color="green", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         """Convert a numpy/cupy array to DataFrame.
 
@@ -4966,7 +4684,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             df._index = as_index(index)
         return df
 
-    @annotate("DATAFRAME_INTERPOLATE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def interpolate(
         self,
         method="linear",
@@ -4997,7 +4715,7 @@ def interpolate(
             **kwargs,
         )
 
-    @annotate("DATAFRAME_QUANTILE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def quantile(
         self,
         q=0.5,
@@ -5105,7 +4823,7 @@ def quantile(
         if isinstance(q, numbers.Number) and numeric_only:
             result = result.fillna(np.nan)
             result = result.iloc[0]
-            result.index = as_index(data_df.columns)
+            result.index = data_df._data.to_pandas_index()
             result.name = q
             return result
         else:
@@ -5113,7 +4831,7 @@ def quantile(
             result.index = q
             return result
 
-    @annotate("DATAFRAME_QUANTILES", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def quantiles(self, q=0.5, interpolation="nearest"):
         """
         Return values at the given quantile.
@@ -5153,7 +4871,7 @@ def quantiles(self, q=0.5, interpolation="nearest"):
             result.index = as_index(q)
             return result
 
-    @annotate("DATAFRAME_ISIN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def isin(self, values):
         """
         Whether each element in the DataFrame is contained in values.
@@ -5285,14 +5003,13 @@ def make_false_column_like_self():
                 f"'{type(values).__name__}'"
             )
 
+        # TODO: Update this logic to properly preserve MultiIndex columns.
         return DataFrame._from_data(result, self.index)
 
     #
     # Stats
     #
-    @annotate(
-        "DATAFRAME_PREPARE_FOR_ROWWISE_OP", color="green", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _prepare_for_rowwise_op(self, method, skipna):
         """Prepare a DataFrame for CuPy-based row-wise operations."""
 
@@ -5342,7 +5059,7 @@ def _prepare_for_rowwise_op(self, method, skipna):
             coerced = coerced.astype("int64", copy=False)
         return coerced, mask, common_dtype
 
-    @annotate("DATAFRAME_COUNT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         """
         Count ``non-NA`` cells for each column or row.
@@ -5389,7 +5106,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         "columns": 1,
     }
 
-    @annotate("DATAFRAME_REDUCE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _reduce(
         self, op, axis=None, level=None, numeric_only=None, **kwargs,
     ):
@@ -5403,10 +5120,13 @@ def _reduce(
         axis = self._get_axis_from_axis_arg(axis)
 
         if axis == 0:
-            result = [
-                getattr(self._data[col], op)(**kwargs)
-                for col in self._data.names
-            ]
+            try:
+                result = [
+                    getattr(self._data[col], op)(**kwargs)
+                    for col in self._data.names
+                ]
+            except AttributeError:
+                raise TypeError(f"cannot perform {op} with type {self.dtype}")
 
             return Series._from_data(
                 {None: result}, as_index(self._data.names)
@@ -5414,7 +5134,7 @@ def _reduce(
         elif axis == 1:
             return self._apply_cupy_method_axis_1(op, **kwargs)
 
-    @annotate("DATAFRAME_SCAN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _scan(
         self, op, axis=None, *args, **kwargs,
     ):
@@ -5423,9 +5143,9 @@ def _scan(
         if axis == 0:
             return super()._scan(op, axis=axis, *args, **kwargs)
         elif axis == 1:
-            return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs)
+            return self._apply_cupy_method_axis_1(op, **kwargs)
 
-    @annotate("DATAFRAME_MODE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def mode(self, axis=0, numeric_only=False, dropna=True):
         """
         Get the mode(s) of each element along the selected axis.
@@ -5521,11 +5241,11 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         if isinstance(df, Series):
             df = df.to_frame()
 
-        df.columns = data_df.columns
+        df._set_column_names_like(data_df)
 
         return df
 
-    @annotate("DATAFRAME_KURTOSIS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def kurtosis(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -5534,7 +5254,7 @@ def kurtosis(
             axis, skipna, level, numeric_only, **kwargs
         )
 
-    @annotate("DATAFRAME_SKEW", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def skew(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -5543,17 +5263,17 @@ def skew(
             axis, skipna, level, numeric_only, **kwargs
         )
 
-    @annotate("DATAFRAME_ALL", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).all(axis, skipna, level, **kwargs)
 
-    @annotate("DATAFRAME_ANY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).any(axis, skipna, level, **kwargs)
 
-    @annotate("DATAFRAME_APPLY_CUPY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         # This method uses cupy to perform scans and reductions along rows of a
         # DataFrame. Since cuDF is designed around columnar storage and
@@ -5651,10 +5371,10 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             return Series(result, index=self.index, dtype=result_dtype,)
         else:
             result_df = DataFrame(result).set_index(self.index)
-            result_df.columns = prepared.columns
+            result_df._set_column_names_like(prepared)
             return result_df
 
-    @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _columns_view(self, columns):
         """
         Return a subset of the DataFrame's columns as a view.
@@ -5663,7 +5383,7 @@ def _columns_view(self, columns):
             {col: self._data[col] for col in columns}, index=self.index
         )
 
-    @annotate("DATAFRAME_SELECT_DTYPES", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def select_dtypes(self, include=None, exclude=None):
         """Return a subset of the DataFrame’s columns based on the column dtypes.
 
@@ -5850,7 +5570,7 @@ def to_orc(self, fname, compression=None, *args, **kwargs):
 
         orc.to_orc(self, fname, compression, *args, **kwargs)
 
-    @annotate("DATAFRAME_STACK", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def stack(self, level=-1, dropna=True):
         """Stack the prescribed level(s) from columns to index
 
@@ -5912,7 +5632,7 @@ def stack(self, level=-1, dropna=True):
         else:
             return result
 
-    @annotate("DATAFRAME_COV", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def cov(self, **kwargs):
         """Compute the covariance matrix of a DataFrame.
 
@@ -5926,19 +5646,21 @@ def cov(self, **kwargs):
         cov : DataFrame
         """
         cov = cupy.cov(self.values, rowvar=False)
-        df = DataFrame(cupy.asfortranarray(cov)).set_index(self.columns)
-        df.columns = self.columns
+        cols = self._data.to_pandas_index()
+        df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
+        df._set_column_names_like(self)
         return df
 
-    @annotate("DATAFRAME_CORR", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def corr(self):
         """Compute the correlation matrix of a DataFrame."""
         corr = cupy.corrcoef(self.values, rowvar=False)
-        df = DataFrame(cupy.asfortranarray(corr)).set_index(self.columns)
-        df.columns = self.columns
+        cols = self._data.to_pandas_index()
+        df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
+        df._set_column_names_like(self)
         return df
 
-    @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def to_struct(self, name=None):
         """
         Return a struct Series composed of the columns of the DataFrame.
@@ -5971,7 +5693,7 @@ def to_struct(self, name=None):
             name=name,
         )
 
-    @annotate("DATAFRAME_KEYS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def keys(self):
         """
         Get the columns.
@@ -6001,7 +5723,7 @@ def keys(self):
         >>> df.keys()
         Int64Index([0, 1, 2, 3], dtype='int64')
         """
-        return self.columns
+        return self._data.to_pandas_index()
 
     def itertuples(self, index=True, name="Pandas"):
         raise TypeError(
@@ -6019,7 +5741,7 @@ def iterrows(self):
             "if you wish to iterate over each row."
         )
 
-    @annotate("DATAFRAME_APPEND", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def append(
         self, other, ignore_index=False, verify_integrity=False, sort=False
     ):
@@ -6114,19 +5836,10 @@ def append(
         3  3
         4  4
         """
-        if verify_integrity not in (None, False):
-            raise NotImplementedError(
-                "verify_integrity parameter is not supported yet."
-            )
-
         if isinstance(other, dict):
             if not ignore_index:
                 raise TypeError("Can only append a dict if ignore_index=True")
             other = DataFrame(other)
-            result = cudf.concat(
-                [self, other], ignore_index=ignore_index, sort=sort
-            )
-            return result
         elif isinstance(other, Series):
             if other.name is None and not ignore_index:
                 raise TypeError(
@@ -6134,7 +5847,7 @@ def append(
                     "or if the Series has a name"
                 )
 
-            current_cols = self.columns
+            current_cols = self._data.to_pandas_index()
             combined_columns = other.index.to_pandas()
             if len(current_cols):
 
@@ -6157,22 +5870,21 @@ def append(
             other = other.reindex(combined_columns, copy=False).to_frame().T
             if not current_cols.equals(combined_columns):
                 self = self.reindex(columns=combined_columns)
-        elif isinstance(other, list):
-            if not other:
-                pass
-            elif not isinstance(other[0], DataFrame):
-                other = DataFrame(other)
-                if (self.columns.get_indexer(other.columns) >= 0).all():
-                    other = other.reindex(columns=self.columns)
-
-        if is_list_like(other):
-            to_concat = [self, *other]
-        else:
-            to_concat = [self, other]
+        elif (
+            isinstance(other, list)
+            and other
+            and not isinstance(other[0], DataFrame)
+        ):
+            other = DataFrame(other)
+            cols = self._data.to_pandas_index()
+            if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all():
+                other = other.reindex(columns=cols)
 
-        return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort)
+        return super(DataFrame, self)._append(
+            other, ignore_index, verify_integrity, sort
+        )
 
-    @annotate("DATAFRAME_PIVOT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @copy_docstring(reshape.pivot)
     def pivot(self, index, columns, values=None):
 
@@ -6180,14 +5892,14 @@ def pivot(self, index, columns, values=None):
             self, index=index, columns=columns, values=values
         )
 
-    @annotate("DATAFRAME_UNSTACK", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @copy_docstring(reshape.unstack)
     def unstack(self, level=-1, fill_value=None):
         return cudf.core.reshape.unstack(
             self, level=level, fill_value=fill_value
         )
 
-    @annotate("DATAFRAME_EXPLODE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def explode(self, column, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -6195,7 +5907,7 @@ def explode(self, column, ignore_index=False):
 
         Parameters
         ----------
-        column : str or tuple
+        column : str
             Column to explode.
         ignore_index : bool, default False
             If True, the resulting index will be labeled 0, 1, …, n - 1.
@@ -6230,11 +5942,6 @@ def explode(self, column, ignore_index=False):
         if column not in self._column_names:
             raise KeyError(column)
 
-        if not is_list_dtype(self._data[column].dtype):
-            data = self._data.copy(deep=True)
-            idx = None if ignore_index else self._index.copy(deep=True)
-            return self.__class__._from_data(data, index=idx)
-
         return super()._explode(column, ignore_index)
 
     def pct_change(
@@ -6313,7 +6020,45 @@ def nunique(self, axis=0, dropna=True):
         if axis != 0:
             raise NotImplementedError("axis parameter is not supported yet.")
 
-        return cudf.Series(super().nunique(method="sort", dropna=dropna))
+        return cudf.Series(super().nunique(dropna=dropna))
+
+    def _sample_axis_1(
+        self,
+        n: int,
+        weights: Optional[ColumnLike],
+        replace: bool,
+        random_state: np.random.RandomState,
+        ignore_index: bool,
+    ):
+        if replace:
+            # Since cuDF does not support multiple columns with same name,
+            # sample with replace=True at axis 1 is unsupported.
+            raise NotImplementedError(
+                "Sample is not supported for axis 1/`columns` when"
+                "`replace=True`."
+            )
+
+        sampled_column_labels = random_state.choice(
+            self._column_names, size=n, replace=False, p=weights
+        )
+
+        result = self._get_columns_by_label(sampled_column_labels)
+        if ignore_index:
+            result.reset_index(drop=True)
+
+        return result
+
+    def _from_columns_like_self(
+        self,
+        columns: List[ColumnBase],
+        column_names: Iterable[str],
+        index_names: Optional[List[str]] = None,
+    ) -> DataFrame:
+        result = super()._from_columns_like_self(
+            columns, column_names, index_names
+        )
+        result._set_column_names_like(self)
+        return result
 
 
 def from_dataframe(df, allow_copy=False):
@@ -6433,7 +6178,7 @@ def func(left, right, output):
     )
 
 
-@annotate("CUDF_FROM_PANDAS", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def from_pandas(obj, nan_as_null=None):
     """
     Convert certain Pandas objects into the cudf equivalent.
@@ -6554,7 +6299,7 @@ def from_pandas(obj, nan_as_null=None):
         )
 
 
-@annotate("CUDF_MERGE", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def merge(left, right, *args, **kwargs):
     return left.merge(right, *args, **kwargs)
 
@@ -6587,10 +6332,10 @@ def _align_indices(lhs, rhs):
         df = df.sort_index()
         lhs_out = DataFrame(index=df.index)
         rhs_out = DataFrame(index=df.index)
-        common = set(lhs.columns) & set(rhs.columns)
+        common = set(lhs._column_names) & set(rhs._column_names)
         common_x = {f"{x}_x" for x in common}
         common_y = {f"{x}_y" for x in common}
-        for col in df.columns:
+        for col in df._column_names:
             if col in common_x:
                 lhs_out[col[:-2]] = df[col]
             elif col in common_y:
@@ -6607,7 +6352,7 @@ def _setitem_with_dataframe(
     input_df: DataFrame,
     replace_df: DataFrame,
     input_cols: Any = None,
-    mask: Optional[cudf.core.column.ColumnBase] = None,
+    mask: Optional[ColumnBase] = None,
     ignore_index: bool = False,
 ):
     """
@@ -6620,9 +6365,9 @@ def _setitem_with_dataframe(
     """
 
     if input_cols is None:
-        input_cols = input_df.columns
+        input_cols = input_df._column_names
 
-    if len(input_cols) != len(replace_df.columns):
+    if len(input_cols) != len(replace_df._column_names):
         raise ValueError(
             "Number of Input Columns must be same replacement Dataframe"
         )
@@ -6634,8 +6379,8 @@ def _setitem_with_dataframe(
     ):
         replace_df = replace_df.reindex(input_df.index)
 
-    for col_1, col_2 in zip(input_cols, replace_df.columns):
-        if col_1 in input_df.columns:
+    for col_1, col_2 in zip(input_cols, replace_df._column_names):
+        if col_1 in input_df._column_names:
             if mask is not None:
                 input_df._data[col_1][mask] = column.as_column(
                     replace_df[col_2]
@@ -6697,28 +6442,6 @@ def _get_union_of_series_names(series_list):
     return names_list
 
 
-def _get_host_unique(array):
-    if isinstance(
-        array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
-    ):
-        return array.unique.to_pandas()
-    elif isinstance(array, (str, numbers.Number)):
-        return [array]
-    else:
-        return set(array)
-
-
-def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
-    for c in columns:
-        try:
-            df._drop_column(c)
-        except KeyError as e:
-            if errors == "ignore":
-                pass
-            else:
-                raise e
-
-
 # Create a dictionary of the common, non-null columns
 def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns):
     # A mapping of {idx: np.dtype}
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 0f0cce283f3..8f00289afcb 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -537,7 +537,7 @@ def metadata(self):
         return {"cudf.index": self._df.index}
 
     def num_columns(self) -> int:
-        return len(self._df.columns)
+        return len(self._df._column_names)
 
     def num_rows(self) -> int:
         return len(self._df)
@@ -546,7 +546,7 @@ def num_chunks(self) -> int:
         return 1
 
     def column_names(self) -> Iterable[str]:
-        return self._df.columns.tolist()
+        return self._df._column_names
 
     def get_column(self, i: int) -> _CuDFColumn:
         return _CuDFColumn(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 9a2841cb402..07cc3ea71cd 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import builtins
 import copy
 import pickle
 import warnings
@@ -24,7 +23,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from nvtx import annotate
 
 import cudf
 from cudf import _lib as libcudf
@@ -33,6 +31,7 @@
     _is_non_decimal_numeric_dtype,
     is_decimal_dtype,
     is_dict_like,
+    is_dtype_equal,
     is_scalar,
     issubdtype,
 )
@@ -46,15 +45,60 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import Merge, MergeSemi
+from cudf.core.mixins import BinaryOperand, Scannable
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.dtypes import find_common_type, is_column_like
+from cudf.utils.dtypes import find_common_type
+from cudf.utils.utils import _cudf_nvtx_annotate
 
 T = TypeVar("T", bound="Frame")
 
 
-class Frame:
+# Mapping from ufuncs to the corresponding binary operators.
+_ufunc_binary_operations = {
+    # Arithmetic binary operations.
+    "add": "add",
+    "subtract": "sub",
+    "multiply": "mul",
+    "matmul": "matmul",
+    "divide": "truediv",
+    "true_divide": "truediv",
+    "floor_divide": "floordiv",
+    "power": "pow",
+    "float_power": "pow",
+    "remainder": "mod",
+    "mod": "mod",
+    "fmod": "mod",
+    # Bitwise binary operations.
+    "bitwise_and": "and",
+    "bitwise_or": "or",
+    "bitwise_xor": "xor",
+    # Comparison binary operators
+    "greater": "gt",
+    "greater_equal": "ge",
+    "less": "lt",
+    "less_equal": "le",
+    "not_equal": "ne",
+    "equal": "eq",
+}
+
+# These operators need to be mapped to their inverses when performing a
+# reflected ufunc operation because no reflected version of the operators
+# themselves exist. When these operators are invoked directly (not via
+# __array_ufunc__) Python takes care of calling the inverse operation.
+_ops_without_reflection = {
+    "gt": "lt",
+    "ge": "le",
+    "lt": "gt",
+    "le": "ge",
+    # ne and eq are symmetric, so they are their own inverse op
+    "ne": "ne",
+    "eq": "eq",
+}
+
+
+class Frame(BinaryOperand, Scannable):
     """A collection of Column objects with an optional index.
 
     Parameters
@@ -71,6 +115,23 @@ class Frame:
     _index: Optional[cudf.core.index.BaseIndex]
     _names: Optional[List]
 
+    _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
+
+    _VALID_SCANS = {
+        "cumsum",
+        "cumprod",
+        "cummin",
+        "cummax",
+    }
+
+    # Necessary because the function names don't directly map to the docs.
+    _SCAN_DOCSTRINGS = {
+        "cumsum": {"op_name": "cumulative sum"},
+        "cumprod": {"op_name": "cumulative product"},
+        "cummin": {"op_name": "cumulative min"},
+        "cummax": {"op_name": "cumulative max"},
+    }
+
     def __init__(self, data=None, index=None):
         if data is None:
             data = {}
@@ -123,7 +184,7 @@ def deserialize(cls, header, frames):
         return cls_deserialize._from_data(dict(zip(column_names, columns)))
 
     @classmethod
-    @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _from_data(
         cls,
         data: MutableMapping,
@@ -134,11 +195,11 @@ def _from_data(
         return obj
 
     @classmethod
-    @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _from_columns(
         cls,
         columns: List[ColumnBase],
-        column_names: List[str],
+        column_names: abc.Iterable[str],
         index_names: Optional[List[str]] = None,
     ):
         """Construct a `Frame` object from a list of columns.
@@ -165,13 +226,11 @@ def _from_columns(
 
         return cls._from_data(data, index)
 
-    @annotate(
-        "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _from_columns_like_self(
         self,
         columns: List[ColumnBase],
-        column_names: List[str],
+        column_names: abc.Iterable[str],
         index_names: Optional[List[str]] = None,
     ):
         """Construct a `Frame` from a list of columns with metadata from self.
@@ -365,7 +424,7 @@ def memory_usage(self, deep=False):
     def __len__(self):
         return self._num_rows
 
-    @annotate("FRAME_COPY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def copy(self: T, deep: bool = True) -> T:
         """
         Make a copy of this object's indices and data.
@@ -451,7 +510,19 @@ def copy(self: T, deep: bool = True) -> T:
 
         return new_frame
 
-    @annotate("FRAME_EQUALS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
+    def astype(self, dtype, copy=False, **kwargs):
+        result = {}
+        for col_name, col in self._data.items():
+            dt = dtype.get(col_name, col.dtype)
+            if not is_dtype_equal(dt, col.dtype):
+                result[col_name] = col.astype(dt, copy=copy, **kwargs)
+            else:
+                result[col_name] = col.copy() if copy else col
+
+        return result
+
+    @_cudf_nvtx_annotate
     def equals(self, other, **kwargs):
         """
         Test whether two objects contain the same elements.
@@ -534,33 +605,7 @@ def equals(self, other, **kwargs):
         else:
             return self._index.equals(other._index)
 
-    @annotate("FRAME_EXPLODE", color="green", domain="cudf_python")
-    def _explode(self, explode_column: Any, ignore_index: bool):
-        """Helper function for `explode` in `Series` and `Dataframe`, explodes
-        a specified nested column. Other columns' corresponding rows are
-        duplicated. If ignore_index is set, the original index is not exploded
-        and will be replaced with a `RangeIndex`.
-        """
-        explode_column_num = self._column_names.index(explode_column)
-        if not ignore_index and self._index is not None:
-            explode_column_num += self._index.nlevels
-
-        res = self.__class__._from_data(  # type: ignore
-            *libcudf.lists.explode_outer(
-                self, explode_column_num, ignore_index
-            )
-        )
-
-        res._data.multiindex = self._data.multiindex
-        res._data._level_names = self._data._level_names
-
-        if not ignore_index and self._index is not None:
-            res.index.names = self._index.names
-        return res
-
-    @annotate(
-        "FRAME_GET_COLUMNS_BY_LABEL", color="green", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _get_columns_by_label(self, labels, downcast=False):
         """
         Returns columns of the Frame specified by `labels`
@@ -568,9 +613,7 @@ def _get_columns_by_label(self, labels, downcast=False):
         """
         return self._data.select_by_label(labels)
 
-    @annotate(
-        "FRAME_GET_COLUMNS_BY_INDEX", color="green", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def _get_columns_by_index(self, indices):
         """
         Returns columns of the Frame specified by `labels`
@@ -594,15 +637,6 @@ def _as_column(self):
 
         return self._data[None].copy(deep=False)
 
-    @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python")
-    def _empty_like(self, keep_index=True):
-        result = self.__class__._from_data(
-            *libcudf.copying.table_empty_like(self, keep_index)
-        )
-
-        result._copy_type_metadata(self, include_index=keep_index)
-        return result
-
     @property
     def values(self):
         """
@@ -690,7 +724,7 @@ def get_column_values_na(col):
     # particular, we need to benchmark how much of the overhead is coming from
     # (potentially unavoidable) local copies in to_cupy and how much comes from
     # inefficiencies in the implementation.
-    @annotate("FRAME_TO_CUPY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def to_cupy(
         self,
         dtype: Union[Dtype, None] = None,
@@ -725,7 +759,7 @@ def to_cupy(
             na_value,
         )
 
-    @annotate("FRAME_TO_NUMPY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def to_numpy(
         self,
         dtype: Union[Dtype, None] = None,
@@ -760,7 +794,7 @@ def to_numpy(
             (lambda col: col.values_host), np.empty, dtype, na_value
         )
 
-    @annotate("FRAME_CLIP", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def clip(self, lower=None, upper=None, inplace=False, axis=1):
         """
         Trim values at input threshold(s).
@@ -888,7 +922,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
 
         return self._mimic_inplace(output, inplace=inplace)
 
-    @annotate("FRAME_WHERE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def where(self, cond, other=None, inplace=False):
         """
         Replace values where the condition is False.
@@ -947,7 +981,7 @@ def where(self, cond, other=None, inplace=False):
             frame=self, cond=cond, other=other, inplace=inplace
         )
 
-    @annotate("FRAME_MASK", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def mask(self, cond, other=None, inplace=False):
         """
         Replace values where the condition is True.
@@ -1009,7 +1043,7 @@ def mask(self, cond, other=None, inplace=False):
 
         return self.where(cond=~cond, other=other, inplace=inplace)
 
-    @annotate("FRAME_PIPE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def pipe(self, func, *args, **kwargs):
         """
         Apply ``func(self, *args, **kwargs)``.
@@ -1057,7 +1091,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @annotate("SCATTER_BY_MAP", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def scatter_by_map(
         self, map_index, map_size=None, keep_index=True, **kwargs
     ):
@@ -1140,7 +1174,7 @@ def scatter_by_map(
 
         return result
 
-    @annotate("FRAME_FILLNA", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1295,7 +1329,14 @@ def fillna(
             inplace=inplace,
         )
 
-    @annotate("FRAME_DROPNA_COLUMNS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
+    def _drop_column(self, name):
+        """Drop a column by *name*"""
+        if name not in self._data:
+            raise KeyError(f"column '{name}' does not exist")
+        del self._data[name]
+
+    @_cudf_nvtx_annotate
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
         """
         Drop columns containing nulls
@@ -1327,7 +1368,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
 
         return self[out_cols]
 
-    @annotate("FRAME_INTERPOLATE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def interpolate(
         self,
         method="linear",
@@ -1397,7 +1438,7 @@ def interpolate(
             else result._gather(perm_sort.argsort())
         )
 
-    @annotate("FRAME_QUANTILES", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _quantiles(
         self,
         q,
@@ -1430,7 +1471,7 @@ def _quantiles(
         result._copy_type_metadata(self)
         return result
 
-    @annotate("FRAME_RANK", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def rank(
         self,
         axis=0,
@@ -1507,7 +1548,7 @@ def rank(
 
         return self._from_data(data, index).astype(np.float64)
 
-    @annotate("FRAME_REPEAT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def repeat(self, repeats, axis=None):
         """Repeats elements consecutively.
 
@@ -1597,7 +1638,7 @@ def repeat(self, repeats, axis=None):
         result._copy_type_metadata(self)
         return result
 
-    @annotate("FRAME_SHIFT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
@@ -1613,201 +1654,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             zip(self._column_names, data_columns), self._index
         )
 
-    @annotate("FRAME_SAMPLE", color="orange", domain="cudf_python")
-    def sample(
-        self,
-        n=None,
-        frac=None,
-        replace=False,
-        weights=None,
-        random_state=None,
-        axis=None,
-        keep_index=True,
-    ):
-        """Return a random sample of items from an axis of object.
-
-        You can use random_state for reproducibility.
-
-        Parameters
-        ----------
-        n : int, optional
-            Number of items from axis to return. Cannot be used with frac.
-            Default = 1 if frac = None.
-        frac : float, optional
-            Fraction of axis items to return. Cannot be used with n.
-        replace : bool, default False
-            Allow or disallow sampling of the same row more than once.
-            replace == True is not yet supported for axis = 1/"columns"
-        weights : str or ndarray-like, optional
-            Only supported for axis=1/"columns"
-        random_state : int, numpy RandomState or None, default None
-            Seed for the random number generator (if int), or None.
-            If None, a random seed will be chosen.
-            if RandomState, seed will be extracted from current state.
-        axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
-            Axis to sample. Accepts axis number or name.
-            Default is stat axis for given data type
-            (0 for Series and DataFrames). Series and Index doesn't
-            support axis=1.
-
-        Returns
-        -------
-        Series or DataFrame or Index
-            A new object of same type as caller containing n items
-            randomly sampled from the caller object.
-
-        Examples
-        --------
-        >>> import cudf as cudf
-        >>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}})
-        >>> df.sample(3)
-           a
-        1  2
-        3  4
-        0  1
-
-        >>> sr = cudf.Series([1, 2, 3, 4, 5])
-        >>> sr.sample(10, replace=True)
-        1    4
-        3    1
-        2    4
-        0    5
-        0    1
-        4    5
-        4    1
-        0    2
-        0    3
-        3    2
-        dtype: int64
-
-        >>> df = cudf.DataFrame(
-        ... {"a":[1, 2], "b":[2, 3], "c":[3, 4], "d":[4, 5]})
-        >>> df.sample(2, axis=1)
-           a  c
-        0  1  3
-        1  2  4
-        """
-
-        if frac is not None and frac > 1 and not replace:
-            raise ValueError(
-                "Replace has to be set to `True` "
-                "when upsampling the population `frac` > 1."
-            )
-        elif frac is not None and n is not None:
-            raise ValueError(
-                "Please enter a value for `frac` OR `n`, not both"
-            )
-
-        if frac is None and n is None:
-            n = 1
-        elif frac is not None:
-            if axis is None or axis == 0 or axis == "index":
-                n = int(round(self.shape[0] * frac))
-            else:
-                n = int(round(self.shape[1] * frac))
-
-        if axis is None or axis == 0 or axis == "index":
-            if n > 0 and self.shape[0] == 0:
-                raise ValueError(
-                    "Cannot take a sample larger than 0 when axis is empty"
-                )
-
-            if not replace and n > self.shape[0]:
-                raise ValueError(
-                    "Cannot take a larger sample than population "
-                    "when 'replace=False'"
-                )
-
-            if weights is not None:
-                raise NotImplementedError(
-                    "weights is not yet supported for axis=0/index"
-                )
-
-            if random_state is None:
-                seed = np.random.randint(
-                    np.iinfo(np.int64).max, dtype=np.int64
-                )
-            elif isinstance(random_state, np.random.mtrand.RandomState):
-                _, keys, pos, _, _ = random_state.get_state()
-                seed = 0 if pos >= len(keys) else pos
-            else:
-                seed = np.int64(random_state)
-
-            result = self.__class__._from_data(
-                *libcudf.copying.sample(
-                    self,
-                    n=n,
-                    replace=replace,
-                    seed=seed,
-                    keep_index=keep_index,
-                )
-            )
-            result._copy_type_metadata(self)
-
-            return result
-        else:
-            if len(self.shape) != 2:
-                raise ValueError(
-                    f"No axis named {axis} for "
-                    f"object type {self.__class__}"
-                )
-
-            if replace:
-                raise NotImplementedError(
-                    "Sample is not supported for "
-                    f"axis {axis} when 'replace=True'"
-                )
-
-            if n > 0 and self.shape[1] == 0:
-                raise ValueError(
-                    "Cannot take a sample larger than 0 when axis is empty"
-                )
-
-            columns = np.asarray(self._data.names)
-            if not replace and n > columns.size:
-                raise ValueError(
-                    "Cannot take a larger sample "
-                    "than population when 'replace=False'"
-                )
-
-            if weights is not None:
-                if is_column_like(weights):
-                    weights = np.asarray(weights)
-                else:
-                    raise ValueError(
-                        "Strings can only be passed to weights "
-                        "when sampling from rows on a DataFrame"
-                    )
-
-                if columns.size != len(weights):
-                    raise ValueError(
-                        "Weights and axis to be sampled must be of same length"
-                    )
-
-                total_weight = weights.sum()
-                if total_weight != 1:
-                    if not isinstance(weights.dtype, float):
-                        weights = weights.astype("float64")
-                    weights = weights / total_weight
-
-            np.random.seed(random_state)
-            gather_map = np.random.choice(
-                columns, size=n, replace=replace, p=weights
-            )
-
-            if isinstance(self, cudf.MultiIndex):
-                # TODO: Need to update this once MultiIndex is refactored,
-                # should be able to treat it similar to other Frame object
-                result = cudf.Index(self.to_frame(index=False)[gather_map])
-            else:
-                result = self[gather_map]
-                if not keep_index:
-                    result.index = None
-
-            return result
-
     @classmethod
-    @annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def from_arrow(cls, data):
         """Convert from PyArrow Table to Frame
 
@@ -1947,7 +1795,7 @@ def from_arrow(cls, data):
 
         return cls._from_data({name: result[name] for name in column_names})
 
-    @annotate("FRAME_TO_ARROW", color="orange", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def to_arrow(self):
         """
         Convert to arrow Table
@@ -1983,7 +1831,7 @@ def _positions_from_column_names(self, column_names):
             if name in set(column_names)
         ]
 
-    @annotate("FRAME_REPLACE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def replace(
         self,
         to_replace=None,
@@ -2270,7 +2118,7 @@ def _copy_type_metadata(
 
         return self
 
-    @annotate("FRAME_ISNULL", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def isnull(self):
         """
         Identify missing values.
@@ -2352,7 +2200,7 @@ def isnull(self):
     # Alias for isnull
     isna = isnull
 
-    @annotate("FRAME_NOTNULL", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def notnull(self):
         """
         Identify non-missing values.
@@ -2434,7 +2282,7 @@ def notnull(self):
     # Alias for notnull
     notna = notnull
 
-    @annotate("FRAME_INTERLEAVE_COLUMNS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def interleave_columns(self):
         """
         Interleave Series columns of a table into a single column.
@@ -2474,7 +2322,7 @@ def interleave_columns(self):
 
         return result
 
-    @annotate("FRAME_TILE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def tile(self, count):
         """
         Repeats the rows from `self` DataFrame `count` times to form a
@@ -2504,7 +2352,7 @@ def tile(self, count):
         result._copy_type_metadata(self)
         return result
 
-    @annotate("FRAME_SEARCHSORTED", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def searchsorted(
         self, values, side="left", ascending=True, na_position="last"
     ):
@@ -2589,7 +2437,7 @@ def searchsorted(
         else:
             return result
 
-    @annotate("FRAME_ARGSORT", color="yellow", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def argsort(
         self,
         by=None,
@@ -2692,7 +2540,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
 
         return libcudf.sort.order_by(to_sort, ascending, na_position)
 
-    @annotate("FRAME_SIN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def sin(self):
         """
         Get Trigonometric sine, element-wise.
@@ -2752,9 +2600,14 @@ def sin(self):
                     0.8011526357338306, 0.8939966636005579],
                     dtype='float64')
         """
+        warnings.warn(
+            "sin is deprecated and will be removed. Use numpy.sin instead",
+            FutureWarning,
+        )
+
         return self._unaryop("sin")
 
-    @annotate("FRAME_COS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def cos(self):
         """
         Get Trigonometric cosine, element-wise.
@@ -2814,9 +2667,14 @@ def cos(self):
                     -0.5984600690578581, -0.4480736161291701],
                     dtype='float64')
         """
+        warnings.warn(
+            "cos is deprecated and will be removed. Use numpy.cos instead",
+            FutureWarning,
+        )
+
         return self._unaryop("cos")
 
-    @annotate("FRAME_TAN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def tan(self):
         """
         Get Trigonometric tangent, element-wise.
@@ -2876,9 +2734,14 @@ def tan(self):
                     -1.3386902103511544, -1.995200412208242],
                     dtype='float64')
         """
+        warnings.warn(
+            "tan is deprecated and will be removed. Use numpy.tan instead",
+            FutureWarning,
+        )
+
         return self._unaryop("tan")
 
-    @annotate("FRAME_ASIN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def asin(self):
         """
         Get Trigonometric inverse sine, element-wise.
@@ -2927,9 +2790,14 @@ def asin(self):
                     1.5707963267948966, 0.3046926540153975],
                     dtype='float64')
         """
+        warnings.warn(
+            "asin is deprecated and will be removed in the future",
+            FutureWarning,
+        )
+
         return self._unaryop("asin")
 
-    @annotate("FRAME_ACOS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def acos(self):
         """
         Get Trigonometric inverse cosine, element-wise.
@@ -2978,6 +2846,11 @@ def acos(self):
                     1.5707963267948966,  1.266103672779499],
                     dtype='float64')
         """
+        warnings.warn(
+            "acos is deprecated and will be removed. Use numpy.acos instead",
+            FutureWarning,
+        )
+
         result = self.copy(deep=False)
         for col in result._data:
             min_float_dtype = cudf.utils.dtypes.get_min_float_dtype(
@@ -2988,7 +2861,7 @@ def acos(self):
         result = result.mask((result < 0) | (result > np.pi + 1))
         return result
 
-    @annotate("FRAME_ATAN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def atan(self):
         """
         Get Trigonometric inverse tangent, element-wise.
@@ -3047,9 +2920,14 @@ def atan(self):
                                     0.2914567944778671],
                     dtype='float64')
         """
+        warnings.warn(
+            "atan is deprecated and will be removed. Use numpy.atan instead",
+            FutureWarning,
+        )
+
         return self._unaryop("atan")
 
-    @annotate("FRAME_EXP", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def exp(self):
         """
         Get the exponential of all elements, element-wise.
@@ -3110,9 +2988,14 @@ def exp(self):
                       2.718281828459045, 1.0,  1.3498588075760032],
                     dtype='float64')
         """
+        warnings.warn(
+            "exp is deprecated and will be removed. Use numpy.exp instead",
+            FutureWarning,
+        )
+
         return self._unaryop("exp")
 
-    @annotate("FRAME_LOG", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def log(self):
         """
         Get the natural logarithm of all elements, element-wise.
@@ -3172,9 +3055,14 @@ def log(self):
         Float64Index([2.302585092994046, 2.3978952727983707,
                     6.214608098422191], dtype='float64')
         """
+        warnings.warn(
+            "log is deprecated and will be removed. Use numpy.log instead",
+            FutureWarning,
+        )
+
         return self._unaryop("log")
 
-    @annotate("FRAME_SQRT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def sqrt(self):
         """
         Get the non-negative square-root of all elements, element-wise.
@@ -3228,9 +3116,14 @@ def sqrt(self):
         >>> index.sqrt()
         Float64Index([nan, 10.0, 25.0], dtype='float64')
         """
+        warnings.warn(
+            "sqrt is deprecated and will be removed. Use numpy.sqrt instead",
+            FutureWarning,
+        )
+
         return self._unaryop("sqrt")
 
-    @annotate("FRAME_ABS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def abs(self):
         """
         Return a Series/DataFrame with absolute numeric value of each element.
@@ -3257,7 +3150,7 @@ def abs(self):
         return self._unaryop("abs")
 
     # Rounding
-    @annotate("FRAME_CEIL", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def ceil(self):
         """
         Rounds each value upward to the smallest integral value not less
@@ -3294,7 +3187,7 @@ def ceil(self):
 
         return self._unaryop("ceil")
 
-    @annotate("FRAME_FLOOR", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def floor(self):
         """Rounds each value downward to the largest integral value not greater
         than the original.
@@ -3334,7 +3227,7 @@ def floor(self):
 
         return self._unaryop("floor")
 
-    @annotate("FRAME_SCALE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def scale(self):
         """
         Scale values to [0, 1] in float64
@@ -3369,7 +3262,7 @@ def scale(self):
         scaled._index = self._index.copy(deep=False)
         return scaled
 
-    @annotate("FRAME_INTERNAL_MERGE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _merge(
         self,
         right,
@@ -3413,7 +3306,7 @@ def _merge(
             suffixes=suffixes,
         ).perform_merge()
 
-    @annotate("FRAME_IS_SORTED", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _is_sorted(self, ascending=None, null_position=None):
         """
         Returns a boolean indicating whether the data of the Frame are sorted
@@ -3444,14 +3337,22 @@ def _is_sorted(self, ascending=None, null_position=None):
             self, ascending=ascending, null_position=null_position
         )
 
-    @annotate("FRAME_SPLIT", color="green", domain="cudf_python")
-    def _split(self, splits, keep_index=True):
-        results = libcudf.copying.table_split(
-            self, splits, keep_index=keep_index
-        )
-        return [self.__class__._from_data(*result) for result in results]
+    @_cudf_nvtx_annotate
+    def _split(self, splits):
+        """Split a frame with split points in ``splits``. Returns a list of
+        Frames of length `len(splits) + 1`.
+        """
+        return [
+            self._from_columns_like_self(
+                libcudf.copying.columns_split([*self._data.columns], splits)[
+                    split_idx
+                ],
+                self._column_names,
+            )
+            for split_idx in range(len(splits) + 1)
+        ]
 
-    @annotate("FRAME_ENCODE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _encode(self):
         data, index, indices = libcudf.transform.table_encode(self)
         for name, col in data.items():
@@ -3459,7 +3360,7 @@ def _encode(self):
         keys = self.__class__._from_data(data, index)
         return keys, indices
 
-    @annotate("FRAME_UNARYOP", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         return self.__class__._from_data(
@@ -3467,13 +3368,7 @@ def _unaryop(self, op):
         )
 
     def _binaryop(
-        self,
-        other: T,
-        fn: str,
-        fill_value: Any = None,
-        reflect: bool = False,
-        *args,
-        **kwargs,
+        self, other: T, op: str, fill_value: Any = None, *args, **kwargs,
     ) -> Frame:
         """Perform a binary operation between two frames.
 
@@ -3481,25 +3376,23 @@ def _binaryop(
         ----------
         other : Frame
             The second operand.
-        fn : str
+        op : str
             The operation to perform.
         fill_value : Any, default None
             The value to replace null values with. If ``None``, nulls are not
             filled before the operation.
-        reflect : bool, default False
-            If ``True``, swap the order of the operands. See
-            https://docs.python.org/3/reference/datamodel.html#object.__ror__
-            for more information on when this is necessary.
 
         Returns
         -------
         Frame
             A new instance containing the result of the operation.
         """
-        raise NotImplementedError
+        raise NotImplementedError(
+            f"Binary operations are not supported for {self.__class__}"
+        )
 
     @classmethod
-    @annotate("FRAME_COLWISE_BINOP", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _colwise_binop(
         cls,
         operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
@@ -3527,6 +3420,7 @@ def _colwise_binop(
             A dict of columns constructed from the result of performing the
             requested operation on the operands.
         """
+        fn = fn[2:-2]
 
         # Now actually perform the binop on the columns in left and right.
         output = {}
@@ -3658,7 +3552,85 @@ def _colwise_binop(
 
         return output
 
-    @annotate("FRAME_DOT", color="green", domain="cudf_python")
+    # For more detail on this function and how it should work, see
+    # https://numpy.org/doc/stable/reference/ufuncs.html
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        # We don't currently support reduction, accumulation, etc. We also
+        # don't support any special kwargs or higher arity ufuncs than binary.
+        if method != "__call__" or kwargs or ufunc.nin > 2:
+            return NotImplemented
+
+        fname = ufunc.__name__
+        if fname in _ufunc_binary_operations:
+            reflect = self is not inputs[0]
+            other = inputs[0] if reflect else inputs[1]
+
+            op = _ufunc_binary_operations[fname]
+            if reflect and op in _ops_without_reflection:
+                op = _ops_without_reflection[op]
+                reflect = False
+            op = f"__{'r' if reflect else ''}{op}__"
+
+            # Float_power returns float irrespective of the input type.
+            if fname == "float_power":
+                return getattr(self, op)(other).astype(float)
+            return getattr(self, op)(other)
+
+        # Special handling for various unary operations.
+        if fname == "negative":
+            return self * -1
+        if fname == "positive":
+            return self.copy(deep=True)
+        if fname == "invert":
+            return ~self
+        if fname == "absolute":
+            return self.abs()
+        if fname == "fabs":
+            return self.abs().astype(np.float64)
+
+        # None is a sentinel used by subclasses to trigger cupy dispatch.
+        return None
+
+    def _apply_cupy_ufunc_to_operands(
+        self, ufunc, cupy_func, operands, **kwargs
+    ):
+        # Note: There are some operations that may be supported by libcudf but
+        # are not supported by pandas APIs. In particular, libcudf binary
+        # operations support logical and/or operations as well as
+        # trigonometric, but those operations are not defined on
+        # pd.Series/DataFrame. For now those operations will dispatch to cupy,
+        # but if ufuncs are ever a bottleneck we could add special handling to
+        # dispatch those (or any other) functions that we could implement
+        # without cupy.
+
+        mask = None
+        data = [{} for _ in range(ufunc.nout)]
+        for name, (left, right, _, _) in operands.items():
+            cupy_inputs = []
+            for inp in (left, right) if ufunc.nin == 2 else (left,):
+                if isinstance(inp, ColumnBase) and inp.has_nulls():
+                    new_mask = as_column(inp.nullmask)
+
+                    # TODO: This is a hackish way to perform a bitwise and
+                    # of bitmasks. Once we expose
+                    # cudf::detail::bitwise_and, then we can use that
+                    # instead.
+                    mask = new_mask if mask is None else (mask & new_mask)
+
+                    # Arbitrarily fill with zeros. For ufuncs, we assume
+                    # that the end result propagates nulls via a bitwise
+                    # and, so these elements are irrelevant.
+                    inp = inp.fillna(0)
+                cupy_inputs.append(cupy.asarray(inp))
+
+            cp_output = cupy_func(*cupy_inputs, **kwargs)
+            if ufunc.nout == 1:
+                cp_output = (cp_output,)
+            for i, out in enumerate(cp_output):
+                data[i][name] = as_column(out).set_mask(mask)
+        return data
+
+    @_cudf_nvtx_annotate
     def dot(self, other, reflect=False):
         """
         Get dot product of frame and other, (binary operator `dot`).
@@ -3731,83 +3703,12 @@ def dot(self, other, reflect=False):
             return cudf.DataFrame(result)
         return result.item()
 
-    # Binary arithmetic operations.
-    def __add__(self, other):
-        return self._binaryop(other, "add")
-
-    def __radd__(self, other):
-        return self._binaryop(other, "add", reflect=True)
-
-    def __sub__(self, other):
-        return self._binaryop(other, "sub")
-
-    def __rsub__(self, other):
-        return self._binaryop(other, "sub", reflect=True)
-
     def __matmul__(self, other):
         return self.dot(other)
 
     def __rmatmul__(self, other):
         return self.dot(other, reflect=True)
 
-    def __mul__(self, other):
-        return self._binaryop(other, "mul")
-
-    def __rmul__(self, other):
-        return self._binaryop(other, "mul", reflect=True)
-
-    def __mod__(self, other):
-        return self._binaryop(other, "mod")
-
-    def __rmod__(self, other):
-        return self._binaryop(other, "mod", reflect=True)
-
-    def __pow__(self, other):
-        return self._binaryop(other, "pow")
-
-    def __rpow__(self, other):
-        return self._binaryop(other, "pow", reflect=True)
-
-    def __floordiv__(self, other):
-        return self._binaryop(other, "floordiv")
-
-    def __rfloordiv__(self, other):
-        return self._binaryop(other, "floordiv", reflect=True)
-
-    def __truediv__(self, other):
-        return self._binaryop(other, "truediv")
-
-    def __rtruediv__(self, other):
-        return self._binaryop(other, "truediv", reflect=True)
-
-    def __and__(self, other):
-        return self._binaryop(other, "and")
-
-    def __or__(self, other):
-        return self._binaryop(other, "or")
-
-    def __xor__(self, other):
-        return self._binaryop(other, "xor")
-
-    # Binary rich comparison operations.
-    def __eq__(self, other):
-        return self._binaryop(other, "eq")
-
-    def __ne__(self, other):
-        return self._binaryop(other, "ne")
-
-    def __lt__(self, other):
-        return self._binaryop(other, "lt")
-
-    def __le__(self, other):
-        return self._binaryop(other, "le")
-
-    def __gt__(self, other):
-        return self._binaryop(other, "gt")
-
-    def __ge__(self, other):
-        return self._binaryop(other, "ge")
-
     # Unary logical operators
     def __neg__(self):
         return -1 * self
@@ -3831,7 +3732,7 @@ def _reduce(self, *args, **kwargs):
             f"Reductions are not supported for objects of type {type(self)}."
         )
 
-    @annotate("FRAME_MIN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def min(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
     ):
@@ -3877,7 +3778,7 @@ def min(
             **kwargs,
         )
 
-    @annotate("FRAME_MAX", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def max(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
     ):
@@ -3923,7 +3824,7 @@ def max(
             **kwargs,
         )
 
-    @annotate("FRAME_SUM", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def sum(
         self,
         axis=None,
@@ -3982,7 +3883,7 @@ def sum(
             **kwargs,
         )
 
-    @annotate("FRAME_PRODUCT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def product(
         self,
         axis=None,
@@ -4047,7 +3948,7 @@ def product(
     # Alias for pandas compatibility.
     prod = product
 
-    @annotate("FRAME_MEAN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def mean(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -4092,7 +3993,7 @@ def mean(
             **kwargs,
         )
 
-    @annotate("FRAME_STD", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def std(
         self,
         axis=None,
@@ -4149,7 +4050,7 @@ def std(
             **kwargs,
         )
 
-    @annotate("FRAME_VAR", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def var(
         self,
         axis=None,
@@ -4205,7 +4106,7 @@ def var(
             **kwargs,
         )
 
-    @annotate("FRAME_KURTOSIS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def kurtosis(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -4274,7 +4175,7 @@ def kurt(
             **kwargs,
         )
 
-    @annotate("FRAME_SKEW", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def skew(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -4332,7 +4233,7 @@ def skew(
             **kwargs,
         )
 
-    @annotate("FRAME_ALL", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def all(self, axis=0, skipna=True, level=None, **kwargs):
         """
         Return whether all elements are True in DataFrame.
@@ -4368,7 +4269,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
             "all", axis=axis, skipna=skipna, level=level, **kwargs,
         )
 
-    @annotate("FRAME_ANY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def any(self, axis=0, skipna=True, level=None, **kwargs):
         """
         Return whether any elements is True in DataFrame.
@@ -4404,7 +4305,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
             "any", axis=axis, skipna=skipna, level=level, **kwargs,
         )
 
-    @annotate("FRAME_SUM_OF_SQUARES", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def sum_of_squares(self, dtype=None):
         """Return the sum of squares of values.
 
@@ -4428,7 +4329,7 @@ def sum_of_squares(self, dtype=None):
         """
         return self._reduce("sum_of_squares", dtype=dtype)
 
-    @annotate("FRAME_MEDIAN", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def median(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -4474,152 +4375,24 @@ def median(
         )
 
     # Scans
-    @annotate("FRAME_SCAN", color="green", domain="cudf_python")
-    def _scan(self, op, axis=None, skipna=True, cast_to_int=False):
-        skipna = True if skipna is None else skipna
-
-        results = {}
-        for name, col in self._data.items():
-            if skipna:
-                try:
-                    result_col = col.nans_to_nulls()
-                except AttributeError:
-                    result_col = col
-            else:
-                if col.has_nulls(include_nan=True):
-                    # Workaround as find_first_value doesn't seem to work
-                    # incase of bools.
-                    first_index = int(
-                        col.isnull().astype("int8").find_first_value(1)
-                    )
-                    result_col = col.copy()
-                    result_col[first_index:] = None
-                else:
-                    result_col = col
-
-            if (
-                cast_to_int
-                and not is_decimal_dtype(result_col.dtype)
-                and (
-                    np.issubdtype(result_col.dtype, np.integer)
-                    or np.issubdtype(result_col.dtype, np.bool_)
-                )
-            ):
-                # For reductions that accumulate a value (e.g. sum, not max)
-                # pandas returns an int64 dtype for all int or bool dtypes.
-                result_col = result_col.astype(np.int64)
-            results[name] = result_col._apply_scan_op(op)
-        # TODO: This will work for Index because it's passing self._index
-        # (which is None), but eventually we may want to remove that parameter
-        # for Index._from_data and simplify.
-        return self._from_data(results, index=self._index)
-
-    @annotate("FRAME_CUMMIN", color="green", domain="cudf_python")
-    def cummin(self, axis=None, skipna=True, *args, **kwargs):
+    @_cudf_nvtx_annotate
+    def _scan(self, op, axis=None, skipna=True):
         """
-        Return cumulative minimum of the Series or DataFrame.
+        Return {op_name} of the {cls}.
 
         Parameters
         ----------
 
-        axis: {index (0), columns(1)}
+        axis: {{index (0), columns(1)}}
             Axis for the function to be applied on.
         skipna: bool, default True
             Exclude NA/null values. If an entire row/column is NA,
             the result will be NA.
 
-        Returns
-        -------
-        Series or DataFrame
-
-        Examples
-        --------
-        **Series**
-
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.cummin()
-        0    1
-        1    1
-        2    1
-        3    1
-        4    1
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.cummin()
-           a  b
-        0  1  7
-        1  1  7
-        2  1  7
-        3  1  7
-        """
-        return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs)
-
-    @annotate("FRAME_CUMMAX", color="green", domain="cudf_python")
-    def cummax(self, axis=None, skipna=True, *args, **kwargs):
-        """
-        Return cumulative maximum of the Series or DataFrame.
-
-        Parameters
-        ----------
-
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
 
         Returns
         -------
-        Series or DataFrame
-
-        Examples
-        --------
-        **Series**
-
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.cummax()
-        0    1
-        1    5
-        2    5
-        3    5
-        4    5
-
-        **DataFrame**
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.cummax()
-           a   b
-        0  1   7
-        1  2   8
-        2  3   9
-        3  4  10
-        """
-        return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs)
-
-    @annotate("FRAME_CUMSUM", color="green", domain="cudf_python")
-    def cumsum(self, axis=None, skipna=True, *args, **kwargs):
-        """
-        Return cumulative sum of the Series or DataFrame.
-
-        Parameters
-        ----------
-
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-
-        Returns
-        -------
-        Series or DataFrame
+        {cls}
 
         Examples
         --------
@@ -4637,7 +4410,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs):
         **DataFrame**
 
         >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df = cudf.DataFrame({{'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}})
         >>> s.cumsum()
             a   b
         0   1   7
@@ -4645,57 +4418,46 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs):
         2   6  24
         3  10  34
         """
-        return self._scan(
-            "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs
-        )
-
-    @annotate("FRAME_CUMPROD", color="green", domain="cudf_python")
-    def cumprod(self, axis=None, skipna=True, *args, **kwargs):
-        """
-        Return cumulative product of the Series or DataFrame.
-
-        Parameters
-        ----------
-
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-        Returns
-        -------
-        Series or DataFrame
-
-        Examples
-        --------
-        **Series**
-
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.cumprod()
-        0    1
-        1    5
-        2    10
-        3    40
-        4    120
+        cast_to_int = op in ("cumsum", "cumprod")
+        skipna = True if skipna is None else skipna
 
-        **DataFrame**
+        results = {}
+        for name, col in self._data.items():
+            if skipna:
+                try:
+                    result_col = col.nans_to_nulls()
+                except AttributeError:
+                    result_col = col
+            else:
+                if col.has_nulls(include_nan=True):
+                    # Workaround as find_first_value doesn't seem to work
+                    # incase of bools.
+                    first_index = int(
+                        col.isnull().astype("int8").find_first_value(1)
+                    )
+                    result_col = col.copy()
+                    result_col[first_index:] = None
+                else:
+                    result_col = col
 
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> s.cumprod()
-            a     b
-        0   1     7
-        1   2    56
-        2   6   504
-        3  24  5040
-        """
-        return self._scan(
-            "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs
-        )
+            if (
+                cast_to_int
+                and not is_decimal_dtype(result_col.dtype)
+                and (
+                    np.issubdtype(result_col.dtype, np.integer)
+                    or np.issubdtype(result_col.dtype, np.bool_)
+                )
+            ):
+                # For reductions that accumulate a value (e.g. sum, not max)
+                # pandas returns an int64 dtype for all int or bool dtypes.
+                result_col = result_col.astype(np.int64)
+            results[name] = getattr(result_col, op)()
+        # TODO: This will work for Index because it's passing self._index
+        # (which is None), but eventually we may want to remove that parameter
+        # for Index._from_data and simplify.
+        return self._from_data(results, index=self._index)
 
-    @annotate("FRAME_TO_JSON", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @ioutils.doc_to_json()
     def to_json(self, path_or_buf=None, *args, **kwargs):
         """{docstring}"""
@@ -4704,21 +4466,21 @@ def to_json(self, path_or_buf=None, *args, **kwargs):
             self, path_or_buf=path_or_buf, *args, **kwargs
         )
 
-    @annotate("FRAME_TO_HDF", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @ioutils.doc_to_hdf()
     def to_hdf(self, path_or_buf, key, *args, **kwargs):
         """{docstring}"""
 
         cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs)
 
-    @annotate("FRAME_TO_DLPACK", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
         """{docstring}"""
 
         return cudf.io.dlpack.to_dlpack(self)
 
-    @annotate("FRAME_TO_STRING", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def to_string(self):
         """
         Convert to string
@@ -4744,15 +4506,15 @@ def to_string(self):
     def __str__(self):
         return self.to_string()
 
-    @annotate("FRAME_DEEP_COPY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __deepcopy__(self, memo):
         return self.copy(deep=True)
 
-    @annotate("FRAME_COPY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __copy__(self):
         return self.copy(deep=False)
 
-    @annotate("FRAME_HEAD", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -4836,7 +4598,7 @@ def head(self, n=5):
         """
         return self.iloc[:n]
 
-    @annotate("FRAME_TAIL", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def tail(self, n=5):
         """
         Returns the last n rows as a new DataFrame or Series
@@ -4868,7 +4630,7 @@ def tail(self, n=5):
 
         return self.iloc[-n:]
 
-    @annotate("FRAME_ROLLING", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     @copy_docstring(Rolling)
     def rolling(
         self, window, min_periods=None, center=False, axis=0, win_type=None
@@ -4882,7 +4644,7 @@ def rolling(
             win_type=win_type,
         )
 
-    @annotate("FRAME_NANS_TO_NULLS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def nans_to_nulls(self):
         """
         Convert nans (if any) to nulls
@@ -4937,7 +4699,7 @@ def nans_to_nulls(self):
                 result_data[name] = col.copy()
         return self._from_data(result_data, self._index)
 
-    @annotate("FRAME_INVERT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data(
@@ -4948,7 +4710,7 @@ def __invert__(self):
             self._index,
         )
 
-    @annotate("FRAME_ADD", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def add(self, other, axis, level=None, fill_value=None):
         """
         Get Addition of dataframe or series and other, element-wise (binary
@@ -5017,9 +4779,9 @@ def add(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "add", fill_value)
+        return self._binaryop(other, "__add__", fill_value)
 
-    @annotate("FRAME_RADD", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def radd(self, other, axis, level=None, fill_value=None):
         """
         Get Addition of dataframe or series and other, element-wise (binary
@@ -5097,9 +4859,9 @@ def radd(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "add", fill_value, reflect=True)
+        return self._binaryop(other, "__radd__", fill_value)
 
-    @annotate("FRAME_SUBTRACT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def subtract(self, other, axis, level=None, fill_value=None):
         """
         Get Subtraction of dataframe or series and other, element-wise (binary
@@ -5178,11 +4940,11 @@ def subtract(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "sub", fill_value)
+        return self._binaryop(other, "__sub__", fill_value)
 
     sub = subtract
 
-    @annotate("FRAME_RSUB", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def rsub(self, other, axis, level=None, fill_value=None):
         """
         Get Subtraction of dataframe or series and other, element-wise (binary
@@ -5264,9 +5026,9 @@ def rsub(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "sub", fill_value, reflect=True)
+        return self._binaryop(other, "__rsub__", fill_value)
 
-    @annotate("FRAME_MULTIPLY", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def multiply(self, other, axis, level=None, fill_value=None):
         """
         Get Multiplication of dataframe or series and other, element-wise
@@ -5347,11 +5109,11 @@ def multiply(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "mul", fill_value)
+        return self._binaryop(other, "__mul__", fill_value)
 
     mul = multiply
 
-    @annotate("FRAME_RMUL", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def rmul(self, other, axis, level=None, fill_value=None):
         """
         Get Multiplication of dataframe or series and other, element-wise
@@ -5434,9 +5196,9 @@ def rmul(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "mul", fill_value, reflect=True)
+        return self._binaryop(other, "__rmul__", fill_value)
 
-    @annotate("FRAME_MOD", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def mod(self, other, axis, level=None, fill_value=None):
         """
         Get Modulo division of dataframe or series and other, element-wise
@@ -5505,9 +5267,9 @@ def mod(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "mod", fill_value)
+        return self._binaryop(other, "__mod__", fill_value)
 
-    @annotate("FRAME_RMOD", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def rmod(self, other, axis, level=None, fill_value=None):
         """
         Get Modulo division of dataframe or series and other, element-wise
@@ -5588,9 +5350,9 @@ def rmod(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "mod", fill_value, reflect=True)
+        return self._binaryop(other, "__rmod__", fill_value)
 
-    @annotate("FRAME_POW", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def pow(self, other, axis, level=None, fill_value=None):
         """
         Get Exponential power of dataframe series and other, element-wise
@@ -5668,9 +5430,9 @@ def pow(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "pow", fill_value)
+        return self._binaryop(other, "__pow__", fill_value)
 
-    @annotate("FRAME_RPOW", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def rpow(self, other, axis, level=None, fill_value=None):
         """
         Get Exponential power of dataframe or series and other, element-wise
@@ -5748,9 +5510,9 @@ def rpow(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "pow", fill_value, reflect=True)
+        return self._binaryop(other, "__rpow__", fill_value)
 
-    @annotate("FRAME_FLOORDIV", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def floordiv(self, other, axis, level=None, fill_value=None):
         """
         Get Integer division of dataframe or series and other, element-wise
@@ -5828,9 +5590,9 @@ def floordiv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "floordiv", fill_value)
+        return self._binaryop(other, "__floordiv__", fill_value)
 
-    @annotate("FRAME_RFLOORDIV", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def rfloordiv(self, other, axis, level=None, fill_value=None):
         """
         Get Integer division of dataframe or series and other, element-wise
@@ -5925,9 +5687,9 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "floordiv", fill_value, reflect=True)
+        return self._binaryop(other, "__rfloordiv__", fill_value)
 
-    @annotate("FRAME_TRUEDIV", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def truediv(self, other, axis, level=None, fill_value=None):
         """
         Get Floating division of dataframe or series and other, element-wise
@@ -6010,13 +5772,13 @@ def truediv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "truediv", fill_value)
+        return self._binaryop(other, "__truediv__", fill_value)
 
     # Alias for truediv
     div = truediv
     divide = truediv
 
-    @annotate("FRAME_RTRUEDIV", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def rtruediv(self, other, axis, level=None, fill_value=None):
         """
         Get Floating division of dataframe or series and other, element-wise
@@ -6104,12 +5866,12 @@ def rtruediv(self, other, axis, level=None, fill_value=None):
         if level is not None:
             raise NotImplementedError("level parameter is not supported yet.")
 
-        return self._binaryop(other, "truediv", fill_value, reflect=True)
+        return self._binaryop(other, "__rtruediv__", fill_value)
 
     # Alias for rtruediv
     rdiv = rtruediv
 
-    @annotate("FRAME_EQ", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def eq(self, other, axis="columns", level=None, fill_value=None):
         """Equal to, element-wise (binary operator eq).
 
@@ -6182,10 +5944,10 @@ def eq(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """
         return self._binaryop(
-            other=other, fn="eq", fill_value=fill_value, can_reindex=True
+            other=other, op="__eq__", fill_value=fill_value, can_reindex=True
         )
 
-    @annotate("FRAME_NE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def ne(self, other, axis="columns", level=None, fill_value=None):
         """Not equal to, element-wise (binary operator ne).
 
@@ -6258,10 +6020,10 @@ def ne(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="ne", fill_value=fill_value, can_reindex=True
+            other=other, op="__ne__", fill_value=fill_value, can_reindex=True
         )
 
-    @annotate("FRAME_LT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def lt(self, other, axis="columns", level=None, fill_value=None):
         """Less than, element-wise (binary operator lt).
 
@@ -6334,10 +6096,10 @@ def lt(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="lt", fill_value=fill_value, can_reindex=True
+            other=other, op="__lt__", fill_value=fill_value, can_reindex=True
         )
 
-    @annotate("FRAME_LE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def le(self, other, axis="columns", level=None, fill_value=None):
         """Less than or equal, element-wise (binary operator le).
 
@@ -6410,10 +6172,10 @@ def le(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="le", fill_value=fill_value, can_reindex=True
+            other=other, op="__le__", fill_value=fill_value, can_reindex=True
         )
 
-    @annotate("FRAME_GT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def gt(self, other, axis="columns", level=None, fill_value=None):
         """Greater than, element-wise (binary operator gt).
 
@@ -6486,10 +6248,10 @@ def gt(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="gt", fill_value=fill_value, can_reindex=True
+            other=other, op="__gt__", fill_value=fill_value, can_reindex=True
         )
 
-    @annotate("FRAME_GE", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def ge(self, other, axis="columns", level=None, fill_value=None):
         """Greater than or equal, element-wise (binary operator ge).
 
@@ -6562,18 +6324,16 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
         dtype: bool
         """  # noqa: E501
         return self._binaryop(
-            other=other, fn="ge", fill_value=fill_value, can_reindex=True
+            other=other, op="__ge__", fill_value=fill_value, can_reindex=True
         )
 
-    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+    def nunique(self, dropna: bool = True):
         """
         Returns a per column mapping with counts of unique values for
         each column.
 
         Parameters
         ----------
-        method : builtins.str, default "sort"
-            Method used by cpp_distinct_count
         dropna : bool, default True
             Don't include NaN in the counts.
 
@@ -6583,16 +6343,12 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True):
             Name and unique value counts of each column in frame.
         """
         return {
-            name: col.distinct_count(method=method, dropna=dropna)
+            name: col.distinct_count(dropna=dropna)
             for name, col in self._data.items()
         }
 
 
-@annotate(
-    "FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS",
-    color="green",
-    domain="cudf_python",
-)
+@_cudf_nvtx_annotate
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any]
 ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]:
@@ -6757,7 +6513,7 @@ def _is_series(obj):
     return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None
 
 
-@annotate("FRAME_DROP_ROWS_BY_LABELS", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def _drop_rows_by_labels(
     obj: DataFrameOrSeries,
     labels: Union[ColumnLike, abc.Iterable, str],
@@ -6822,7 +6578,7 @@ def _drop_rows_by_labels(
             return obj.__class__._from_data(
                 join_res.iloc[:, idx_nlv:]._data,
                 index=midx,
-                columns=obj.columns,
+                columns=obj._data.to_pandas_index(),
             )
 
     else:
@@ -6830,12 +6586,12 @@ def _drop_rows_by_labels(
             raise KeyError("One or more values not found in axis")
 
         key_df = cudf.DataFrame(index=labels)
-        if isinstance(obj, cudf.Series):
+        if isinstance(obj, cudf.DataFrame):
+            return obj.join(key_df, how="leftanti")
+        else:
             res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"]
             res.name = obj.name
             return res
-        else:
-            return obj.join(key_df, how="leftanti")
 
 
 def _apply_inverse_column(col: ColumnBase) -> ColumnBase:
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index b90f857ce84..264f0ea5df6 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -7,8 +7,6 @@
 from functools import cached_property
 
 import numpy as np
-import pandas as pd
-from nvtx import annotate
 
 import cudf
 from cudf._lib import groupby as libgroupby
@@ -17,8 +15,9 @@
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
 from cudf.core.column.column import arange, as_column
+from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
-from cudf.utils.utils import GetAttrGetItemMixin
+from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
@@ -36,7 +35,36 @@ def _quantile_75(x):
     return x.quantile(0.75)
 
 
-class GroupBy(Serializable):
+class GroupBy(Serializable, Reducible, Scannable):
+
+    _VALID_REDUCTIONS = {
+        "sum",
+        "prod",
+        "idxmin",
+        "idxmax",
+        "min",
+        "max",
+        "mean",
+        "median",
+        "nunique",
+        "first",
+        "last",
+        "var",
+        "std",
+    }
+
+    _VALID_SCANS = {
+        "cumsum",
+        "cummin",
+        "cummax",
+    }
+
+    # Necessary because the function names don't directly map to the docs.
+    _SCAN_DOCSTRINGS = {
+        "cumsum": {"op_name": "Cumulative sum"},
+        "cummin": {"op_name": "Cumulative min"},
+        "cummax": {"op_name": "Cumulative max"},
+    }
 
     _MAX_GROUPS_BEFORE_WARN = 100
 
@@ -178,7 +206,7 @@ def cumcount(self):
     def _groupby(self):
         return libgroupby.GroupBy(self.grouping.keys, dropna=self._dropna)
 
-    @annotate("GROUPBY_AGG", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def agg(self, func):
         """
         Apply aggregation(s) to the groups.
@@ -297,6 +325,50 @@ def agg(self, func):
 
         return result
 
+    def _reduce(
+        self,
+        op: str,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        *args,
+        **kwargs,
+    ):
+        """Compute {op} of group values.
+
+        Parameters
+        ----------
+        numeric_only : bool, default None
+            Include only float, int, boolean columns. If None, will attempt to
+            use everything, then use only numeric data.
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If
+            fewer than ``min_count`` non-NA values are present the result will
+            be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+            Computed {op} of values within each group.
+
+        Notes
+        -----
+        Difference from pandas:
+            * Not supporting: numeric_only, min_count
+        """
+        if numeric_only:
+            raise NotImplementedError(
+                "numeric_only parameter is not implemented yet"
+            )
+        if min_count != 0:
+            raise NotImplementedError(
+                "min_count parameter is not implemented yet"
+            )
+        return self.agg(op)
+
+    def _scan(self, op: str, *args, **kwargs):
+        """{op_name} for each group."""
+        return self.agg(op)
+
     aggregate = agg
 
     def nth(self, n):
@@ -813,38 +885,6 @@ def describe(self, include=None, exclude=None):
         )
         return res
 
-    def sum(self):
-        """Compute the column-wise sum of the values in each group."""
-        return self.agg("sum")
-
-    def prod(self):
-        """Compute the column-wise product of the values in each group."""
-        return self.agg("prod")
-
-    def idxmin(self):
-        """Get the column-wise index of the minimum value in each group."""
-        return self.agg("idxmin")
-
-    def idxmax(self):
-        """Get the column-wise index of the maximum value in each group."""
-        return self.agg("idxmax")
-
-    def min(self):
-        """Get the column-wise minimum value in each group."""
-        return self.agg("min")
-
-    def max(self):
-        """Get the column-wise maximum value in each group."""
-        return self.agg("max")
-
-    def mean(self):
-        """Compute the column-wise mean of the values in each group."""
-        return self.agg("mean")
-
-    def median(self):
-        """Get the column-wise median of the values in each group."""
-        return self.agg("median")
-
     def corr(self, method="pearson", min_periods=1):
         """
         Compute pairwise correlation of columns, excluding NA/null values.
@@ -905,7 +945,7 @@ def corr(self, method="pearson", min_periods=1):
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs to be correlated
         # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        _cols = self.grouping.values.columns.tolist()
+        _cols = self.grouping.values._data.to_pandas_index().tolist()
         len_cols = len(_cols)
 
         new_df_data = {}
@@ -1048,7 +1088,7 @@ def cov(self, min_periods=0, ddof=1):
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs used in the covariance calculation
         # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        column_names = self.grouping.values.columns.tolist()
+        column_names = self.grouping.values._column_names
         num_cols = len(column_names)
 
         column_pair_structs = {}
@@ -1178,10 +1218,6 @@ def func(x):
 
         return self.agg(func)
 
-    def nunique(self):
-        """Compute the number of unique values in each column in each group."""
-        return self.agg("nunique")
-
     def collect(self):
         """Get a list of all the values for each column in each group."""
         return self.agg("collect")
@@ -1190,27 +1226,6 @@ def unique(self):
         """Get a list of the unique values for each column in each group."""
         return self.agg("unique")
 
-    def cumsum(self):
-        """Compute the column-wise cumulative sum of the values in
-        each group."""
-        return self.agg("cumsum")
-
-    def cummin(self):
-        """Get the column-wise cumulative minimum value in each group."""
-        return self.agg("cummin")
-
-    def cummax(self):
-        """Get the column-wise cumulative maximum value in each group."""
-        return self.agg("cummax")
-
-    def first(self):
-        """Get the first non-null value in each group."""
-        return self.agg("first")
-
-    def last(self):
-        """Get the last non-null value in each group."""
-        return self.agg("last")
-
     def diff(self, periods=1, axis=0):
         """Get the difference between the values in each group.
 
@@ -1540,12 +1555,6 @@ def __getitem__(self, key):
             by=self.grouping.keys, dropna=self._dropna, sort=self._sort
         )
 
-    def nunique(self):
-        """
-        Return the number of unique values per group
-        """
-        return self.agg("nunique")
-
 
 class SeriesGroupBy(GroupBy):
     """
@@ -1616,11 +1625,8 @@ def agg(self, func):
                 return result.iloc[:, 0]
 
         # drop the first level if we have a multiindex
-        if (
-            isinstance(result.columns, pd.MultiIndex)
-            and result.columns.nlevels > 1
-        ):
-            result.columns = result.columns.droplevel(0)
+        if result._data.nlevels > 1:
+            result.columns = result._data.to_pandas_index().droplevel(0)
 
         return result
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 5b60e8dbd1c..1c68289898f 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -22,7 +22,6 @@
 import cupy
 import numpy as np
 import pandas as pd
-from nvtx import annotate
 from pandas._config import get_option
 
 import cudf
@@ -33,10 +32,11 @@
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
     is_categorical_dtype,
+    is_dtype_equal,
     is_interval_dtype,
     is_string_dtype,
 )
-from cudf.core._base_index import BaseIndex
+from cudf.core._base_index import BaseIndex, _index_astype_docstring
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
@@ -52,10 +52,11 @@
 from cudf.core.column.string import StringMethods as StringMethods
 from cudf.core.dtypes import IntervalDtype
 from cudf.core.frame import Frame
+from cudf.core.mixins import BinaryOperand
 from cudf.core.single_column_frame import SingleColumnFrame
-from cudf.utils.docutils import copy_docstring
+from cudf.utils.docutils import copy_docstring, doc_apply
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.utils import search_range
+from cudf.utils.utils import _cudf_nvtx_annotate, search_range
 
 T = TypeVar("T", bound="Frame")
 
@@ -122,7 +123,7 @@ def _index_from_columns(
     return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)
 
 
-class RangeIndex(BaseIndex):
+class RangeIndex(BaseIndex, BinaryOperand):
     """
     Immutable Index implementing a monotonic integer range.
 
@@ -155,8 +156,11 @@ class RangeIndex(BaseIndex):
     RangeIndex(start=1, stop=10, step=1, name='a')
     """
 
+    _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
+
     _range: range
 
+    @_cudf_nvtx_annotate
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
@@ -187,43 +191,50 @@ def _copy_type_metadata(
         # have an underlying column.
         return self
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def name(self):
         """
         Returns the name of the Index.
         """
         return self._name
 
-    @name.setter
+    @name.setter  # type: ignore
+    @_cudf_nvtx_annotate
     def name(self, value):
         self._name = value
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def start(self):
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
         return self._start
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def stop(self):
         """
         The value of the stop parameter.
         """
         return self._stop
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def step(self):
         """
         The value of the step parameter.
         """
         return self._step
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def _num_rows(self):
         return len(self)
 
     @cached_property
+    @_cudf_nvtx_annotate
     def _values(self):
         if len(self) > 0:
             return column.arange(
@@ -253,12 +264,14 @@ def is_categorical(self):
     def is_interval(self):
         return False
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
             {self.name: self._values}
         )
 
+    @_cudf_nvtx_annotate
     def __contains__(self, item):
         if not isinstance(
             item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
@@ -268,6 +281,7 @@ def __contains__(self, item):
             return False
         return item in range(self._start, self._stop, self._step)
 
+    @_cudf_nvtx_annotate
     def copy(self, name=None, deep=False, dtype=None, names=None):
         """
         Make a copy of this object.
@@ -298,9 +312,18 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
             start=self._start, stop=self._stop, step=self._step, name=name
         )
 
+    @_cudf_nvtx_annotate
+    @doc_apply(_index_astype_docstring)
+    def astype(self, dtype, copy: bool = True):
+        if is_dtype_equal(dtype, np.int64):
+            return self
+        return self._as_int64().astype(dtype, copy=copy)
+
+    @_cudf_nvtx_annotate
     def drop_duplicates(self, keep="first"):
         return self
 
+    @_cudf_nvtx_annotate
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(start={self._start}, stop={self._stop}"
@@ -313,9 +336,11 @@ def __repr__(self):
             + ")"
         )
 
+    @_cudf_nvtx_annotate
     def __len__(self):
         return len(range(self._start, self._stop, self._step))
 
+    @_cudf_nvtx_annotate
     def __getitem__(self, index):
         len_self = len(self)
         if isinstance(index, slice):
@@ -341,6 +366,7 @@ def __getitem__(self, index):
 
         return as_index(self._values[index], name=self.name)
 
+    @_cudf_nvtx_annotate
     def equals(self, other):
         if isinstance(other, RangeIndex):
             if (self._start, self._stop, self._step) == (
@@ -351,6 +377,7 @@ def equals(self, other):
                 return True
         return Int64Index._from_data(self._data).equals(other)
 
+    @_cudf_nvtx_annotate
     def serialize(self):
         header = {}
         header["index_column"] = {}
@@ -371,6 +398,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
+    @_cudf_nvtx_annotate
     def deserialize(cls, header, frames):
         h = header["index_column"]
         name = pickle.loads(header["name"])
@@ -379,13 +407,15 @@ def deserialize(cls, header, frames):
         step = h.get("step", 1)
         return RangeIndex(start=start, stop=stop, step=step, name=name)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def dtype(self):
         """
         `dtype` of the range of values in RangeIndex.
         """
         return cudf.dtype(np.int64)
 
+    @_cudf_nvtx_annotate
     def find_label_range(self, first=None, last=None):
         """Find subrange in the ``RangeIndex``, marked by their positions, that
         starts greater or equal to ``first`` and ends less or equal to ``last``
@@ -425,6 +455,7 @@ def find_label_range(self, first=None, last=None):
 
         return begin, end
 
+    @_cudf_nvtx_annotate
     def to_pandas(self):
         return pd.RangeIndex(
             start=self._start,
@@ -441,14 +472,17 @@ def is_unique(self):
         """
         return True
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
         return self._step > 0 or len(self) <= 1
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
         return self._step < 0 or len(self) <= 1
 
+    @_cudf_nvtx_annotate
     def get_slice_bound(self, label, side, kind=None):
         """
         Calculate slice bound that corresponds to given label.
@@ -483,6 +517,7 @@ def get_slice_bound(self, label, side, kind=None):
         pos = search_range(start, stop, label, step, side=side)
         return pos
 
+    @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
         if deep:
             warnings.warn(
@@ -495,6 +530,7 @@ def unique(self):
         # RangeIndex always has unique values
         return self
 
+    @_cudf_nvtx_annotate
     def __mul__(self, other):
         # Multiplication by raw ints must return a RangeIndex to match pandas.
         if isinstance(other, cudf.Scalar) and other.dtype.kind in "iu":
@@ -511,15 +547,24 @@ def __mul__(self, other):
             )
         return self._as_int64().__mul__(other)
 
+    @_cudf_nvtx_annotate
     def __rmul__(self, other):
         # Multiplication is commutative.
         return self.__mul__(other)
 
+    @_cudf_nvtx_annotate
     def _as_int64(self):
         # Convert self to an Int64Index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
         return Int64Index._from_data(self._data)
 
+    @_cudf_nvtx_annotate
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        return self._as_int64().__array_ufunc__(
+            ufunc, method, *inputs, **kwargs
+        )
+
+    @_cudf_nvtx_annotate
     def __getattr__(self, key):
         # For methods that are not defined for RangeIndex we attempt to operate
         # on the corresponding integer index if possible.
@@ -530,6 +575,7 @@ def __getattr__(self, key):
                 f"'{type(self)}' object has no attribute {key}"
             )
 
+    @_cudf_nvtx_annotate
     def get_loc(self, key, method=None, tolerance=None):
         # Given an actual integer,
         idx = (key - self._start) / self._step
@@ -563,6 +609,7 @@ def get_loc(self, key, method=None, tolerance=None):
             raise KeyError(key)
         return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int)
 
+    @_cudf_nvtx_annotate
     def _union(self, other, sort=None):
         if isinstance(other, RangeIndex):
             # Variable suffixes are of the
@@ -637,6 +684,7 @@ def _union(self, other, sort=None):
         # then perform `union`.
         return Int64Index(self._values)._union(other, sort=sort)
 
+    @_cudf_nvtx_annotate
     def _intersection(self, other, sort=False):
         if not isinstance(other, RangeIndex):
             return super()._intersection(other, sort=sort)
@@ -682,54 +730,34 @@ def _intersection(self, other, sort=False):
 
         return new_index
 
+    @_cudf_nvtx_annotate
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
         return Int64Index._from_columns(
             [self._values.take(gather_map, nullify, check_bounds)], [self.name]
         )
 
+    @_cudf_nvtx_annotate
     def _apply_boolean_mask(self, boolean_mask):
         return Int64Index._from_columns(
             [self._values.apply_boolean_mask(boolean_mask)], [self.name]
         )
 
+    def _split(self, splits):
+        return Int64Index._from_columns(
+            [self._values.columns_split(splits)], [self.name]
+        )
 
-# Patch in all binops and unary ops, which bypass __getattr__ on the instance
-# and prevent the above overload from working.
-for binop in (
-    "__add__",
-    "__radd__",
-    "__sub__",
-    "__rsub__",
-    "__mod__",
-    "__rmod__",
-    "__pow__",
-    "__rpow__",
-    "__floordiv__",
-    "__rfloordiv__",
-    "__truediv__",
-    "__rtruediv__",
-    "__and__",
-    "__or__",
-    "__xor__",
-    "__eq__",
-    "__ne__",
-    "__lt__",
-    "__le__",
-    "__gt__",
-    "__ge__",
-):
-    setattr(
-        RangeIndex,
-        binop,
-        lambda self, other, op=binop: getattr(self._as_int64(), op)(other),
-    )
+    def _binaryop(self, other, op: str):
+        return self._as_int64()._binaryop(other, op=op)
 
 
+# Patch in all binops and unary ops, which bypass __getattr__ on the instance
+# and prevent the above overload from working.
 for unaop in ("__neg__", "__pos__", "__abs__"):
     setattr(
         RangeIndex,
-        binop,
+        unaop,
         lambda self, op=unaop: getattr(self._as_int64(), op)(),
     )
 
@@ -753,6 +781,7 @@ class GenericIndex(SingleColumnFrame, BaseIndex):
         Column's, the data Column will be cloned to adopt this name.
     """
 
+    @_cudf_nvtx_annotate
     def __init__(self, data, **kwargs):
         kwargs = _setdefault_name(data, **kwargs)
 
@@ -773,23 +802,62 @@ def __init__(self, data, **kwargs):
         name = kwargs.get("name")
         super().__init__({name: data})
 
+    @_cudf_nvtx_annotate
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
+
+        if ret is not None:
+            return ret
+
+        # Attempt to dispatch all other functions to cupy.
+        cupy_func = getattr(cupy, ufunc.__name__)
+        if cupy_func:
+            if ufunc.nin == 2:
+                other = inputs[self is inputs[0]]
+                inputs = self._make_operands_for_binop(other)
+            else:
+                inputs = {
+                    name: (col, None, False, None)
+                    for name, col in self._data.items()
+                }
+
+            data = self._apply_cupy_ufunc_to_operands(
+                ufunc, cupy_func, inputs, **kwargs
+            )
+
+            out = [_index_from_data(out) for out in data]
+
+            # pandas returns numpy arrays when the outputs are boolean.
+            for i, o in enumerate(out):
+                # We explicitly _do not_ use isinstance here: we want only
+                # boolean GenericIndexes, not dtype-specific subclasses.
+                if type(o) is GenericIndex and o.dtype.kind == "b":
+                    out[i] = o.values
+
+            return out[0] if ufunc.nout == 1 else tuple(out)
+
+        return NotImplemented
+
+    @_cudf_nvtx_annotate
     def _binaryop(
-        self,
-        other: T,
-        fn: str,
-        fill_value: Any = None,
-        reflect: bool = False,
-        *args,
-        **kwargs,
+        self, other: T, op: str, fill_value: Any = None, *args, **kwargs,
     ) -> SingleColumnFrame:
-        # Specialize binops to generate the appropriate output index type.
+        reflect = self._is_reflected_op(op)
+        if reflect:
+            op = op[:2] + op[3:]
         operands = self._make_operands_for_binop(other, fill_value, reflect)
-        return (
-            _index_from_data(data=self._colwise_binop(operands, fn),)
-            if operands is not NotImplemented
-            else NotImplemented
-        )
-
+        if operands is NotImplemented:
+            return NotImplemented
+        ret = _index_from_data(self._colwise_binop(operands, op))
+
+        # pandas returns numpy arrays when the outputs are boolean. We
+        # explicitly _do not_ use isinstance here: we want only boolean
+        # GenericIndexes, not dtype-specific subclasses.
+        if type(ret) is GenericIndex and ret.dtype.kind == "b":
+            return ret.values
+        return ret
+
+    @_cudf_nvtx_annotate
     def _copy_type_metadata(
         self, other: Frame, include_index: bool = True
     ) -> GenericIndex:
@@ -806,11 +874,13 @@ def _copy_type_metadata(
             )
         return self
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def _values(self):
         return self._column
 
     @classmethod
+    @_cudf_nvtx_annotate
     def _concat(cls, objs):
         if all(isinstance(obj, RangeIndex) for obj in objs):
             result = _concat_range_index(objs)
@@ -827,10 +897,11 @@ def _concat(cls, objs):
         result.name = name
         return result
 
+    @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
         return sum(super().memory_usage(deep=deep).values())
 
-    @annotate("INDEX_EQUALS", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def equals(self, other, **kwargs):
         """
         Determine if two Index objects contain the same elements.
@@ -860,6 +931,7 @@ def equals(self, other, **kwargs):
         except TypeError:
             return False
 
+    @_cudf_nvtx_annotate
     def copy(self, name=None, deep=False, dtype=None, names=None):
         """
         Make a copy of this object.
@@ -887,6 +959,12 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
         col = self._values.astype(dtype)
         return _index_from_data({name: col.copy(True) if deep else col})
 
+    @_cudf_nvtx_annotate
+    @doc_apply(_index_astype_docstring)
+    def astype(self, dtype, copy: bool = True):
+        return _index_from_data(super().astype({self.name: dtype}, copy))
+
+    @_cudf_nvtx_annotate
     def get_loc(self, key, method=None, tolerance=None):
         """Get integer location, slice or boolean mask for requested label.
 
@@ -1005,6 +1083,7 @@ def get_loc(self, key, method=None, tolerance=None):
         mask[true_inds] = True
         return mask
 
+    @_cudf_nvtx_annotate
     def __repr__(self):
         max_seq_items = get_option("max_seq_items") or len(self)
         mr = 0
@@ -1081,6 +1160,7 @@ def __repr__(self):
 
         return "\n".join(lines)
 
+    @_cudf_nvtx_annotate
     def __getitem__(self, index):
         if type(self) == IntervalIndex:
             raise NotImplementedError(
@@ -1092,13 +1172,15 @@ def __getitem__(self, index):
             res.name = self.name
         return res
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def dtype(self):
         """
         `dtype` of the underlying values in GenericIndex.
         """
         return self._values.dtype
 
+    @_cudf_nvtx_annotate
     def find_label_range(self, first, last):
         """Find range that starts with *first* and ends with *last*,
         inclusively.
@@ -1118,6 +1200,7 @@ def find_label_range(self, first, last):
             end += 1
         return begin, end
 
+    @_cudf_nvtx_annotate
     def get_slice_bound(self, label, side, kind=None):
         return self._values.get_slice_bound(label, side, kind)
 
@@ -1142,6 +1225,7 @@ def is_categorical(self):
     def is_interval(self):
         return False
 
+    @_cudf_nvtx_annotate
     def argsort(
         self,
         axis=0,
@@ -1203,6 +1287,7 @@ class NumericIndex(GenericIndex):
     # Subclasses must define the dtype they are associated with.
     _dtype: Union[None, Type[np.number]] = None
 
+    @_cudf_nvtx_annotate
     def __init__(self, data=None, dtype=None, copy=False, name=None):
 
         dtype = type(self)._dtype
@@ -1540,6 +1625,7 @@ class DatetimeIndex(GenericIndex):
                   dtype='datetime64[ns]', name='a')
     """
 
+    @_cudf_nvtx_annotate
     def __init__(
         self,
         data=None,
@@ -1594,7 +1680,8 @@ def __init__(
             data = column.as_column(np.array(data, dtype=dtype))
         super().__init__(data, **kwargs)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def year(self):
         """
         The year of the datetime.
@@ -1612,7 +1699,8 @@ def year(self):
         """  # noqa: E501
         return self._get_dt_field("year")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def month(self):
         """
         The month as January=1, December=12.
@@ -1630,7 +1718,8 @@ def month(self):
         """  # noqa: E501
         return self._get_dt_field("month")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def day(self):
         """
         The day of the datetime.
@@ -1648,7 +1737,8 @@ def day(self):
         """  # noqa: E501
         return self._get_dt_field("day")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def hour(self):
         """
         The hours of the datetime.
@@ -1668,7 +1758,8 @@ def hour(self):
         """
         return self._get_dt_field("hour")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def minute(self):
         """
         The minutes of the datetime.
@@ -1688,7 +1779,8 @@ def minute(self):
         """
         return self._get_dt_field("minute")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def second(self):
         """
         The seconds of the datetime.
@@ -1708,7 +1800,8 @@ def second(self):
         """
         return self._get_dt_field("second")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def weekday(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -1729,7 +1822,8 @@ def weekday(self):
         """
         return self._get_dt_field("weekday")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def dayofweek(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -1750,7 +1844,8 @@ def dayofweek(self):
         """
         return self._get_dt_field("weekday")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def dayofyear(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -1772,7 +1867,8 @@ def dayofyear(self):
         """
         return self._get_dt_field("day_of_year")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def day_of_year(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -1794,7 +1890,8 @@ def day_of_year(self):
         """
         return self._get_dt_field("day_of_year")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_leap_year(self):
         """
         Boolean indicator if the date belongs to a leap year.
@@ -1812,7 +1909,8 @@ def is_leap_year(self):
         res = is_leap_year(self._values).fillna(False)
         return cupy.asarray(res)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def quarter(self):
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -1837,6 +1935,7 @@ def quarter(self):
         res = extract_quarter(self._values)
         return Int8Index(res, dtype="int8")
 
+    @_cudf_nvtx_annotate
     def isocalendar(self):
         """
         Returns a DataFrame with the year, week, and day
@@ -1858,10 +1957,12 @@ def isocalendar(self):
         """
         return cudf.core.tools.datetimes._to_iso_calendar(self)
 
+    @_cudf_nvtx_annotate
     def to_pandas(self):
         nanos = self._values.astype("datetime64[ns]")
         return pd.DatetimeIndex(nanos.to_pandas(), name=self.name)
 
+    @_cudf_nvtx_annotate
     def _get_dt_field(self, field):
         out_column = self._values.get_dt_field(field)
         # column.column_empty_like always returns a Column object
@@ -1878,6 +1979,7 @@ def _get_dt_field(self, field):
     def is_boolean(self):
         return False
 
+    @_cudf_nvtx_annotate
     def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
@@ -1910,6 +2012,7 @@ def ceil(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
+    @_cudf_nvtx_annotate
     def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
@@ -1942,6 +2045,7 @@ def floor(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
+    @_cudf_nvtx_annotate
     def round(self, freq):
         """
         Perform round operation on the data to the specified freq.
@@ -2024,6 +2128,7 @@ class TimedeltaIndex(GenericIndex):
                   dtype='timedelta64[s]', name='delta-index')
     """
 
+    @_cudf_nvtx_annotate
     def __init__(
         self,
         data=None,
@@ -2055,6 +2160,7 @@ def __init__(
             data = column.as_column(np.array(data, dtype=dtype))
         super().__init__(data, **kwargs)
 
+    @_cudf_nvtx_annotate
     def to_pandas(self):
         return pd.TimedeltaIndex(
             self._values.to_pandas(),
@@ -2062,28 +2168,32 @@ def to_pandas(self):
             unit=self._values.time_unit,
         )
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def days(self):
         """
         Number of days for each element.
         """
         return as_index(arbitrary=self._values.days, name=self.name)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
         """
         return as_index(arbitrary=self._values.seconds, name=self.name)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
         """
         return as_index(arbitrary=self._values.microseconds, name=self.name)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def nanoseconds(self):
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
@@ -2091,7 +2201,8 @@ def nanoseconds(self):
         """
         return as_index(arbitrary=self._values.nanoseconds, name=self.name)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def components(self):
         """
         Return a dataframe of the components (days, hours, minutes,
@@ -2157,6 +2268,7 @@ class CategoricalIndex(GenericIndex):
     CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
     """  # noqa: E501
 
+    @_cudf_nvtx_annotate
     def __init__(
         self,
         data=None,
@@ -2211,14 +2323,16 @@ def __init__(
 
         super().__init__(data, **kwargs)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def codes(self):
         """
         The category codes of this categorical.
         """
         return as_index(self._values.codes)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def categories(self):
         """
         The categories of this categorical.
@@ -2232,6 +2346,7 @@ def is_categorical(self):
         return True
 
 
+@_cudf_nvtx_annotate
 def interval_range(
     start=None, end=None, periods=None, freq=None, name=None, closed="right",
 ) -> "IntervalIndex":
@@ -2394,6 +2509,7 @@ class IntervalIndex(GenericIndex):
     IntervalIndex
     """
 
+    @_cudf_nvtx_annotate
     def __init__(
         self, data, closed=None, dtype=None, copy=False, name=None,
     ):
@@ -2418,6 +2534,7 @@ def __init__(
         self.closed = closed
         super().__init__(data, **kwargs)
 
+    @_cudf_nvtx_annotate
     def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
         """
         Construct an IntervalIndex from an array of splits.
@@ -2474,6 +2591,7 @@ class StringIndex(GenericIndex):
     name: A string
     """
 
+    @_cudf_nvtx_annotate
     def __init__(self, values, copy=False, **kwargs):
         kwargs = _setdefault_name(values, **kwargs)
         if isinstance(values, StringColumn):
@@ -2489,11 +2607,13 @@ def __init__(self, values, copy=False, **kwargs):
 
         super().__init__(values, **kwargs)
 
+    @_cudf_nvtx_annotate
     def to_pandas(self):
         return pd.Index(
             self.to_numpy(na_value=None), name=self.name, dtype="object"
         )
 
+    @_cudf_nvtx_annotate
     def __repr__(self):
         return (
             f"{self.__class__.__name__}({self._values.values_host},"
@@ -2508,6 +2628,7 @@ def __repr__(self):
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
+    @_cudf_nvtx_annotate
     def str(self):
         return StringMethods(parent=self)
 
@@ -2528,6 +2649,7 @@ def is_object(self):
         return True
 
 
+@_cudf_nvtx_annotate
 def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
     """Create an Index from an arbitrary object
 
@@ -2656,6 +2778,7 @@ class Index(BaseIndex, metaclass=IndexMeta):
                 names=['a', 'b'])
     """
 
+    @_cudf_nvtx_annotate
     def __new__(
         cls,
         data=None,
@@ -2684,6 +2807,7 @@ def __new__(
         )
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_arrow(cls, obj):
         try:
             return cls(ColumnBase.from_arrow(obj))
@@ -2692,6 +2816,7 @@ def from_arrow(cls, obj):
             return cudf.MultiIndex.from_arrow(obj)
 
 
+@_cudf_nvtx_annotate
 def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
     """
     An internal Utility function to concat RangeIndex objects.
@@ -2732,6 +2857,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
     return RangeIndex(start, stop, step)
 
 
+@_cudf_nvtx_annotate
 def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]:
     """
     Extended Euclidean algorithms to solve Bezout's identity:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index bc7337d0a42..3fa951241f7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3,17 +3,17 @@
 
 from __future__ import annotations
 
+import numbers
 import operator
 import warnings
 from collections import Counter, abc
 from functools import cached_property
-from typing import Callable, Type, TypeVar
+from typing import Any, Callable, Dict, Optional, Tuple, Type, TypeVar, Union
 from uuid import uuid4
 
 import cupy as cp
 import numpy as np
 import pandas as pd
-from nvtx import annotate
 
 import cudf
 import cudf._lib as libcudf
@@ -23,13 +23,16 @@
     is_bool_dtype,
     is_categorical_dtype,
     is_integer_dtype,
+    is_list_dtype,
     is_list_like,
 )
+from cudf.core.column import ColumnBase
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame
+from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.index import Index, RangeIndex, _index_from_columns
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
+from cudf.utils.utils import _cudf_nvtx_annotate
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
@@ -57,6 +60,26 @@
 """
 
 
+def _get_host_unique(array):
+    if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)):
+        return array.unique.to_pandas()
+    elif isinstance(array, (str, numbers.Number)):
+        return [array]
+    else:
+        return set(array)
+
+
+def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
+    for c in columns:
+        try:
+            f._drop_column(c)
+        except KeyError as e:
+            if errors == "ignore":
+                pass
+            else:
+                raise e
+
+
 def _indices_from_labels(obj, labels):
 
     if not isinstance(labels, cudf.MultiIndex):
@@ -317,7 +340,7 @@ def iloc(self):
         """
         return self._iloc_indexer_type(self)
 
-    @annotate("SORT_INDEX", color="red", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def sort_index(
         self,
         axis=0,
@@ -444,12 +467,11 @@ def sort_index(
                 out = self._gather(inds)
                 # TODO: frame factory function should handle multilevel column
                 # names
-                if isinstance(
-                    self, cudf.core.dataframe.DataFrame
-                ) and isinstance(
-                    self.columns, pd.core.indexes.multi.MultiIndex
+                if (
+                    isinstance(self, cudf.core.dataframe.DataFrame)
+                    and self._data.multiindex
                 ):
-                    out.columns = self.columns
+                    out._set_column_names_like(self)
             elif (ascending and idx.is_monotonic_increasing) or (
                 not ascending and idx.is_monotonic_decreasing
             ):
@@ -459,12 +481,11 @@ def sort_index(
                     ascending=ascending, na_position=na_position
                 )
                 out = self._gather(inds)
-                if isinstance(
-                    self, cudf.core.dataframe.DataFrame
-                ) and isinstance(
-                    self.columns, pd.core.indexes.multi.MultiIndex
+                if (
+                    isinstance(self, cudf.core.dataframe.DataFrame)
+                    and self._data.multiindex
                 ):
-                    out.columns = self.columns
+                    out._set_column_names_like(self)
         else:
             labels = sorted(self._data.names, reverse=not ascending)
             out = self[labels]
@@ -701,6 +722,37 @@ def drop_duplicates(
             self._index.names if not ignore_index else None,
         )
 
+    @_cudf_nvtx_annotate
+    def _empty_like(self, keep_index=True):
+        return self._from_columns_like_self(
+            libcudf.copying.columns_empty_like(
+                [
+                    *(self._index._data.columns if keep_index else ()),
+                    *self._columns,
+                ]
+            ),
+            self._column_names,
+            self._index.names if keep_index else None,
+        )
+
+    def _split(self, splits, keep_index=True):
+        columns_split = libcudf.copying.columns_split(
+            [
+                *(self._index._data.columns if keep_index else []),
+                *self._columns,
+            ],
+            splits,
+        )
+
+        return [
+            self._from_columns_like_self(
+                columns_split[i],
+                self._column_names,
+                self._index.names if keep_index else None,
+            )
+            for i in range(len(splits) + 1)
+        ]
+
     def add_prefix(self, prefix):
         """
         Prefix labels with string `prefix`.
@@ -819,7 +871,7 @@ def add_suffix(self, suffix):
                 Use `Series.add_suffix` or `DataFrame.add_suffix`"
         )
 
-    @annotate("APPLY", color="purple", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def _apply(self, func, kernel_getter, *args, **kwargs):
         """Apply `func` across the rows of the frame."""
         if kwargs:
@@ -938,10 +990,11 @@ def sort_values(
             ),
             keep_index=not ignore_index,
         )
-        if isinstance(self, cudf.core.dataframe.DataFrame) and isinstance(
-            self.columns, pd.core.indexes.multi.MultiIndex
+        if (
+            isinstance(self, cudf.core.dataframe.DataFrame)
+            and self._data.multiindex
         ):
-            out.columns = self.columns
+            out.columns = self._data.to_pandas_index()
         return out
 
     def _n_largest_or_smallest(self, largest, n, columns, keep):
@@ -1695,154 +1748,682 @@ def last(self, offset):
             slice_func=lambda i: self.iloc[i:],
         )
 
-    # For more detail on this function and how it should work, see
-    # https://numpy.org/doc/stable/reference/ufuncs.html
-    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-        # We don't currently support reduction, accumulation, etc. We also
-        # don't support any special kwargs or higher arity ufuncs than binary.
-        if method != "__call__" or kwargs or ufunc.nin > 2:
+    @_cudf_nvtx_annotate
+    def sample(
+        self,
+        n=None,
+        frac=None,
+        replace=False,
+        weights=None,
+        random_state=None,
+        axis=None,
+        ignore_index=False,
+    ):
+        """Return a random sample of items from an axis of object.
+
+        If reproducible results are required, a random number generator may be
+        provided via the `random_state` parameter. This function will always
+        produce the same sample given an identical `random_state`.
+
+        Notes
+        -----
+        When sampling from ``axis=0/'index'``, ``random_state`` can be either
+        a numpy random state (``numpy.random.RandomState``) or a cupy random
+        state (``cupy.random.RandomState``). When a numpy random state is
+        used, the output is guaranteed to match the output of the corresponding
+        pandas method call, but generating the sample may be slow. If exact
+        pandas equivalence is not required, using a cupy random state will
+        achieve better performance, especially when sampling large number of
+        items. It's advised to use the matching `ndarray` type to the random
+        state for the `weights` array.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of items from axis to return. Cannot be used with `frac`.
+            Default = 1 if frac = None.
+        frac : float, optional
+            Fraction of axis items to return. Cannot be used with n.
+        replace : bool, default False
+            Allow or disallow sampling of the same row more than once.
+            `replace == True` is not supported for axis = 1/"columns".
+            `replace == False` is not supported for axis = 0/"index" given
+            `random_state` is `None` or a cupy random state, and `weights` is
+            specified.
+        weights : ndarray-like, optional
+            Default `None` for uniform probability distribution over rows to
+            sample from. If `ndarray` is passed, the length of `weights` should
+            equal to the number of rows to sample from, and will be normalized
+            to have a sum of 1. Unlike pandas, index alignment is not currently
+            not performed.
+        random_state : int, numpy/cupy RandomState, or None, default None
+            If None, default cupy random state is chosen.
+            If int, the seed for the default cupy random state.
+            If RandomState, rows-to-sample are generated from the RandomState.
+        axis : {0 or `index`, 1 or `columns`, None}, default None
+            Axis to sample. Accepts axis number or name.
+            Default is stat axis for given data type
+            (0 for Series and DataFrames). Series doesn't support axis=1.
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        Series or DataFrame
+            A new object of same type as caller containing n items
+            randomly sampled from the caller object.
+
+        Examples
+        --------
+        >>> import cudf as cudf
+        >>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}})
+        >>> df.sample(3)
+           a
+        1  2
+        3  4
+        0  1
+
+        >>> sr = cudf.Series([1, 2, 3, 4, 5])
+        >>> sr.sample(10, replace=True)
+        1    4
+        3    1
+        2    4
+        0    5
+        0    1
+        4    5
+        4    1
+        0    2
+        0    3
+        3    2
+        dtype: int64
+
+        >>> df = cudf.DataFrame(
+        ...     {"a": [1, 2], "b": [2, 3], "c": [3, 4], "d": [4, 5]}
+        ... )
+        >>> df.sample(2, axis=1)
+           a  c
+        0  1  3
+        1  2  4
+        """
+        axis = self._get_axis_from_axis_arg(axis)
+        size = self.shape[axis]
+
+        # Compute `n` from parameter `frac`.
+        if frac is None:
+            n = 1 if n is None else n
+        else:
+            if frac > 1 and not replace:
+                raise ValueError(
+                    "Replace has to be set to `True` when upsampling the "
+                    "population `frac` > 1."
+                )
+            if n is not None:
+                raise ValueError(
+                    "Please enter a value for `frac` OR `n`, not both."
+                )
+            n = int(round(size * frac))
+
+        if n > 0 and size == 0:
+            raise ValueError(
+                "Cannot take a sample larger than 0 when axis is empty."
+            )
+
+        if isinstance(random_state, cp.random.RandomState):
+            lib = cp
+        elif isinstance(random_state, np.random.RandomState):
+            lib = np
+        else:
+            # Construct random state if `random_state` parameter is None or a
+            # seed. By default, cupy random state is used to sample rows
+            # and numpy is used to sample columns. This is because row data
+            # is stored on device, and the column objects are stored on host.
+            lib = cp if axis == 0 else np
+            random_state = lib.random.RandomState(seed=random_state)
+
+        # Normalize `weights` array.
+        if weights is not None:
+            if isinstance(weights, str):
+                raise NotImplementedError(
+                    "Weights specified by string is unsupported yet."
+                )
+
+            if size != len(weights):
+                raise ValueError(
+                    "Weights and axis to be sampled must be of same length."
+                )
+
+            weights = lib.asarray(weights)
+            weights = weights / weights.sum()
+
+        if axis == 0:
+            return self._sample_axis_0(
+                n, weights, replace, random_state, ignore_index
+            )
+        else:
+            if isinstance(random_state, cp.random.RandomState):
+                raise ValueError(
+                    "Sampling from `axis=1`/`columns` with cupy random state"
+                    "isn't supported."
+                )
+            return self._sample_axis_1(
+                n, weights, replace, random_state, ignore_index
+            )
+
+    def _sample_axis_0(
+        self,
+        n: int,
+        weights: Optional[ColumnLike],
+        replace: bool,
+        random_state: Union[np.random.RandomState, cp.random.RandomState],
+        ignore_index: bool,
+    ):
+        try:
+            gather_map_array = random_state.choice(
+                len(self), size=n, replace=replace, p=weights
+            )
+        except NotImplementedError as e:
+            raise NotImplementedError(
+                "Random sampling with cupy does not support these inputs."
+            ) from e
+
+        return self._gather(
+            cudf.core.column.as_column(gather_map_array),
+            keep_index=not ignore_index,
+            check_bounds=False,
+        )
+
+    def _sample_axis_1(
+        self,
+        n: int,
+        weights: Optional[ColumnLike],
+        replace: bool,
+        random_state: np.random.RandomState,
+        ignore_index: bool,
+    ):
+        raise NotImplementedError(
+            f"Sampling from axis 1 is not implemented for {self.__class__}."
+        )
+
+    def _binaryop(
+        self,
+        other: Any,
+        op: str,
+        fill_value: Any = None,
+        can_reindex: bool = False,
+        *args,
+        **kwargs,
+    ):
+        reflect = self._is_reflected_op(op)
+        if reflect:
+            op = op[:2] + op[3:]
+        operands, out_index = self._make_operands_and_index_for_binop(
+            other, op, fill_value, reflect, can_reindex
+        )
+        if operands is NotImplemented:
             return NotImplemented
 
-        # Binary operations
-        binary_operations = {
-            # Arithmetic binary operations.
-            "add": "add",
-            "subtract": "sub",
-            "multiply": "mul",
-            "matmul": "matmul",
-            "divide": "truediv",
-            "true_divide": "truediv",
-            "floor_divide": "floordiv",
-            "power": "pow",
-            "float_power": "pow",
-            "remainder": "mod",
-            "mod": "mod",
-            "fmod": "mod",
-            # Bitwise binary operations.
-            "bitwise_and": "and",
-            "bitwise_or": "or",
-            "bitwise_xor": "xor",
-            # Comparison binary operators
-            "greater": "gt",
-            "greater_equal": "ge",
-            "less": "lt",
-            "less_equal": "le",
-            "not_equal": "ne",
-            "equal": "eq",
-        }
+        return self._from_data(
+            ColumnAccessor(type(self)._colwise_binop(operands, op)),
+            index=out_index,
+        )
+
+    def _make_operands_and_index_for_binop(
+        self,
+        other: Any,
+        fn: str,
+        fill_value: Any = None,
+        reflect: bool = False,
+        can_reindex: bool = False,
+        *args,
+        **kwargs,
+    ) -> Tuple[
+        Union[
+            Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
+            Type[NotImplemented],
+        ],
+        Optional[cudf.BaseIndex],
+    ]:
+        raise NotImplementedError(
+            "Binary operations are not supported for {self.__class__}"
+        )
 
-        # First look for methods of the class.
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
         fname = ufunc.__name__
-        if fname in binary_operations:
-            reflect = self is not inputs[0]
-            other = inputs[0] if reflect else inputs[1]
-
-            # These operators need to be mapped to their inverses when
-            # performing a reflected operation because no reflected version of
-            # the operators themselves exist.
-            ops_without_reflection = {
-                "gt": "lt",
-                "ge": "le",
-                "lt": "gt",
-                "le": "ge",
-                # ne and eq are symmetric, so they are their own inverse op
-                "ne": "ne",
-                "eq": "eq",
-            }
-
-            op = binary_operations[fname]
-            if reflect and op in ops_without_reflection:
-                op = ops_without_reflection[op]
-                reflect = False
-            op = f"__{'r' if reflect else ''}{op}__"
 
+        if ret is not None:
             # pandas bitwise operations return bools if indexes are misaligned.
-            if (
-                "bitwise" in fname
-                and isinstance(other, IndexedFrame)
-                and not self.index.equals(other.index)
-            ):
-                return getattr(self, op)(other).astype(bool)
-            # Float_power returns float irrespective of the input type.
-            if fname == "float_power":
-                return getattr(self, op)(other).astype(float)
-            return getattr(self, op)(other)
-
-        # Special handling for unary operations.
-        if fname == "negative":
-            return self * -1
-        if fname == "positive":
-            return self.copy(deep=True)
-        if fname == "invert":
-            return ~self
-        if fname == "absolute":
-            return self.abs()
-        if fname == "fabs":
-            return self.abs().astype(np.float64)
-
-        # Note: There are some operations that may be supported by libcudf but
-        # are not supported by pandas APIs. In particular, libcudf binary
-        # operations support logical and/or operations, but those operations
-        # are not defined on pd.Series/DataFrame. For now those operations will
-        # dispatch to cupy, but if ufuncs are ever a bottleneck we could add
-        # special handling to dispatch those (or any other) functions that we
-        # could implement without cupy.
+            if "bitwise" in fname:
+                reflect = self is not inputs[0]
+                other = inputs[0] if reflect else inputs[1]
+                if isinstance(other, self.__class__) and not self.index.equals(
+                    other.index
+                ):
+                    ret = ret.astype(bool)
+            return ret
 
         # Attempt to dispatch all other functions to cupy.
         cupy_func = getattr(cp, fname)
         if cupy_func:
-            # Indices must be aligned before converting to arrays.
             if ufunc.nin == 2:
                 other = inputs[self is inputs[0]]
-                inputs, index = self._prep_for_binop(other, fname)
+                inputs, index = self._make_operands_and_index_for_binop(
+                    other, fname
+                )
             else:
+                # This works for Index too
                 inputs = {
                     name: (col, None, False, None)
                     for name, col in self._data.items()
                 }
                 index = self._index
 
-            mask = None
-            data = [{} for _ in range(ufunc.nout)]
-            for name, (left, right, _, _) in inputs.items():
-                cupy_inputs = []
-                # TODO: I'm jumping through multiple hoops to get the unary
-                # behavior to match up with the binary. I should see if there
-                # are better patterns to employ here.
-                for inp in (left, right) if ufunc.nin == 2 else (left,):
-                    if (
-                        isinstance(inp, cudf.core.column.ColumnBase)
-                        and inp.has_nulls()
-                    ):
-                        new_mask = cudf.core.column.as_column(inp.nullmask)
-
-                        # TODO: This is a hackish way to perform a bitwise and
-                        # of bitmasks. Once we expose
-                        # cudf::detail::bitwise_and, then we can use that
-                        # instead.
-                        mask = new_mask if mask is None else (mask & new_mask)
-
-                        # Arbitrarily fill with zeros. For ufuncs, we assume
-                        # that the end result propagates nulls via a bitwise
-                        # and, so these elements are irrelevant.
-                        inp = inp.fillna(0)
-                    cupy_inputs.append(cp.asarray(inp))
-
-                cp_output = cupy_func(*cupy_inputs, **kwargs)
-                if ufunc.nout == 1:
-                    cp_output = (cp_output,)
-                for i, out in enumerate(cp_output):
-                    data[i][name] = cudf.core.column.as_column(out).set_mask(
-                        mask
-                    )
-
-            out = tuple(
-                self.__class__._from_data(out, index=index) for out in data
+            data = self._apply_cupy_ufunc_to_operands(
+                ufunc, cupy_func, inputs, **kwargs
             )
+
+            out = tuple(self._from_data(out, index=index) for out in data)
             return out[0] if ufunc.nout == 1 else out
 
         return NotImplemented
 
+    def _append(
+        self, other, ignore_index=False, verify_integrity=False, sort=None
+    ):
+        warnings.warn(
+            "append is deprecated and will be removed in a future version. "
+            "Use concat instead.",
+            FutureWarning,
+        )
+        if verify_integrity not in (None, False):
+            raise NotImplementedError(
+                "verify_integrity parameter is not supported yet."
+            )
+
+        if is_list_like(other):
+            to_concat = [self, *other]
+        else:
+            to_concat = [self, other]
+
+        return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort)
+
+    def astype(self, dtype, copy=False, errors="raise", **kwargs):
+        """Cast the object to the given dtype.
+
+        Parameters
+        ----------
+        dtype : data type, or dict of column name -> data type
+            Use a numpy.dtype or Python type to cast entire DataFrame object to
+            the same type. Alternatively, use ``{col: dtype, ...}``, where col
+            is a column label and dtype is a numpy.dtype or Python type
+            to cast one or more of the DataFrame's columns to
+            column-specific types.
+        copy : bool, default False
+            Return a deep-copy when ``copy=True``. Note by default
+            ``copy=False`` setting is used and hence changes to
+            values then may propagate to other cudf objects.
+        errors : {'raise', 'ignore', 'warn'}, default 'raise'
+            Control raising of exceptions on invalid data for provided dtype.
+
+            -   ``raise`` : allow exceptions to be raised
+            -   ``ignore`` : suppress exceptions. On error return original
+                object.
+            -   ``warn`` : prints last exceptions as warnings and
+                return original object.
+        **kwargs : extra arguments to pass on to the constructor
+
+        Returns
+        -------
+        DataFrame/Series
+
+        Examples
+        --------
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]})
+        >>> df
+            a  b
+        0  10  1
+        1  20  2
+        2  30  3
+        >>> df.dtypes
+        a    int64
+        b    int64
+        dtype: object
+
+        Cast all columns to `int32`:
+
+        >>> df.astype('int32').dtypes
+        a    int32
+        b    int32
+        dtype: object
+
+        Cast `a` to `float32` using a dictionary:
+
+        >>> df.astype({'a': 'float32'}).dtypes
+        a    float32
+        b      int64
+        dtype: object
+        >>> df.astype({'a': 'float32'})
+              a  b
+        0  10.0  1
+        1  20.0  2
+        2  30.0  3
+
+        **Series**
+
+        >>> import cudf
+        >>> series = cudf.Series([1, 2], dtype='int32')
+        >>> series
+        0    1
+        1    2
+        dtype: int32
+        >>> series.astype('int64')
+        0    1
+        1    2
+        dtype: int64
+
+        Convert to categorical type:
+
+        >>> series.astype('category')
+        0    1
+        1    2
+        dtype: category
+        Categories (2, int64): [1, 2]
+
+        Convert to ordered categorical type with custom ordering:
+
+        >>> cat_dtype = cudf.CategoricalDtype(categories=[2, 1], ordered=True)
+        >>> series.astype(cat_dtype)
+        0    1
+        1    2
+        dtype: category
+        Categories (2, int64): [2 < 1]
+
+        Note that using ``copy=False`` (enabled by default)
+        and changing data on a new Series will
+        propagate changes:
+
+        >>> s1 = cudf.Series([1, 2])
+        >>> s1
+        0    1
+        1    2
+        dtype: int64
+        >>> s2 = s1.astype('int64', copy=False)
+        >>> s2[0] = 10
+        >>> s1
+        0    10
+        1     2
+        dtype: int64
+        """
+        if errors not in ("ignore", "warn", "raise"):
+            raise ValueError("invalid error value specified")
+        elif errors == "warn":
+            warnings.warn(
+                "Specifying errors='warn' is deprecated and will be removed "
+                "in a future release.",
+                FutureWarning,
+            )
+
+        try:
+            data = super().astype(dtype, copy, **kwargs)
+        except Exception as e:
+            if errors == "raise":
+                raise e
+            elif errors == "warn":
+                import traceback
+
+                tb = traceback.format_exc()
+                warnings.warn(tb)
+            return self
+
+        return self._from_data(data, index=self._index)
+
+    @_cudf_nvtx_annotate
+    def drop(
+        self,
+        labels=None,
+        axis=0,
+        index=None,
+        columns=None,
+        level=None,
+        inplace=False,
+        errors="raise",
+    ):
+        """Drop specified labels from rows or columns.
+
+        Remove rows or columns by specifying label names and corresponding
+        axis, or by specifying directly index or column names. When using a
+        multi-index, labels on different levels can be removed by specifying
+        the level.
+
+        Parameters
+        ----------
+        labels : single label or list-like
+            Index or column labels to drop.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Whether to drop labels from the index (0 or 'index') or
+            columns (1 or 'columns').
+        index : single label or list-like
+            Alternative to specifying axis (``labels, axis=0``
+            is equivalent to ``index=labels``).
+        columns : single label or list-like
+            Alternative to specifying axis (``labels, axis=1``
+            is equivalent to ``columns=labels``).
+        level : int or level name, optional
+            For MultiIndex, level from which the labels will be removed.
+        inplace : bool, default False
+            If False, return a copy. Otherwise, do operation
+            inplace and return None.
+        errors : {'ignore', 'raise'}, default 'raise'
+            If 'ignore', suppress error and only existing labels are
+            dropped.
+
+        Returns
+        -------
+        DataFrame or Series
+            DataFrame or Series without the removed index or column labels.
+
+        Raises
+        ------
+        KeyError
+            If any of the labels is not found in the selected axis.
+
+        See Also
+        --------
+        DataFrame.loc : Label-location based indexer for selection by label.
+        DataFrame.dropna : Return DataFrame with labels on given axis omitted
+            where (all or any) data are missing.
+        DataFrame.drop_duplicates : Return DataFrame with duplicate rows
+            removed, optionally only considering certain columns.
+        Series.reindex
+            Return only specified index labels of Series
+        Series.dropna
+            Return series without null values
+        Series.drop_duplicates
+            Return series with duplicate values removed
+
+        Examples
+        --------
+        **Series**
+
+        >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z'])
+        >>> s
+        x    1
+        y    2
+        z    3
+        dtype: int64
+
+        Drop labels x and z
+
+        >>> s.drop(labels=['x', 'z'])
+        y    2
+        dtype: int64
+
+        Drop a label from the second level in MultiIndex Series.
+
+        >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']])
+        >>> s = cudf.Series(range(6), index=midx)
+        >>> s
+        0  x    0
+           y    1
+        1  x    2
+           y    3
+        2  x    4
+           y    5
+        dtype: int64
+        >>> s.drop(labels='y', level=1)
+        0  x    0
+        1  x    2
+        2  x    4
+        Name: 2, dtype: int64
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({"A": [1, 2, 3, 4],
+        ...                      "B": [5, 6, 7, 8],
+        ...                      "C": [10, 11, 12, 13],
+        ...                      "D": [20, 30, 40, 50]})
+        >>> df
+           A  B   C   D
+        0  1  5  10  20
+        1  2  6  11  30
+        2  3  7  12  40
+        3  4  8  13  50
+
+        Drop columns
+
+        >>> df.drop(['B', 'C'], axis=1)
+           A   D
+        0  1  20
+        1  2  30
+        2  3  40
+        3  4  50
+        >>> df.drop(columns=['B', 'C'])
+           A   D
+        0  1  20
+        1  2  30
+        2  3  40
+        3  4  50
+
+        Drop a row by index
+
+        >>> df.drop([0, 1])
+           A  B   C   D
+        2  3  7  12  40
+        3  4  8  13  50
+
+        Drop columns and/or rows of MultiIndex DataFrame
+
+        >>> midx = cudf.MultiIndex(levels=[['lama', 'cow', 'falcon'],
+        ...                              ['speed', 'weight', 'length']],
+        ...                      codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
+        ...                             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
+        >>> df = cudf.DataFrame(index=midx, columns=['big', 'small'],
+        ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
+        ...                         [250, 150], [1.5, 0.8], [320, 250],
+        ...                         [1, 0.8], [0.3, 0.2]])
+        >>> df
+                         big  small
+        lama   speed    45.0   30.0
+               weight  200.0  100.0
+               length    1.5    1.0
+        cow    speed    30.0   20.0
+               weight  250.0  150.0
+               length    1.5    0.8
+        falcon speed   320.0  250.0
+               weight    1.0    0.8
+               length    0.3    0.2
+        >>> df.drop(index='cow', columns='small')
+                         big
+        lama   speed    45.0
+               weight  200.0
+               length    1.5
+        falcon speed   320.0
+               weight    1.0
+               length    0.3
+        >>> df.drop(index='length', level=1)
+                         big  small
+        lama   speed    45.0   30.0
+               weight  200.0  100.0
+        cow    speed    30.0   20.0
+               weight  250.0  150.0
+        falcon speed   320.0  250.0
+               weight    1.0    0.8
+        """
+        if labels is not None:
+            if index is not None or columns is not None:
+                raise ValueError(
+                    "Cannot specify both 'labels' and 'index'/'columns'"
+                )
+            target = labels
+        elif index is not None:
+            target = index
+            axis = 0
+        elif columns is not None:
+            target = columns
+            axis = 1
+        else:
+            raise ValueError(
+                "Need to specify at least one of 'labels', "
+                "'index' or 'columns'"
+            )
+
+        if inplace:
+            out = self
+        else:
+            out = self.copy()
+
+        if axis in (1, "columns"):
+            target = _get_host_unique(target)
+
+            _drop_columns(out, target, errors)
+        elif axis in (0, "index"):
+            dropped = _drop_rows_by_labels(out, target, level, errors)
+
+            if columns is not None:
+                columns = _get_host_unique(columns)
+                _drop_columns(dropped, columns, errors)
+
+            out._data = dropped._data
+            out._index = dropped._index
+
+        if not inplace:
+            return out
+
+    @_cudf_nvtx_annotate
+    def _explode(self, explode_column: Any, ignore_index: bool):
+        # Helper function for `explode` in `Series` and `Dataframe`, explodes a
+        # specified nested column. Other columns' corresponding rows are
+        # duplicated. If ignore_index is set, the original index is not
+        # exploded and will be replaced with a `RangeIndex`.
+        if not is_list_dtype(self._data[explode_column].dtype):
+            data = self._data.copy(deep=True)
+            idx = None if ignore_index else self._index.copy(deep=True)
+            return self.__class__._from_data(data, index=idx)
+
+        explode_column_num = self._column_names.index(explode_column)
+        if not ignore_index and self._index is not None:
+            explode_column_num += self._index.nlevels
+
+        data, index = libcudf.lists.explode_outer(
+            self, explode_column_num, ignore_index
+        )
+        res = self.__class__._from_data(
+            ColumnAccessor(
+                data,
+                multiindex=self._data.multiindex,
+                level_names=self._data._level_names,
+            ),
+            index=index,
+        )
+
+        if not ignore_index and self._index is not None:
+            res.index.names = self._index.names
+        return res
+
 
 def _check_duplicate_level_names(specified, level_names):
     """Raise if any of `specified` has duplicates in `level_names`."""
diff --git a/python/cudf/cudf/core/mixins/__init__.py b/python/cudf/cudf/core/mixins/__init__.py
new file mode 100644
index 00000000000..8306f3f11b3
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from .binops import BinaryOperand
+from .reductions import Reducible
+from .scans import Scannable
+
+__all__ = ["BinaryOperand", "Reducible", "Scannable"]
diff --git a/python/cudf/cudf/core/mixins/binops.py b/python/cudf/cudf/core/mixins/binops.py
new file mode 100644
index 00000000000..773b47b62b2
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/binops.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from .mixin_factory import _create_delegating_mixin
+
+BinaryOperand = _create_delegating_mixin(
+    "BinaryOperand",
+    "Mixin encapsulating binary operations.",
+    "BINARY_OPERATION",
+    "_binaryop",
+    {
+        # Numeric operations.
+        "__add__",
+        "__sub__",
+        "__mul__",
+        "__matmul__",
+        "__truediv__",
+        "__floordiv__",
+        "__mod__",
+        # "__divmod__", # Not yet implemented
+        "__pow__",
+        # "__lshift__", # Not yet implemented
+        # "__rshift__", # Not yet implemented
+        "__and__",
+        "__xor__",
+        "__or__",
+        # Reflected numeric operations.
+        "__radd__",
+        "__rsub__",
+        "__rmul__",
+        "__rmatmul__",
+        "__rtruediv__",
+        "__rfloordiv__",
+        "__rmod__",
+        # "__rdivmod__", # Not yet implemented
+        "__rpow__",
+        # "__rlshift__", # Not yet implemented
+        # "__rrshift__", # Not yet implemented
+        "__rand__",
+        "__rxor__",
+        "__ror__",
+        # Rich comparison operations.
+        "__lt__",
+        "__le__",
+        "__eq__",
+        "__ne__",
+        "__gt__",
+        "__ge__",
+    },
+)
+
+
+def _is_reflected_op(op):
+    return op[2] == "r" and op != "__rshift__"
+
+
+BinaryOperand._is_reflected_op = staticmethod(_is_reflected_op)
diff --git a/python/cudf/cudf/core/mixins/binops.pyi b/python/cudf/cudf/core/mixins/binops.pyi
new file mode 100644
index 00000000000..45093cd04d4
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/binops.pyi
@@ -0,0 +1,88 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from typing import Set
+
+class BinaryOperand:
+    _SUPPORTED_BINARY_OPERATIONS: Set
+
+    def __add__(self, other):
+        ...
+
+    def __sub__(self, other):
+        ...
+
+    def __mul__(self, other):
+        ...
+
+    def __truediv__(self, other):
+        ...
+
+    def __floordiv__(self, other):
+        ...
+
+    def __mod__(self, other):
+        ...
+
+    def __pow__(self, other):
+        ...
+
+    def __and__(self, other):
+        ...
+
+    def __xor__(self, other):
+        ...
+
+    def __or__(self, other):
+        ...
+
+    def __radd__(self, other):
+        ...
+
+    def __rsub__(self, other):
+        ...
+
+    def __rmul__(self, other):
+        ...
+
+    def __rtruediv__(self, other):
+        ...
+
+    def __rfloordiv__(self, other):
+        ...
+
+    def __rmod__(self, other):
+        ...
+
+    def __rpow__(self, other):
+        ...
+
+    def __rand__(self, other):
+        ...
+
+    def __rxor__(self, other):
+        ...
+
+    def __ror__(self, other):
+        ...
+
+    def __lt__(self, other):
+        ...
+
+    def __le__(self, other):
+        ...
+
+    def __eq__(self, other):
+        ...
+
+    def __ne__(self, other):
+        ...
+
+    def __gt__(self, other):
+        ...
+
+    def __ge__(self, other):
+        ...
+
+    @staticmethod
+    def _is_reflected_op(op) -> bool:
+        ...
diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py
new file mode 100644
index 00000000000..7bbb299d643
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/mixin_factory.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import inspect
+
+
+# `functools.partialmethod` does not allow setting attributes such as
+# __doc__ on the resulting method. So we use a simple alternative to
+# it here:
+def _partialmethod(method, *args1, **kwargs1):
+    def wrapper(self, *args2, **kwargs2):
+        return method(self, *args1, *args2, **kwargs1, **kwargs2)
+
+    return wrapper
+
+
+class Operation:
+    """Descriptor used to define operations for delegating mixins.
+
+    This class is designed to be assigned to the attributes (the delegating
+    methods) defined by the OperationMixin. This class will create the method
+    and mimic all the expected attributes for that method to appear as though
+    it was originally designed on the class. The use of the descriptor pattern
+    ensures that the method is only created the first time it is invoked, after
+    which all further calls use the callable generated on the first invocation.
+
+    Parameters
+    ----------
+    name : str
+        The name of the operation.
+    docstring_format_args : str
+        The attribute of the owning class from which to pull format parameters
+        for this operation's docstring.
+    base_operation : str
+        The underlying operation function to be invoked when operation `name`
+        is called on the owning class.
+    """
+
+    def __init__(self, name, docstring_format_args, base_operation):
+        self._name = name
+        self._docstring_format_args = docstring_format_args
+        self._base_operation = base_operation
+
+    def __get__(self, obj, owner=None):
+        retfunc = _partialmethod(self._base_operation, op=self._name)
+
+        # Required attributes that will exist.
+        retfunc.__name__ = self._name
+        retfunc.__qualname__ = ".".join([owner.__name__, self._name])
+        retfunc.__module__ = self._base_operation.__module__
+
+        if self._base_operation.__doc__ is not None:
+            retfunc.__doc__ = self._base_operation.__doc__.format(
+                cls=owner.__name__,
+                op=self._name,
+                **self._docstring_format_args,
+            )
+
+        retfunc.__annotations__ = self._base_operation.__annotations__.copy()
+        retfunc.__annotations__.pop("op", None)
+        retfunc_params = [
+            v
+            for k, v in inspect.signature(
+                self._base_operation
+            ).parameters.items()
+            if k != "op"
+        ]
+        retfunc.__signature__ = inspect.Signature(retfunc_params)
+
+        setattr(owner, self._name, retfunc)
+
+        if obj is None:
+            return getattr(owner, self._name)
+        else:
+            return getattr(obj, self._name)
+
+
+def _should_define_operation(cls, operation, base_operation_name):
+    if operation not in dir(cls):
+        return True
+
+    # If the class doesn't override the base operation we stick to whatever
+    # parent implementation exists.
+    if base_operation_name not in cls.__dict__:
+        return False
+
+    # At this point we know that the class has the operation defined but it
+    # also overrides the base operation. Since this function is called before
+    # the operation is defined on the current class, we know that it inherited
+    # the operation from a parent. We therefore have three possibilities:
+    # 1. A parent class manually defined the operation. That override takes
+    #    precedence even if the current class defined the base operation.
+    # 2. A parent class has an auto-generated operation, i.e. it is of type
+    #    Operation and was created by OperationMixin.__init_subclass__. The
+    #    current class must override it so that its base operation is used
+    #    rather than the parent's base operation.
+    # 3. The method is defined for all classes, i.e. it is a method of object.
+    for base_cls in cls.__mro__:
+        # We always override methods defined for object.
+        if base_cls is object:
+            return True
+        # The first attribute in the MRO is the one that will be used.
+        if operation in base_cls.__dict__:
+            return isinstance(base_cls.__dict__[operation], Operation)
+
+    # This line should be unreachable since we know the attribute exists
+    # somewhere in the MRO if the for loop was entered.
+    assert False, "Operation attribute not found in hierarchy."
+
+
+def _create_delegating_mixin(
+    mixin_name,
+    docstring,
+    category_name,
+    base_operation_name,
+    supported_operations,
+):
+    """Factory for mixins defining collections of delegated operations.
+
+    This function generates mixins based on two common paradigms in cuDF:
+
+    1. libcudf groups many operations into categories using a common API. These
+       APIs usually accept an enum to delineate the specific operation to
+       perform, e.g. binary operations use the `binary_operator` enum when
+       calling the `binary_operation` function. cuDF Python mimics this
+       structure by having operations within a category delegate to a common
+       internal function (e.g. DataFrame.__add__ calls DataFrame._binaryop).
+    2. Many cuDF classes implement similar operations (e.g. `sum`) via
+       delegation to lower-level APIs before reaching a libcudf C++ function
+       call. As a result, many API function calls actually involve multiple
+       delegations to lower-level APIs that can look essentially identical. An
+       example of such a sequence would be DataFrame.sum -> DataFrame._reduce
+       -> Column.sum -> Column._reduce -> libcudf.
+
+    This factory creates mixins for a category of operations implemented by via
+    this delegator pattern. The resulting mixins make it easy to share common
+    functions across various classes while also providing a common entrypoint
+    for implementing the centralized logic for a given category of operations.
+    Its usage is best demonstrated by example below.
+
+    Parameters
+    ----------
+    mixin_name : str
+        The name of the class. This argument should be the same as the object
+        that this function's output is assigned to, e.g.
+        :code:`Baz = _create_delegating_mixin("Baz", ...)`.
+    docstring : str
+        The documentation string for the mixin class.
+    category_name : str
+        The category of operations for which a mixin is being created. This
+        name will be used to define or access the following attributes as shown
+        in the example below:
+            - f'_{category_name}_DOCSTRINGS'
+            - f'_VALID_{category_name}S'  # The subset of ops a subclass allows
+            - f'_SUPPORTED_{category_name}S'  # The ops supported by the mixin
+    base_operation_name : str
+        The name given to the core function implementing this category of
+        operations.  The corresponding function is the entrypoint for child
+        classes.
+    supported_ops : List[str]
+        The list of valid operations that subclasses of the resulting mixin may
+        request to be implemented.
+
+    Examples
+    --------
+    >>> # The class below:
+    >>> class Person:
+    ...     def _greet(self, op):
+    ...         print(op)
+    ...
+    ...     def hello(self):
+    ...         self._greet("hello")
+    ...
+    ...     def goodbye(self):
+    ...         self._greet("goodbye")
+    >>> # can  be rewritten using a delegating mixin as follows:
+    >>> Greeter = _create_delegating_mixin(
+    ...     "Greeter", "", "GREETING", "_greet", {"hello", "goodbye", "hey"}
+    ... )
+    >>> # The `hello` and `goodbye` methods will now be automatically generated
+    >>> # for the Person class below.
+    >>> class Person(Greeter):
+    ...     _VALID_GREETINGS = {"hello", "goodbye"}
+    ...
+    ...     def _greet(self, op: str):
+    ...         '''Say {op}.'''
+    ...         print(op)
+    >>> mom = Person()
+    >>> mom.hello()
+    hello
+    >>> # The Greeter class could also enable the `hey` method, but Person did
+    >>> # not include it in the _VALID_GREETINGS set so it will not exist.
+    >>> mom.hey()
+    Traceback (most recent call last):
+        ...
+    AttributeError: 'Person' object has no attribute 'hey'
+    >>> # The docstrings for each method are generated by formatting the _greet
+    >>> # docstring with the operation name as well as any additional keys
+    >>> # provided via the _GREETING_DOCSTRINGS parameter.
+    >>> print(mom.hello.__doc__)
+    Say hello.
+    """
+    # The first two attributes may be defined on subclasses of the generated
+    # OperationMixin to indicate valid attributes and parameters to use when
+    # formatting docstrings. The supported_attr will be defined on the
+    # OperationMixin itself to indicate what operations its subclass may
+    # inherit from it.
+    validity_attr = f"_VALID_{category_name}S"
+    docstring_attr = f"_{category_name}_DOCSTRINGS"
+    supported_attr = f"_SUPPORTED_{category_name}S"
+
+    class OperationMixin:
+        @classmethod
+        def __init_subclass__(cls):
+            # Support composition of various OperationMixins. Note that since
+            # this __init_subclass__ is defined on mixins, it does not prohibit
+            # classes that inherit it from implementing this method as well as
+            # long as those implementations also include this super call.
+            super().__init_subclass__()
+
+            # Only add the valid set of operations for a particular class.
+            valid_operations = set()
+            for base_cls in cls.__mro__:
+                # Check for sentinel indicating that all operations are valid.
+                valid_operations |= getattr(base_cls, validity_attr, set())
+
+            invalid_operations = valid_operations - supported_operations
+            assert (
+                len(invalid_operations) == 0
+            ), f"Invalid requested operations: {invalid_operations}"
+
+            base_operation = getattr(cls, base_operation_name)
+            for operation in valid_operations:
+                if _should_define_operation(
+                    cls, operation, base_operation_name
+                ):
+                    docstring_format_args = getattr(
+                        cls, docstring_attr, {}
+                    ).get(operation, {})
+                    op_attr = Operation(
+                        operation, docstring_format_args, base_operation
+                    )
+                    setattr(cls, operation, op_attr)
+
+    OperationMixin.__name__ = mixin_name
+    OperationMixin.__qualname__ = mixin_name
+    OperationMixin.__doc__ = docstring
+
+    def _operation(self, op: str, *args, **kwargs):
+        raise NotImplementedError
+
+    _operation.__name__ = base_operation_name
+    _operation.__qualname__ = ".".join([mixin_name, base_operation_name])
+    _operation.__doc__ = (
+        f"The core {category_name.lower()} function. Must be overridden by "
+        "subclasses, the default implementation raises a NotImplementedError."
+    )
+
+    setattr(OperationMixin, base_operation_name, _operation)
+    # Making this attribute available makes it easy for subclasses to indicate
+    # that all supported operations for this mixin are valid.
+    setattr(OperationMixin, supported_attr, supported_operations)
+
+    return OperationMixin
diff --git a/python/cudf/cudf/core/mixins/reductions.py b/python/cudf/cudf/core/mixins/reductions.py
new file mode 100644
index 00000000000..f73f0e8fbc6
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/reductions.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from .mixin_factory import _create_delegating_mixin
+
+Reducible = _create_delegating_mixin(
+    "Reducible",
+    "Mixin encapsulating reduction operations.",
+    "REDUCTION",
+    "_reduce",
+    {
+        "sum",
+        "product",
+        "min",
+        "max",
+        "count",
+        "any",
+        "all",
+        "sum_of_squares",
+        "mean",
+        "var",
+        "std",
+        "median",
+        "argmax",
+        "argmin",
+        "nunique",
+        "nth",
+        "collect",
+        "unique",
+        "prod",
+        "idxmin",
+        "idxmax",
+        "first",
+        "last",
+    },
+)
diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi
new file mode 100644
index 00000000000..3769b7c360e
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/reductions.pyi
@@ -0,0 +1,72 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from typing import Set
+
+class Reducible:
+    _SUPPORTED_REDUCTIONS: Set
+
+    def sum(self):
+        ...
+
+    def product(self):
+        ...
+
+    def min(self):
+        ...
+
+    def max(self):
+        ...
+
+    def count(self):
+        ...
+
+    def any(self):
+        ...
+
+    def all(self):
+        ...
+
+    def sum_of_squares(self):
+        ...
+
+    def mean(self):
+        ...
+
+    def var(self):
+        ...
+
+    def std(self):
+        ...
+
+    def median(self):
+        ...
+
+    def argmax(self):
+        ...
+
+    def argmin(self):
+        ...
+
+    def nunique(self):
+        ...
+
+    def nth(self):
+        ...
+
+    def collect(self):
+        ...
+
+    def prod(self):
+        ...
+
+    def idxmin(self):
+        ...
+
+    def idxmax(self):
+        ...
+
+    def first(self):
+        ...
+
+    def last(self):
+        ...
diff --git a/python/cudf/cudf/core/mixins/scans.py b/python/cudf/cudf/core/mixins/scans.py
new file mode 100644
index 00000000000..723fc758b13
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/scans.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from .mixin_factory import _create_delegating_mixin
+
+Scannable = _create_delegating_mixin(
+    "Scannable",
+    "Mixin encapsulating scan operations.",
+    "SCAN",
+    "_scan",
+    {"cumsum", "cumprod", "cummin", "cummax",},  # noqa: E231
+)
diff --git a/python/cudf/cudf/core/mixins/scans.pyi b/python/cudf/cudf/core/mixins/scans.pyi
new file mode 100644
index 00000000000..38cb9af284f
--- /dev/null
+++ b/python/cudf/cudf/core/mixins/scans.pyi
@@ -0,0 +1,18 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from typing import Set
+
+class Scannable:
+    _SUPPORTED_SCANS: Set
+
+    def cumsum(self):
+        ...
+
+    def cumprod(self):
+        ...
+
+    def cummin(self):
+        ...
+
+    def cummax(self):
+        ...
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index b934c6e7038..c9036db05fa 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -18,12 +18,22 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import DataFrameOrSeries
-from cudf.api.types import is_integer, is_list_like
+from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.frame import Frame
-from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
-from cudf.utils.utils import NotIterable, _maybe_indices_to_slice
+from cudf.core.index import (
+    BaseIndex,
+    _index_astype_docstring,
+    _lexsorted_equal_range,
+    as_index,
+)
+from cudf.utils.docutils import doc_apply
+from cudf.utils.utils import (
+    NotIterable,
+    _cudf_nvtx_annotate,
+    _maybe_indices_to_slice,
+)
 
 
 class MultiIndex(Frame, BaseIndex, NotIterable):
@@ -63,6 +73,7 @@ class MultiIndex(Frame, BaseIndex, NotIterable):
                )
     """
 
+    @_cudf_nvtx_annotate
     def __init__(
         self,
         levels=None,
@@ -110,7 +121,7 @@ def __init__(
 
         levels = [cudf.Series(level) for level in levels]
 
-        if len(levels) != len(codes.columns):
+        if len(levels) != len(codes._data):
             raise ValueError(
                 "MultiIndex has unequal number of levels and "
                 "codes and is inconsistent!"
@@ -147,11 +158,13 @@ def __init__(
         self._name = None
         self.names = names
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def names(self):
         return self._names
 
-    @names.setter
+    @names.setter  # type: ignore
+    @_cudf_nvtx_annotate
     def names(self, value):
         value = [None] * self.nlevels if value is None else value
 
@@ -169,6 +182,17 @@ def names(self, value):
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
+    @_cudf_nvtx_annotate
+    @doc_apply(_index_astype_docstring)
+    def astype(self, dtype, copy: bool = True):
+        if not is_object_dtype(dtype):
+            raise TypeError(
+                "Setting a MultiIndex dtype to anything other than object is "
+                "not supported"
+            )
+        return self
+
+    @_cudf_nvtx_annotate
     def rename(self, names, inplace=False):
         """
         Alter MultiIndex level names
@@ -215,6 +239,7 @@ def rename(self, names, inplace=False):
         """
         return self.set_names(names, level=None, inplace=inplace)
 
+    @_cudf_nvtx_annotate
     def set_names(self, names, level=None, inplace=False):
         names_is_list_like = is_list_like(names)
         level_is_list_like = is_list_like(level)
@@ -252,6 +277,7 @@ def set_names(self, names, level=None, inplace=False):
         return self._set_names(names=names, inplace=inplace)
 
     @classmethod
+    @_cudf_nvtx_annotate
     def _from_data(
         cls,
         data: MutableMapping,
@@ -264,14 +290,17 @@ def _from_data(
             obj.name = name
         return obj
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def name(self):
         return self._name
 
-    @name.setter
+    @name.setter  # type: ignore
+    @_cudf_nvtx_annotate
     def name(self, value):
         self._name = value
 
+    @_cudf_nvtx_annotate
     def copy(
         self,
         names=None,
@@ -367,6 +396,7 @@ def copy(
 
         return mi
 
+    @_cudf_nvtx_annotate
     def __repr__(self):
         max_seq_items = get_option("display.max_seq_items") or len(self)
 
@@ -443,7 +473,8 @@ def __repr__(self):
         data_output = "\n".join(lines)
         return output_prefix + data_output
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def codes(self):
         """
         Returns the codes of the underlying MultiIndex.
@@ -473,12 +504,14 @@ def codes(self):
             self._compute_levels_and_codes()
         return self._codes
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def nlevels(self):
         """Integer number of levels in this MultiIndex."""
         return len(self._data)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def levels(self):
         """
         Returns list of levels in the MultiIndex
@@ -515,11 +548,13 @@ def levels(self):
             self._compute_levels_and_codes()
         return self._levels
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def ndim(self):
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
+    @_cudf_nvtx_annotate
     def _get_level_label(self, level):
         """Get name of the level.
 
@@ -536,6 +571,7 @@ def _get_level_label(self, level):
         else:
             return self._data.names[level]
 
+    @_cudf_nvtx_annotate
     def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
@@ -640,6 +676,7 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
+    @_cudf_nvtx_annotate
     def _compute_levels_and_codes(self):
         levels = []
 
@@ -652,6 +689,7 @@ def _compute_levels_and_codes(self):
         self._levels = levels
         self._codes = cudf.DataFrame._from_data(codes)
 
+    @_cudf_nvtx_annotate
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
         lookup = cudf.DataFrame()
@@ -680,6 +718,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
                     raise KeyError(row)
         return result
 
+    @_cudf_nvtx_annotate
     def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
         # Instructions for Slicing
         # if tuple, get first and last elements of tuple
@@ -707,6 +746,7 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
             return row_tuple
         return self._compute_validity_mask(index, row_tuple, max_length)
 
+    @_cudf_nvtx_annotate
     def _index_and_downcast(self, result, index, index_key):
 
         if isinstance(index_key, (numbers.Number, slice)):
@@ -775,6 +815,7 @@ def _index_and_downcast(self, result, index, index_key):
             result.index = index
         return result
 
+    @_cudf_nvtx_annotate
     def _get_row_major(
         self,
         df: DataFrameOrSeries,
@@ -800,6 +841,7 @@ def _get_row_major(
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
 
+    @_cudf_nvtx_annotate
     def _validate_indexer(
         self,
         indexer: Union[
@@ -826,6 +868,7 @@ def _validate_indexer(
             for i in indexer:
                 self._validate_indexer(i)
 
+    @_cudf_nvtx_annotate
     def __eq__(self, other):
         if isinstance(other, MultiIndex):
             for self_col, other_col in zip(
@@ -836,11 +879,13 @@ def __eq__(self, other):
             return self.names == other.names
         return NotImplemented
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def size(self):
         # The size of a MultiIndex is only dependent on the number of rows.
         return self._num_rows
 
+    @_cudf_nvtx_annotate
     def take(self, indices):
         if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -848,6 +893,7 @@ def take(self, indices):
         obj.names = self.names
         return obj
 
+    @_cudf_nvtx_annotate
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
@@ -855,6 +901,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
+    @_cudf_nvtx_annotate
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
         column_names = pickle.loads(header["column_names"])
@@ -862,6 +909,7 @@ def deserialize(cls, header, frames):
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
+    @_cudf_nvtx_annotate
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
@@ -884,6 +932,7 @@ def __getitem__(self, index):
         result.names = self.names
         return result
 
+    @_cudf_nvtx_annotate
     def to_frame(self, index=True, name=None):
         # TODO: Currently this function makes a shallow copy, which is
         # incorrect. We want to make a deep copy, otherwise further
@@ -900,6 +949,7 @@ def to_frame(self, index=True, name=None):
             df.columns = name
         return df
 
+    @_cudf_nvtx_annotate
     def get_level_values(self, level):
         """
         Return the values at the requested level
@@ -953,6 +1003,7 @@ def is_interval(self):
         return False
 
     @classmethod
+    @_cudf_nvtx_annotate
     def _concat(cls, objs):
 
         source_data = [o.to_frame(index=False) for o in objs]
@@ -960,12 +1011,12 @@ def _concat(cls, objs):
         # TODO: Verify if this is really necessary or if we can rely on
         # DataFrame._concat.
         if len(source_data) > 1:
-            colnames = source_data[0].columns
+            colnames = source_data[0]._data.to_pandas_index()
             for obj in source_data[1:]:
                 obj.columns = colnames
 
         source_data = cudf.DataFrame._concat(source_data)
-        names = [None for x in source_data.columns]
+        names = [None] * source_data._num_columns
         objs = list(filter(lambda o: o.names is not None, objs))
         for o in range(len(objs)):
             for i, name in enumerate(objs[o].names):
@@ -973,6 +1024,7 @@ def _concat(cls, objs):
         return cudf.MultiIndex.from_frame(source_data, names=names)
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_tuples(cls, tuples, names=None):
         """
         Convert list of tuples to MultiIndex.
@@ -1009,7 +1061,8 @@ def from_tuples(cls, tuples, names=None):
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         return cls.from_pandas(pdi)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def values_host(self):
         """
         Return a numpy representation of the MultiIndex.
@@ -1036,7 +1089,8 @@ def values_host(self):
         """
         return self.to_pandas().values
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def values(self):
         """
         Return a CuPy representation of the MultiIndex.
@@ -1068,6 +1122,7 @@ def values(self):
         return self.to_frame(index=False).values
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_frame(cls, df, names=None):
         """
         Make a MultiIndex from a DataFrame.
@@ -1141,6 +1196,7 @@ def from_frame(cls, df, names=None):
         return obj
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_product(cls, arrays, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
@@ -1181,6 +1237,7 @@ def from_product(cls, arrays, names=None):
         pdi = pd.MultiIndex.from_product(arrays, names=names)
         return cls.from_pandas(pdi)
 
+    @_cudf_nvtx_annotate
     def _poplevels(self, level):
         """
         Remove and return the specified levels from self.
@@ -1231,6 +1288,7 @@ def _poplevels(self, level):
 
         return popped
 
+    @_cudf_nvtx_annotate
     def droplevel(self, level=-1):
         """
         Removes the specified levels from the MultiIndex.
@@ -1293,11 +1351,13 @@ def droplevel(self, level=-1):
         else:
             return mi
 
+    @_cudf_nvtx_annotate
     def to_pandas(self, nullable=False, **kwargs):
         result = self.to_frame(index=False).to_pandas(nullable=nullable)
         return pd.MultiIndex.from_frame(result, names=self.names)
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_pandas(cls, multiindex, nan_as_null=None):
         """
         Convert from a Pandas MultiIndex
@@ -1334,10 +1394,12 @@ def from_pandas(cls, multiindex, nan_as_null=None):
         return cls.from_frame(df, names=multiindex.names)
 
     @cached_property
+    @_cudf_nvtx_annotate
     def is_unique(self):
         return len(self) == len(self.unique())
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
         """
         Return if the index is monotonic increasing
@@ -1345,7 +1407,8 @@ def is_monotonic_increasing(self):
         """
         return self._is_sorted(ascending=None, null_position=None)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
         """
         Return if the index is monotonic decreasing
@@ -1355,6 +1418,7 @@ def is_monotonic_decreasing(self):
             ascending=[False] * len(self.levels), null_position=None
         )
 
+    @_cudf_nvtx_annotate
     def fillna(self, value):
         """
         Fill null values with the specified value.
@@ -1395,6 +1459,7 @@ def fillna(self, value):
 
         return super().fillna(value=value)
 
+    @_cudf_nvtx_annotate
     def unique(self):
         return self.drop_duplicates(keep="first")
 
@@ -1408,6 +1473,7 @@ def _clean_nulls_from_index(self):
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
 
+    @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
         usage = sum(super().memory_usage(deep=deep).values())
         if self.levels:
@@ -1418,11 +1484,13 @@ def memory_usage(self, deep=False):
                 usage += col.memory_usage
         return usage
 
+    @_cudf_nvtx_annotate
     def difference(self, other, sort=None):
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
         return self.to_pandas().difference(other, sort)
 
+    @_cudf_nvtx_annotate
     def append(self, other):
         """
         Append a collection of MultiIndex objects together
@@ -1485,6 +1553,7 @@ def append(self, other):
 
         return MultiIndex._concat(to_concat)
 
+    @_cudf_nvtx_annotate
     def __array_function__(self, func, types, args, kwargs):
         cudf_df_module = MultiIndex
 
@@ -1531,6 +1600,7 @@ def _level_index_from_level(self, level):
                 ) from None
             return level
 
+    @_cudf_nvtx_annotate
     def get_loc(self, key, method=None, tolerance=None):
         """
         Get location for a label or a tuple of labels.
@@ -1667,6 +1737,7 @@ def _maybe_match_names(self, other):
             for self_name, other_name in zip(self.names, other.names)
         ]
 
+    @_cudf_nvtx_annotate
     def _union(self, other, sort=None):
         # TODO: When to_frame is refactored to return a
         # deep copy in future, we should push most of the common
@@ -1682,7 +1753,8 @@ def _union(self, other, sort=None):
 
         result_df = self_df.merge(other_df, on=col_names, how="outer")
         result_df = result_df.sort_values(
-            by=result_df.columns[self.nlevels :], ignore_index=True
+            by=result_df._data.to_pandas_index()[self.nlevels :],
+            ignore_index=True,
         )
 
         midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels])
@@ -1691,6 +1763,7 @@ def _union(self, other, sort=None):
             return midx.sort_values()
         return midx
 
+    @_cudf_nvtx_annotate
     def _intersection(self, other, sort=None):
         if self.names != other.names:
             deep = True
@@ -1713,6 +1786,7 @@ def _intersection(self, other, sort=None):
             return midx.sort_values()
         return midx
 
+    @_cudf_nvtx_annotate
     def _copy_type_metadata(
         self, other: Frame, include_index: bool = True
     ) -> Frame:
@@ -1720,6 +1794,7 @@ def _copy_type_metadata(
         res._names = other._names
         return res
 
+    @_cudf_nvtx_annotate
     def _split_columns_by_levels(self, levels):
         # This function assumes that for levels with duplicate names, they are
         # specified by indices, not name by ``levels``. E.g. [None, None] can
diff --git a/python/cudf/cudf/core/ops.py b/python/cudf/cudf/core/ops.py
index fe9e012f406..c2a8c0e72fb 100644
--- a/python/cudf/cudf/core/ops.py
+++ b/python/cudf/cudf/core/ops.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+import warnings
 from numbers import Number
 
 import numpy as np
@@ -10,6 +11,10 @@
 
 
 def sin(arbitrary):
+    warnings.warn(
+        "sin is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.sin(arbitrary)
     else:
@@ -17,6 +22,10 @@ def sin(arbitrary):
 
 
 def cos(arbitrary):
+    warnings.warn(
+        "cos is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.cos(arbitrary)
     else:
@@ -24,6 +33,10 @@ def cos(arbitrary):
 
 
 def tan(arbitrary):
+    warnings.warn(
+        "tan is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.tan(arbitrary)
     else:
@@ -31,6 +44,11 @@ def tan(arbitrary):
 
 
 def arcsin(arbitrary):
+    warnings.warn(
+        "arcsin is deprecated and will be removed in the future",
+        FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.arcsin(arbitrary)
     else:
@@ -38,6 +56,11 @@ def arcsin(arbitrary):
 
 
 def arccos(arbitrary):
+    warnings.warn(
+        "arcsin is deprecated and will be removed in the future",
+        FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.arccos(arbitrary)
     else:
@@ -45,6 +68,11 @@ def arccos(arbitrary):
 
 
 def arctan(arbitrary):
+    warnings.warn(
+        "arctan is deprecated and will be removed in the future",
+        FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.arctan(arbitrary)
     else:
@@ -52,6 +80,10 @@ def arctan(arbitrary):
 
 
 def exp(arbitrary):
+    warnings.warn(
+        "exp is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.exp(arbitrary)
     else:
@@ -59,6 +91,10 @@ def exp(arbitrary):
 
 
 def log(arbitrary):
+    warnings.warn(
+        "log is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.log(arbitrary)
     else:
@@ -66,6 +102,10 @@ def log(arbitrary):
 
 
 def sqrt(arbitrary):
+    warnings.warn(
+        "sqrt is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.sqrt(arbitrary)
     else:
@@ -73,6 +113,11 @@ def sqrt(arbitrary):
 
 
 def logical_not(arbitrary):
+    warnings.warn(
+        "logical_not is deprecated and will be removed in the future",
+        FutureWarning,
+    )
+
     if isinstance(arbitrary, Number):
         return np.logical_not(arbitrary)
     else:
@@ -80,6 +125,11 @@ def logical_not(arbitrary):
 
 
 def logical_and(lhs, rhs):
+    warnings.warn(
+        "logical_and is deprecated and will be removed in the future",
+        FutureWarning,
+    )
+
     if isinstance(lhs, Number) and isinstance(rhs, Number):
         return np.logical_and(lhs, rhs)
     else:
@@ -87,6 +137,11 @@ def logical_and(lhs, rhs):
 
 
 def logical_or(lhs, rhs):
+    warnings.warn(
+        "logical_or is deprecated and will be removed in the future",
+        FutureWarning,
+    )
+
     if isinstance(lhs, Number) and isinstance(rhs, Number):
         return np.logical_or(lhs, rhs)
     else:
@@ -94,6 +149,11 @@ def logical_or(lhs, rhs):
 
 
 def remainder(lhs, rhs):
+    warnings.warn(
+        "remainder is deprecated and will be removed in the future",
+        FutureWarning,
+    )
+
     if isinstance(lhs, Number) and isinstance(rhs, Number):
         return np.mod(lhs, rhs)
     elif isinstance(lhs, Frame):
@@ -103,6 +163,10 @@ def remainder(lhs, rhs):
 
 
 def floor_divide(lhs, rhs):
+    warnings.warn(
+        "sin is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(lhs, Number) and isinstance(rhs, Number):
         return np.floor_divide(lhs, rhs)
     elif isinstance(lhs, Frame):
@@ -112,6 +176,10 @@ def floor_divide(lhs, rhs):
 
 
 def subtract(lhs, rhs):
+    warnings.warn(
+        "sin is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(lhs, Number) and isinstance(rhs, Number):
         return np.subtract(lhs, rhs)
     elif isinstance(lhs, Frame):
@@ -121,6 +189,10 @@ def subtract(lhs, rhs):
 
 
 def add(lhs, rhs):
+    warnings.warn(
+        "sin is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(lhs, Number) and isinstance(rhs, Number):
         return np.add(lhs, rhs)
     elif isinstance(rhs, Frame):
@@ -130,6 +202,10 @@ def add(lhs, rhs):
 
 
 def true_divide(lhs, rhs):
+    warnings.warn(
+        "sin is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(lhs, Number) and isinstance(rhs, Number):
         return np.true_divide(lhs, rhs)
     elif isinstance(rhs, Frame):
@@ -139,6 +215,10 @@ def true_divide(lhs, rhs):
 
 
 def multiply(lhs, rhs):
+    warnings.warn(
+        "sin is deprecated and will be removed in the future", FutureWarning,
+    )
+
     if isinstance(lhs, Number) and isinstance(rhs, Number):
         return np.multiply(lhs, rhs)
     elif isinstance(rhs, Frame):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 43c7d0343fd..5aa7f616e35 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import itertools
 from typing import Dict, Optional
@@ -334,8 +334,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 else:
                     df[name] = col
 
-        result_columns = objs[0].columns.append(
-            [obj.columns for obj in objs[1:]]
+        result_columns = (
+            objs[0]
+            ._data.to_pandas_index()
+            .append([obj._data.to_pandas_index() for obj in objs[1:]])
         )
 
         if ignore_index:
@@ -492,7 +494,7 @@ def melt(
         if not isinstance(id_vars, collections.abc.Sequence):
             id_vars = [id_vars]
         id_vars = list(id_vars)
-        missing = set(id_vars) - set(frame.columns)
+        missing = set(id_vars) - set(frame._column_names)
         if not len(missing) == 0:
             raise KeyError(
                 f"The following 'id_vars' are not present"
@@ -506,7 +508,7 @@ def melt(
         if not isinstance(value_vars, collections.abc.Sequence):
             value_vars = [value_vars]
         value_vars = list(value_vars)
-        missing = set(value_vars) - set(frame.columns)
+        missing = set(value_vars) - set(frame._column_names)
         if not len(missing) == 0:
             raise KeyError(
                 f"The following 'value_vars' are not present"
@@ -514,8 +516,7 @@ def melt(
             )
     else:
         # then all remaining columns in frame
-        value_vars = frame.columns.drop(id_vars)
-        value_vars = list(value_vars)
+        value_vars = list(set(frame._column_names) - set(id_vars))
 
     # Error for unimplemented support for datatype
     dtypes = [frame[col].dtype for col in id_vars + value_vars]
@@ -689,7 +690,9 @@ def get_dummies(
         encode_fallback_dtypes = ["object", "category"]
 
         if columns is None or len(columns) == 0:
-            columns = df.select_dtypes(include=encode_fallback_dtypes).columns
+            columns = df.select_dtypes(
+                include=encode_fallback_dtypes
+            )._column_names
 
         _length_check_params(prefix, columns, "prefix")
         _length_check_params(prefix_sep, columns, "prefix_sep")
@@ -1060,7 +1063,9 @@ def unstack(df, level, fill_value=None):
                 )
         res = df.T.stack(dropna=False)
         # Result's index is a multiindex
-        res.index.names = tuple(df.columns.names) + df.index.names
+        res.index.names = (
+            tuple(df._data.to_pandas_index().names) + df.index.names
+        )
         return res
     else:
         columns = df.index._poplevels(level)
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 134b94bf0f2..1c81803ed98 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -10,6 +10,7 @@
 from cudf.core.column.column import ColumnBase
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.index import BaseIndex
+from cudf.core.mixins import BinaryOperand
 from cudf.core.series import Series
 from cudf.utils.dtypes import (
     get_allowed_combinations_for_operator,
@@ -17,7 +18,7 @@
 )
 
 
-class Scalar:
+class Scalar(BinaryOperand):
     """
     A GPU-backed scalar object with NumPy scalar like properties
     May be used in binary operations against other scalars, cuDF
@@ -57,6 +58,8 @@ class Scalar:
         The data type
     """
 
+    _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
+
     def __init__(self, value, dtype=None):
 
         self._host_value = None
@@ -211,69 +214,8 @@ def __float__(self):
     def __bool__(self):
         return bool(self.value)
 
-    # Scalar Binary Operations
-    def __add__(self, other):
-        return self._scalar_binop(other, "__add__")
-
-    def __radd__(self, other):
-        return self._scalar_binop(other, "__radd__")
-
-    def __sub__(self, other):
-        return self._scalar_binop(other, "__sub__")
-
-    def __rsub__(self, other):
-        return self._scalar_binop(other, "__rsub__")
-
-    def __mul__(self, other):
-        return self._scalar_binop(other, "__mul__")
-
-    def __rmul__(self, other):
-        return self._scalar_binop(other, "__rmul__")
-
-    def __truediv__(self, other):
-        return self._scalar_binop(other, "__truediv__")
-
-    def __floordiv__(self, other):
-        return self._scalar_binop(other, "__floordiv__")
-
-    def __rtruediv__(self, other):
-        return self._scalar_binop(other, "__rtruediv__")
-
-    def __mod__(self, other):
-        return self._scalar_binop(other, "__mod__")
-
-    def __divmod__(self, other):
-        return self._scalar_binop(other, "__divmod__")
-
-    def __and__(self, other):
-        return self._scalar_binop(other, "__and__")
-
-    def __xor__(self, other):
-        return self._scalar_binop(other, "__or__")
-
-    def __pow__(self, other):
-        return self._scalar_binop(other, "__pow__")
-
-    def __gt__(self, other):
-        return self._scalar_binop(other, "__gt__")
-
-    def __lt__(self, other):
-        return self._scalar_binop(other, "__lt__")
-
-    def __ge__(self, other):
-        return self._scalar_binop(other, "__ge__")
-
-    def __le__(self, other):
-        return self._scalar_binop(other, "__le__")
-
-    def __eq__(self, other):
-        return self._scalar_binop(other, "__eq__")
-
-    def __ne__(self, other):
-        return self._scalar_binop(other, "__ne__")
-
     def __round__(self, n):
-        return self._scalar_binop(n, "__round__")
+        return self._binaryop(n, "__round__")
 
     # Scalar Unary Operations
     def __abs__(self):
@@ -330,7 +272,7 @@ def _binop_result_dtype_or_error(self, other, op):
 
         return cudf.dtype(out_dtype)
 
-    def _scalar_binop(self, other, op):
+    def _binaryop(self, other, op: str):
         if isinstance(other, (ColumnBase, Series, BaseIndex, np.ndarray)):
             # dispatch to column implementation
             return NotImplemented
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8574a152c44..b3b73b8961c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,7 +9,7 @@
 from collections import abc as abc
 from numbers import Number
 from shutil import get_terminal_size
-from typing import Any, MutableMapping, Optional, Set, Union
+from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union
 
 import cupy
 import numpy as np
@@ -28,17 +28,16 @@
     is_categorical_dtype,
     is_decimal_dtype,
     is_dict_like,
-    is_dtype_equal,
     is_integer,
     is_integer_dtype,
     is_interval_dtype,
     is_list_dtype,
-    is_list_like,
     is_scalar,
     is_struct_dtype,
 )
 from cudf.core.abc import Serializable
 from cudf.core.column import (
+    ColumnBase,
     DatetimeColumn,
     TimeDeltaColumn,
     arange,
@@ -55,7 +54,7 @@
 from cudf.core.column.string import StringMethods
 from cudf.core.column.struct import StructMethods
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame, _drop_rows_by_labels
+from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import SeriesGroupBy
 from cudf.core.index import BaseIndex, RangeIndex, as_index
 from cudf.core.indexed_frame import (
@@ -75,11 +74,7 @@
     is_mixed_with_object_dtype,
     min_scalar_type,
 )
-from cudf.utils.utils import (
-    get_appropriate_dispatched_func,
-    get_relevant_submodule,
-    to_cudf_compatible_scalar,
-)
+from cudf.utils.utils import _cudf_nvtx_annotate, to_cudf_compatible_scalar
 
 
 def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
@@ -98,6 +93,7 @@ class _SeriesIlocIndexer(_FrameIndexer):
     For integer-location based selection.
     """
 
+    @_cudf_nvtx_annotate
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
@@ -113,6 +109,7 @@ def __getitem__(self, arg):
             {self._frame.name: data}, index=cudf.Index(self._frame.index[arg]),
         )
 
+    @_cudf_nvtx_annotate
     def __setitem__(self, key, value):
         from cudf.core.column import column
 
@@ -156,6 +153,7 @@ class _SeriesLocIndexer(_FrameIndexer):
     Label-based selection
     """
 
+    @_cudf_nvtx_annotate
     def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
         if isinstance(arg, pd.MultiIndex):
             arg = cudf.from_pandas(arg)
@@ -178,6 +176,7 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
 
         return self._frame.iloc[arg]
 
+    @_cudf_nvtx_annotate
     def __setitem__(self, key, value):
         try:
             key = self._loc_to_iloc(key)
@@ -301,6 +300,7 @@ def _constructor_expanddim(self):
         return cudf.DataFrame
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_categorical(cls, categorical, codes=None):
         """Creates from a pandas.Categorical
 
@@ -341,6 +341,7 @@ def from_categorical(cls, categorical, codes=None):
         return Series(data=col)
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_masked_array(cls, data, mask, null_count=None):
         """Create a Series with null-mask.
         This is equivalent to:
@@ -389,6 +390,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         col = column.as_column(data).set_mask(mask)
         return cls(data=col)
 
+    @_cudf_nvtx_annotate
     def __init__(
         self, data=None, index=None, dtype=None, name=None, nan_as_null=True,
     ):
@@ -435,7 +437,7 @@ def __init__(
             else:
                 data = {}
 
-        if not isinstance(data, column.ColumnBase):
+        if not isinstance(data, ColumnBase):
             data = column.as_column(data, nan_as_null=nan_as_null, dtype=dtype)
         else:
             if dtype is not None:
@@ -444,12 +446,13 @@ def __init__(
         if index is not None and not isinstance(index, BaseIndex):
             index = as_index(index)
 
-        assert isinstance(data, column.ColumnBase)
+        assert isinstance(data, ColumnBase)
 
         super().__init__({name: data})
         self._index = RangeIndex(len(data)) if index is None else index
 
     @classmethod
+    @_cudf_nvtx_annotate
     def _from_data(
         cls,
         data: MutableMapping,
@@ -464,10 +467,12 @@ def _from_data(
             out._index = RangeIndex(out._data.nrows)
         return out
 
+    @_cudf_nvtx_annotate
     def __contains__(self, item):
         return item in self._index
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_pandas(cls, s, nan_as_null=None):
         """
         Convert from a Pandas Series.
@@ -508,7 +513,8 @@ def from_pandas(cls, s, nan_as_null=None):
         """
         return cls(s, nan_as_null=nan_as_null)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def dt(self):
         """
         Accessor object for datetime-like properties of the Series values.
@@ -549,6 +555,7 @@ def dt(self):
                 "Can only use .dt accessor with datetimelike values"
             )
 
+    @_cudf_nvtx_annotate
     def serialize(self):
         header, frames = super().serialize()
 
@@ -561,6 +568,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
+    @_cudf_nvtx_annotate
     def deserialize(cls, header, frames):
         index_nframes = header["index_frame_count"]
         obj = super().deserialize(
@@ -587,6 +595,7 @@ def _get_columns_by_label(self, labels, downcast=False):
             else self.__class__(dtype=self.dtype, name=self.name)
         )
 
+    @_cudf_nvtx_annotate
     def drop(
         self,
         labels=None,
@@ -597,120 +606,16 @@ def drop(
         inplace=False,
         errors="raise",
     ):
-        """
-        Return Series with specified index labels removed.
-
-        Remove elements of a Series based on specifying the index labels.
-        When using a multi-index, labels on different levels can be removed by
-        specifying the level.
-
-        Parameters
-        ----------
-        labels : single label or list-like
-            Index labels to drop.
-        axis : 0, default 0
-            Redundant for application on Series.
-        index : single label or list-like
-            Redundant for application on Series. But ``index`` can be used
-            instead of ``labels``
-        columns : single label or list-like
-            This parameter is ignored. Use ``index`` or ``labels`` to specify.
-        level : int or level name, optional
-            For MultiIndex, level from which the labels will be removed.
-        inplace : bool, default False
-            If False, return a copy. Otherwise, do operation
-            inplace and return None.
-        errors : {'ignore', 'raise'}, default 'raise'
-            If 'ignore', suppress error and only existing labels are
-            dropped.
-
-        Returns
-        -------
-        Series or None
-            Series with specified index labels removed or None if
-            ``inplace=True``
-
-        Raises
-        ------
-        KeyError
-            If any of the labels is not found in the selected axis and
-            ``error='raise'``
-
-        See Also
-        --------
-        Series.reindex
-            Return only specified index labels of Series
-        Series.dropna
-            Return series without null values
-        Series.drop_duplicates
-            Return series with duplicate values removed
-        cudf.DataFrame.drop
-            Drop specified labels from rows or columns in dataframe
-
-        Examples
-        --------
-        >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z'])
-        >>> s
-        x    1
-        y    2
-        z    3
-        dtype: int64
-
-        Drop labels x and z
-
-        >>> s.drop(labels=['x', 'z'])
-        y    2
-        dtype: int64
-
-        Drop a label from the second level in MultiIndex Series.
-
-        >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']])
-        >>> s = cudf.Series(range(6), index=midx)
-        >>> s
-        0  x    0
-           y    1
-        1  x    2
-           y    3
-        2  x    4
-           y    5
-        dtype: int64
-        >>> s.drop(labels='y', level=1)
-        0  x    0
-        1  x    2
-        2  x    4
-        Name: 2, dtype: int64
-        """
-        if labels is not None:
-            if index is not None or columns is not None:
-                raise ValueError(
-                    "Cannot specify both 'labels' and 'index'/'columns'"
-                )
-            if axis == 1:
-                raise ValueError("No axis named 1 for object type Series")
-            target = labels
-        elif index is not None:
-            target = index
-        elif columns is not None:
-            target = []  # Ignore parameter columns
-        else:
-            raise ValueError(
-                "Need to specify at least one of 'labels', "
-                "'index' or 'columns'"
-            )
-
-        if inplace:
-            out = self
-        else:
-            out = self.copy()
-
-        dropped = _drop_rows_by_labels(out, target, level, errors)
-
-        out._data = dropped._data
-        out._index = dropped._index
-
-        if not inplace:
-            return out
+        if axis == 1:
+            raise ValueError("No axis named 1 for object type Series")
+        # Ignore columns for Series
+        if columns is not None:
+            columns = []
+        return super().drop(
+            labels, axis, index, columns, level, inplace, errors
+        )
 
+    @_cudf_nvtx_annotate
     def append(self, to_append, ignore_index=False, verify_integrity=False):
         """Append values from another ``Series`` or array-like object.
         If ``ignore_index=True``, the index is reset.
@@ -784,19 +689,9 @@ def append(self, to_append, ignore_index=False, verify_integrity=False):
         5    6
         dtype: int64
         """
-        if verify_integrity not in (None, False):
-            raise NotImplementedError(
-                "verify_integrity parameter is not supported yet."
-            )
-
-        if is_list_like(to_append):
-            to_concat = [self]
-            to_concat.extend(to_append)
-        else:
-            to_concat = [self, to_append]
-
-        return cudf.concat(to_concat, ignore_index=ignore_index)
+        return super()._append(to_append, ignore_index, verify_integrity)
 
+    @_cudf_nvtx_annotate
     def reindex(self, index=None, copy=True):
         """Return a Series that conforms to a new index
 
@@ -832,6 +727,7 @@ def reindex(self, index=None, copy=True):
         series.name = self.name
         return series
 
+    @_cudf_nvtx_annotate
     @docutils.doc_apply(
         doc_reset_index_template.format(
             klass="Series",
@@ -913,6 +809,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False):
             inplace=inplace,
         )
 
+    @_cudf_nvtx_annotate
     def to_frame(self, name=None):
         """Convert Series into a DataFrame
 
@@ -955,28 +852,67 @@ def to_frame(self, name=None):
 
         return cudf.DataFrame({col: self._column}, index=self.index)
 
+    @_cudf_nvtx_annotate
     def memory_usage(self, index=True, deep=False):
         return sum(super().memory_usage(index, deep).values())
 
+    @_cudf_nvtx_annotate
     def __array_function__(self, func, types, args, kwargs):
-        handled_types = [cudf.Series]
-        for t in types:
-            if t not in handled_types:
-                return NotImplemented
+        if "out" in kwargs or not all(issubclass(t, Series) for t in types):
+            return NotImplemented
 
-        cudf_submodule = get_relevant_submodule(func, cudf)
-        cudf_ser_submodule = get_relevant_submodule(func, cudf.Series)
-        cupy_submodule = get_relevant_submodule(func, cupy)
-
-        return get_appropriate_dispatched_func(
-            cudf_submodule,
-            cudf_ser_submodule,
-            cupy_submodule,
-            func,
-            args,
-            kwargs,
-        )
+        try:
+            # Apply a Series method if one exists.
+            if cudf_func := getattr(Series, func.__name__, None):
+                return cudf_func(*args, **kwargs)
+
+            # Assume that cupy subpackages match numpy and search the
+            # corresponding cupy submodule based on the func's __module__.
+            numpy_submodule = func.__module__.split(".")[1:]
+            cupy_func = cupy
+            for name in (*numpy_submodule, func.__name__):
+                cupy_func = getattr(cupy_func, name, None)
+
+            # Handle case if cupy does not implement the function or just
+            # aliases the numpy function.
+            if not cupy_func or cupy_func is func:
+                return NotImplemented
 
+            # For now just fail on cases with mismatched indices. There is
+            # almost certainly no general solution for all array functions.
+            index = args[0].index
+            if not all(s.index.equals(index) for s in args):
+                return NotImplemented
+            out = cupy_func(*(s.values for s in args), **kwargs)
+
+            # Return (host) scalar values immediately.
+            if not isinstance(out, cupy.ndarray):
+                return out
+
+            # 0D array (scalar)
+            if out.ndim == 0:
+                return to_cudf_compatible_scalar(out)
+            # 1D array
+            elif (
+                # Only allow 1D arrays
+                ((out.ndim == 1) or (out.ndim == 2 and out.shape[1] == 1))
+                # If we have an index, it must be the same length as the
+                # output for cupy dispatching to be well-defined.
+                and len(index) == len(out)
+            ):
+                return Series(out, index=index)
+        except Exception:
+            # The rare instance where a "silent" failure is preferable. Except
+            # in the (highly unlikely) case that some other library
+            # interoperates with cudf objects, the result will be that numpy
+            # raises a TypeError indicating that the operation is not
+            # implemented, which is much friendlier than an arbitrary internal
+            # cudf error.
+            pass
+
+        return NotImplemented
+
+    @_cudf_nvtx_annotate
     def map(self, arg, na_action=None) -> "Series":
         """
         Map values of Series according to input correspondence.
@@ -1078,6 +1014,7 @@ def map(self, arg, na_action=None) -> "Series":
             result = self.applymap(arg)
         return result
 
+    @_cudf_nvtx_annotate
     def __getitem__(self, arg):
         if isinstance(arg, slice):
             return self.iloc[arg]
@@ -1088,6 +1025,7 @@ def __getitem__(self, arg):
 
     items = SingleColumnFrame.__iter__
 
+    @_cudf_nvtx_annotate
     def __setitem__(self, key, value):
         if isinstance(key, slice):
             self.iloc[key] = value
@@ -1206,7 +1144,7 @@ def __repr__(self):
             lines.append(category_memory)
         return "\n".join(lines)
 
-    def _prep_for_binop(
+    def _make_operands_and_index_for_binop(
         self,
         other: Any,
         fn: str,
@@ -1215,22 +1153,19 @@ def _prep_for_binop(
         can_reindex: bool = False,
         *args,
         **kwargs,
-    ):
+    ) -> Tuple[
+        Union[
+            Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
+            Type[NotImplemented],
+        ],
+        Optional[BaseIndex],
+    ]:
         # Specialize binops to align indices.
-        if isinstance(other, SingleColumnFrame):
+        if isinstance(other, Series):
             if (
-                # TODO: The can_reindex logic also needs to be applied for
-                # DataFrame (the methods that need it just don't exist yet).
                 not can_reindex
                 and fn in cudf.utils.utils._EQUALITY_OPS
-                and (
-                    isinstance(other, Series)
-                    # TODO: mypy doesn't like this line because the index
-                    # property is not defined on SingleColumnFrame (or Index,
-                    # for that matter). Ignoring is the easy solution for now,
-                    # a cleaner fix requires reworking the type hierarchy.
-                    and not self.index.equals(other.index)  # type: ignore
-                )
+                and not self.index.equals(other.index)
             ):
                 raise ValueError(
                     "Can only compare identically-labeled Series objects"
@@ -1242,48 +1177,31 @@ def _prep_for_binop(
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
         return operands, lhs._index
 
-    def _binaryop(
-        self,
-        other: Frame,
-        fn: str,
-        fill_value: Any = None,
-        reflect: bool = False,
-        can_reindex: bool = False,
-        *args,
-        **kwargs,
-    ):
-        operands, out_index = self._prep_for_binop(
-            other, fn, fill_value, reflect, can_reindex
-        )
-        return (
-            self._from_data(
-                data=self._colwise_binop(operands, fn), index=out_index,
-            )
-            if operands is not NotImplemented
-            else NotImplemented
-        )
-
+    @_cudf_nvtx_annotate
     def logical_and(self, other):
         warnings.warn(
             "Series.logical_and is deprecated and will be removed.",
             FutureWarning,
         )
-        return self._binaryop(other, "l_and").astype(np.bool_)
+        return self._binaryop(other, "__l_and__").astype(np.bool_)
 
+    @_cudf_nvtx_annotate
     def remainder(self, other):
         warnings.warn(
             "Series.remainder is deprecated and will be removed.",
             FutureWarning,
         )
-        return self._binaryop(other, "mod")
+        return self._binaryop(other, "__mod__")
 
+    @_cudf_nvtx_annotate
     def logical_or(self, other):
         warnings.warn(
             "Series.logical_or is deprecated and will be removed.",
             FutureWarning,
         )
-        return self._binaryop(other, "l_or").astype(np.bool_)
+        return self._binaryop(other, "__l_or__").astype(np.bool_)
 
+    @_cudf_nvtx_annotate
     def logical_not(self):
         warnings.warn(
             "Series.logical_not is deprecated and will be removed.",
@@ -1293,30 +1211,36 @@ def logical_not(self):
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
+    @_cudf_nvtx_annotate
     def cat(self):
         return CategoricalAccessor(parent=self)
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
+    @_cudf_nvtx_annotate
     def str(self):
         return StringMethods(parent=self)
 
     @copy_docstring(ListMethods)  # type: ignore
     @property
+    @_cudf_nvtx_annotate
     def list(self):
         return ListMethods(parent=self)
 
     @copy_docstring(StructMethods)  # type: ignore
     @property
+    @_cudf_nvtx_annotate
     def struct(self):
         return StructMethods(parent=self)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def dtype(self):
         """dtype of the Series"""
         return self._column.dtype
 
     @classmethod
+    @_cudf_nvtx_annotate
     def _concat(cls, objs, axis=0, index=True):
         # Concatenate index if not provided
         if index is True:
@@ -1386,22 +1310,26 @@ def _concat(cls, objs, axis=0, index=True):
 
         return cls(data=col, index=index, name=name)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def valid_count(self):
         """Number of non-null values"""
         return self._column.valid_count
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def null_count(self):
         """Number of null values"""
         return self._column.null_count
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def nullable(self):
         """A boolean indicating whether a null-mask is needed"""
         return self._column.nullable
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def has_nulls(self):
         """
         Indicator whether Series contains null values.
@@ -1430,13 +1358,14 @@ def has_nulls(self):
         """
         return self._column.has_nulls()
 
+    @_cudf_nvtx_annotate
     def dropna(self, axis=0, inplace=False, how=None):
         """
         Return a Series with null values removed.
 
         Parameters
         ----------
-        axis : {0 or ‘index’}, default 0
+        axis : {0 or 'index'}, default 0
             There is only one axis to drop values from.
         inplace : bool, default False
             If True, do operation inplace and return None.
@@ -1509,6 +1438,7 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         return self._mimic_inplace(result, inplace=inplace)
 
+    @_cudf_nvtx_annotate
     def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         """
         Return Series with duplicate values removed.
@@ -1544,9 +1474,9 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         Name: animal, dtype: object
 
         With the `keep` parameter, the selection behaviour of duplicated
-        values can be changed. The value ‘first’ keeps the first
+        values can be changed. The value 'first' keeps the first
         occurrence for each set of duplicated entries.
-        The default value of keep is ‘first’. Note that order of
+        The default value of keep is 'first'. Note that order of
         the rows being returned is not guaranteed
         to be sorted.
 
@@ -1557,7 +1487,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         0      lama
         Name: animal, dtype: object
 
-        The value ‘last’ for parameter `keep` keeps the last occurrence
+        The value 'last' for parameter `keep` keeps the last occurrence
         for each set of duplicated entries.
 
         >>> s.drop_duplicates(keep='last')
@@ -1582,6 +1512,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         return self._mimic_inplace(result, inplace=inplace)
 
+    @_cudf_nvtx_annotate
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1605,6 +1536,7 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
+    @_cudf_nvtx_annotate
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1612,6 +1544,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
             )
         return super().all(axis, skipna, level, **kwargs)
 
+    @_cudf_nvtx_annotate
     def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1619,6 +1552,7 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
             )
         return super().any(axis, skipna, level, **kwargs)
 
+    @_cudf_nvtx_annotate
     def to_pandas(self, index=True, nullable=False, **kwargs):
         """
         Convert to a Pandas Series.
@@ -1682,7 +1616,8 @@ def to_pandas(self, index=True, nullable=False, **kwargs):
         s.name = self.name
         return s
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def data(self):
         """The gpu buffer for the data
 
@@ -1708,122 +1643,31 @@ def data(self):
         """  # noqa: E501
         return self._column.data
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def nullmask(self):
         """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
-    def astype(self, dtype, copy=False, errors="raise"):
-        """
-        Cast the Series to the given dtype
-
-        Parameters
-        ----------
-
-        dtype : data type, or dict of column name -> data type
-            Use a numpy.dtype or Python type to cast Series object to
-            the same type. Alternatively, use {col: dtype, ...}, where col is a
-            series name and dtype is a numpy.dtype or Python type to cast to.
-        copy : bool, default False
-            Return a deep-copy when ``copy=True``. Note by default
-            ``copy=False`` setting is used and hence changes to
-            values then may propagate to other cudf objects.
-        errors : {'raise', 'ignore', 'warn'}, default 'raise'
-            Control raising of exceptions on invalid data for provided dtype.
-
-            - ``raise`` : allow exceptions to be raised
-            - ``ignore`` : suppress exceptions. On error return original
-              object.
-            - ``warn`` : prints last exceptions as warnings and
-              return original object.
-
-        Returns
-        -------
-        out : Series
-            Returns ``self.copy(deep=copy)`` if ``dtype`` is the same
-            as ``self.dtype``.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([1, 2], dtype='int32')
-        >>> series
-        0    1
-        1    2
-        dtype: int32
-        >>> series.astype('int64')
-        0    1
-        1    2
-        dtype: int64
-
-        Convert to categorical type:
-
-        >>> series.astype('category')
-        0    1
-        1    2
-        dtype: category
-        Categories (2, int64): [1, 2]
-
-        Convert to ordered categorical type with custom ordering:
-
-        >>> cat_dtype = cudf.CategoricalDtype(categories=[2, 1], ordered=True)
-        >>> series.astype(cat_dtype)
-        0    1
-        1    2
-        dtype: category
-        Categories (2, int64): [2 < 1]
-
-        Note that using ``copy=False`` (enabled by default)
-        and changing data on a new Series will
-        propagate changes:
-
-        >>> s1 = cudf.Series([1, 2])
-        >>> s1
-        0    1
-        1    2
-        dtype: int64
-        >>> s2 = s1.astype('int64', copy=False)
-        >>> s2[0] = 10
-        >>> s1
-        0    10
-        1     2
-        dtype: int64
-        """
-        if errors not in ("ignore", "raise", "warn"):
-            raise ValueError("invalid error value specified")
-
+    @_cudf_nvtx_annotate
+    def astype(self, dtype, copy=False, errors="raise", **kwargs):
         if is_dict_like(dtype):
             if len(dtype) > 1 or self.name not in dtype:
                 raise KeyError(
-                    "Only the Series name can be used for "
-                    "the key in Series dtype mappings."
+                    "Only the Series name can be used for the key in Series "
+                    "dtype mappings."
                 )
-            dtype = dtype[self.name]
-
-        if is_dtype_equal(dtype, self.dtype):
-            return self.copy(deep=copy)
-        try:
-            data = self._column.astype(dtype)
-
-            return self._from_data({self.name: data}, index=self._index)
-
-        except Exception as e:
-            if errors == "raise":
-                raise e
-            elif errors == "warn":
-                import traceback
-
-                tb = traceback.format_exc()
-                warnings.warn(tb)
-            elif errors == "ignore":
-                pass
-            return self
+        else:
+            dtype = {self.name: dtype}
+        return super().astype(dtype, copy, errors, **kwargs)
 
+    @_cudf_nvtx_annotate
     def sort_index(self, axis=0, *args, **kwargs):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
         return super().sort_index(axis=axis, *args, **kwargs)
 
+    @_cudf_nvtx_annotate
     def sort_values(
         self,
         axis=0,
@@ -1878,6 +1722,7 @@ def sort_values(
             ignore_index=ignore_index,
         )
 
+    @_cudf_nvtx_annotate
     def nlargest(self, n=5, keep="first"):
         """Returns a new Series of the *n* largest element.
 
@@ -1940,6 +1785,7 @@ def nlargest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(True, n, [self.name], keep)
 
+    @_cudf_nvtx_annotate
     def nsmallest(self, n=5, keep="first"):
         """
         Returns a new Series of the *n* smallest element.
@@ -2015,6 +1861,7 @@ def nsmallest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, [self.name], keep)
 
+    @_cudf_nvtx_annotate
     def argsort(
         self,
         axis=0,
@@ -2037,6 +1884,7 @@ def argsort(
         obj.name = self.name
         return obj
 
+    @_cudf_nvtx_annotate
     def replace(self, to_replace=None, value=None, *args, **kwargs):
         if is_dict_like(to_replace) and value is not None:
             raise ValueError(
@@ -2046,6 +1894,7 @@ def replace(self, to_replace=None, value=None, *args, **kwargs):
 
         return super().replace(to_replace, value, *args, **kwargs)
 
+    @_cudf_nvtx_annotate
     def update(self, other):
         """
         Modify Series in place using values from passed Series.
@@ -2150,6 +1999,7 @@ def update(self, other):
 
         self.mask(mask, other, inplace=True)
 
+    @_cudf_nvtx_annotate
     def _label_encoding(self, cats, dtype=None, na_sentinel=-1):
         # Private implementation of deprecated public label_encoding method
         def _return_sentinel_series():
@@ -2193,6 +2043,7 @@ def _return_sentinel_series():
         return codes
 
     # UDF related
+    @_cudf_nvtx_annotate
     def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         Apply a scalar function to the values of a Series.
@@ -2281,6 +2132,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
             raise ValueError("Series.apply only supports convert_dtype=True")
         return self._apply(func, _get_scalar_kernel, *args, **kwargs)
 
+    @_cudf_nvtx_annotate
     def applymap(self, udf, out_dtype=None):
         """Apply an elementwise function to transform the values in the Column.
 
@@ -2396,6 +2248,7 @@ def applymap(self, udf, out_dtype=None):
     #
     # Stats
     #
+    @_cudf_nvtx_annotate
     def count(self, level=None, **kwargs):
         """
         Return number of non-NA/null observations in the Series
@@ -2422,6 +2275,7 @@ def count(self, level=None, **kwargs):
 
         return self.valid_count
 
+    @_cudf_nvtx_annotate
     def mode(self, dropna=True):
         """
         Return the mode(s) of the dataset.
@@ -2490,6 +2344,7 @@ def mode(self, dropna=True):
 
         return Series(val_counts.index.sort_values(), name=self.name)
 
+    @_cudf_nvtx_annotate
     def round(self, decimals=0, how="half_even"):
         if not is_integer(decimals):
             raise ValueError(
@@ -2498,6 +2353,7 @@ def round(self, decimals=0, how="half_even"):
         decimals = int(decimals)
         return super().round(decimals, how)
 
+    @_cudf_nvtx_annotate
     def cov(self, other, min_periods=None):
         """
         Compute covariance with Series, excluding missing values.
@@ -2539,8 +2395,15 @@ def cov(self, other, min_periods=None):
 
         lhs, rhs = _align_indices([lhs, rhs], how="inner")
 
-        return lhs._column.cov(rhs._column)
+        try:
+            return lhs._column.cov(rhs._column)
+        except AttributeError:
+            raise TypeError(
+                f"cannot perform covariance with types {self.dtype}, "
+                f"{other.dtype}"
+            )
 
+    @_cudf_nvtx_annotate
     def transpose(self):
         """Return the transpose, which is by definition self.
         """
@@ -2549,6 +2412,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
+    @_cudf_nvtx_annotate
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
@@ -2575,8 +2439,14 @@ def corr(self, other, method="pearson", min_periods=None):
         rhs = other.nans_to_nulls().dropna()
         lhs, rhs = _align_indices([lhs, rhs], how="inner")
 
-        return lhs._column.corr(rhs._column)
+        try:
+            return lhs._column.corr(rhs._column)
+        except AttributeError:
+            raise TypeError(
+                f"cannot perform corr with types {self.dtype}, {other.dtype}"
+            )
 
+    @_cudf_nvtx_annotate
     def autocorr(self, lag=1):
         """Compute the lag-N autocorrelation. This method computes the Pearson
         correlation between the Series and its shifted self.
@@ -2602,6 +2472,7 @@ def autocorr(self, lag=1):
         """
         return self.corr(self.shift(lag))
 
+    @_cudf_nvtx_annotate
     def isin(self, values):
         """Check whether values are contained in Series.
 
@@ -2671,6 +2542,7 @@ def isin(self, values):
             {self.name: self._column.isin(values)}, index=self.index
         )
 
+    @_cudf_nvtx_annotate
     def unique(self):
         """
         Returns unique values of this Series.
@@ -2703,6 +2575,7 @@ def unique(self):
         res = self._column.unique()
         return Series(res, name=self.name)
 
+    @_cudf_nvtx_annotate
     def value_counts(
         self,
         normalize=False,
@@ -2825,6 +2698,7 @@ def value_counts(
             res = res / float(res._column.sum())
         return res
 
+    @_cudf_nvtx_annotate
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):
@@ -2889,6 +2763,7 @@ def quantile(
         return Series(result, index=index, name=self.name)
 
     @docutils.doc_describe()
+    @_cudf_nvtx_annotate
     def describe(
         self,
         percentiles=None,
@@ -3044,6 +2919,7 @@ def _describe_timestamp(self):
         else:
             return _describe_categorical(self)
 
+    @_cudf_nvtx_annotate
     def digitize(self, bins, right=False):
         """Return the indices of the bins to which each value in series belongs.
 
@@ -3079,6 +2955,7 @@ def digitize(self, bins, right=False):
             cudf.core.column.numerical.digitize(self._column, bins, right)
         )
 
+    @_cudf_nvtx_annotate
     def diff(self, periods=1):
         """Calculate the difference between values at positions i and i - N in
         an array and store the output in a new array.
@@ -3167,6 +3044,7 @@ def diff(self, periods=1):
         return Series(output_col, name=self.name, index=self.index)
 
     @copy_docstring(SeriesGroupBy)
+    @_cudf_nvtx_annotate
     def groupby(
         self,
         by=None,
@@ -3212,6 +3090,7 @@ def groupby(
             )
         )
 
+    @_cudf_nvtx_annotate
     def rename(self, index=None, copy=True):
         """
         Alter Series name
@@ -3257,6 +3136,7 @@ def rename(self, index=None, copy=True):
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
+    @_cudf_nvtx_annotate
     def merge(
         self,
         other,
@@ -3308,18 +3188,21 @@ def merge(
 
         return result
 
+    @_cudf_nvtx_annotate
     def add_prefix(self, prefix):
         return Series._from_data(
             data=self._data.copy(deep=True),
             index=prefix + self.index.astype(str),
         )
 
+    @_cudf_nvtx_annotate
     def add_suffix(self, suffix):
         return Series._from_data(
             data=self._data.copy(deep=True),
             index=self.index.astype(str) + suffix,
         )
 
+    @_cudf_nvtx_annotate
     def keys(self):
         """
         Return alias for index.
@@ -3363,6 +3246,7 @@ def keys(self):
         """
         return self.index
 
+    @_cudf_nvtx_annotate
     def explode(self, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -3375,7 +3259,7 @@ def explode(self, ignore_index=False):
 
         Returns
         -------
-        DataFrame
+        Series
 
         Examples
         --------
@@ -3397,13 +3281,9 @@ def explode(self, ignore_index=False):
         3       5
         dtype: int64
         """
-        if not is_list_dtype(self._column.dtype):
-            data = self._data.copy(deep=True)
-            idx = None if ignore_index else self._index.copy(deep=True)
-            return self.__class__._from_data(data, index=idx)
-
-        return super()._explode(self._column_names[0], ignore_index)
+        return super()._explode(self.name, ignore_index)
 
+    @_cudf_nvtx_annotate
     def pct_change(
         self, periods=1, fill_method="ffill", limit=None, freq=None
     ):
@@ -3555,7 +3435,8 @@ class DatetimeProperties:
     def __init__(self, series):
         self.series = series
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def year(self):
         """
         The year of the datetime.
@@ -3579,7 +3460,8 @@ def year(self):
         """
         return self._get_dt_field("year")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def month(self):
         """
         The month as January=1, December=12.
@@ -3603,7 +3485,8 @@ def month(self):
         """
         return self._get_dt_field("month")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def day(self):
         """
         The day of the datetime.
@@ -3627,7 +3510,8 @@ def day(self):
         """
         return self._get_dt_field("day")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def hour(self):
         """
         The hours of the datetime.
@@ -3651,7 +3535,8 @@ def hour(self):
         """
         return self._get_dt_field("hour")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def minute(self):
         """
         The minutes of the datetime.
@@ -3675,7 +3560,8 @@ def minute(self):
         """
         return self._get_dt_field("minute")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def second(self):
         """
         The seconds of the datetime.
@@ -3699,7 +3585,8 @@ def second(self):
         """
         return self._get_dt_field("second")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def weekday(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -3735,7 +3622,8 @@ def weekday(self):
         """
         return self._get_dt_field("weekday")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def dayofweek(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -3771,7 +3659,8 @@ def dayofweek(self):
         """
         return self._get_dt_field("weekday")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def dayofyear(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -3808,7 +3697,8 @@ def dayofyear(self):
         """
         return self._get_dt_field("day_of_year")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def day_of_year(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -3845,7 +3735,8 @@ def day_of_year(self):
         """
         return self._get_dt_field("day_of_year")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_leap_year(self):
         """
         Boolean indicator if the date belongs to a leap year.
@@ -3903,7 +3794,8 @@ def is_leap_year(self):
             name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def quarter(self):
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -3934,6 +3826,7 @@ def quarter(self):
             {None: res}, index=self.series._index, name=self.series.name,
         )
 
+    @_cudf_nvtx_annotate
     def isocalendar(self):
         """
         Returns a DataFrame with the year, week, and day
@@ -3977,14 +3870,16 @@ def isocalendar(self):
         """
         return cudf.core.tools.datetimes._to_iso_calendar(self)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_month_start(self):
         """
         Booleans indicating if dates are the first day of the month.
         """
         return (self.day == 1).fillna(False)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def days_in_month(self):
         """
         Get the total number of days in the month that the date falls on.
@@ -4035,7 +3930,8 @@ def days_in_month(self):
             name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_month_end(self):
         """
         Boolean indicator if the date is the last day of the month.
@@ -4081,7 +3977,8 @@ def is_month_end(self):
         )
         return (self.day == last_day.dt.day).fillna(False)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_quarter_start(self):
         """
         Boolean indicator if the date is the first day of a quarter.
@@ -4127,7 +4024,8 @@ def is_quarter_start(self):
             {None: result}, index=self.series._index, name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_quarter_end(self):
         """
         Boolean indicator if the date is the last day of a quarter.
@@ -4175,7 +4073,8 @@ def is_quarter_end(self):
             {None: result}, index=self.series._index, name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_year_start(self):
         """
         Boolean indicator if the date is the first day of the year.
@@ -4209,7 +4108,8 @@ def is_year_start(self):
             name=self.series.name,
         )
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_year_end(self):
         """
         Boolean indicator if the date is the last day of the year.
@@ -4245,12 +4145,14 @@ def is_year_end(self):
             {None: result}, index=self.series._index, name=self.series.name,
         )
 
+    @_cudf_nvtx_annotate
     def _get_dt_field(self, field):
         out_column = self.series._column.get_dt_field(field)
         return Series(
             data=out_column, index=self.series._index, name=self.series.name
         )
 
+    @_cudf_nvtx_annotate
     def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
@@ -4287,6 +4189,7 @@ def ceil(self, freq):
             data={self.series.name: out_column}, index=self.series._index
         )
 
+    @_cudf_nvtx_annotate
     def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
@@ -4323,6 +4226,7 @@ def floor(self, freq):
             data={self.series.name: out_column}, index=self.series._index
         )
 
+    @_cudf_nvtx_annotate
     def round(self, freq):
         """
         Perform round operation on the data to the specified freq.
@@ -4362,6 +4266,7 @@ def round(self, freq):
             data={self.series.name: out_column}, index=self.series._index
         )
 
+    @_cudf_nvtx_annotate
     def strftime(self, date_format, *args, **kwargs):
         """
         Convert to Series using specified ``date_format``.
@@ -4515,7 +4420,8 @@ class TimedeltaProperties:
     def __init__(self, series):
         self.series = series
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def days(self):
         """
         Number of days.
@@ -4546,7 +4452,8 @@ def days(self):
         """
         return self._get_td_field("days")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day).
@@ -4584,7 +4491,8 @@ def seconds(self):
         """
         return self._get_td_field("seconds")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second).
@@ -4615,7 +4523,8 @@ def microseconds(self):
         """
         return self._get_td_field("microseconds")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def nanoseconds(self):
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
@@ -4646,7 +4555,8 @@ def nanoseconds(self):
         """
         return self._get_td_field("nanoseconds")
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def components(self):
         """
         Return a Dataframe of the components of the Timedeltas.
@@ -4675,6 +4585,7 @@ def components(self):
         """  # noqa: E501
         return self.series._column.components(index=self.series._index)
 
+    @_cudf_nvtx_annotate
     def _get_td_field(self, field):
         out_column = getattr(self.series._column, field)
         return Series(
@@ -4682,6 +4593,7 @@ def _get_td_field(self, field):
         )
 
 
+@_cudf_nvtx_annotate
 def _align_indices(series_list, how="outer", allow_non_unique=False):
     """
     Internal util to align the indices of a list of Series objects
@@ -4749,6 +4661,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
     return result
 
 
+@_cudf_nvtx_annotate
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     """Returns a boolean array where two arrays are equal within a tolerance.
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 50b206d3388..b35d653e28f 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -3,8 +3,16 @@
 
 from __future__ import annotations
 
-import builtins
-from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union
+from typing import (
+    Any,
+    Dict,
+    MutableMapping,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
 
 import cupy
 import numpy as np
@@ -15,7 +23,7 @@
 from cudf.api.types import _is_scalar_or_zero_d_array
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
-from cudf.utils.utils import NotIterable
+from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
 
 T = TypeVar("T", bound="Frame")
 
@@ -33,6 +41,7 @@ class SingleColumnFrame(Frame, NotIterable):
         "index": 0,
     }
 
+    @_cudf_nvtx_annotate
     def _reduce(
         self, op, axis=None, level=None, numeric_only=None, **kwargs,
     ):
@@ -46,8 +55,12 @@ def _reduce(
             raise NotImplementedError(
                 "numeric_only parameter is not implemented yet"
             )
-        return getattr(self._column, op)(**kwargs)
+        try:
+            return getattr(self._column, op)(**kwargs)
+        except AttributeError:
+            raise TypeError(f"cannot perform {op} with type {self.dtype}")
 
+    @_cudf_nvtx_annotate
     def _scan(self, op, axis=None, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
@@ -55,6 +68,7 @@ def _scan(self, op, axis=None, *args, **kwargs):
         return super()._scan(op, axis=axis, *args, **kwargs)
 
     @classmethod
+    @_cudf_nvtx_annotate
     def _from_data(
         cls,
         data: MutableMapping,
@@ -67,21 +81,25 @@ def _from_data(
             out.name = name
         return out
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def name(self):
         """Get the name of this object."""
         return next(iter(self._data.names))
 
-    @name.setter
+    @name.setter  # type: ignore
+    @_cudf_nvtx_annotate
     def name(self, value):
         self._data[value] = self._data.pop(self.name)
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def ndim(self):
         """Get the dimensionality (always 1 for single-columned frames)."""
         return 1
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def shape(self):
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
@@ -92,26 +110,32 @@ def __bool__(self):
             "a.empty, a.bool(), a.item(), a.any() or a.all()."
         )
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def _num_columns(self):
         return 1
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def _column(self):
         return self._data[self.name]
 
-    @_column.setter
+    @_column.setter  # type: ignore
+    @_cudf_nvtx_annotate
     def _column(self, value):
         self._data[self.name] = value
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def values(self):  # noqa: D102
         return self._column.values
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def values_host(self):  # noqa: D102
         return self._column.values_host
 
+    @_cudf_nvtx_annotate
     def to_cupy(
         self,
         dtype: Union[Dtype, None] = None,
@@ -120,6 +144,7 @@ def to_cupy(
     ) -> cupy.ndarray:  # noqa: D102
         return super().to_cupy(dtype, copy, na_value).flatten()
 
+    @_cudf_nvtx_annotate
     def to_numpy(
         self,
         dtype: Union[Dtype, None] = None,
@@ -139,6 +164,7 @@ def tolist(self):  # noqa: D102
     to_list = tolist
 
     @classmethod
+    @_cudf_nvtx_annotate
     def from_arrow(cls, array):
         """Create from PyArrow Array/ChunkedArray.
 
@@ -169,6 +195,7 @@ def from_arrow(cls, array):
         """
         return cls(ColumnBase.from_arrow(array))
 
+    @_cudf_nvtx_annotate
     def to_arrow(self):
         """
         Convert to a PyArrow Array.
@@ -199,7 +226,8 @@ def to_arrow(self):
         """
         return self._column.to_arrow()
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_unique(self):
         """Return boolean if values in the object are unique.
 
@@ -209,7 +237,8 @@ def is_unique(self):
         """
         return self._column.is_unique
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_monotonic(self):
         """Return boolean if values in the object are monotonically increasing.
 
@@ -221,7 +250,8 @@ def is_monotonic(self):
         """
         return self.is_monotonic_increasing
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
         """Return boolean if values in the object are monotonically increasing.
 
@@ -231,7 +261,8 @@ def is_monotonic_increasing(self):
         """
         return self._column.is_monotonic_increasing
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
         """Return boolean if values in the object are monotonically decreasing.
 
@@ -241,10 +272,12 @@ def is_monotonic_decreasing(self):
         """
         return self._column.is_monotonic_decreasing
 
-    @property
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
     def __cuda_array_interface__(self):
         return self._column.__cuda_array_interface__
 
+    @_cudf_nvtx_annotate
     def factorize(self, na_sentinel=-1):
         """Encode the input values as integer labels.
 
@@ -272,6 +305,7 @@ def factorize(self, na_sentinel=-1):
         """
         return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
 
+    @_cudf_nvtx_annotate
     def _make_operands_for_binop(
         self,
         other: Any,
@@ -279,7 +313,10 @@ def _make_operands_for_binop(
         reflect: bool = False,
         *args,
         **kwargs,
-    ) -> Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]:
+    ) -> Union[
+        Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
+        Type[NotImplemented],
+    ]:
         """Generate the dictionary of operands used for a binary operation.
 
         Parameters
@@ -322,14 +359,13 @@ def _make_operands_for_binop(
 
         return {result_name: (self._column, other, reflect, fill_value)}
 
-    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+    @_cudf_nvtx_annotate
+    def nunique(self, dropna: bool = True):
         """
         Return count of unique values for the column.
 
         Parameters
         ----------
-        method : builtins.str, default "sort"
-            Method used by cpp_distinct_count
         dropna : bool, default True
             Don't include NaN in the counts.
 
@@ -340,4 +376,4 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True):
         """
         if self._column.null_count == len(self):
             return 0
-        return self._column.distinct_count(method=method, dropna=dropna)
+        return self._column.distinct_count(dropna=dropna)
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index a98ee40274e..f5c270a3705 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 from typing import Callable
 
 import cachetools
@@ -6,7 +8,6 @@
 from numba.core.errors import TypingError
 from numba.np import numpy_support
 from numba.types import Poison, Tuple, boolean, int64, void
-from nvtx import annotate
 
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.udf.typing import MaskedType
@@ -17,6 +18,7 @@
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
 )
+from cudf.utils.utils import _cudf_nvtx_annotate
 
 JIT_SUPPORTED_TYPES = (
     NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES
@@ -28,7 +30,7 @@
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 
 
-@annotate("NUMBA JIT", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def _get_udf_return_type(argty, func: Callable, args=()):
     """
     Get the return type of a masked UDF for a given set of argument dtypes. It
@@ -165,7 +167,7 @@ def _generate_cache_key(frame, func: Callable):
     )
 
 
-@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
+@_cudf_nvtx_annotate
 def _compile_or_get(frame, func, args, kernel_getter=None):
     """
     Return a compiled kernel in terms of MaskedTypes that launches a
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 8ffd75b1d76..fa482d52104 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION
+# Copyright (c) 2020-2022, NVIDIA CORPORATION
 
 import itertools
 
@@ -11,11 +11,12 @@
 from cudf.api.types import is_integer, is_number
 from cudf.core import column
 from cudf.core.column.column import as_column
+from cudf.core.mixins import Reducible
 from cudf.utils import cudautils
 from cudf.utils.utils import GetAttrGetItemMixin
 
 
-class Rolling(GetAttrGetItemMixin):
+class Rolling(GetAttrGetItemMixin, Reducible):
     """
     Rolling window calculations.
 
@@ -163,6 +164,15 @@ class Rolling(GetAttrGetItemMixin):
 
     _time_window = False
 
+    _VALID_REDUCTIONS = {
+        "sum",
+        "min",
+        "max",
+        "mean",
+        "var",
+        "std",
+    }
+
     def __init__(
         self,
         obj,
@@ -198,8 +208,7 @@ def __getitem__(self, arg):
             center=self.center,
         )
 
-    def _apply_agg_series(self, sr, agg_name):
-        source_column = sr._column
+    def _apply_agg_column(self, source_column, agg_name):
         min_periods = self.min_periods or 1
         if isinstance(self.window, int):
             preceding_window = None
@@ -230,7 +239,7 @@ def _apply_agg_series(self, sr, agg_name):
             )
             window = None
 
-        result_col = libcudf.rolling.rolling(
+        return libcudf.rolling.rolling(
             source_column=source_column,
             pre_column_window=preceding_window,
             fwd_column_window=following_window,
@@ -240,33 +249,40 @@ def _apply_agg_series(self, sr, agg_name):
             op=agg_name,
             agg_params=self.agg_params,
         )
-        return sr._from_data({sr.name: result_col}, sr._index)
 
     def _apply_agg_dataframe(self, df, agg_name):
-        result_df = cudf.DataFrame({})
-        for i, col_name in enumerate(df.columns):
-            result_col = self._apply_agg_series(df[col_name], agg_name)
-            result_df.insert(i, col_name, result_col)
-        result_df.index = df.index
-        return result_df
+        return cudf.DataFrame._from_data(
+            {
+                col_name: self._apply_agg_column(col, agg_name)
+                for col_name, col in df._data.items()
+            },
+            index=df.index,
+        )
 
     def _apply_agg(self, agg_name):
         if isinstance(self.obj, cudf.Series):
-            return self._apply_agg_series(self.obj, agg_name)
+            return cudf.Series._from_data(
+                {
+                    self.obj.name: self._apply_agg_column(
+                        self.obj._column, agg_name
+                    )
+                },
+                index=self.obj.index,
+            )
         else:
             return self._apply_agg_dataframe(self.obj, agg_name)
 
-    def sum(self):
-        return self._apply_agg("sum")
-
-    def min(self):
-        return self._apply_agg("min")
-
-    def max(self):
-        return self._apply_agg("max")
+    def _reduce(
+        self, op: str, *args, **kwargs,
+    ):
+        """Calculate the rolling {op}.
 
-    def mean(self):
-        return self._apply_agg("mean")
+        Returns
+        -------
+        Series or DataFrame
+            Return type is the same as the original object.
+        """
+        return self._apply_agg(op)
 
     def var(self, ddof=1):
         self.agg_params["ddof"] = ddof
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 4694243ad18..f15fef19c07 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -1,17 +1,17 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
-from nvtx import annotate
 from pyarrow.lib import NativeFile
 
 import cudf
 from cudf import _lib as libcudf
 from cudf.api.types import is_scalar
 from cudf.utils import ioutils
+from cudf.utils.utils import _cudf_nvtx_annotate
 
 
-@annotate("READ_CSV", color="purple", domain="cudf_python")
+@_cudf_nvtx_annotate
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
@@ -106,7 +106,7 @@ def read_csv(
     )
 
 
-@annotate("WRITE_CSV", color="purple", domain="cudf_python")
+@_cudf_nvtx_annotate
 @ioutils.doc_to_csv()
 def to_csv(
     df,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 948428de4f0..253d7950c54 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -7,7 +7,6 @@
 from uuid import uuid4
 
 import numpy as np
-from nvtx import annotate
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -15,9 +14,10 @@
 from cudf.api.types import is_list_like
 from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import ioutils
+from cudf.utils.utils import _cudf_nvtx_annotate
 
 
-@annotate("_WRITE_PARQUET", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def _write_parquet(
     df,
     paths,
@@ -75,7 +75,7 @@ def _write_parquet(
 
 # Logic chosen to match: https://arrow.apache.org/
 # docs/_modules/pyarrow/parquet.html#write_to_dataset
-@annotate("WRITE_TO_DATASET", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def write_to_dataset(
     df,
     root_path,
@@ -164,7 +164,7 @@ def write_to_dataset(
 
 
 @ioutils.doc_read_parquet_metadata()
-@annotate("READ_PARQUET_METADATA", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def read_parquet_metadata(path):
     """{docstring}"""
 
@@ -177,7 +177,7 @@ def read_parquet_metadata(path):
     return num_rows, num_row_groups, col_names
 
 
-@annotate("_PROCESS_DATASET", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def _process_dataset(
     paths, fs, filters=None, row_groups=None, categorical_partitions=True,
 ):
@@ -313,7 +313,7 @@ def _process_dataset(
 
 
 @ioutils.doc_read_parquet()
-@annotate("READ_PARQUET", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def read_parquet(
     filepath_or_buffer,
     engine="cudf",
@@ -441,7 +441,7 @@ def read_parquet(
     )
 
 
-@annotate("_PARQUET_TO_FRAME", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def _parquet_to_frame(
     paths_or_buffers,
     *args,
@@ -509,7 +509,7 @@ def _parquet_to_frame(
     )
 
 
-@annotate("_WRITE_PARQUET", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def _read_parquet(
     filepaths_or_buffers,
     engine,
@@ -543,7 +543,7 @@ def _read_parquet(
 
 
 @ioutils.doc_to_parquet()
-@annotate("TO_PARQUET", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def to_parquet(
     df,
     path,
@@ -565,7 +565,7 @@ def to_parquet(
 
     if engine == "cudf":
         # Ensure that no columns dtype is 'category'
-        for col in df.columns:
+        for col in df._column_names:
             if partition_cols is None or col not in partition_cols:
                 if df[col].dtype.name == "category":
                     raise ValueError(
@@ -655,7 +655,7 @@ def _generate_filename():
     return uuid4().hex + ".parquet"
 
 
-@annotate("_GET_PARTITIONED", color="green", domain="cudf_python")
+@_cudf_nvtx_annotate
 def _get_partitioned(
     df,
     root_path,
@@ -699,7 +699,7 @@ def _get_partitioned(
 
 
 class ParquetDatasetWriter:
-    @annotate("ParquetDatasetWriter_INIT", color="green", domain="cudf_python")
+    @_cudf_nvtx_annotate
     def __init__(
         self,
         path,
@@ -776,9 +776,7 @@ def __init__(
         self.path_cw_map: Dict[str, int] = {}
         self.filename = None
 
-    @annotate(
-        "ParquetDatasetWriter_WRITE_TABLE", color="green", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def write_table(self, df):
         """
         Write a dataframe to the file/dataset
@@ -835,9 +833,7 @@ def write_table(self, df):
         self.path_cw_map.update({k: new_cw_idx for k in new_paths})
         self._chunked_writers[-1][0].write_table(grouped_df, part_info)
 
-    @annotate(
-        "ParquetDatasetWriter_CLOSE", color="green", domain="cudf_python"
-    )
+    @_cudf_nvtx_annotate
     def close(self, return_metadata=False):
         """
         Close all open files and optionally return footer metadata as a binary
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 705645b8349..e5a3beb7d61 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,18 +1,17 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
-from nvtx import annotate
-
 import cudf
 from cudf._lib import text as libtext
 from cudf.utils import ioutils
+from cudf.utils.utils import _cudf_nvtx_annotate
 
 
-@annotate("READ_TEXT", color="purple", domain="cudf_python")
+@_cudf_nvtx_annotate
 @ioutils.doc_read_text()
 def read_text(
-    filepath_or_buffer, delimiter=None, **kwargs,
+    filepath_or_buffer, delimiter=None, byte_range=None, **kwargs,
 ):
     """{docstring}"""
 
@@ -24,5 +23,7 @@ def read_text(
     )
 
     return cudf.Series._from_data(
-        libtext.read_text(filepath_or_buffer, delimiter=delimiter,)
+        libtext.read_text(
+            filepath_or_buffer, delimiter=delimiter, byte_range=byte_range
+        )
     )
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index e767c0c62be..f6b5e0f3ccc 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -333,6 +333,35 @@ def xfail_param(param, **kwargs):
     return pytest.param(param, marks=pytest.mark.xfail(**kwargs))
 
 
+def assert_column_memory_eq(
+    lhs: cudf.core.column.ColumnBase, rhs: cudf.core.column.ColumnBase
+):
+    """Assert the memory location and size of `lhs` and `rhs` are equivalent.
+
+    Both data pointer and mask pointer are checked. Also recursively check for
+    children to the same contarints. Also fails check if the number of children
+    mismatches at any level.
+    """
+    assert lhs.base_data_ptr == rhs.base_data_ptr
+    assert lhs.base_mask_ptr == rhs.base_mask_ptr
+    assert lhs.base_size == rhs.base_size
+    assert lhs.offset == rhs.offset
+    assert lhs.size == rhs.size
+    assert len(lhs.base_children) == len(rhs.base_children)
+    for lhs_child, rhs_child in zip(lhs.base_children, rhs.base_children):
+        assert_column_memory_eq(lhs_child, rhs_child)
+
+
+def assert_column_memory_ne(
+    lhs: cudf.core.column.ColumnBase, rhs: cudf.core.column.ColumnBase
+):
+    try:
+        assert_column_memory_eq(lhs, rhs)
+    except AssertionError:
+        return
+    raise AssertionError("lhs and rhs holds the same memory.")
+
+
 parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
     "left_dtype,right_dtype",
     list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index b3e30fac7d5..5f7616cc75e 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -696,8 +696,8 @@ def assert_frame_equal(
 
     if PANDAS_GE_110:
         pd.testing.assert_index_equal(
-            left.columns,
-            right.columns,
+            left._data.to_pandas_index(),
+            right._data.to_pandas_index(),
             exact=check_column_type,
             check_names=check_names,
             check_exact=check_exact,
@@ -708,8 +708,8 @@ def assert_frame_equal(
         )
     else:
         pd.testing.assert_index_equal(
-            left.columns,
-            right.columns,
+            left._data.to_pandas_index(),
+            right._data.to_pandas_index(),
             exact=check_column_type,
             check_names=check_names,
             check_exact=check_exact,
@@ -717,7 +717,7 @@ def assert_frame_equal(
             obj=f"{obj}.columns",
         )
 
-    for col in left.columns:
+    for col in left._column_names:
         assert_column_equal(
             left._data[col],
             right._data[col],
diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py
index 4d5b5926d6e..4a42d811c80 100644
--- a/python/cudf/cudf/tests/conftest.py
+++ b/python/cudf/cudf/tests/conftest.py
@@ -1,12 +1,17 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
+import itertools
 import os
 import pathlib
 
+import cupy as cp
+import numpy as np
 import pytest
 
 import rmm  # noqa: F401
 
+from cudf.testing._utils import assert_eq
+
 _CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent)
 
 
@@ -15,6 +20,103 @@ def datadir():
     return pathlib.Path(__file__).parent / "data"
 
 
+@pytest.fixture(
+    params=itertools.product([0, 2, None], [0.3, None]),
+    ids=lambda arg: f"n={arg[0]}-frac={arg[1]}",
+)
+def sample_n_frac(request):
+    """
+    Specific to `test_sample*` tests.
+    """
+    n, frac = request.param
+    if n is not None and frac is not None:
+        pytest.skip("Cannot specify both n and frac.")
+    return n, frac
+
+
+def shape_checker(expected, got):
+    assert expected.shape == got.shape
+
+
+def exact_checker(expected, got):
+    assert_eq(expected, got)
+
+
+@pytest.fixture(
+    params=[
+        (None, None, shape_checker),
+        (42, 42, shape_checker),
+        (np.random.RandomState(42), np.random.RandomState(42), exact_checker),
+    ],
+    ids=["None", "IntSeed", "NumpyRandomState"],
+)
+def random_state_tuple_axis_1(request):
+    """
+    Specific to `test_sample*_axis_1` tests.
+    A pytest fixture of valid `random_state` parameter pairs for pandas
+    and cudf. Valid parameter combinations, and what to check for each pair
+    are listed below:
+
+    pandas:   None,   seed(int),  np.random.RandomState
+    cudf:     None,   seed(int),  np.random.RandomState
+    ------
+    check:    shape,  shape,      exact result
+
+    Each column above stands for one valid parameter combination and check.
+    """
+
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        (None, None, shape_checker),
+        (42, 42, shape_checker),
+        (np.random.RandomState(42), np.random.RandomState(42), exact_checker),
+        (np.random.RandomState(42), cp.random.RandomState(42), shape_checker),
+    ],
+    ids=["None", "IntSeed", "NumpyRandomState", "CupyRandomState"],
+)
+def random_state_tuple_axis_0(request):
+    """
+    Specific to `test_sample*_axis_0` tests.
+    A pytest fixture of valid `random_state` parameter pairs for pandas
+    and cudf. Valid parameter combinations, and what to check for each pair
+    are listed below:
+
+    pandas:   None,   seed(int),  np.random.RandomState,  np.random.RandomState
+    cudf:     None,   seed(int),  np.random.RandomState,  cp.random.RandomState
+    ------
+    check:    shape,  shape,      exact result,           shape
+
+    Each column above stands for one valid parameter combination and check.
+    """
+
+    return request.param
+
+
+@pytest.fixture(params=[None, "builtin_list", "ndarray"])
+def make_weights_axis_0(request):
+    """Specific to `test_sample*_axis_0` tests.
+    Only testing weights array that matches type with random state.
+    """
+
+    if request.param is None:
+        return lambda *_: (None, None)
+    elif request.param == "builtin-list":
+        return lambda size, _: ([1] * size, [1] * size)
+    else:
+
+        def wrapped(size, numpy_weights_for_cudf):
+            # Uniform distribution, non-normalized
+            if numpy_weights_for_cudf:
+                return np.ones(size), np.ones(size)
+            else:
+                return np.ones(size), cp.ones(size)
+
+        return wrapped
+
+
 # To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use
 # the sessionstart and sessionfinish hooks rather than a simple autouse,
 # session-scope fixture because we need to set these variable before collection
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index e4b4d5020ea..9d762f26ebd 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -50,6 +50,83 @@ def _hide_ufunc_warnings(ufunc):
         yield
 
 
+@pytest.mark.parametrize("ufunc", _UFUNCS)
+def test_ufunc_index(ufunc):
+    # Note: This test assumes that all ufuncs are unary or binary.
+    fname = ufunc.__name__
+
+    N = 100
+    # Avoid zeros in either array to skip division by 0 errors. Also limit the
+    # scale to avoid issues with overflow, etc. We use ints because some
+    # operations (like bitwise ops) are not defined for floats.
+    pandas_args = args = [
+        cudf.Index(cp.random.randint(low=1, high=10, size=N),)
+        for _ in range(ufunc.nin)
+    ]
+
+    try:
+        got = ufunc(*args)
+    except AttributeError as e:
+        # We xfail if we don't have an explicit dispatch and cupy doesn't have
+        # the method so that we can easily identify these methods. As of this
+        # writing, the only missing methods are isnat and heaviside.
+        if "module 'cupy' has no attribute" in str(e):
+            pytest.xfail(reason="Operation not supported by cupy")
+        raise
+
+    expect = ufunc(*(arg.to_pandas() for arg in pandas_args))
+
+    try:
+        if ufunc.nout > 1:
+            for g, e in zip(got, expect):
+                assert_eq(g, e, check_exact=False)
+        else:
+            assert_eq(got, expect, check_exact=False)
+    except AssertionError:
+        # TODO: This branch can be removed when
+        # https://github.com/rapidsai/cudf/issues/10178 is resolved
+        if fname in ("power", "float_power"):
+            if (got - expect).abs().max() == 1:
+                pytest.xfail("https://github.com/rapidsai/cudf/issues/10178")
+        raise
+
+
+@pytest.mark.parametrize(
+    "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and]
+)
+@pytest.mark.parametrize("type_", ["cupy", "numpy", "list"])
+@pytest.mark.parametrize("reflect", [True, False])
+def test_binary_ufunc_index_array(ufunc, type_, reflect):
+    N = 100
+    # Avoid zeros in either array to skip division by 0 errors. Also limit the
+    # scale to avoid issues with overflow, etc. We use ints because some
+    # operations (like bitwise ops) are not defined for floats.
+    args = [cudf.Index(cp.random.rand(N)) for _ in range(ufunc.nin)]
+
+    arg1 = args[1].to_cupy() if type_ == "cupy" else args[1].to_numpy()
+    if type_ == "list":
+        arg1 = arg1.tolist()
+
+    if reflect:
+        got = ufunc(arg1, args[0])
+        expect = ufunc(args[1].to_numpy(), args[0].to_pandas())
+    else:
+        got = ufunc(args[0], arg1)
+        expect = ufunc(args[0].to_pandas(), args[1].to_numpy())
+
+    if ufunc.nout > 1:
+        for g, e in zip(got, expect):
+            if type_ == "cupy" and reflect:
+                assert (cp.asnumpy(g) == e).all()
+            else:
+                assert_eq(g, e, check_exact=False)
+    else:
+        if type_ == "cupy" and reflect:
+            assert (cp.asnumpy(got) == expect).all()
+        else:
+            assert_eq(got, expect, check_exact=False)
+
+
 @pytest.mark.parametrize("ufunc", _UFUNCS)
 @pytest.mark.parametrize("has_nulls", [True, False])
 @pytest.mark.parametrize("indexed", [True, False])
@@ -117,11 +194,11 @@ def test_ufunc_series(ufunc, has_nulls, indexed):
             for g, e in zip(got, expect):
                 if has_nulls:
                     e[mask] = np.nan
-                assert_eq(g, e)
+                assert_eq(g, e, check_exact=False)
         else:
             if has_nulls:
                 expect[mask] = np.nan
-            assert_eq(got, expect)
+            assert_eq(got, expect, check_exact=False)
     except AssertionError:
         # TODO: This branch can be removed when
         # https://github.com/rapidsai/cudf/issues/10178 is resolved
@@ -195,14 +272,14 @@ def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect):
             if type_ == "cupy" and reflect:
                 assert (cp.asnumpy(g) == e).all()
             else:
-                assert_eq(g, e)
+                assert_eq(g, e, check_exact=False)
     else:
         if has_nulls:
             expect[mask] = np.nan
         if type_ == "cupy" and reflect:
             assert (cp.asnumpy(got) == expect).all()
         else:
-            assert_eq(got, expect)
+            assert_eq(got, expect, check_exact=False)
 
 
 @pytest.mark.parametrize(
@@ -298,11 +375,11 @@ def test_ufunc_dataframe(ufunc, has_nulls, indexed):
             for g, e in zip(got, expect):
                 if has_nulls:
                     e[mask] = np.nan
-                assert_eq(g, e)
+                assert_eq(g, e, check_exact=False)
         else:
             if has_nulls:
                 expect[mask] = np.nan
-            assert_eq(got, expect)
+            assert_eq(got, expect, check_exact=False)
     except AssertionError:
         # TODO: This branch can be removed when
         # https://github.com/rapidsai/cudf/issues/10178 is resolved
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index c98568d53a5..db12743ac17 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1768,10 +1768,6 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype):
     expected = data.to_pandas() == val
     got = data == val
 
-    # In case of index, expected would be a numpy array
-    if isinstance(data, cudf.BaseIndex):
-        expected = pd.Index(expected)
-
     utils.assert_eq(expected, got)
 
 
@@ -2969,7 +2965,7 @@ def test_binops_non_cudf_types(obj_class, binop, other_type):
     data = range(1, 100)
     lhs = obj_class(data)
     rhs = other_type(data)
-    assert cp.all((binop(lhs, rhs) == binop(lhs, lhs)).values)
+    assert (binop(lhs, rhs) == binop(lhs, lhs)).all()
 
 
 @pytest.mark.parametrize("binop", _binops + _binops_compare)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index bc3ae721554..19a5cd4a49d 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -1,14 +1,17 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import operator
 import string
+import warnings
+from contextlib import contextmanager
+from textwrap import dedent
 
 import numpy as np
 import pandas as pd
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_110
+from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_134
 from cudf.testing._utils import (
     NUMERIC_TYPES,
     assert_eq,
@@ -16,6 +19,30 @@
 )
 
 
+@contextmanager
+def _hide_deprecated_pandas_categorical_inplace_warnings(function_name):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            (
+                "The `inplace` parameter in "
+                f"pandas.Categorical.{function_name} is deprecated and will "
+                "be removed in a future version."
+            ),
+            category=FutureWarning,
+        )
+        yield
+
+
+@contextmanager
+def _hide_cudf_safe_casting_warning():
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore", "Can't safely cast column", category=UserWarning,
+        )
+        yield
+
+
 @pytest.fixture
 def pd_str_cat():
     categories = list("abc")
@@ -51,9 +78,8 @@ def test_categorical_basic():
     assert_eq(cat.codes, cudf_cat.codes.to_numpy())
 
 
+@pytest.mark.skipif(not PANDAS_GE_110, reason="requires pandas>=1.1.0")
 def test_categorical_integer():
-    if not PANDAS_GE_110:
-        pytest.xfail(reason="pandas >=1.1 required")
     cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"])
     pdsr = pd.Series(cat)
     sr = cudf.Series(cat)
@@ -67,17 +93,17 @@ def test_categorical_integer():
         sr.cat.codes.astype(pdsr.cat.codes.dtype).fillna(-1).to_numpy(),
     )
 
-    string = str(sr)
-    expect_str = """
-0 a
-1 <NA>
-2 <NA>
-3 c
-4 a
-dtype: category
-Categories (3, object): ['a', 'b', 'c']
-"""
-    assert string.split() == expect_str.split()
+    expect_str = dedent(
+        """\
+        0       a
+        1    <NA>
+        2    <NA>
+        3       c
+        4       a
+        dtype: category
+        Categories (3, object): ['a', 'b', 'c']"""
+    )
+    assert str(sr) == expect_str
 
 
 def test_categorical_compare_unordered():
@@ -152,23 +178,9 @@ def test_categorical_binary_add():
         rfunc=operator.add,
         lfunc_args_and_kwargs=([pdsr, pdsr],),
         rfunc_args_and_kwargs=([sr, sr],),
-        expected_error_message="Series of dtype `category` cannot perform "
-        "the operation: add",
-    )
-
-
-def test_categorical_unary_ceil():
-    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
-    pdsr = pd.Series(cat)
-    sr = cudf.Series(cat)
-
-    assert_exceptions_equal(
-        lfunc=getattr,
-        rfunc=sr.ceil,
-        lfunc_args_and_kwargs=([pdsr, "ceil"],),
-        check_exception_type=False,
-        expected_error_message="Series of dtype `category` cannot "
-        "perform the operation: ceil",
+        expected_error_message=(
+            "Series of dtype `category` cannot perform the operation: add"
+        ),
     )
 
 
@@ -238,26 +250,25 @@ def test_cat_series_binop_error():
     df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
     df["b"] = np.arange(len(df))
 
-    dfa = df["a"]
-    dfb = df["b"]
+    pdf = df.to_pandas()
 
-    # lhs is a categorical
+    # lhs is categorical
     assert_exceptions_equal(
         lfunc=operator.add,
         rfunc=operator.add,
-        lfunc_args_and_kwargs=([dfa, dfb],),
-        rfunc_args_and_kwargs=([dfa, dfb],),
-        check_exception_type=False,
-        expected_error_message="Series of dtype `category` cannot "
-        "perform the operation: add",
+        lfunc_args_and_kwargs=([pdf["a"], pdf["b"]],),
+        rfunc_args_and_kwargs=([df["a"], df["b"]],),
+        expected_error_message=(
+            "Series of dtype `category` cannot perform the operation: add"
+        ),
     )
-    # if lhs is a numerical
+
+    # lhs is numerical
     assert_exceptions_equal(
         lfunc=operator.add,
         rfunc=operator.add,
-        lfunc_args_and_kwargs=([dfb, dfa],),
-        rfunc_args_and_kwargs=([dfb, dfa],),
-        check_exception_type=False,
+        lfunc_args_and_kwargs=([pdf["b"], pdf["a"]],),
+        rfunc_args_and_kwargs=([df["b"], df["a"]],),
         expected_error_message="'add' operator not supported",
     )
 
@@ -367,8 +378,9 @@ def test_categorical_as_ordered(pd_str_cat, inplace):
 
     pd_sr_1 = pd_sr.cat.as_ordered(inplace=inplace)
     cd_sr_1 = cd_sr.cat.as_ordered(inplace=inplace)
-    pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1
-    cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1
+    if inplace:
+        pd_sr_1 = pd_sr
+        cd_sr_1 = cd_sr
 
     assert cd_sr_1.cat.ordered is True
     assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered
@@ -386,8 +398,9 @@ def test_categorical_as_unordered(pd_str_cat, inplace):
 
     pd_sr_1 = pd_sr.cat.as_unordered(inplace=inplace)
     cd_sr_1 = cd_sr.cat.as_unordered(inplace=inplace)
-    pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1
-    cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1
+    if inplace:
+        pd_sr_1 = pd_sr
+        cd_sr_1 = cd_sr
 
     assert cd_sr_1.cat.ordered is False
     assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered
@@ -401,8 +414,9 @@ def test_categorical_as_unordered(pd_str_cat, inplace):
     [
         pytest.param(
             True,
-            marks=pytest.mark.xfail(
-                reason="https://github.com/pandas-dev/pandas/issues/43232"
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_134,
+                reason="https://github.com/pandas-dev/pandas/issues/43232",
             ),
         ),
         False,
@@ -421,10 +435,14 @@ def test_categorical_reorder_categories(
 
     kwargs = dict(ordered=to_ordered, inplace=inplace)
 
-    pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs)
+    with _hide_deprecated_pandas_categorical_inplace_warnings(
+        "reorder_categories"
+    ):
+        pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs)
     cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs)
-    pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1
-    cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1
+    if inplace:
+        pd_sr_1 = pd_sr
+        cd_sr_1 = cd_sr
 
     assert_eq(pd_sr_1, cd_sr_1)
 
@@ -436,8 +454,9 @@ def test_categorical_reorder_categories(
     [
         pytest.param(
             True,
-            marks=pytest.mark.xfail(
-                reason="https://github.com/pandas-dev/pandas/issues/43232"
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_134,
+                reason="https://github.com/pandas-dev/pandas/issues/43232",
             ),
         ),
         False,
@@ -452,10 +471,14 @@ def test_categorical_add_categories(pd_str_cat, inplace):
 
     assert str(pd_sr) == str(cd_sr)
 
-    pd_sr_1 = pd_sr.cat.add_categories(["d"], inplace=inplace)
+    with _hide_deprecated_pandas_categorical_inplace_warnings(
+        "add_categories"
+    ):
+        pd_sr_1 = pd_sr.cat.add_categories(["d"], inplace=inplace)
     cd_sr_1 = cd_sr.cat.add_categories(["d"], inplace=inplace)
-    pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1
-    cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1
+    if inplace:
+        pd_sr_1 = pd_sr
+        cd_sr_1 = cd_sr
 
     assert "d" in pd_sr_1.cat.categories.to_list()
     assert "d" in cd_sr_1.cat.categories.to_pandas().to_list()
@@ -468,8 +491,9 @@ def test_categorical_add_categories(pd_str_cat, inplace):
     [
         pytest.param(
             True,
-            marks=pytest.mark.xfail(
-                reason="https://github.com/pandas-dev/pandas/issues/43232"
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_134,
+                reason="https://github.com/pandas-dev/pandas/issues/43232",
             ),
         ),
         False,
@@ -484,10 +508,14 @@ def test_categorical_remove_categories(pd_str_cat, inplace):
 
     assert str(pd_sr) == str(cd_sr)
 
-    pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace)
+    with _hide_deprecated_pandas_categorical_inplace_warnings(
+        "remove_categories"
+    ):
+        pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace)
     cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace)
-    pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1
-    cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1
+    if inplace:
+        pd_sr_1 = pd_sr
+        cd_sr_1 = cd_sr
 
     assert "a" not in pd_sr_1.cat.categories.to_list()
     assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list()
@@ -495,13 +523,16 @@ def test_categorical_remove_categories(pd_str_cat, inplace):
     assert_eq(pd_sr_1, cd_sr_1)
 
     # test using ordered operators
-    assert_exceptions_equal(
-        lfunc=cd_sr.to_pandas().cat.remove_categories,
-        rfunc=cd_sr.cat.remove_categories,
-        lfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}),
-        rfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}),
-        expected_error_message="removals must all be in old categories",
-    )
+    with _hide_deprecated_pandas_categorical_inplace_warnings(
+        "remove_categories"
+    ):
+        assert_exceptions_equal(
+            lfunc=cd_sr.to_pandas().cat.remove_categories,
+            rfunc=cd_sr.cat.remove_categories,
+            lfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}),
+            rfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}),
+            expected_error_message="removals must all be in old categories",
+        )
 
 
 def test_categorical_dataframe_slice_copy():
@@ -583,19 +614,21 @@ def test_categorical_set_categories_categoricals(data, new_categories):
     pd_data = data.copy().astype("category")
     gd_data = cudf.from_pandas(pd_data)
 
-    assert_eq(
-        pd_data.cat.set_categories(new_categories=new_categories),
-        gd_data.cat.set_categories(new_categories=new_categories),
-    )
+    expected = pd_data.cat.set_categories(new_categories=new_categories)
+    with _hide_cudf_safe_casting_warning():
+        actual = gd_data.cat.set_categories(new_categories=new_categories)
 
-    assert_eq(
-        pd_data.cat.set_categories(
-            new_categories=pd.Series(new_categories, dtype="category")
-        ),
-        gd_data.cat.set_categories(
-            new_categories=cudf.Series(new_categories, dtype="category")
-        ),
+    assert_eq(expected, actual)
+
+    expected = pd_data.cat.set_categories(
+        new_categories=pd.Series(new_categories, dtype="category")
     )
+    with _hide_cudf_safe_casting_warning():
+        actual = gd_data.cat.set_categories(
+            new_categories=cudf.Series(new_categories, dtype="category")
+        )
+
+    assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -703,7 +736,9 @@ def test_add_categories(data, add):
     gds = cudf.Series(data, dtype="category")
 
     expected = pds.cat.add_categories(add)
-    actual = gds.cat.add_categories(add)
+    with _hide_cudf_safe_casting_warning():
+        actual = gds.cat.add_categories(add)
+
     assert_eq(
         expected.cat.codes, actual.cat.codes.astype(expected.cat.codes.dtype)
     )
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index f3d69e1745e..6176184b670 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1315,7 +1315,7 @@ def test_csv_reader_aligned_byte_range(tmpdir):
     [(None, None), ("int", "hex"), ("int32", "hex32"), ("int64", "hex64")],
 )
 def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype):
-    lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF", "9512c20b"]
+    lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF"]
     values = [int(hex_int, 16) for hex_int in lines]
 
     buffer = "\n".join(lines)
@@ -1334,6 +1334,35 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype):
         assert_eq(pdf, gdf)
 
 
+@pytest.mark.parametrize(
+    "np_dtype, gdf_dtype",
+    [("int", "hex"), ("int32", "hex32"), ("int64", "hex64")],
+)
+def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype):
+    # This tests values which cause an overflow warning that will become an
+    # error in pandas. NumPy wraps the overflow silently up to the bounds of a
+    # signed int64.
+    lines = [
+        "0x0",
+        "-0x1000",
+        "0xfedcba",
+        "0xABCDEF",
+        "0xaBcDeF",
+        "0x9512c20b",
+        "0x7fffffff",
+        "0x7fffffffffffffff",
+        "-0x8000000000000000",
+    ]
+    values = [int(hex_int, 16) for hex_int in lines]
+    buffer = "\n".join(lines)
+
+    gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"])
+
+    expected = np.array(values, dtype=np_dtype)
+    actual = gdf["hex_int"].to_numpy()
+    np.testing.assert_array_equal(expected, actual)
+
+
 @pytest.mark.parametrize("quoting", [0, 1, 2, 3])
 def test_csv_reader_pd_consistent_quotes(quoting):
     names = ["text"]
diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py
index e8bd64b5061..7fdf9754534 100644
--- a/python/cudf/cudf/tests/test_cuda_apply.py
+++ b/python/cudf/cudf/tests/test_cuda_apply.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 """
 Test method that apply GPU kernel to a frame.
@@ -98,7 +98,7 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2):
 
     expect_out1 = extra2 * in1 - extra1 * in2 + in3
     expect_out2 = np.hstack(
-        np.arange(e - s) for s, e in zip(chunks, chunks[1:] + [len(df)])
+        [np.arange(e - s) for s, e in zip(chunks, chunks[1:] + [len(df)])]
     )
 
     outdf = df.apply_chunks(
@@ -141,7 +141,10 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2):
 
     expect_out1 = extra2 * in1 - extra1 * in2 + in3
     expect_out2 = np.hstack(
-        tpb * np.arange(e - s) for s, e in zip(chunks, chunks[1:] + [len(df)])
+        [
+            tpb * np.arange(e - s)
+            for s, e in zip(chunks, chunks[1:] + [len(df)])
+        ]
     )
 
     outdf = df.apply_chunks(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 1db91633c5e..5bde75c2e21 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11,6 +11,7 @@
 from copy import copy
 
 import cupy
+import cupy as cp
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -3342,14 +3343,6 @@ def test_select_dtype_datetime_with_frequency():
     )
 
 
-def test_array_ufunc():
-    gdf = cudf.DataFrame({"x": [2, 3, 4.0], "y": [9.0, 2.5, 1.1]})
-    pdf = gdf.to_pandas()
-
-    assert_eq(np.sqrt(gdf), np.sqrt(pdf))
-    assert_eq(np.sqrt(gdf.x), np.sqrt(pdf.x))
-
-
 def test_dataframe_describe_exclude():
     np.random.seed(12)
     data_length = 10000
@@ -7151,120 +7144,165 @@ def test_cudf_arrow_array_error():
         sr.__arrow_array__()
 
 
-@pytest.mark.parametrize("n", [0, 2, 5, 10, None])
-@pytest.mark.parametrize("frac", [0.1, 0.5, 1, 2, None])
-@pytest.mark.parametrize("replace", [True, False])
-@pytest.mark.parametrize("axis", [0, 1])
-def test_dataframe_sample_basic(n, frac, replace, axis):
-    # as we currently don't support column with same name
-    if axis == 1 and replace:
-        return
+@pytest.mark.parametrize(
+    "make_weights_axis_1",
+    [lambda _: None, lambda s: [1] * s, lambda s: np.ones(s)],
+)
+def test_sample_axis_1(
+    sample_n_frac, random_state_tuple_axis_1, make_weights_axis_1
+):
+    n, frac = sample_n_frac
+    pd_random_state, gd_random_state, checker = random_state_tuple_axis_1
+
     pdf = pd.DataFrame(
         {
             "a": [1, 2, 3, 4, 5],
             "float": [0.05, 0.2, 0.3, 0.2, 0.25],
             "int": [1, 3, 5, 4, 2],
         },
-        index=[1, 2, 3, 4, 5],
     )
     df = cudf.DataFrame.from_pandas(pdf)
-    random_state = 0
-
-    try:
-        pout = pdf.sample(
-            n=n,
-            frac=frac,
-            replace=replace,
-            random_state=random_state,
-            axis=axis,
-        )
-    except BaseException:
-        assert_exceptions_equal(
-            lfunc=pdf.sample,
-            rfunc=df.sample,
-            lfunc_args_and_kwargs=(
-                [],
-                {
-                    "n": n,
-                    "frac": frac,
-                    "replace": replace,
-                    "random_state": random_state,
-                    "axis": axis,
-                },
-            ),
-            rfunc_args_and_kwargs=(
-                [],
-                {
-                    "n": n,
-                    "frac": frac,
-                    "replace": replace,
-                    "random_state": random_state,
-                    "axis": axis,
-                },
-            ),
-        )
-    else:
-        gout = df.sample(
-            n=n,
-            frac=frac,
-            replace=replace,
-            random_state=random_state,
-            axis=axis,
+
+    weights = make_weights_axis_1(len(pdf.columns))
+
+    expected = pdf.sample(
+        n=n,
+        frac=frac,
+        replace=False,
+        random_state=pd_random_state,
+        weights=weights,
+        axis=1,
+    )
+    got = df.sample(
+        n=n,
+        frac=frac,
+        replace=False,
+        random_state=gd_random_state,
+        weights=weights,
+        axis=1,
+    )
+    checker(expected, got)
+
+
+@pytest.mark.parametrize(
+    "pdf",
+    [
+        pd.DataFrame(
+            {
+                "a": [1, 2, 3, 4, 5],
+                "float": [0.05, 0.2, 0.3, 0.2, 0.25],
+                "int": [1, 3, 5, 4, 2],
+            },
+        ),
+        pd.Series([1, 2, 3, 4, 5]),
+    ],
+)
+@pytest.mark.parametrize("replace", [True, False])
+def test_sample_axis_0(
+    pdf, sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0
+):
+    n, frac = sample_n_frac
+    pd_random_state, gd_random_state, checker = random_state_tuple_axis_0
+
+    df = cudf.from_pandas(pdf)
+
+    pd_weights, gd_weights = make_weights_axis_0(
+        len(pdf), isinstance(gd_random_state, np.random.RandomState)
+    )
+    if (
+        not replace
+        and not isinstance(gd_random_state, np.random.RandomState)
+        and gd_weights is not None
+    ):
+        pytest.skip(
+            "`cupy.random.RandomState` doesn't support weighted sampling "
+            "without replacement."
         )
-        assert pout.shape == gout.shape
+
+    expected = pdf.sample(
+        n=n,
+        frac=frac,
+        replace=replace,
+        random_state=pd_random_state,
+        weights=pd_weights,
+        axis=0,
+    )
+
+    got = df.sample(
+        n=n,
+        frac=frac,
+        replace=replace,
+        random_state=gd_random_state,
+        weights=gd_weights,
+        axis=0,
+    )
+    checker(expected, got)
 
 
 @pytest.mark.parametrize("replace", [True, False])
-@pytest.mark.parametrize("random_state", [1, np.random.mtrand.RandomState(10)])
-def test_dataframe_reproducibility(replace, random_state):
+@pytest.mark.parametrize(
+    "random_state_lib", [cp.random.RandomState, np.random.RandomState]
+)
+def test_sample_reproducibility(replace, random_state_lib):
     df = cudf.DataFrame({"a": cupy.arange(0, 1024)})
 
-    expected = df.sample(1024, replace=replace, random_state=random_state)
-    out = df.sample(1024, replace=replace, random_state=random_state)
+    n = 1024
+    expected = df.sample(n, replace=replace, random_state=random_state_lib(10))
+    out = df.sample(n, replace=replace, random_state=random_state_lib(10))
 
     assert_eq(expected, out)
 
 
-@pytest.mark.parametrize("n", [0, 2, 5, 10, None])
-@pytest.mark.parametrize("frac", [0.1, 0.5, 1, 2, None])
-@pytest.mark.parametrize("replace", [True, False])
-def test_series_sample_basic(n, frac, replace):
-    psr = pd.Series([1, 2, 3, 4, 5])
-    sr = cudf.Series.from_pandas(psr)
-    random_state = 0
-
-    try:
-        pout = psr.sample(
-            n=n, frac=frac, replace=replace, random_state=random_state
-        )
-    except BaseException:
-        assert_exceptions_equal(
-            lfunc=psr.sample,
-            rfunc=sr.sample,
-            lfunc_args_and_kwargs=(
-                [],
-                {
-                    "n": n,
-                    "frac": frac,
-                    "replace": replace,
-                    "random_state": random_state,
-                },
-            ),
-            rfunc_args_and_kwargs=(
-                [],
-                {
-                    "n": n,
-                    "frac": frac,
-                    "replace": replace,
-                    "random_state": random_state,
-                },
-            ),
-        )
-    else:
-        gout = sr.sample(
-            n=n, frac=frac, replace=replace, random_state=random_state
+@pytest.mark.parametrize("axis", [0, 1])
+def test_sample_invalid_n_frac_combo(axis):
+    n, frac = 2, 0.5
+    pdf = pd.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5],
+            "float": [0.05, 0.2, 0.3, 0.2, 0.25],
+            "int": [1, 3, 5, 4, 2],
+        },
+    )
+    df = cudf.DataFrame.from_pandas(pdf)
+
+    assert_exceptions_equal(
+        lfunc=pdf.sample,
+        rfunc=df.sample,
+        lfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}),
+        rfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}),
+    )
+
+
+@pytest.mark.parametrize("n, frac", [(100, None), (None, 3)])
+@pytest.mark.parametrize("axis", [0, 1])
+def test_oversample_without_replace(n, frac, axis):
+    pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]})
+    df = cudf.DataFrame.from_pandas(pdf)
+
+    assert_exceptions_equal(
+        lfunc=pdf.sample,
+        rfunc=df.sample,
+        lfunc_args_and_kwargs=(
+            [],
+            {"n": n, "frac": frac, "axis": axis, "replace": False},
+        ),
+        rfunc_args_and_kwargs=(
+            [],
+            {"n": n, "frac": frac, "axis": axis, "replace": False},
+        ),
+    )
+
+
+@pytest.mark.parametrize("random_state", [None, cp.random.RandomState(42)])
+def test_sample_unsupported_arguments(random_state):
+    df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]})
+    with pytest.raises(
+        NotImplementedError,
+        match="Random sampling with cupy does not support these inputs.",
+    ):
+        df.sample(
+            n=2, replace=False, random_state=random_state, weights=[1] * 5
         )
-        assert pout.shape == gout.shape
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 2f851e805b4..c67fc199710 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -82,7 +82,7 @@ def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
     assert dfo.num_columns() == len(df.columns)
     assert dfo.num_rows() == len(df)
     assert dfo.num_chunks() == 1
-    assert dfo.column_names() == list(df.columns)
+    assert dfo.column_names() == tuple(df.columns)
     for col in df.columns:
         assert_column_equal(dfo.get_column_by_name(col), df[col]._column)
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index faaa42ac7f8..80270e62da7 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -28,6 +28,8 @@
     SIGNED_INTEGER_TYPES,
     SIGNED_TYPES,
     UNSIGNED_TYPES,
+    assert_column_memory_eq,
+    assert_column_memory_ne,
     assert_eq,
     assert_exceptions_equal,
 )
@@ -391,62 +393,12 @@ def test_index_copy_category(name, dtype, deep=True):
     ],
 )
 def test_index_copy_deep(idx, deep):
-    """Test if deep copy creates a new instance for device data.
-    The general criterion is to compare `Buffer.ptr` between two data objects.
-    Specifically for:
-        - CategoricalIndex, this applies to both `.codes` and `.categories`
-        - StringIndex, to every element in `._base_children`
-        - Others, to `.base_data`
-    No test is defined for RangeIndex.
-    """
+    """Test if deep copy creates a new instance for device data."""
     idx_copy = idx.copy(deep=deep)
-    same_ref = not deep
-    if isinstance(idx, cudf.CategoricalIndex):
-        assert (
-            idx._values.codes.base_data.ptr
-            == idx_copy._values.codes.base_data.ptr
-        ) == same_ref
-        if isinstance(
-            idx._values.categories, cudf.core.column.string.StringColumn
-        ):
-            children = idx._values.categories._base_children
-            copy_children = idx_copy._values.categories._base_children
-            assert all(
-                [
-                    (
-                        children[i].base_data.ptr
-                        == copy_children[i].base_data.ptr
-                    )
-                    == same_ref
-                    for i in range(len(children))
-                ]
-            )
-        elif isinstance(
-            idx._values.categories, cudf.core.column.numerical.NumericalColumn
-        ):
-            assert (
-                idx._values.categories.base_data.ptr
-                == idx_copy._values.categories.base_data.ptr
-            ) == same_ref
-    elif isinstance(idx, cudf.StringIndex):
-        children = idx._values._base_children
-        copy_children = idx_copy._values._base_children
-        assert all(
-            [
-                (
-                    (
-                        children[i].base_data.ptr
-                        == copy_children[i].base_data.ptr
-                    )
-                    == same_ref
-                )
-                for i in range(len(children))
-            ]
-        )
+    if not deep:
+        assert_column_memory_eq(idx._values, idx_copy._values)
     else:
-        assert (
-            idx._values.base_data.ptr == idx_copy._values.base_data.ptr
-        ) == same_ref
+        assert_column_memory_ne(idx._values, idx_copy._values)
 
 
 @pytest.mark.parametrize("idx", [[1, None, 3, None, 5]])
@@ -1648,6 +1600,7 @@ def test_index_sample_basic(n, frac, replace):
                     "random_state": random_state,
                 },
             ),
+            compare_error_message=False,
         )
     else:
         gout = gindex.sample(
@@ -1716,6 +1669,8 @@ def test_multiindex_sample_basic(n, frac, replace, axis):
             random_state=random_state,
             axis=axis,
         )
+        if axis == 1 and n is None and frac is None:
+            pout = pout.iloc[:, 0]
         assert pout.shape == gout.shape
 
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 623098741a9..bd7335c577c 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -552,7 +552,7 @@ def test_orc_writer_sliced(tmpdir):
     df_select = df.iloc[1:3]
 
     df_select.to_orc(cudf_path)
-    assert_eq(cudf.read_orc(cudf_path), df_select.reset_index(drop=True))
+    assert_eq(cudf.read_orc(cudf_path), df_select)
 
 
 @pytest.mark.parametrize(
@@ -794,7 +794,8 @@ def test_orc_bool_encode_fail():
 
     # Also validate data
     pdf = pa.orc.ORCFile(buffer).read().to_pandas()
-    assert_eq(okay_df, pdf)
+
+    assert_eq(okay_df.to_pandas(nullable=True), pdf)
 
 
 def test_nanoseconds_overflow():
@@ -840,7 +841,12 @@ def test_empty_string_columns(data):
     got_df = cudf.read_orc(buffer)
 
     assert_eq(expected, got_df)
-    assert_eq(expected_pdf, got_df)
+    assert_eq(
+        expected_pdf,
+        got_df.to_pandas(nullable=True)
+        if expected_pdf["string"].dtype == pd.StringDtype()
+        else got_df,
+    )
 
 
 @pytest.mark.parametrize("scale", [-3, 0, 3])
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 8d504edd669..35ebd1b77c7 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import sys
 
@@ -62,7 +62,9 @@ def test_pickle_dataframe_categorical():
     np.random.seed(0)
 
     df = DataFrame()
-    df["keys"] = pd.Categorical("aaabababac")
+    df["keys"] = pd.Categorical(
+        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
+    )
     df["vals"] = np.random.random(len(df))
 
     check_serialization(df)
@@ -90,9 +92,7 @@ def test_pickle_index():
     idx = GenericIndex(np.arange(nelem), name="a")
     pickled = pickle.dumps(idx)
     out = pickle.loads(pickled)
-    # TODO: Once operations like `all` are supported on Index objects, we can
-    # just use that without calling values first.
-    assert (idx == out).values.all()
+    assert (idx == out).all()
 
 
 def test_pickle_buffer():
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index c146766c5e1..abf38f74b86 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import math
+from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -12,6 +13,20 @@
 from cudf.testing.dataset_generator import rand_dataframe
 
 
+@contextmanager
+def _hide_pandas_rolling_min_periods_warning(agg):
+    if agg == "count":
+        with pytest.warns(
+            FutureWarning,
+            match="min_periods=None will default to the size of window "
+            "consistent with other methods in a future version. Specify "
+            "min_periods=0 instead.",
+        ):
+            yield
+    else:
+        yield
+
+
 @pytest.mark.parametrize(
     "data,index",
     [
@@ -78,16 +93,16 @@ def test_rolling_dataframe_basic(data, agg, nulls, center):
     pdf = pd.DataFrame(data)
 
     if len(pdf) > 0:
-        for col_name in pdf.columns:
+        for col_idx in range(len(pdf.columns)):
             if nulls == "one":
                 p = rng.integers(0, len(data))
-                pdf[col_name][p] = np.nan
+                pdf.iloc[p, col_idx] = np.nan
             elif nulls == "some":
                 p1, p2 = rng.integers(0, len(data), (2,))
-                pdf[col_name][p1] = np.nan
-                pdf[col_name][p2] = np.nan
+                pdf.iloc[p1, col_idx] = np.nan
+                pdf.iloc[p2, col_idx] = np.nan
             elif nulls == "all":
-                pdf[col_name][:] = np.nan
+                pdf.iloc[:, col_idx] = np.nan
 
     gdf = cudf.from_pandas(pdf)
     for window_size in range(1, len(data) + 1):
@@ -406,9 +421,10 @@ def test_rolling_groupby_simple(agg):
     gdf = cudf.from_pandas(pdf)
 
     for window_size in range(1, len(pdf) + 1):
-        expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna(
-            -1
-        )
+        with _hide_pandas_rolling_min_periods_warning(agg):
+            expect = getattr(
+                pdf.groupby("a").rolling(window_size), agg
+            )().fillna(-1)
         got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1)
         assert_eq(expect, got, check_dtype=False)
 
@@ -418,9 +434,10 @@ def test_rolling_groupby_simple(agg):
     gdf = cudf.from_pandas(pdf)
 
     for window_size in range(1, len(pdf) + 1):
-        expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna(
-            -1
-        )
+        with _hide_pandas_rolling_min_periods_warning(agg):
+            expect = getattr(
+                pdf.groupby("a").rolling(window_size), agg
+            )().fillna(-1)
         got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1)
         assert_eq(expect, got, check_dtype=False)
 
@@ -439,9 +456,10 @@ def test_rolling_groupby_multi(agg):
     gdf = cudf.from_pandas(pdf)
 
     for window_size in range(1, len(pdf) + 1):
-        expect = getattr(
-            pdf.groupby(["a", "b"], sort=True).rolling(window_size), agg
-        )().fillna(-1)
+        with _hide_pandas_rolling_min_periods_warning(agg):
+            expect = getattr(
+                pdf.groupby(["a", "b"], sort=True).rolling(window_size), agg
+            )().fillna(-1)
         got = getattr(
             gdf.groupby(["a", "b"], sort=True).rolling(window_size), agg
         )().fillna(-1)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 3e3c5d1b053..b35aab7ca07 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1109,7 +1109,7 @@ def test_series_drop_edge_inputs():
         rfunc=gs.drop,
         lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}),
         rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}),
-        expected_error_message="Cannot specify both",
+        compare_error_message=False,
     )
 
     assert_exceptions_equal(
@@ -1584,6 +1584,12 @@ def test_isin_numeric(data, values):
     assert_eq(got, expected)
 
 
+@pytest.mark.xfail(raises=ValueError)
+def test_fill_new_category():
+    gs = cudf.Series(pd.Categorical(["a", "b", "c"]))
+    gs[0:1] = "d"
+
+
 @pytest.mark.parametrize(
     "data",
     [
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 1fce7853fdf..fd3f2732556 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -33,9 +33,14 @@ def test_dataframe_setitem_scaler_bool():
     assert_eq(df, gdf)
 
 
-@pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})])
+@pytest.mark.parametrize(
+    "df",
+    [pd.DataFrame({"a": [1, 2, 3]}), pd.DataFrame({"a": ["x", "y", "z"]})],
+)
 @pytest.mark.parametrize("arg", [["a"], "a", "b"])
-@pytest.mark.parametrize("value", [-10, pd.DataFrame({"a": [-1, -2, -3]})])
+@pytest.mark.parametrize(
+    "value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"]
+)
 def test_dataframe_setitem_columns(df, arg, value):
     gdf = cudf.from_pandas(df)
     cudf_replace_value = value
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index cb3a369d067..7bf339d6ab7 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -1,6 +1,5 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
-import re
 from concurrent.futures import ThreadPoolExecutor
 
 import numpy as np
@@ -513,9 +512,7 @@ def test_cov_corr_invalid_dtypes(gsr):
         rfunc=gsr.corr,
         lfunc_args_and_kwargs=([psr],),
         rfunc_args_and_kwargs=([gsr],),
-        expected_error_message=re.escape(
-            f"cannot perform corr with types {gsr.dtype}, {gsr.dtype}"
-        ),
+        compare_error_message=False,
     )
 
     assert_exceptions_equal(
@@ -523,7 +520,5 @@ def test_cov_corr_invalid_dtypes(gsr):
         rfunc=gsr.cov,
         lfunc_args_and_kwargs=([psr],),
         rfunc_args_and_kwargs=([gsr],),
-        expected_error_message=re.escape(
-            f"cannot perform covarience with types {gsr.dtype}, {gsr.dtype}"
-        ),
+        compare_error_message=False,
     )
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index 0b27c562d75..4dc4d86d94c 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -1,7 +1,8 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import cudf
@@ -11,10 +12,28 @@
     assert_index_equal,
     assert_series_equal,
 )
-from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing._utils import (
+    NUMERIC_TYPES,
+    OTHER_TYPES,
+    assert_column_memory_eq,
+    assert_column_memory_ne,
+    assert_eq,
+)
 from cudf.testing.testing import assert_column_equal
 
 
+@pytest.fixture(
+    params=[
+        pa.array([*range(10)]),
+        pa.array(["hello", "world", "rapids", "AI"]),
+        pa.array([[1, 2, 3], [4, 5], [6], [], [7]]),
+        pa.array([{"f0": "hello", "f1": 42}, {"f0": "world", "f1": 3}]),
+    ]
+)
+def arrow_arrays(request):
+    return request.param
+
+
 @pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]])
 @pytest.mark.parametrize("exact", ["equiv", True, False])
 @pytest.mark.parametrize("check_names", [True, False])
@@ -369,3 +388,42 @@ def test_basic_scalar_equality(left, right):
 def test_basic_scalar_inequality(left, right):
     with pytest.raises(AssertionError, match=r".*not (almost )?equal.*"):
         assert_eq(left, right)
+
+
+def test_assert_column_memory_basic(arrow_arrays):
+    left = cudf.core.column.ColumnBase.from_arrow(arrow_arrays)
+    right = cudf.core.column.ColumnBase.from_arrow(arrow_arrays)
+
+    with pytest.raises(AssertionError):
+        assert_column_memory_eq(left, right)
+    assert_column_memory_ne(left, right)
+
+
+def test_assert_column_memory_slice(arrow_arrays):
+    col = cudf.core.column.ColumnBase.from_arrow(arrow_arrays)
+    left = col[0:1]
+    right = col[1:2]
+
+    with pytest.raises(AssertionError):
+        assert_column_memory_eq(left, right)
+    assert_column_memory_ne(left, right)
+
+    with pytest.raises(AssertionError):
+        assert_column_memory_eq(left, col)
+    assert_column_memory_ne(left, col)
+
+    with pytest.raises(AssertionError):
+        assert_column_memory_eq(right, col)
+    assert_column_memory_ne(right, col)
+
+
+def test_assert_column_memory_basic_same(arrow_arrays):
+    data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays)
+    buf = cudf.core.buffer.Buffer(data=data.base_data, owner=data)
+
+    left = cudf.core.column.build_column(buf, dtype=np.int32)
+    right = cudf.core.column.build_column(buf, dtype=np.int32)
+
+    assert_column_memory_eq(left, right)
+    with pytest.raises(AssertionError):
+        assert_column_memory_ne(left, right)
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 5ff66fc750f..fb6505f5f92 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import numpy as np
 import pytest
@@ -778,3 +778,54 @@ def test_read_text(datadir):
     actual = cudf.read_text(chess_file, delimiter=delimiter)
 
     assert_eq(expected, actual)
+
+
+def test_read_text_byte_range(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file, "r") as f:
+        data = f.read()
+        content = data.split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)
+
+    actual_0 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 0, byte_range_size],
+    )
+    actual_1 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 1, byte_range_size],
+    )
+    actual_2 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 2, byte_range_size],
+    )
+
+    actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_byte_range_large(datadir):
+    content = str(("\n" if x % 5 == 0 else "x") for x in range(0, 300000000))
+    delimiter = "1."
+    temp_file = str(datadir) + "/temp.txt"
+
+    with open(temp_file, "w") as f:
+        f.write(content)
+
+    cudf.read_text(temp_file, delimiter=delimiter)
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 2e8da615e3e..bc9edacb68a 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
 import itertools
 import operator
 import re
@@ -39,7 +41,8 @@ def test_series_not(dtype):
         arr = arr * (np.random.random(1000) * 100).astype(dtype)
     sr = Series(arr)
 
-    result = cudf.logical_not(sr).to_numpy()
+    with pytest.warns(FutureWarning, match="logical_not is deprecated"):
+        result = cudf.logical_not(sr).to_numpy()
     expect = np.logical_not(arr)
     np.testing.assert_equal(result, expect)
     np.testing.assert_equal((~sr).to_numpy(), ~arr)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 4dadfede866..1bd3fa7558e 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -2,14 +2,16 @@
 
 import decimal
 import functools
+import hashlib
 import os
 import traceback
-from collections.abc import Sequence
+from functools import partial
 from typing import FrozenSet, Set, Union
 
 import cupy as cp
 import numpy as np
 import pandas as pd
+from nvtx import annotate
 
 import rmm
 
@@ -24,12 +26,6 @@
 
 
 _EQUALITY_OPS = {
-    "eq",
-    "ne",
-    "lt",
-    "gt",
-    "le",
-    "ge",
     "__eq__",
     "__ne__",
     "__lt__",
@@ -38,6 +34,7 @@
     "__ge__",
 }
 
+_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
 
 # The test root is set by pytest to support situations where tests are run from
 # a source tree on a built version of cudf.
@@ -323,136 +320,6 @@ def search_range(start, stop, x, step=1, side="left"):
     return max(min(length, i), 0)
 
 
-_UFUNC_ALIASES = {
-    "power": "pow",
-    "equal": "eq",
-    "not_equal": "ne",
-    "less": "lt",
-    "less_equal": "le",
-    "greater": "gt",
-    "greater_equal": "ge",
-    "absolute": "abs",
-}
-# For op(., cudf.Series) -> cudf.Series.__r{op}__
-_REVERSED_NAMES = {
-    "lt": "__gt__",
-    "le": "__ge__",
-    "gt": "__lt__",
-    "ge": "__le__",
-    "eq": "__eq__",
-    "ne": "__ne__",
-}
-
-
-# todo: can probably be used to remove cudf/core/ops.py
-def _get_cudf_series_ufunc(fname, args, kwargs, cudf_ser_submodule):
-    if isinstance(args[0], cudf.Series):
-        cudf_ser_func = getattr(cudf_ser_submodule, fname)
-        return cudf_ser_func(*args, **kwargs)
-    elif len(args) == 2 and isinstance(args[1], cudf.Series):
-        rev_name = _REVERSED_NAMES.get(fname, f"__r{fname}__")
-        cudf_ser_func = getattr(cudf_ser_submodule, rev_name)
-        return cudf_ser_func(args[1], args[0], **kwargs)
-    return NotImplemented
-
-
-# Utils for using appropriate dispatch for array functions
-def get_appropriate_dispatched_func(
-    cudf_submodule, cudf_ser_submodule, cupy_submodule, func, args, kwargs
-):
-    if kwargs.get("out") is None:
-        fname = func.__name__
-        # Dispatch these functions to appropiate alias from the _UFUNC_ALIASES
-        is_ufunc = fname in _UFUNC_ALIASES
-        fname = _UFUNC_ALIASES.get(fname, fname)
-
-        if hasattr(cudf_submodule, fname):
-            cudf_func = getattr(cudf_submodule, fname)
-            return cudf_func(*args, **kwargs)
-
-        elif hasattr(cudf_ser_submodule, fname):
-            if is_ufunc:
-                return _get_cudf_series_ufunc(
-                    fname, args, kwargs, cudf_ser_submodule
-                )
-            else:
-                cudf_ser_func = getattr(cudf_ser_submodule, fname)
-                return cudf_ser_func(*args, **kwargs)
-
-        elif hasattr(cupy_submodule, fname):
-            cupy_func = getattr(cupy_submodule, fname)
-            # Handle case if cupy implements it as a numpy function
-            # Unsure if needed
-            if cupy_func is func:
-                return NotImplemented
-
-            cupy_compatible_args, index = _get_cupy_compatible_args_index(args)
-            if cupy_compatible_args:
-                cupy_output = cupy_func(*cupy_compatible_args, **kwargs)
-                if isinstance(cupy_output, cp.ndarray):
-                    return _cast_to_appropriate_cudf_type(cupy_output, index)
-                else:
-                    return cupy_output
-
-    return NotImplemented
-
-
-def _cast_to_appropriate_cudf_type(val, index=None):
-    # Handle scalar
-    if val.ndim == 0:
-        return to_cudf_compatible_scalar(val)
-    # 1D array
-    elif (val.ndim == 1) or (val.ndim == 2 and val.shape[1] == 1):
-        # if index is not None and is of a different length
-        # than the index, cupy dispatching behaviour is undefined
-        # so we don't implement it
-        if (index is None) or (len(index) == len(val)):
-            return cudf.Series(val, index=index)
-
-    return NotImplemented
-
-
-def _get_cupy_compatible_args_index(args, ser_index=None):
-    """
-    This function returns cupy compatible arguments and output index
-    if conversion is not possible it returns None
-    """
-
-    casted_ls = []
-    for arg in args:
-        if isinstance(arg, cp.ndarray):
-            casted_ls.append(arg)
-        elif isinstance(arg, cudf.Series):
-            # check if indexes can be aligned
-            if (ser_index is None) or (ser_index.equals(arg.index)):
-                ser_index = arg.index
-                casted_ls.append(arg.values)
-            else:
-                # this throws a value-error if indexes are not aligned
-                # following pandas behavior for ufunc numpy dispatching
-                raise ValueError(
-                    "Can only compare identically-labeled Series objects"
-                )
-        elif isinstance(arg, Sequence):
-            # we dont handle list of inputs for functions as
-            # these form inputs for functions like
-            # np.concatenate, vstack have ambiguity around index alignment
-            return None, ser_index
-        else:
-            casted_ls.append(arg)
-    return casted_ls, ser_index
-
-
-def get_relevant_submodule(func, module):
-    # point to the correct submodule
-    for submodule in func.__module__.split(".")[1:]:
-        if hasattr(module, submodule):
-            module = getattr(module, submodule)
-        else:
-            return None
-    return module
-
-
 def _categorical_scalar_broadcast_to(cat_scalar, size):
     if isinstance(cat_scalar, (cudf.Series, pd.Series)):
         cats = cat_scalar.cat.categories
@@ -537,3 +404,25 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
     if (indices == cp.arange(start, stop, step)).all():
         return slice(start, stop, step)
     return indices
+
+
+def _get_color_for_nvtx(name):
+    m = hashlib.sha256()
+    m.update(name.encode())
+    hash_value = int(m.hexdigest(), 16)
+    idx = hash_value % len(_NVTX_COLORS)
+    return _NVTX_COLORS[idx]
+
+
+def _cudf_nvtx_annotate(func, domain="cudf_python"):
+    """Decorator for applying nvtx annotations to methods in cudf."""
+    return annotate(
+        message=func.__qualname__,
+        color=_get_color_for_nvtx(func.__qualname__),
+        domain=domain,
+    )(func)
+
+
+_dask_cudf_nvtx_annotate = partial(
+    _cudf_nvtx_annotate, domain="dask_cudf_python"
+)
diff --git a/python/dask_cudf/.coveragerc b/python/dask_cudf/.coveragerc
index dc953eefef3..10a8ce87884 100644
--- a/python/dask_cudf/.coveragerc
+++ b/python/dask_cudf/.coveragerc
@@ -1,4 +1,3 @@
 # Configuration file for Python coverage tests
 [run]
-include = dask_cudf/*
-omit = dask_cudf/tests/*
\ No newline at end of file
+source = dask_cudf
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 1b1f3e29ab2..bd9a8fc2769 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -6,7 +6,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from nvtx import annotate
 
 from dask.dataframe.core import get_parallel_type, meta_nonempty
 from dask.dataframe.dispatch import (
@@ -31,6 +30,7 @@
 
 import cudf
 from cudf.api.types import is_string_dtype
+from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
 from .core import DataFrame, Index, Series
 
@@ -40,7 +40,7 @@
 
 
 @meta_nonempty.register(cudf.BaseIndex)
-@annotate("_nonempty_index", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _nonempty_index(idx):
     if isinstance(idx, cudf.core.index.RangeIndex):
         return cudf.core.index.RangeIndex(2, name=idx.name)
@@ -75,7 +75,7 @@ def _nonempty_index(idx):
     raise TypeError(f"Don't know how to handle index of type {type(idx)}")
 
 
-@annotate("_get_non_empty_data", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _get_non_empty_data(s):
     if isinstance(s._column, cudf.core.column.CategoricalColumn):
         categories = (
@@ -103,7 +103,7 @@ def _get_non_empty_data(s):
 
 
 @meta_nonempty.register(cudf.Series)
-@annotate("_nonempty_series", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _nonempty_series(s, idx=None):
     if idx is None:
         idx = _nonempty_index(s.index)
@@ -113,7 +113,7 @@ def _nonempty_series(s, idx=None):
 
 
 @meta_nonempty.register(cudf.DataFrame)
-@annotate("meta_nonempty_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
@@ -129,18 +129,18 @@ def meta_nonempty_cudf(x):
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
-@annotate("make_meta_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def make_meta_cudf(x, index=None):
     return x.head(0)
 
 
 @make_meta_dispatch.register(cudf.BaseIndex)
-@annotate("make_meta_cudf_index", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def make_meta_cudf_index(x, index=None):
     return x[:0]
 
 
-@annotate("_empty_series", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
         return cudf.Series(
@@ -150,7 +150,7 @@ def _empty_series(name, dtype, index=None):
 
 
 @make_meta_obj.register(object)
-@annotate("make_meta_object_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def make_meta_object_cudf(x, index=None):
     """Create an empty cudf object containing the desired metadata.
 
@@ -221,7 +221,7 @@ def make_meta_object_cudf(x, index=None):
 
 
 @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex))
-@annotate("concat_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def concat_cudf(
     dfs,
     axis=0,
@@ -246,13 +246,13 @@ def concat_cudf(
 @categorical_dtype_dispatch.register(
     (cudf.DataFrame, cudf.Series, cudf.BaseIndex)
 )
-@annotate("categorical_dtype_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def categorical_dtype_cudf(categories=None, ordered=None):
     return cudf.CategoricalDtype(categories=categories, ordered=ordered)
 
 
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
-@annotate("tolist_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def tolist_cudf(obj):
     return obj.to_arrow().to_pylist()
 
@@ -260,9 +260,7 @@ def tolist_cudf(obj):
 @is_categorical_dtype_dispatch.register(
     (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
 )
-@annotate(
-    "is_categorical_dtype_cudf", color="green", domain="dask_cudf_python"
-)
+@_dask_cudf_nvtx_annotate
 def is_categorical_dtype_cudf(obj):
     return cudf.api.types.is_categorical_dtype(obj)
 
@@ -276,7 +274,7 @@ def is_categorical_dtype_cudf(obj):
         )
 
     @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
-    @annotate("percentile_cudf", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def percentile_cudf(a, q, interpolation="linear"):
         # Cudf dispatch to the equivalent of `np.percentile`:
         # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
@@ -321,7 +319,7 @@ def percentile_cudf(a, q, interpolation="linear"):
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
-@annotate("union_categoricals_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def union_categoricals_cudf(
     to_union, sort_categories=False, ignore_order=False
 ):
@@ -330,13 +328,13 @@ def union_categoricals_cudf(
     )
 
 
-@annotate("safe_hash", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def safe_hash(frame):
     return cudf.Series(frame.hash_values(), index=frame.index)
 
 
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
-@annotate("hash_object_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def hash_object_cudf(frame, index=True):
     if index:
         return safe_hash(frame.reset_index())
@@ -344,7 +342,7 @@ def hash_object_cudf(frame, index=True):
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
-@annotate("hash_object_cudf_index", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def hash_object_cudf_index(ind, index=None):
 
     if isinstance(ind, cudf.MultiIndex):
@@ -355,7 +353,7 @@ def hash_object_cudf_index(ind, index=None):
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
-@annotate("group_split_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def group_split_cudf(df, c, k, ignore_index=False):
     return dict(
         zip(
@@ -370,12 +368,12 @@ def group_split_cudf(df, c, k, ignore_index=False):
 
 
 @sizeof_dispatch.register(cudf.DataFrame)
-@annotate("sizeof_cudf_dataframe", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def sizeof_cudf_dataframe(df):
     return int(df.memory_usage().sum())
 
 
 @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex))
-@annotate("sizeof_cudf_series_index", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index d8802f33941..4d193f34b9f 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 import pandas as pd
-from nvtx import annotate
 from tlz import partition_all
 
 import dask
@@ -26,6 +25,7 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
@@ -58,7 +58,7 @@ def __dask_postcompute__(self):
     def __dask_postpersist__(self):
         return type(self), (self._name, self._meta, self.divisions)
 
-    @annotate("_FRAME_INIT", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def __init__(self, dsk, name, meta, divisions):
         if not isinstance(dsk, HighLevelGraph):
             dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[])
@@ -84,9 +84,7 @@ def __repr__(self):
         s = "<dask_cudf.%s | %d tasks | %d npartitions>"
         return s % (type(self).__name__, len(self.dask), self.npartitions)
 
-    @annotate(
-        "_FRAME_to_dask_dataframe", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def to_dask_dataframe(self, **kwargs):
         """Create a dask.dataframe object from a dask_cudf object"""
         nullable_pd_dtype = kwargs.get("nullable_pd_dtype", False)
@@ -104,9 +102,7 @@ def to_dask_dataframe(self, **kwargs):
 class DataFrame(_Frame, dd.core.DataFrame):
     _partition_type = cudf.DataFrame
 
-    @annotate(
-        "DATAFRAME_assign_column", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def _assign_column(self, k, v):
         def assigner(df, k, v):
             out = df.copy()
@@ -116,7 +112,7 @@ def assigner(df, k, v):
         meta = assigner(self._meta, k, dask_make_meta(v))
         return self.map_partitions(assigner, k, v, meta=meta)
 
-    @annotate("DATAFRAME_apply_rows", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
         import uuid
 
@@ -136,7 +132,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs):
             do_apply_rows, func, incols, outcols, kwargs, meta=meta
         )
 
-    @annotate("DATAFRAME_merge", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def merge(self, other, **kwargs):
         if kwargs.pop("shuffle", "tasks") != "tasks":
             raise ValueError(
@@ -148,7 +144,7 @@ def merge(self, other, **kwargs):
             on = list(on)
         return super().merge(other, on=on, shuffle="tasks", **kwargs)
 
-    @annotate("DATAFRAME_join", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def join(self, other, **kwargs):
         if kwargs.pop("shuffle", "tasks") != "tasks":
             raise ValueError(
@@ -166,7 +162,7 @@ def join(self, other, **kwargs):
             on = list(on)
         return super().join(other, how=how, on=on, shuffle="tasks", **kwargs)
 
-    @annotate("DATAFRAME_set_index", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def set_index(self, other, sorted=False, divisions=None, **kwargs):
         if kwargs.pop("shuffle", "tasks") != "tasks":
             raise ValueError(
@@ -238,9 +234,7 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs):
             **kwargs,
         )
 
-    @annotate(
-        "DATAFRAME_sort_values", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def sort_values(
         self,
         by,
@@ -276,14 +270,14 @@ def sort_values(
             return df.reset_index(drop=True)
         return df
 
-    @annotate("DATAFRAME_to_parquet", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def to_parquet(self, path, *args, **kwargs):
         """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
         from dask_cudf.io import to_parquet
 
         return to_parquet(self, path, *args, **kwargs)
 
-    @annotate("DATAFRAME_to_orc", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def to_orc(self, path, **kwargs):
         """Calls dask_cudf.io.to_orc"""
         from dask_cudf.io import to_orc
@@ -291,7 +285,7 @@ def to_orc(self, path, **kwargs):
         return to_orc(self, path, **kwargs)
 
     @derived_from(pd.DataFrame)
-    @annotate("DATAFRAME_var", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def var(
         self,
         axis=None,
@@ -320,9 +314,7 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
-    @annotate(
-        "DATAFRAME_repartition", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def repartition(self, *args, **kwargs):
         """Wraps dask.dataframe DataFrame.repartition method.
         Uses DataFrame.shuffle if `columns=` is specified.
@@ -345,7 +337,7 @@ def repartition(self, *args, **kwargs):
             )
         return super().repartition(*args, **kwargs)
 
-    @annotate("DATAFRAME_shuffle", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def shuffle(self, *args, **kwargs):
         """Wraps dask.dataframe DataFrame.shuffle method"""
         shuffle_arg = kwargs.pop("shuffle", None)
@@ -353,21 +345,21 @@ def shuffle(self, *args, **kwargs):
             raise ValueError("dask_cudf does not support disk-based shuffle.")
         return super().shuffle(*args, shuffle="tasks", **kwargs)
 
-    @annotate("DATAFRAME_groupby", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def groupby(self, by=None, **kwargs):
         from .groupby import CudfDataFrameGroupBy
 
         return CudfDataFrameGroupBy(self, by=by, **kwargs)
 
 
-@annotate("DATAFRAME_sum_of_squares", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
     return cudf.Series(outcol)
 
 
-@annotate("DATAFRAME_var_aggregate", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def var_aggregate(x2, x, n, ddof):
     try:
         with warnings.catch_warnings(record=True):
@@ -380,12 +372,12 @@ def var_aggregate(x2, x, n, ddof):
         return np.float64(np.nan)
 
 
-@annotate("DATAFRAME_nlargest_agg", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def nlargest_agg(x, **kwargs):
     return cudf.concat(x).nlargest(**kwargs)
 
 
-@annotate("DATAFRAME_nsmallest_agg", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def nsmallest_agg(x, **kwargs):
     return cudf.concat(x).nsmallest(**kwargs)
 
@@ -393,7 +385,7 @@ def nsmallest_agg(x, **kwargs):
 class Series(_Frame, dd.core.Series):
     _partition_type = cudf.Series
 
-    @annotate("Series_count", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def count(self, split_every=False):
         return reduction(
             [self],
@@ -403,14 +395,14 @@ def count(self, split_every=False):
             meta="i8",
         )
 
-    @annotate("Series_mean", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def mean(self, split_every=False):
         sum = self.sum(split_every=split_every)
         n = self.count(split_every=split_every)
         return sum / n
 
     @derived_from(pd.DataFrame)
-    @annotate("Series_var", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def var(
         self,
         axis=None,
@@ -439,19 +431,19 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
-    @annotate("Series_groupby", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def groupby(self, *args, **kwargs):
         from .groupby import CudfSeriesGroupBy
 
         return CudfSeriesGroupBy(self, *args, **kwargs)
 
     @property
-    @annotate("Series_list", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def list(self):
         return ListMethods(self)
 
     @property
-    @annotate("Series_struct", color="green", domain="dask_cudf_python")
+    @_dask_cudf_nvtx_annotate
     def struct(self):
         return StructMethods(self)
 
@@ -460,7 +452,7 @@ class Index(Series, dd.core.Index):
     _partition_type = cudf.Index  # type: ignore
 
 
-@annotate("_naive_var", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     num = ddf._get_numeric_data()
     x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
@@ -475,7 +467,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     return handle_out(out, result)
 
 
-@annotate("_parallel_var", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _parallel_var(ddf, meta, skipna, split_every, out):
     def _local_var(x, skipna):
         if skipna:
@@ -542,7 +534,7 @@ def _finalize_var(vals):
     return handle_out(out, result)
 
 
-@annotate("_extract_meta", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _extract_meta(x):
     """
     Extract internal cache data (``_meta``) from dask_cudf objects
@@ -558,7 +550,7 @@ def _extract_meta(x):
     return x
 
 
-@annotate("_emulate", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _emulate(func, *args, **kwargs):
     """
     Apply a function using args / kwargs. If arguments contain dd.DataFrame /
@@ -568,7 +560,7 @@ def _emulate(func, *args, **kwargs):
         return func(*_extract_meta(args), **_extract_meta(kwargs))
 
 
-@annotate("align_partitions", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def align_partitions(args):
     """Align partitions between dask_cudf objects.
 
@@ -584,7 +576,7 @@ def align_partitions(args):
     return args
 
 
-@annotate("reduction", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def reduction(
     args,
     chunk=None,
@@ -723,7 +715,7 @@ def reduction(
     return dd.core.new_dd_object(graph, b, meta, (None, None))
 
 
-@annotate("from_cudf", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
     if isinstance(getattr(data, "index", None), cudf.MultiIndex):
         raise NotImplementedError(
@@ -745,7 +737,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 )
 
 
-@annotate("from_dask_dataframe", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def from_dask_dataframe(df):
     return df.map_partitions(cudf.from_pandas)
 
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 658e63ea923..76533706030 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 import pandas as pd
-from nvtx import annotate
 
 from dask.base import tokenize
 from dask.dataframe.core import (
@@ -20,6 +19,7 @@
 from dask.highlevelgraph import HighLevelGraph
 
 import cudf
+from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
 SUPPORTED_AGGS = (
     "count",
@@ -36,19 +36,13 @@
 
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
-    @annotate(
-        "CudfDataFrameGroupBy_INIT", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def __init__(self, *args, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, **kwargs)
 
-    @annotate(
-        "CudfDataFrameGroupBy_GETITEM",
-        color="green",
-        domain="dask_cudf_python",
-    )
+    @_dask_cudf_nvtx_annotate
     def __getitem__(self, key):
         if isinstance(key, list):
             g = CudfDataFrameGroupBy(
@@ -62,9 +56,7 @@ def __getitem__(self, key):
         g._meta = g._meta[key]
         return g
 
-    @annotate(
-        "CudfDataFrameGroupBy_MEAN", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -78,11 +70,7 @@ def mean(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )
 
-    @annotate(
-        "CudfDataFrameGroupBy_COLLECT",
-        color="green",
-        domain="dask_cudf_python",
-    )
+    @_dask_cudf_nvtx_annotate
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -96,11 +84,7 @@ def collect(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )
 
-    @annotate(
-        "CudfDataFrameGroupBy_AGGREGATE",
-        color="green",
-        domain="dask_cudf_python",
-    )
+    @_dask_cudf_nvtx_annotate
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
@@ -140,17 +124,13 @@ def aggregate(self, arg, split_every=None, split_out=1):
 
 
 class CudfSeriesGroupBy(SeriesGroupBy):
-    @annotate(
-        "CudfSeriesGroupBy_INIT", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def __init__(self, *args, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, **kwargs)
 
-    @annotate(
-        "CudfSeriesGroupBy_MEAN", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -164,9 +144,7 @@ def mean(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
-    @annotate(
-        "CudfSeriesGroupBy_STD", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -180,9 +158,7 @@ def std(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
-    @annotate(
-        "CudfSeriesGroupBy_VAR", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -196,9 +172,7 @@ def var(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
-    @annotate(
-        "CudfSeriesGroupBy_COLLECT", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -212,9 +186,7 @@ def collect(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
-    @annotate(
-        "CudfSeriesGroupBy_AGGREGATE", color="green", domain="dask_cudf_python"
-    )
+    @_dask_cudf_nvtx_annotate
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
@@ -245,7 +217,7 @@ def aggregate(self, arg, split_every=None, split_out=1):
         )
 
 
-@annotate("groupby_agg", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def groupby_agg(
     ddf,
     gb_cols,
@@ -412,7 +384,7 @@ def groupby_agg(
     return new_dd_object(graph, gb_agg_name, _meta, divisions)
 
 
-@annotate("_redirect_aggs", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _redirect_aggs(arg):
     """Redirect aggregations to their corresponding name in cuDF"""
     redirects = {
@@ -439,7 +411,7 @@ def _redirect_aggs(arg):
     return redirects.get(arg, arg)
 
 
-@annotate("_is_supported", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _is_supported(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
@@ -465,7 +437,7 @@ def _make_name(*args, sep="_"):
     return sep.join(_args)
 
 
-@annotate("_groupby_partition_agg", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _groupby_partition_agg(
     df, gb_cols, aggs, columns, split_out, dropna, sort, sep
 ):
@@ -523,7 +495,7 @@ def _groupby_partition_agg(
     return output
 
 
-@annotate("_tree_node_agg", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
     """Node in groupby-aggregation reduction tree.
 
@@ -558,7 +530,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
     return gb
 
 
-@annotate("_var_agg", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     """Calculate variance (given count, sum, and sum-squared columns)."""
 
@@ -580,7 +552,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     return var
 
 
-@annotate("_finalize_gb_agg", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _finalize_gb_agg(
     gb,
     gb_cols,
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index ada738c5a9b..5b286b0ff3d 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -5,7 +5,6 @@
 import cupy
 import numpy as np
 import tlz as toolz
-from nvtx import annotate
 
 from dask.base import tokenize
 from dask.dataframe import methods
@@ -16,16 +15,17 @@
 
 import cudf as gd
 from cudf.api.types import is_categorical_dtype
+from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
 
-@annotate("set_index_post", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def set_index_post(df, index_name, drop, column_dtype):
     df2 = df.set_index(index_name, drop=drop)
     df2.columns = df2.columns.astype(column_dtype)
     return df2
 
 
-@annotate("_set_partitions_pre", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     if ascending:
         partitions = divisions.searchsorted(s, side="right") - 1
@@ -42,7 +42,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     return partitions
 
 
-@annotate("_quantile", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _quantile(a, q):
     n = len(a)
     if not len(a):
@@ -50,7 +50,7 @@ def _quantile(a, q):
     return (a.quantiles(q=q.tolist(), interpolation="nearest"), n)
 
 
-@annotate("merge_quantiles", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def merge_quantiles(finalq, qs, vals):
     """Combine several quantile calculations of different data.
     [NOTE: Same logic as dask.array merge_percentiles]
@@ -113,7 +113,7 @@ def _append_counts(val, count):
     return rv.reset_index(drop=True)
 
 
-@annotate("_approximate_quantile", color="green", domain="dask_cudf_python")
+@_dask_cudf_nvtx_annotate
 def _approximate_quantile(df, q):
     """Approximate quantiles of DataFrame or Series.
     [NOTE: Same logic as dask.dataframe Series quantile]
@@ -187,7 +187,7 @@ def set_quantile_index(df):
     return df
 
 
-@annotate("quantile_divisions", color="green", domain="cudf_python")
+@_dask_cudf_nvtx_annotate
 def quantile_divisions(df, by, npartitions):
     qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
     divisions = _approximate_quantile(df[by], qn).compute()
@@ -221,7 +221,7 @@ def quantile_divisions(df, by, npartitions):
     return divisions
 
 
-@annotate("sort_values", color="green", domain="cudf_python")
+@_dask_cudf_nvtx_annotate
 def sort_values(
     df,
     by,