diff --git a/build.sh b/build.sh
index 48182ca1a6f..ab3bd0e7a89 100755
--- a/build.sh
+++ b/build.sh
@@ -112,16 +112,22 @@ function buildLibCudfJniInDocker {
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
     local workspaceRepoDir="$workspaceDir/cudf"
     local workspaceMavenRepoDir="$workspaceDir/.m2/repository"
+    local workspaceCcacheDir="$workspaceDir/.ccache"
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
+    mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
         -f java/ci/Dockerfile.centos7 \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
+        -e PARALLEL_LEVEL \
+        -e CCACHE_DISABLE \
+        -e CCACHE_DIR="$workspaceCcacheDir" \
         -v "/etc/group:/etc/group:ro" \
         -v "/etc/passwd:/etc/passwd:ro" \
         -v "/etc/shadow:/etc/shadow:ro" \
         -v "/etc/sudoers.d:/etc/sudoers.d:ro" \
+        -v "$HOME/.ccache:$workspaceCcacheDir:rw" \
         -v "$REPODIR:$workspaceRepoDir:rw" \
         -v "$localMavenRepo:$workspaceMavenRepoDir:rw" \
         --workdir "$workspaceRepoDir/java/target/libcudf-cmake-build" \
@@ -129,11 +135,16 @@ function buildLibCudfJniInDocker {
         scl enable devtoolset-9 \
             "cmake $workspaceRepoDir/cpp \
                 -G${CMAKE_GENERATOR} \
+                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                -DCMAKE_CXX_LINKER_LAUNCHER=ccache \
                 -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} \
-                -DCMAKE_INSTALL_PREFIX==/usr/local/rapids \
-                -DUSE_NVTX=ON -DCUDF_USE_ARROW_STATIC=ON \
+                -DCMAKE_INSTALL_PREFIX=/usr/local/rapids \
+                -DUSE_NVTX=ON \
+                -DCUDF_USE_ARROW_STATIC=ON \
                 -DCUDF_ENABLE_ARROW_S3=OFF \
                 -DBUILD_TESTS=OFF \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
@@ -145,6 +156,10 @@ function buildLibCudfJniInDocker {
                 -Dmaven.repo.local=$workspaceMavenRepoDir \
                 -DskipTests=${SKIP_TESTS:-false} \
                 -Dparallel.level=${PARALLEL_LEVEL} \
+                -Dcmake.ccache.opts='-DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+                                     -DCMAKE_CXX_LINKER_LAUNCHER=ccache' \
                 -DCUDF_CPP_BUILD_DIR=$workspaceRepoDir/java/target/libcudf-cmake-build \
                 -DCUDA_STATIC_RUNTIME=ON \
                 -DPER_THREAD_DEFAULT_STREAM=ON \
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cbe2811afe4..42a434ba53d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -189,7 +189,6 @@ add_library(
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
   src/binaryop/binaryop.cpp
-  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/Add.cu
   src/binaryop/compiled/ATan2.cu
   src/binaryop/compiled/BitwiseAnd.cu
@@ -220,6 +219,7 @@ add_library(
   src/binaryop/compiled/ShiftRightUnsigned.cu
   src/binaryop/compiled/Sub.cu
   src/binaryop/compiled/TrueDiv.cu
+  src/binaryop/compiled/binary_ops.cu
   src/binaryop/compiled/util.cpp
   src/labeling/label_bins.cu
   src/bitmask/null_mask.cu
@@ -362,6 +362,7 @@ add_library(
   src/join/mixed_join_size_kernel_nulls.cu
   src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
+  src/lists/apply_boolean_mask.cu
   src/lists/contains.cu
   src/lists/combine/concatenate_list_elements.cu
   src/lists/combine/concatenate_rows.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e93b2bf4f25..04dcf51dd40 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -242,6 +242,10 @@ ConfigureBench(PARQUET_WRITER_BENCH io/parquet/parquet_writer.cpp)
 # * orc writer benchmark --------------------------------------------------------------------------
 ConfigureBench(ORC_WRITER_BENCH io/orc/orc_writer.cpp)
 
+# ##################################################################################################
+# * orc writer chunks benchmark ---------------------------------------------------------------
+ConfigureNVBench(ORC_WRITER_CHUNKS_NVBENCH io/orc/orc_writer_chunks.cpp)
+
 # ##################################################################################################
 # * csv writer benchmark --------------------------------------------------------------------------
 ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer.cpp)
diff --git a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
new file mode 100644
index 00000000000..dc82772fa83
--- /dev/null
+++ b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <cudf/column/column.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/table/table.hpp>
+
+// to enable, run cmake with -DBUILD_BENCHMARKS=ON
+
+constexpr int64_t data_size = 512 << 20;
+
+namespace cudf_io = cudf::io;
+
+void nvbench_orc_write(nvbench::state& state)
+{
+  cudf::size_type num_cols = state.get_int64("num_columns");
+
+  auto tbl =
+    create_random_table(cycle_dtypes(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
+                                                        int32_t(type_group_id::FLOATING_POINT),
+                                                        int32_t(type_group_id::FIXED_POINT),
+                                                        int32_t(type_group_id::TIMESTAMP),
+                                                        int32_t(cudf::type_id::STRING),
+                                                        int32_t(cudf::type_id::STRUCT),
+                                                        int32_t(cudf::type_id::LIST)}),
+                                     num_cols),
+                        table_size_bytes{data_size});
+  cudf::table_view view = tbl->view();
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  state.add_global_memory_reads<int64_t>(data_size);
+  state.add_element_count(view.num_columns() * view.num_rows());
+
+  size_t encoded_file_size = 0;
+
+  state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch, auto& timer) {
+               cuio_source_sink_pair source_sink(io_type::VOID);
+               timer.start();
+
+               cudf_io::orc_writer_options opts =
+                 cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view);
+               cudf_io::write_orc(opts);
+
+               timer.stop();
+               encoded_file_size = source_sink.size();
+             });
+
+  state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
+  state.add_buffer_size(encoded_file_size, "efs", "Encoded File Size");
+  state.add_buffer_size(view.num_rows(), "trc", "Total Rows");
+}
+
+void nvbench_orc_chunked_write(nvbench::state& state)
+{
+  cudf::size_type num_cols   = state.get_int64("num_columns");
+  cudf::size_type num_tables = state.get_int64("num_chunks");
+
+  std::vector<std::unique_ptr<cudf::table>> tables;
+  for (cudf::size_type idx = 0; idx < num_tables; idx++) {
+    tables.push_back(
+      create_random_table(cycle_dtypes(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
+                                                          int32_t(type_group_id::FLOATING_POINT),
+                                                          int32_t(type_group_id::FIXED_POINT),
+                                                          int32_t(type_group_id::TIMESTAMP),
+                                                          int32_t(cudf::type_id::STRING),
+                                                          int32_t(cudf::type_id::STRUCT),
+                                                          int32_t(cudf::type_id::LIST)}),
+                                       num_cols),
+                          table_size_bytes{size_t(data_size / num_tables)}));
+  }
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  auto size_iter = thrust::make_transform_iterator(
+    tables.begin(), [](auto const& i) { return i->num_columns() * i->num_rows(); });
+  auto row_count_iter =
+    thrust::make_transform_iterator(tables.begin(), [](auto const& i) { return i->num_rows(); });
+  auto total_elements = std::accumulate(size_iter, size_iter + num_tables, 0);
+  auto total_rows     = std::accumulate(row_count_iter, row_count_iter + num_tables, 0);
+
+  state.add_global_memory_reads<int64_t>(data_size);
+  state.add_element_count(total_elements);
+
+  size_t encoded_file_size = 0;
+
+  state.exec(
+    nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
+      cuio_source_sink_pair source_sink(io_type::VOID);
+      timer.start();
+
+      cudf_io::chunked_orc_writer_options opts =
+        cudf_io::chunked_orc_writer_options::builder(source_sink.make_sink_info());
+      cudf_io::orc_chunked_writer writer(opts);
+      std::for_each(tables.begin(),
+                    tables.end(),
+                    [&writer](std::unique_ptr<cudf::table> const& tbl) { writer.write(*tbl); });
+      writer.close();
+
+      timer.stop();
+      encoded_file_size = source_sink.size();
+    });
+
+  state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
+  state.add_buffer_size(encoded_file_size, "efs", "Encoded File Size");
+  state.add_buffer_size(total_rows, "trc", "Total Rows");
+}
+
+NVBENCH_BENCH(nvbench_orc_write)
+  .set_name("orc_write")
+  .set_min_samples(4)
+  .add_int64_axis("num_columns", {8, 64});
+
+NVBENCH_BENCH(nvbench_orc_chunked_write)
+  .set_name("orc_chunked_write")
+  .set_min_samples(4)
+  .add_int64_axis("num_columns", {8, 64})
+  .add_int64_axis("num_chunks", {8, 64});
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 5232821d113..332b0d9dc96 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -22,7 +22,7 @@ function(find_and_configure_cucollections)
     GLOBAL_TARGETS cuco::cuco
     BUILD_EXPORT_SET cudf-exports
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG fb58a38701f1c24ecfe07d8f1f208bbe80930da5
+    GIT_TAG 8b15f06f38d034e815bc72045ca3403787f75e07
     EXCLUDE_FROM_ALL ${BUILD_SHARED_LIBS}
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 751b7c00e8a..45d4c3b5ae4 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -245,6 +245,20 @@ std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent
   table_view const& table,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Checks if a column or any of its children is a struct column with structs that are null.
+ *
+ * This function searches for structs that are null -- differentiating between structs that are null
+ * and structs containing null values. Null structs add a column to the result of the flatten column
+ * utility and necessitates column_nullability::FORCE when flattening the column for comparison
+ * operations.
+ *
+ * @param col Column to check for null structs
+ * @return A boolean indicating if the column is or contains a struct column that contains a null
+ * struct.
+ */
+bool contains_null_structs(column_view const& col);
 }  // namespace detail
 }  // namespace structs
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
new file mode 100644
index 00000000000..0e9f2ec16c4
--- /dev/null
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::lists::detail {
+
+/**
+ * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<column> apply_boolean_mask(
+  lists_column_view const& input,
+  lists_column_view const& boolean_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
new file mode 100644
index 00000000000..c7a9731eb65
--- /dev/null
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace cudf::lists {
+
+/**
+ * @brief Filters elements in each row of `input` LIST column using `boolean_mask`
+ * LIST of booleans as a mask.
+ *
+ * Given an input `LIST` column and a list-of-bools column, the function produces
+ * a new `LIST` column of the same type as `input`, where each element is copied
+ * from the input row *only* if the corresponding `boolean_mask` is non-null and `true`.
+ *
+ * E.g.
+ * @code{.pseudo}
+ * input        = { {0,1,2}, {3,4}, {5,6,7}, {8,9} };
+ * boolean_mask = { {0,1,1}, {1,0}, {1,1,1}, {0,0} };
+ * results      = { {1,2},   {3},   {5,6,7}, {} };
+ * @endcode
+ *
+ * `input` and `boolean_mask` must have the same number of rows.
+ * The output column has the same number of rows as the input column.
+ * An element is copied to an output row *only* if the corresponding boolean_mask element is `true`.
+ * An output row is invalid only if the input row is invalid.
+ *
+ * @throws cudf::logic_error if `boolean_mask` is not a "lists of bools" column
+ * @throws cudf::logic_error if `input` and `boolean_mask` have different number of rows
+ *
+ * @param input The input list column view to be filtered
+ * @param boolean_mask A nullable list of bools column used to filter `input` elements
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return List column of the same type as `input`, containing filtered list rows
+ */
+std::unique_ptr<column> apply_boolean_mask(
+  lists_column_view const& input,
+  lists_column_view const& boolean_mask,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::lists
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 4eca03a800c..4d503cd53b8 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -74,7 +74,7 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
  * @brief A specialization for floating-point `Element` type relational comparison
  * to derive the order of the elements with respect to `lhs`.
  *
- * This Specialization handles `nan` in the following order:
+ * This specialization handles `nan` in the following order:
  * `[-Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN, null] (for null_order::AFTER)`
  * `[null, -Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN] (for null_order::BEFORE)`
  *
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index ed24517f55b..d8fa7bff0b8 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -699,6 +699,37 @@ constexpr inline bool is_nested(data_type type)
   return cudf::type_dispatcher(type, is_nested_impl{});
 }
 
+/**
+ * @brief Indicates whether `T` is a struct type.
+ *
+ * @param T The type to verify
+ * @return A boolean indicating if T is a struct type
+ */
+template <typename T>
+constexpr inline bool is_struct()
+{
+  return std::is_same_v<T, cudf::struct_view>;
+}
+
+struct is_struct_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return is_struct<T>();
+  }
+};
+
+/**
+ * @brief Indicates whether `type` is a struct type.
+ *
+ * @param type The `data_type` to verify
+ * @return A boolean indicating if `type` is a struct type
+ */
+constexpr inline bool is_struct(data_type type)
+{
+  return cudf::type_dispatcher(type, is_struct_impl{});
+}
+
 template <typename FromType>
 struct is_bit_castable_to_impl {
   template <typename ToType, std::enable_if_t<is_compound<ToType>()>* = nullptr>
diff --git a/cpp/src/binaryop/compiled/ATan2.cu b/cpp/src/binaryop/compiled/ATan2.cu
index 8e5cbf57f55..f43a469a2c9 100644
--- a/cpp/src/binaryop/compiled/ATan2.cu
+++ b/cpp/src/binaryop/compiled/ATan2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::ATan2>(mutable_column_device_view&,
-                                          column_device_view const&,
-                                          column_device_view const&,
+template void apply_binary_op<ops::ATan2>(mutable_column_view&,
+                                          column_view const&,
+                                          column_view const&,
                                           bool is_lhs_scalar,
                                           bool is_rhs_scalar,
                                           rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Add.cu b/cpp/src/binaryop/compiled/Add.cu
index 4cd2ced66f4..1dbfa5b4718 100644
--- a/cpp/src/binaryop/compiled/Add.cu
+++ b/cpp/src/binaryop/compiled/Add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Add>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Add>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/BitwiseAnd.cu b/cpp/src/binaryop/compiled/BitwiseAnd.cu
index 6abac2bd197..cfabb1402ce 100644
--- a/cpp/src/binaryop/compiled/BitwiseAnd.cu
+++ b/cpp/src/binaryop/compiled/BitwiseAnd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::BitwiseAnd>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::BitwiseAnd>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/BitwiseOr.cu b/cpp/src/binaryop/compiled/BitwiseOr.cu
index 6d523cbf1d1..01ef118665b 100644
--- a/cpp/src/binaryop/compiled/BitwiseOr.cu
+++ b/cpp/src/binaryop/compiled/BitwiseOr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::BitwiseOr>(mutable_column_device_view&,
-                                              column_device_view const&,
-                                              column_device_view const&,
+template void apply_binary_op<ops::BitwiseOr>(mutable_column_view&,
+                                              column_view const&,
+                                              column_view const&,
                                               bool is_lhs_scalar,
                                               bool is_rhs_scalar,
                                               rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/BitwiseXor.cu b/cpp/src/binaryop/compiled/BitwiseXor.cu
index 45175681574..44f74bab876 100644
--- a/cpp/src/binaryop/compiled/BitwiseXor.cu
+++ b/cpp/src/binaryop/compiled/BitwiseXor.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::BitwiseXor>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::BitwiseXor>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Div.cu b/cpp/src/binaryop/compiled/Div.cu
index 7cc895ecd06..f377778c427 100644
--- a/cpp/src/binaryop/compiled/Div.cu
+++ b/cpp/src/binaryop/compiled/Div.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Div>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Div>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/FloorDiv.cu b/cpp/src/binaryop/compiled/FloorDiv.cu
index 99ea2706b86..f9cd323caec 100644
--- a/cpp/src/binaryop/compiled/FloorDiv.cu
+++ b/cpp/src/binaryop/compiled/FloorDiv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::FloorDiv>(mutable_column_device_view&,
-                                             column_device_view const&,
-                                             column_device_view const&,
+template void apply_binary_op<ops::FloorDiv>(mutable_column_view&,
+                                             column_view const&,
+                                             column_view const&,
                                              bool is_lhs_scalar,
                                              bool is_rhs_scalar,
                                              rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Greater.cu b/cpp/src/binaryop/compiled/Greater.cu
index 679e029b5fc..db06cc409da 100644
--- a/cpp/src/binaryop/compiled/Greater.cu
+++ b/cpp/src/binaryop/compiled/Greater.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Greater>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::Greater>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/GreaterEqual.cu b/cpp/src/binaryop/compiled/GreaterEqual.cu
index 23b0c6aaa0d..c239e1e1345 100644
--- a/cpp/src/binaryop/compiled/GreaterEqual.cu
+++ b/cpp/src/binaryop/compiled/GreaterEqual.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::GreaterEqual>(mutable_column_device_view&,
-                                                 column_device_view const&,
-                                                 column_device_view const&,
+template void apply_binary_op<ops::GreaterEqual>(mutable_column_view&,
+                                                 column_view const&,
+                                                 column_view const&,
                                                  bool is_lhs_scalar,
                                                  bool is_rhs_scalar,
                                                  rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Less.cu b/cpp/src/binaryop/compiled/Less.cu
index 7ab5dfe3478..e8663715c87 100644
--- a/cpp/src/binaryop/compiled/Less.cu
+++ b/cpp/src/binaryop/compiled/Less.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Less>(mutable_column_device_view&,
-                                         column_device_view const&,
-                                         column_device_view const&,
+template void apply_binary_op<ops::Less>(mutable_column_view&,
+                                         column_view const&,
+                                         column_view const&,
                                          bool is_lhs_scalar,
                                          bool is_rhs_scalar,
                                          rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/LessEqual.cu b/cpp/src/binaryop/compiled/LessEqual.cu
index 983c50c9575..d2f88fab81b 100644
--- a/cpp/src/binaryop/compiled/LessEqual.cu
+++ b/cpp/src/binaryop/compiled/LessEqual.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::LessEqual>(mutable_column_device_view&,
-                                              column_device_view const&,
-                                              column_device_view const&,
+template void apply_binary_op<ops::LessEqual>(mutable_column_view&,
+                                              column_view const&,
+                                              column_view const&,
                                               bool is_lhs_scalar,
                                               bool is_rhs_scalar,
                                               rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/LogBase.cu b/cpp/src/binaryop/compiled/LogBase.cu
index bdc709b86bf..8a2162c4ca4 100644
--- a/cpp/src/binaryop/compiled/LogBase.cu
+++ b/cpp/src/binaryop/compiled/LogBase.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::LogBase>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::LogBase>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/LogicalAnd.cu b/cpp/src/binaryop/compiled/LogicalAnd.cu
index 08112fadfff..64e5c1a31c0 100644
--- a/cpp/src/binaryop/compiled/LogicalAnd.cu
+++ b/cpp/src/binaryop/compiled/LogicalAnd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::LogicalAnd>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::LogicalAnd>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/LogicalOr.cu b/cpp/src/binaryop/compiled/LogicalOr.cu
index bc400afd4cd..a4b64cc6afc 100644
--- a/cpp/src/binaryop/compiled/LogicalOr.cu
+++ b/cpp/src/binaryop/compiled/LogicalOr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::LogicalOr>(mutable_column_device_view&,
-                                              column_device_view const&,
-                                              column_device_view const&,
+template void apply_binary_op<ops::LogicalOr>(mutable_column_view&,
+                                              column_view const&,
+                                              column_view const&,
                                               bool is_lhs_scalar,
                                               bool is_rhs_scalar,
                                               rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Mod.cu b/cpp/src/binaryop/compiled/Mod.cu
index 0b82c09c8a6..fcdd01b7be8 100644
--- a/cpp/src/binaryop/compiled/Mod.cu
+++ b/cpp/src/binaryop/compiled/Mod.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Mod>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Mod>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Mul.cu b/cpp/src/binaryop/compiled/Mul.cu
index 15394245259..de6506d43f1 100644
--- a/cpp/src/binaryop/compiled/Mul.cu
+++ b/cpp/src/binaryop/compiled/Mul.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Mul>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Mul>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullEquals.cu b/cpp/src/binaryop/compiled/NullEquals.cu
index 3fc76e804f7..f4780c13bef 100644
--- a/cpp/src/binaryop/compiled/NullEquals.cu
+++ b/cpp/src/binaryop/compiled/NullEquals.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullEquals>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::NullEquals>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullLogicalAnd.cu b/cpp/src/binaryop/compiled/NullLogicalAnd.cu
index 48ae125bc93..55e71a52dae 100644
--- a/cpp/src/binaryop/compiled/NullLogicalAnd.cu
+++ b/cpp/src/binaryop/compiled/NullLogicalAnd.cu
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullLogicalAnd>(mutable_column_device_view&,
-                                                   column_device_view const&,
-                                                   column_device_view const&,
+template void apply_binary_op<ops::NullLogicalAnd>(mutable_column_view&,
+                                                   column_view const&,
+                                                   column_view const&,
                                                    bool is_lhs_scalar,
                                                    bool is_rhs_scalar,
                                                    rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullLogicalOr.cu b/cpp/src/binaryop/compiled/NullLogicalOr.cu
index e0ea95ac3ee..ee3b27c0934 100644
--- a/cpp/src/binaryop/compiled/NullLogicalOr.cu
+++ b/cpp/src/binaryop/compiled/NullLogicalOr.cu
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullLogicalOr>(mutable_column_device_view&,
-                                                  column_device_view const&,
-                                                  column_device_view const&,
+template void apply_binary_op<ops::NullLogicalOr>(mutable_column_view&,
+                                                  column_view const&,
+                                                  column_view const&,
                                                   bool is_lhs_scalar,
                                                   bool is_rhs_scalar,
                                                   rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullMax.cu b/cpp/src/binaryop/compiled/NullMax.cu
index 78a44041cba..6fae253d41f 100644
--- a/cpp/src/binaryop/compiled/NullMax.cu
+++ b/cpp/src/binaryop/compiled/NullMax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullMax>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::NullMax>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/NullMin.cu b/cpp/src/binaryop/compiled/NullMin.cu
index 629ab600fd7..cb7fdb4f76a 100644
--- a/cpp/src/binaryop/compiled/NullMin.cu
+++ b/cpp/src/binaryop/compiled/NullMin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::NullMin>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::NullMin>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/PMod.cu b/cpp/src/binaryop/compiled/PMod.cu
index 36902c0ed10..63b1f1f8269 100644
--- a/cpp/src/binaryop/compiled/PMod.cu
+++ b/cpp/src/binaryop/compiled/PMod.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::PMod>(mutable_column_device_view&,
-                                         column_device_view const&,
-                                         column_device_view const&,
+template void apply_binary_op<ops::PMod>(mutable_column_view&,
+                                         column_view const&,
+                                         column_view const&,
                                          bool is_lhs_scalar,
                                          bool is_rhs_scalar,
                                          rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Pow.cu b/cpp/src/binaryop/compiled/Pow.cu
index c6f897ee18d..435e1ac044a 100644
--- a/cpp/src/binaryop/compiled/Pow.cu
+++ b/cpp/src/binaryop/compiled/Pow.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Pow>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Pow>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/PyMod.cu b/cpp/src/binaryop/compiled/PyMod.cu
index b05dcd8e7bc..1e213598681 100644
--- a/cpp/src/binaryop/compiled/PyMod.cu
+++ b/cpp/src/binaryop/compiled/PyMod.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::PyMod>(mutable_column_device_view&,
-                                          column_device_view const&,
-                                          column_device_view const&,
+template void apply_binary_op<ops::PyMod>(mutable_column_view&,
+                                          column_view const&,
+                                          column_view const&,
                                           bool is_lhs_scalar,
                                           bool is_rhs_scalar,
                                           rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/ShiftLeft.cu b/cpp/src/binaryop/compiled/ShiftLeft.cu
index 6cc950b2d50..797821a9057 100644
--- a/cpp/src/binaryop/compiled/ShiftLeft.cu
+++ b/cpp/src/binaryop/compiled/ShiftLeft.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::ShiftLeft>(mutable_column_device_view&,
-                                              column_device_view const&,
-                                              column_device_view const&,
+template void apply_binary_op<ops::ShiftLeft>(mutable_column_view&,
+                                              column_view const&,
+                                              column_view const&,
                                               bool is_lhs_scalar,
                                               bool is_rhs_scalar,
                                               rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/ShiftRight.cu b/cpp/src/binaryop/compiled/ShiftRight.cu
index 1ddd7100a73..8a2566ff775 100644
--- a/cpp/src/binaryop/compiled/ShiftRight.cu
+++ b/cpp/src/binaryop/compiled/ShiftRight.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::ShiftRight>(mutable_column_device_view&,
-                                               column_device_view const&,
-                                               column_device_view const&,
+template void apply_binary_op<ops::ShiftRight>(mutable_column_view&,
+                                               column_view const&,
+                                               column_view const&,
                                                bool is_lhs_scalar,
                                                bool is_rhs_scalar,
                                                rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu b/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu
index a87b4b9f9ac..827029bc75c 100644
--- a/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu
+++ b/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::ShiftRightUnsigned>(mutable_column_device_view&,
-                                                       column_device_view const&,
-                                                       column_device_view const&,
+template void apply_binary_op<ops::ShiftRightUnsigned>(mutable_column_view&,
+                                                       column_view const&,
+                                                       column_view const&,
                                                        bool is_lhs_scalar,
                                                        bool is_rhs_scalar,
                                                        rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/Sub.cu b/cpp/src/binaryop/compiled/Sub.cu
index e0cf47c1310..3022294f86f 100644
--- a/cpp/src/binaryop/compiled/Sub.cu
+++ b/cpp/src/binaryop/compiled/Sub.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::Sub>(mutable_column_device_view&,
-                                        column_device_view const&,
-                                        column_device_view const&,
+template void apply_binary_op<ops::Sub>(mutable_column_view&,
+                                        column_view const&,
+                                        column_view const&,
                                         bool is_lhs_scalar,
                                         bool is_rhs_scalar,
                                         rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/TrueDiv.cu b/cpp/src/binaryop/compiled/TrueDiv.cu
index d8f1d956340..4d0fc2d456b 100644
--- a/cpp/src/binaryop/compiled/TrueDiv.cu
+++ b/cpp/src/binaryop/compiled/TrueDiv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-template void apply_binary_op<ops::TrueDiv>(mutable_column_device_view&,
-                                            column_device_view const&,
-                                            column_device_view const&,
+template void apply_binary_op<ops::TrueDiv>(mutable_column_view&,
+                                            column_view const&,
+                                            column_view const&,
                                             bool is_lhs_scalar,
                                             bool is_rhs_scalar,
                                             rmm::cuda_stream_view);
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index c01359b80d0..d260aa6d6a0 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -37,23 +37,20 @@ namespace compiled {
 
 namespace {
 /**
- * @brief Converts scalar to column_device_view with single element.
+ * @brief Converts scalar to column_view with single element.
  *
- * @return pair with column_device_view and column containing any auxilary data to create
- * column_view from scalar
+ * @return pair with column_view and column containing any auxilary data to create column_view from
+ * scalar
  */
-struct scalar_as_column_device_view {
-  using return_type = typename std::pair<decltype(column_device_view::create(column_view{})),
-                                         std::unique_ptr<column>>;
+struct scalar_as_column_view {
+  using return_type = typename std::pair<column_view, std::unique_ptr<column>>;
   template <typename T, std::enable_if_t<(is_fixed_width<T>())>* = nullptr>
-  return_type operator()(scalar const& s,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v =
       column_view(s.type(), 1, h_scalar_type_view.data(), (bitmask_type const*)s.validity_data());
-    return std::pair{column_device_view::create(col_v, stream), std::unique_ptr<column>(nullptr)};
+    return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, std::enable_if_t<(!is_fixed_width<T>())>* = nullptr>
   return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
@@ -63,10 +60,8 @@ struct scalar_as_column_device_view {
 };
 // specialization for cudf::string_view
 template <>
-scalar_as_column_device_view::return_type
-scalar_as_column_device_view::operator()<cudf::string_view>(scalar const& s,
-                                                            rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr)
+scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::string_view>(
+  scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   using T                  = cudf::string_view;
   auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
@@ -87,24 +82,24 @@ scalar_as_column_device_view::operator()<cudf::string_view>(scalar const& s,
                            cudf::UNKNOWN_NULL_COUNT,
                            0,
                            {offsets_column->view(), chars_column_v});
-  return std::pair{column_device_view::create(col_v, stream), std::move(offsets_column)};
+  return std::pair{col_v, std::move(offsets_column)};
 }
 
 /**
- * @brief Converts scalar to column_device_view with single element.
+ * @brief Converts scalar to column_view with single element.
  *
  * @param scal    scalar to convert
  * @param stream  CUDA stream used for device memory operations and kernel launches.
  * @param mr      Device memory resource used to allocate the returned column's device memory
- * @return        pair with column_device_view and column containing any auxilary data to create
+ * @return        pair with column_view and column containing any auxilary data to create
  * column_view from scalar
  */
-auto scalar_to_column_device_view(
+auto scalar_to_column_view(
   scalar const& scal,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return type_dispatcher(scal.type(), scalar_as_column_device_view{}, scal, stream, mr);
+  return type_dispatcher(scal.type(), scalar_as_column_view{}, scal, stream, mr);
 }
 
 // This functor does the actual comparison between string column value and a scalar string
@@ -300,9 +295,9 @@ std::unique_ptr<column> string_null_min_max(column_view const& lhs,
     *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), stream, mr);
 }
 
-void operator_dispatcher(mutable_column_device_view& out,
-                         column_device_view const& lhs,
-                         column_device_view const& rhs,
+void operator_dispatcher(mutable_column_view& out,
+                         column_view const& lhs,
+                         column_view const& rhs,
                          bool is_lhs_scalar,
                          bool is_rhs_scalar,
                          binary_operator op,
@@ -358,10 +353,7 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  auto lhsd = column_device_view::create(lhs, stream);
-  auto rhsd = column_device_view::create(rhs, stream);
-  auto outd = mutable_column_device_view::create(out, stream);
-  operator_dispatcher(*outd, *lhsd, *rhsd, false, false, op, stream);
+  operator_dispatcher(out, lhs, rhs, false, false, op, stream);
 }
 // scalar_vector
 void binary_operation(mutable_column_view& out,
@@ -370,10 +362,8 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  auto [lhsd, aux] = scalar_to_column_device_view(lhs, stream);
-  auto rhsd        = column_device_view::create(rhs, stream);
-  auto outd        = mutable_column_device_view::create(out, stream);
-  operator_dispatcher(*outd, *lhsd, *rhsd, true, false, op, stream);
+  auto [lhsv, aux] = scalar_to_column_view(lhs, stream);
+  operator_dispatcher(out, lhsv, rhs, true, false, op, stream);
 }
 // vector_scalar
 void binary_operation(mutable_column_view& out,
@@ -382,12 +372,9 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  auto lhsd        = column_device_view::create(lhs, stream);
-  auto [rhsd, aux] = scalar_to_column_device_view(rhs, stream);
-  auto outd        = mutable_column_device_view::create(out, stream);
-  operator_dispatcher(*outd, *lhsd, *rhsd, false, true, op, stream);
+  auto [rhsv, aux] = scalar_to_column_view(rhs, stream);
+  operator_dispatcher(out, lhs, rhsv, false, true, op, stream);
 }
-
 }  // namespace compiled
 }  // namespace binops
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index ec41fbb8883..d88d2be2499 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -20,6 +20,7 @@
 #include "operation.cuh"
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_view.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -271,30 +272,36 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
   const int grid_size = util::div_rounding_up_safe(size, 2 * block_size);
   for_each_kernel<<<grid_size, block_size, 0, stream.value()>>>(size, std::forward<Functor&&>(f));
 }
-
+namespace detail {
+template <class T, class... Ts>
+inline constexpr bool is_any_v = std::disjunction<std::is_same<T, Ts>...>::value;
+}
 template <class BinaryOperator>
-void apply_binary_op(mutable_column_device_view& outd,
-                     column_device_view const& lhsd,
-                     column_device_view const& rhsd,
+void apply_binary_op(mutable_column_view& out,
+                     column_view const& lhs,
+                     column_view const& rhs,
                      bool is_lhs_scalar,
                      bool is_rhs_scalar,
                      rmm::cuda_stream_view stream)
 {
-  auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
+  auto common_dtype = get_common_type(out.type(), lhs.type(), rhs.type());
 
+  auto lhsd = column_device_view::create(lhs, stream);
+  auto rhsd = column_device_view::create(rhs, stream);
+  auto outd = mutable_column_device_view::create(out, stream);
   // Create binop functor instance
   if (common_dtype) {
     // Execute it on every element
     for_each(stream,
-             outd.size(),
+             out.size(),
              binary_op_device_dispatcher<BinaryOperator>{
-               *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+               *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   } else {
     // Execute it on every element
     for_each(stream,
-             outd.size(),
+             out.size(),
              binary_op_double_device_dispatcher<BinaryOperator>{
-               outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+               *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
   }
 }
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 26a0f26b59c..d1a40e15326 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -175,45 +175,45 @@ bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_
 // Defined in individual .cu files.
 /**
  * @brief Deploys single type or double type dispatcher that runs binary operation on each element
- * of @p lhsd and @p rhsd columns.
+ * of @p lhs and @p rhs columns.
  *
  * This template is instantiated for each binary operator.
  *
  * @tparam BinaryOperator Binary operator functor
- * @param outd mutable device view of output column
- * @param lhsd device view of left operand column
- * @param rhsd device view of right operand column
- * @param is_lhs_scalar true if @p lhsd is a single element column representing a scalar
- * @param is_rhs_scalar true if @p rhsd is a single element column representing a scalar
+ * @param out mutable view of output column
+ * @param lhs view of left operand column
+ * @param rhs view of right operand column
+ * @param is_lhs_scalar true if @p lhs is a single element column representing a scalar
+ * @param is_rhs_scalar true if @p rhs is a single element column representing a scalar
  * @param stream CUDA stream used for device memory operations
  */
 template <class BinaryOperator>
-void apply_binary_op(mutable_column_device_view&,
-                     column_device_view const&,
-                     column_device_view const&,
+void apply_binary_op(mutable_column_view& out,
+                     column_view const& lhs,
+                     column_view const& rhs,
                      bool is_lhs_scalar,
                      bool is_rhs_scalar,
                      rmm::cuda_stream_view stream);
 /**
  * @brief Deploys single type or double type dispatcher that runs equality operation on each element
- * of @p lhsd and @p rhsd columns.
+ * of @p lhs and @p rhs columns.
  *
  * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS.
- * @p outd type is boolean.
+ * @p out type is boolean.
  *
  * This template is instantiated for each binary operator.
  *
- * @param outd mutable device view of output column
- * @param lhsd device view of left operand column
- * @param rhsd device view of right operand column
- * @param is_lhs_scalar true if @p lhsd is a single element column representing a scalar
- * @param is_rhs_scalar true if @p rhsd is a single element column representing a scalar
+ * @param out mutable view of output column
+ * @param lhs view of left operand column
+ * @param rhs view of right operand column
+ * @param is_lhs_scalar true if @p lhs is a single element column representing a scalar
+ * @param is_rhs_scalar true if @p rhs is a single element column representing a scalar
  * @param op comparison binary operator
  * @param stream CUDA stream used for device memory operations
  */
-void dispatch_equality_op(mutable_column_device_view& outd,
-                          column_device_view const& lhsd,
-                          column_device_view const& rhsd,
+void dispatch_equality_op(mutable_column_view& out,
+                          column_view const& lhs,
+                          column_view const& rhs,
                           bool is_lhs_scalar,
                           bool is_rhs_scalar,
                           binary_operator op,
diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu
index 03c3e373476..61f02252a26 100644
--- a/cpp/src/binaryop/compiled/equality_ops.cu
+++ b/cpp/src/binaryop/compiled/equality_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,41 +17,43 @@
 #include "binary_ops.cuh"
 
 namespace cudf::binops::compiled {
-void dispatch_equality_op(mutable_column_device_view& outd,
-                          column_device_view const& lhsd,
-                          column_device_view const& rhsd,
+void dispatch_equality_op(mutable_column_view& out,
+                          column_view const& lhs,
+                          column_view const& rhs,
                           bool is_lhs_scalar,
                           bool is_rhs_scalar,
                           binary_operator op,
                           rmm::cuda_stream_view stream)
 {
-  auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type());
-
-  // Execute it on every element
-
+  CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL,
+               "Unsupported operator for these types");
+  auto common_dtype = get_common_type(out.type(), lhs.type(), rhs.type());
+  auto outd         = mutable_column_device_view::create(out, stream);
+  auto lhsd         = column_device_view::create(lhs, stream);
+  auto rhsd         = column_device_view::create(rhs, stream);
   if (common_dtype) {
     if (op == binary_operator::EQUAL) {
       for_each(stream,
-               outd.size(),
+               out.size(),
                binary_op_device_dispatcher<ops::Equal>{
-                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
       for_each(stream,
-               outd.size(),
+               out.size(),
                binary_op_device_dispatcher<ops::NotEqual>{
-                 *common_dtype, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+                 *common_dtype, *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   } else {
     if (op == binary_operator::EQUAL) {
       for_each(stream,
-               outd.size(),
+               out.size(),
                binary_op_double_device_dispatcher<ops::Equal>{
-                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     } else if (op == binary_operator::NOT_EQUAL) {
       for_each(stream,
-               outd.size(),
+               out.size(),
                binary_op_double_device_dispatcher<ops::NotEqual>{
-                 outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar});
+                 *outd, *lhsd, *rhsd, is_lhs_scalar, is_rhs_scalar});
     }
   }
 }
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ecd2d6f6ec0..0ad33821dd7 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -54,6 +54,9 @@
 #include <numeric>
 #include <utility>
 
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+
 #include <cuda/std/limits>
 
 namespace cudf {
@@ -1233,8 +1236,7 @@ writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
   auto const num_stripe_blobs =
     thrust::reduce(stripe_size_iter, stripe_size_iter + per_chunk_stats.stripe_stat_merge.size());
   auto const num_file_blobs = num_columns;
-  auto const num_blobs = single_write_mode ? static_cast<int>(num_stripe_blobs + num_file_blobs)
-                                           : static_cast<int>(num_stripe_blobs);
+  auto const num_blobs      = static_cast<int>(num_stripe_blobs + num_file_blobs);
 
   if (num_stripe_blobs == 0) { return {}; }
 
@@ -1242,46 +1244,53 @@ writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
   rmm::device_uvector<statistics_chunk> stat_chunks(num_blobs, stream);
   hostdevice_vector<statistics_merge_group> stats_merge(num_blobs, stream);
 
-  size_t chunk_offset = 0;
-  size_t merge_offset = 0;
+  // we need to merge the stat arrays from the persisted data.
+  // this needs to be done carefully because each array can contain
+  // a different number of stripes and stripes from each column must be
+  // located next to each other. We know the total number of stripes and
+  // we know the size of each array. The number of stripes per column in a chunk array can
+  // be calculated by dividing the number of chunks by the number of columns.
+  // That many chunks need to be copied at a time to the proper destination.
+  size_t num_entries_seen = 0;
   for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) {
-    auto chunk_bytes = per_chunk_stats.stripe_stat_chunks[i].size() * sizeof(statistics_chunk);
-    auto merge_bytes = per_chunk_stats.stripe_stat_merge[i].size() * sizeof(statistics_merge_group);
-    cudaMemcpyAsync(stat_chunks.data() + chunk_offset,
-                    per_chunk_stats.stripe_stat_chunks[i].data(),
-                    chunk_bytes,
-                    cudaMemcpyDeviceToDevice,
-                    stream);
-    cudaMemcpyAsync(stats_merge.device_ptr() + merge_offset,
-                    per_chunk_stats.stripe_stat_merge[i].device_ptr(),
-                    merge_bytes,
-                    cudaMemcpyDeviceToDevice,
-                    stream);
-    chunk_offset += per_chunk_stats.stripe_stat_chunks[i].size();
-    merge_offset += per_chunk_stats.stripe_stat_merge[i].size();
+    auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns;
+
+    auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk);
+    auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group);
+    for (size_t col = 0; col < num_columns; ++col) {
+      cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen,
+                      per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col,
+                      chunk_bytes,
+                      cudaMemcpyDeviceToDevice,
+                      stream);
+      cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen,
+                      per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col,
+                      merge_bytes,
+                      cudaMemcpyDeviceToDevice,
+                      stream);
+    }
+    num_entries_seen += stripes_per_col;
   }
 
-  if (single_write_mode) {
-    std::vector<statistics_merge_group> file_stats_merge(num_file_blobs);
-    for (auto i = 0u; i < num_file_blobs; ++i) {
-      auto col_stats         = &file_stats_merge[i];
-      col_stats->col_dtype   = per_chunk_stats.col_types[i];
-      col_stats->stats_dtype = per_chunk_stats.stats_dtypes[i];
-      col_stats->start_chunk = static_cast<uint32_t>(i * num_stripes);
-      col_stats->num_chunks  = static_cast<uint32_t>(num_stripes);
-    }
+  std::vector<statistics_merge_group> file_stats_merge(num_file_blobs);
+  for (auto i = 0u; i < num_file_blobs; ++i) {
+    auto col_stats         = &file_stats_merge[i];
+    col_stats->col_dtype   = per_chunk_stats.col_types[i];
+    col_stats->stats_dtype = per_chunk_stats.stats_dtypes[i];
+    col_stats->start_chunk = static_cast<uint32_t>(i * num_stripes);
+    col_stats->num_chunks  = static_cast<uint32_t>(num_stripes);
+  }
 
-    auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs);
-    cudaMemcpyAsync(d_file_stats_merge,
-                    file_stats_merge.data(),
-                    num_file_blobs * sizeof(statistics_merge_group),
-                    cudaMemcpyHostToDevice,
-                    stream);
+  auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs);
+  cudaMemcpyAsync(d_file_stats_merge,
+                  file_stats_merge.data(),
+                  num_file_blobs * sizeof(statistics_merge_group),
+                  cudaMemcpyHostToDevice,
+                  stream);
 
-    auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs;
-    detail::merge_group_statistics<detail::io_file_format::ORC>(
-      file_stat_chunks, stat_chunks.data(), d_file_stats_merge, num_file_blobs, stream);
-  }
+  auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs;
+  detail::merge_group_statistics<detail::io_file_format::ORC>(
+    file_stat_chunks, stat_chunks.data(), d_file_stats_merge, num_file_blobs, stream);
 
   hostdevice_vector<uint8_t> blobs =
     allocate_and_encode_blobs(stats_merge, stat_chunks, num_blobs, stream);
@@ -1295,14 +1304,12 @@ writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
     stripe_blobs[i].assign(stat_begin, stat_end);
   }
 
-  std::vector<ColStatsBlob> file_blobs(single_write_mode ? num_file_blobs : 0);
-  if (single_write_mode) {
-    auto file_stat_merge = stats_merge.host_ptr(num_stripe_blobs);
-    for (auto i = 0u; i < num_file_blobs; i++) {
-      auto const stat_begin = blobs.host_ptr(file_stat_merge[i].start_chunk);
-      auto const stat_end   = stat_begin + file_stat_merge[i].num_chunks;
-      file_blobs[i].assign(stat_begin, stat_end);
-    }
+  std::vector<ColStatsBlob> file_blobs(num_file_blobs);
+  auto file_stat_merge = stats_merge.host_ptr(num_stripe_blobs);
+  for (auto i = 0u; i < num_file_blobs; i++) {
+    auto const stat_begin = blobs.host_ptr(file_stat_merge[i].start_chunk);
+    auto const stat_end   = stat_begin + file_stat_merge[i].num_chunks;
+    file_blobs[i].assign(stat_begin, stat_end);
   }
 
   return {std::move(stripe_blobs), std::move(file_blobs)};
@@ -1937,6 +1944,91 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
           std::move(is_dict_enabled)};
 }
 
+struct string_length_functor {
+  __device__ inline size_type operator()(int const i) const
+  {
+    // we translate from 0 -> num_chunks * 2 because each statistic has a min and max
+    // string and we need to calculate lengths for both.
+    if (i >= num_chunks * 2) return 0;
+
+    // min strings are even values, max strings are odd values of i
+    auto const should_copy_min = i % 2 == 0;
+    // index of the chunk
+    auto const idx = i / 2;
+    auto& str_val  = should_copy_min ? stripe_stat_chunks[idx].min_value.str_val
+                                     : stripe_stat_chunks[idx].max_value.str_val;
+    auto const str = stripe_stat_merge[idx].stats_dtype == dtype_string;
+    return str ? str_val.length : 0;
+  }
+
+  int const num_chunks;
+  statistics_chunk const* stripe_stat_chunks;
+  statistics_merge_group const* stripe_stat_merge;
+};
+
+__global__ void copy_string_data(char* string_pool,
+                                 size_type* offsets,
+                                 statistics_chunk* chunks,
+                                 statistics_merge_group const* groups)
+{
+  auto const idx = blockIdx.x / 2;
+  if (groups[idx].stats_dtype == dtype_string) {
+    // min strings are even values, max strings are odd values of i
+    auto const should_copy_min = blockIdx.x % 2 == 0;
+    auto& str_val = should_copy_min ? chunks[idx].min_value.str_val : chunks[idx].max_value.str_val;
+    auto dst      = &string_pool[offsets[blockIdx.x]];
+    auto src      = str_val.ptr;
+
+    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
+      dst[i] = src[i];
+    }
+    if (threadIdx.x == 0) { str_val.ptr = dst; }
+  }
+}
+
+void writer::impl::persisted_statistics::persist(int num_table_rows,
+                                                 bool single_write_mode,
+                                                 intermediate_statistics& intermediate_stats,
+                                                 rmm::cuda_stream_view stream)
+{
+  if (not single_write_mode) {
+    // persist the strings in the chunks into a string pool and update pointers
+    auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
+    // min offset and max offset + 1 for total size
+    rmm::device_uvector<size_type> offsets((num_chunks * 2) + 1, stream);
+
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0,
+      string_length_functor{num_chunks,
+                            intermediate_stats.stripe_stat_chunks.data(),
+                            intermediate_stats.stripe_stat_merge.device_ptr()});
+    thrust::exclusive_scan(rmm::exec_policy(stream), iter, iter + offsets.size(), offsets.begin());
+
+    // pull size back to host
+    auto const total_string_pool_size = offsets.element(num_chunks * 2, stream);
+    if (total_string_pool_size > 0) {
+      rmm::device_uvector<char> string_pool(total_string_pool_size, stream);
+
+      // offsets describes where in the string pool each string goes. Going with the simple
+      // approach for now, but it is possible something fancier with breaking up each thread into
+      // copying x bytes instead of a single string is the better method since we are dealing in
+      // min/max strings they almost certainly will not be uniform length.
+      copy_string_data<<<num_chunks * 2, 256, 0, stream.value()>>>(
+        string_pool.data(),
+        offsets.data(),
+        intermediate_stats.stripe_stat_chunks.data(),
+        intermediate_stats.stripe_stat_merge.device_ptr());
+      string_pools.emplace_back(std::move(string_pool));
+    }
+  }
+
+  stripe_stat_chunks.emplace_back(std::move(intermediate_stats.stripe_stat_chunks));
+  stripe_stat_merge.emplace_back(std::move(intermediate_stats.stripe_stat_merge));
+  stats_dtypes = std::move(intermediate_stats.stats_dtypes);
+  col_types    = std::move(intermediate_stats.col_types);
+  num_rows     = num_table_rows;
+}
+
 void writer::impl::write(table_view const& table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
@@ -2075,13 +2167,8 @@ void writer::impl::write(table_view const& table)
     auto intermediate_stats = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
 
     if (intermediate_stats.stripe_stat_chunks.size() > 0) {
-      persisted_stripe_statistics.stripe_stat_chunks.emplace_back(
-        std::move(intermediate_stats.stripe_stat_chunks));
-      persisted_stripe_statistics.stripe_stat_merge.emplace_back(
-        std::move(intermediate_stats.stripe_stat_merge));
-      persisted_stripe_statistics.stats_dtypes = std::move(intermediate_stats.stats_dtypes);
-      persisted_stripe_statistics.col_types    = std::move(intermediate_stats.col_types);
-      persisted_stripe_statistics.num_rows     = orc_table.num_rows();
+      persisted_stripe_statistics.persist(
+        orc_table.num_rows(), single_write_mode, intermediate_stats, stream);
     }
 
     // Write stripes
@@ -2141,7 +2228,6 @@ void writer::impl::write(table_view const& table)
       }
       out_sink_->host_write(buffer_.data(), buffer_.size());
     }
-
     for (auto const& task : write_tasks) {
       task.wait();
     }
@@ -2204,7 +2290,7 @@ void writer::impl::close()
   auto const statistics = finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics);
 
   // File-level statistics
-  if (single_write_mode and not statistics.file_level.empty()) {
+  if (not statistics.file_level.empty()) {
     buffer_.resize(0);
     pbw_.put_uint(encode_field_number<size_type>(1));
     pbw_.put_uint(persisted_stripe_statistics.num_rows);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index d823c73007f..577c22f8ac3 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -304,7 +304,7 @@ class writer::impl {
         stats_dtypes(std::move(sdt)),
         col_types(std::move(sct)){};
 
-    // blobs for the rowgroups and stripes. Not persisted
+    // blobs for the rowgroups. Not persisted
     std::vector<ColStatsBlob> rowgroup_blobs;
 
     rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
@@ -322,13 +322,20 @@ class writer::impl {
     {
       stripe_stat_chunks.clear();
       stripe_stat_merge.clear();
+      string_pools.clear();
       stats_dtypes.clear();
       col_types.clear();
       num_rows = 0;
     }
 
+    void persist(int num_table_rows,
+                 bool single_write_mode,
+                 intermediate_statistics& intermediate_stats,
+                 rmm::cuda_stream_view stream);
+
     std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
     std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
+    std::vector<rmm::device_uvector<char>> string_pools;
     std::vector<statistics_dtype> stats_dtypes;
     std::vector<data_type> col_types;
     int num_rows = 0;
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 687e553fefd..b7b33000707 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -137,19 +137,28 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   auto gather_map =
     std::make_unique<rmm::device_uvector<cudf::size_type>>(left_num_rows, stream, mr);
 
-  // gather_map_end will be the end of valid data in gather_map
-  auto gather_map_end = thrust::copy_if(
+  rmm::device_uvector<bool> flagged(left_num_rows, stream, mr);
+  auto flagged_d = flagged.data();
+
+  auto counting_iter = thrust::counting_iterator<size_type>(0);
+  thrust::for_each(
     rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0),
-    thrust::make_counting_iterator(left_num_rows),
-    gather_map->begin(),
-    [hash_table_view, join_type_boolean, hash_probe, equality_probe] __device__(
-      size_type const idx) {
-      // Look up this row. The hash function used here needs to map a (left) row index to the hash
-      // of the row, so it's a row hash. The equality check needs to verify
-      return hash_table_view.contains(idx, hash_probe, equality_probe) == join_type_boolean;
+    counting_iter,
+    counting_iter + left_num_rows,
+    [flagged_d, hash_table_view, join_type_boolean, hash_probe, equality_probe] __device__(
+      const size_type idx) {
+      flagged_d[idx] =
+        hash_table_view.contains(idx, hash_probe, equality_probe) == join_type_boolean;
     });
 
+  // gather_map_end will be the end of valid data in gather_map
+  auto gather_map_end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    counting_iter,
+                    counting_iter + left_num_rows,
+                    gather_map->begin(),
+                    [flagged_d] __device__(size_type const idx) { return flagged_d[idx]; });
+
   auto join_size = thrust::distance(gather_map->begin(), gather_map_end);
   gather_map->resize(join_size, stream);
   return gather_map;
diff --git a/cpp/src/lists/apply_boolean_mask.cu b/cpp/src/lists/apply_boolean_mask.cu
new file mode 100644
index 00000000000..670e99dfbc8
--- /dev/null
+++ b/cpp/src/lists/apply_boolean_mask.cu
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/fill.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/detail/replace.hpp>
+#include <cudf/detail/stream_compaction.hpp>
+#include <cudf/lists/detail/stream_compaction.hpp>
+#include <cudf/lists/stream_compaction.hpp>
+#include <cudf/utilities/bit.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/reduce.h>
+
+namespace cudf::lists {
+namespace detail {
+
+std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
+                                           lists_column_view const& boolean_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(boolean_mask.child().type().id() == type_id::BOOL8, "Mask must be of type BOOL8.");
+  CUDF_EXPECTS(input.size() == boolean_mask.size(),
+               "Boolean masks column must have same number of rows as input.");
+  auto const num_rows = input.size();
+
+  if (num_rows == 0) { return cudf::empty_like(input.parent()); }
+
+  auto constexpr offset_data_type = data_type{type_id::INT32};
+
+  auto const boolean_mask_sliced_child = boolean_mask.get_sliced_child(stream);
+
+  auto const make_filtered_child = [&] {
+    auto filtered =
+      cudf::detail::apply_boolean_mask(
+        cudf::table_view{{input.get_sliced_child(stream)}}, boolean_mask_sliced_child, stream, mr)
+        ->release();
+    return std::move(filtered.front());
+  };
+
+  auto const make_output_offsets = [&] {
+    auto boolean_mask_sliced_offsets =
+      cudf::detail::slice(
+        boolean_mask.offsets(), {boolean_mask.offset(), boolean_mask.size() + 1}, stream)
+        .front();
+    auto const sizes       = cudf::reduction::segmented_sum(boolean_mask_sliced_child,
+                                                      boolean_mask_sliced_offsets,
+                                                      offset_data_type,
+                                                      null_policy::EXCLUDE,
+                                                      stream);
+    auto const d_sizes     = column_device_view::create(*sizes, stream);
+    auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, offset_type{0});
+    auto const sizes_end   = sizes_begin + sizes->size();
+    auto output_offsets    = cudf::make_numeric_column(
+      offset_data_type, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+    auto output_offsets_view = output_offsets->mutable_view();
+
+    // Could have attempted an exclusive_scan(), but it would not compute the last entry.
+    // Instead, inclusive_scan(), followed by writing `0` to the head of the offsets column.
+    thrust::inclusive_scan(rmm::exec_policy(stream),
+                           sizes_begin,
+                           sizes_end,
+                           output_offsets_view.begin<offset_type>() + 1);
+    CUDF_CUDA_TRY(cudaMemsetAsync(
+      output_offsets_view.begin<offset_type>(), 0, sizeof(offset_type), stream.value()));
+    return output_offsets;
+  };
+
+  return cudf::make_lists_column(input.size(),
+                                 make_output_offsets(),
+                                 make_filtered_child(),
+                                 input.null_count(),
+                                 cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                 stream,
+                                 mr);
+}
+}  // namespace detail
+
+std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
+                                           lists_column_view const& boolean_mask,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  return detail::apply_boolean_mask(input, boolean_mask, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf::lists
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index c4ffa7f0fb1..987cd076fd0 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -15,9 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -27,13 +25,8 @@
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
@@ -41,51 +34,52 @@ namespace detail {
 
 namespace {
 /**
- * @brief This functor handles both contains_re and match_re to minimize the number
- * of regex calls to find() to be inlined greatly reducing compile time.
+ * @brief This functor handles both contains_re and match_re to regex-match a pattern
+ * to each string in a column.
  */
-template <int stack_size>
 struct contains_fn {
-  reprog_device prog;
   column_device_view const d_strings;
-  bool const beginning_only;  // do not make this a template parameter to keep compile times down
+  bool const beginning_only;
 
-  __device__ bool operator()(size_type idx)
+  __device__ bool operator()(size_type const idx,
+                             reprog_device const prog,
+                             int32_t const thread_idx)
   {
     if (d_strings.is_null(idx)) return false;
     auto const d_str = d_strings.element<string_view>(idx);
-    int32_t begin    = 0;
-    int32_t end      = beginning_only ? 1    // match only the beginning of the string;
-                                      : -1;  // match anywhere in the string
-    return static_cast<bool>(prog.find<stack_size>(idx, d_str, begin, end));
+
+    size_type begin = 0;
+    size_type end   = beginning_only ? 1    // match only the beginning of the string;
+                                     : -1;  // match anywhere in the string
+    return static_cast<bool>(prog.find(thread_idx, d_str, begin, end));
   }
 };
 
-struct contains_dispatch_fn {
-  reprog_device d_prog;
-  bool const beginning_only;
+std::unique_ptr<column> contains_impl(strings_column_view const& input,
+                                      std::string const& pattern,
+                                      regex_flags const flags,
+                                      bool const beginning_only,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  auto results = make_numeric_column(data_type{type_id::BOOL8},
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
+                                     stream,
+                                     mr);
+  if (input.is_empty()) { return results; }
 
-  template <int stack_size>
-  std::unique_ptr<column> operator()(strings_column_view const& input,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    auto results = make_numeric_column(data_type{type_id::BOOL8},
-                                       input.size(),
-                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
-                                       input.null_count(),
-                                       stream,
-                                       mr);
-
-    auto const d_strings = column_device_view::create(input.parent(), stream);
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(input.size()),
-                      results->mutable_view().data<bool>(),
-                      contains_fn<stack_size>{d_prog, *d_strings, beginning_only});
-    return results;
-  }
-};
+  auto d_prog = reprog_device::create(pattern, flags, stream);
+
+  auto d_results       = results->mutable_view().data<bool>();
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+
+  launch_transform_kernel(
+    contains_fn{*d_strings, beginning_only}, *d_prog, d_results, input.size(), stream);
+
+  return results;
+}
 
 }  // namespace
 
@@ -96,10 +90,7 @@ std::unique_ptr<column> contains_re(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
-
-  return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, false}, input, stream, mr);
+  return contains_impl(input, pattern, flags, false, stream, mr);
 }
 
 std::unique_ptr<column> matches_re(
@@ -109,21 +100,18 @@ std::unique_ptr<column> matches_re(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
-
-  return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, true}, input, stream, mr);
+  return contains_impl(input, pattern, flags, true, stream, mr);
 }
 
-std::unique_ptr<column> count_re(strings_column_view const& input,
-                                 std::string const& pattern,
-                                 regex_flags const flags,
-                                 rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> count_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  regex_flags const flags,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   // compile regex into device object
-  auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+  auto d_prog = reprog_device::create(pattern, flags, stream);
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index a850315dfec..d807482a3a7 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -15,41 +15,35 @@
  */
 
 #include <strings/count_matches.hpp>
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/string_view.cuh>
 
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-
 namespace cudf {
 namespace strings {
 namespace detail {
 
 namespace {
 /**
- * @brief Functor counts the total matches to the given regex in each string.
+ * @brief Kernel counts the total matches for the given regex in each string.
  */
-template <int stack_size>
-struct count_matches_fn {
+struct count_fn {
   column_device_view const d_strings;
-  reprog_device prog;
 
-  __device__ size_type operator()(size_type idx)
+  __device__ int32_t operator()(size_type const idx,
+                                reprog_device const prog,
+                                int32_t const thread_idx)
   {
-    if (d_strings.is_null(idx)) { return 0; }
-    size_type count   = 0;
+    if (d_strings.is_null(idx)) return 0;
     auto const d_str  = d_strings.element<string_view>(idx);
     auto const nchars = d_str.length();
+    int32_t count     = 0;
 
-    int32_t begin = 0;
-    int32_t end   = nchars;
-    while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
+    size_type begin = 0;
+    size_type end   = nchars;
+    while ((begin < end) && (prog.find(thread_idx, d_str, begin, end) > 0)) {
       ++count;
       begin = end + (begin == end);
       end   = nchars;
@@ -58,41 +52,26 @@ struct count_matches_fn {
   }
 };
 
-struct count_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  std::unique_ptr<column> operator()(column_device_view const& d_strings,
-                                     size_type output_size,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    assert(output_size >= d_strings.size() and "Unexpected output size");
-
-    auto results = make_numeric_column(
-      data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(d_strings.size()),
-                      results->mutable_view().data<int32_t>(),
-                      count_matches_fn<stack_size>{d_strings, d_prog});
-    return results;
-  }
-};
-
 }  // namespace
 
-/**
- * @copydoc cudf::strings::detail::count_matches
- */
 std::unique_ptr<column> count_matches(column_device_view const& d_strings,
-                                      reprog_device const& d_prog,
+                                      reprog_device& d_prog,
                                       size_type output_size,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  return regex_dispatcher(d_prog, count_dispatch_fn{d_prog}, d_strings, output_size, stream, mr);
+  assert(output_size >= d_strings.size() and "Unexpected output size");
+
+  auto results = make_numeric_column(
+    data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);
+
+  if (d_strings.size() == 0) return results;
+
+  auto d_results = results->mutable_view().data<int32_t>();
+
+  launch_transform_kernel(count_fn{d_strings}, d_prog, d_results, d_strings.size(), stream);
+
+  return results;
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index efff3958c65..d4bcdaf4042 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -39,10 +39,11 @@ class reprog_device;
  * @param output_size Number of rows for the output column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return Integer column of match counts
  */
 std::unique_ptr<column> count_matches(
   column_device_view const& d_strings,
-  reprog_device const& d_prog,
+  reprog_device& d_prog,
   size_type output_size,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index 9e987cf5879..59b90952d97 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -31,7 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
+#include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/pair.h>
@@ -47,28 +45,26 @@ using string_index_pair = thrust::pair<const char*, size_type>;
 /**
  * @brief This functor handles extracting strings by applying the compiled regex pattern
  * and creating string_index_pairs for all the substrings.
- *
- * @tparam stack_size Correlates to the regex instructions state to maintain for each string.
- *         Each instruction requires a fixed amount of overhead data.
  */
-template <int stack_size>
 struct extract_fn {
-  reprog_device prog;
   column_device_view const d_strings;
   cudf::detail::device_2dspan<string_index_pair> d_indices;
 
-  __device__ void operator()(size_type idx)
+  __device__ void operator()(size_type const idx,
+                             reprog_device const d_prog,
+                             int32_t const prog_idx)
   {
-    auto const groups = prog.group_counts();
+    auto const groups = d_prog.group_counts();
     auto d_output     = d_indices[idx];
 
     if (d_strings.is_valid(idx)) {
       auto const d_str = d_strings.element<string_view>(idx);
-      int32_t begin    = 0;
-      int32_t end      = -1;  // handles empty strings automatically
-      if (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+
+      size_type begin = 0;
+      size_type end   = -1;  // handles empty strings automatically
+      if (d_prog.find(prog_idx, d_str, begin, end) > 0) {
         for (auto col_idx = 0; col_idx < groups; ++col_idx) {
-          auto const extracted = prog.extract<stack_size>(idx, d_str, begin, end, col_idx);
+          auto const extracted = d_prog.extract(prog_idx, d_str, begin, end, col_idx);
           d_output[col_idx]    = [&] {
             if (!extracted) return string_index_pair{nullptr, 0};
             auto const offset = d_str.byte_offset((*extracted).first);
@@ -85,33 +81,17 @@ struct extract_fn {
   }
 };
 
-struct extract_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  void operator()(column_device_view const& d_strings,
-                  cudf::detail::device_2dspan<string_index_pair>& d_indices,
-                  rmm::cuda_stream_view stream)
-  {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       d_strings.size(),
-                       extract_fn<stack_size>{d_prog, d_strings, d_indices});
-  }
-};
 }  // namespace
 
 //
-std::unique_ptr<table> extract(
-  strings_column_view const& input,
-  std::string const& pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> extract(strings_column_view const& input,
+                               std::string const& pattern,
+                               regex_flags const flags,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   // compile regex into device object
-  auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+  auto d_prog = reprog_device::create(pattern, flags, stream);
 
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
@@ -121,7 +101,8 @@ std::unique_ptr<table> extract(
     cudf::detail::device_2dspan<string_index_pair>(indices.data(), input.size(), groups);
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
-  regex_dispatcher(*d_prog, extract_dispatch_fn{*d_prog}, *d_strings, d_indices, stream);
+
+  launch_for_each_kernel(extract_fn{*d_strings, d_indices}, *d_prog, input.size(), stream);
 
   // build a result column for each group
   std::vector<std::unique_ptr<column>> results(groups);
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 7dce369a24f..95b8a43a9d4 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -15,9 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -30,9 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_scan.h>
 
 namespace cudf {
@@ -49,14 +45,14 @@ namespace {
  * The `d_offsets` are pre-computed to identify the location of where each
  * string's output groups are to be written.
  */
-template <int stack_size>
 struct extract_fn {
   column_device_view const d_strings;
-  reprog_device d_prog;
   offset_type const* d_offsets;
   string_index_pair* d_indices;
 
-  __device__ void operator()(size_type idx)
+  __device__ void operator()(size_type const idx,
+                             reprog_device const d_prog,
+                             int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) { return; }
 
@@ -64,16 +60,17 @@ struct extract_fn {
     auto d_output        = d_indices + d_offsets[idx];
     size_type output_idx = 0;
 
-    auto const d_str = d_strings.element<string_view>(idx);
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();
 
-    int32_t begin = 0;
-    int32_t end   = d_str.length();
+    size_type begin = 0;
+    size_type end   = nchars;
     // match the regex
-    while ((begin < end) && d_prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+    while ((begin < end) && d_prog.find(prog_idx, d_str, begin, end) > 0) {
       // extract each group into the output
       for (auto group_idx = 0; group_idx < groups; ++group_idx) {
         // result is an optional containing the bounds of the extracted string at group_idx
-        auto const extracted = d_prog.extract<stack_size>(idx, d_str, begin, end, group_idx);
+        auto const extracted = d_prog.extract(prog_idx, d_str, begin, end, group_idx);
 
         d_output[group_idx + output_idx] = [&] {
           if (!extracted) { return string_index_pair{nullptr, 0}; }
@@ -84,33 +81,12 @@ struct extract_fn {
       }
       // continue to next match
       begin = end;
-      end   = d_str.length();
+      end   = nchars;
       output_idx += groups;
     }
   }
 };
 
-struct extract_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  std::unique_ptr<column> operator()(column_device_view const& d_strings,
-                                     size_type total_groups,
-                                     offset_type const* d_offsets,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    rmm::device_uvector<string_index_pair> indices(total_groups, stream);
-
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       d_strings.size(),
-                       extract_fn<stack_size>{d_strings, d_prog, d_offsets, indices.data()});
-
-    return make_strings_column(indices.begin(), indices.end(), stream, mr);
-  }
-};
-
 }  // namespace
 
 /**
@@ -129,8 +105,7 @@ std::unique_ptr<column> extract_all_record(
   auto const d_strings     = column_device_view::create(input.parent(), stream);
 
   // Compile regex into device object.
-  auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
+  auto d_prog = reprog_device::create(pattern, flags, stream);
   // The extract pattern should always include groups.
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern.");
@@ -168,8 +143,12 @@ std::unique_ptr<column> extract_all_record(
   auto const total_groups =
     cudf::detail::get_value<offset_type>(offsets->view(), strings_count, stream);
 
-  auto strings_output = regex_dispatcher(
-    *d_prog, extract_dispatch_fn{*d_prog}, *d_strings, total_groups, d_offsets, stream, mr);
+  rmm::device_uvector<string_index_pair> indices(total_groups, stream);
+
+  launch_for_each_kernel(
+    extract_fn{*d_strings, d_offsets, indices.data()}, *d_prog, strings_count, stream);
+
+  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
 
   // Build the lists column from the offsets and the strings.
   return make_lists_column(strings_count,
diff --git a/cpp/src/strings/regex/dispatcher.hpp b/cpp/src/strings/regex/dispatcher.hpp
deleted file mode 100644
index 9ff51d1c979..00000000000
--- a/cpp/src/strings/regex/dispatcher.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <strings/regex/regex.cuh>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-
-/**
- * The stack is used to keep progress (state) on evaluating the regex instructions on each string.
- * So the size of the stack is in proportion to the number of instructions in the given regex
- * pattern.
- *
- * There are four call types based on the number of regex instructions in the given pattern.
- * Small, medium, and large instruction counts can use the stack effectively.
- * Smaller stack sizes execute faster.
- *
- * Patterns with instruction counts bigger than large use global memory rather than the stack
- * for managing the evaluation state data.
- *
- * @tparam Functor The functor to invoke with stack size templated value.
- * @tparam Ts Parameter types for the functor call.
- */
-template <typename Functor, typename... Ts>
-constexpr decltype(auto) regex_dispatcher(reprog_device d_prog, Functor f, Ts&&... args)
-{
-  auto const num_regex_insts = d_prog.insts_counts();
-  if (num_regex_insts <= RX_SMALL_INSTS) {
-    return f.template operator()<RX_STACK_SMALL>(std::forward<Ts>(args)...);
-  }
-  if (num_regex_insts <= RX_MEDIUM_INSTS) {
-    return f.template operator()<RX_STACK_MEDIUM>(std::forward<Ts>(args)...);
-  }
-  if (num_regex_insts <= RX_LARGE_INSTS) {
-    return f.template operator()<RX_STACK_LARGE>(std::forward<Ts>(args)...);
-  }
-
-  return f.template operator()<RX_STACK_ANY>(std::forward<Ts>(args)...);
-}
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index bcdd15bceda..5ccc70222d5 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -39,23 +39,9 @@ struct relist;
 using match_pair   = thrust::pair<cudf::size_type, cudf::size_type>;
 using match_result = thrust::optional<match_pair>;
 
-constexpr int32_t RX_STACK_SMALL  = 112;   ///< fastest stack size
-constexpr int32_t RX_STACK_MEDIUM = 1104;  ///< faster stack size
-constexpr int32_t RX_STACK_LARGE  = 2560;  ///< fast stack size
-constexpr int32_t RX_STACK_ANY    = 8;     ///< slowest: uses global memory
-
-/**
- * @brief Mapping the number of instructions to device code stack memory size.
- *
- * ```
- * 10128 ≈ 1000 instructions
- * Formula is based on relist::data_size_for() calculation;
- * Stack ≈ (8+2)*x + (x/8) = 10.125x < 11x  where x is number of instructions
- * ```
- */
-constexpr int32_t RX_SMALL_INSTS  = (RX_STACK_SMALL / 11);
-constexpr int32_t RX_MEDIUM_INSTS = (RX_STACK_MEDIUM / 11);
-constexpr int32_t RX_LARGE_INSTS  = (RX_STACK_LARGE / 11);
+constexpr int32_t MAX_SHARED_MEM      = 2048;  ///< Memory size for storing prog instruction data
+constexpr std::size_t MAX_WORKING_MEM = 0x01FFFFFFFF;  ///< Memory size for state data
+constexpr int32_t MINIMUM_THREADS     = 256;  // Minimum threads for computing working memory
 
 /**
  * @brief Regex class stored on the device and executed by reprog_device.
@@ -75,6 +61,12 @@ struct alignas(16) reclass_device {
  *
  * Once created, the find/extract methods are used to evaluate the regex instructions
  * against a single string.
+ *
+ * An instance of the class requires working memory for evaluating the regex
+ * instructions for the string. Determine the size of the required memory by
+ * calling either `working_memory_size()` or `compute_strided_working_memory()`.
+ * Once the buffer is allocated, pass the device pointer to the `set_working_memory()`
+ * member function.
  */
 class reprog_device {
  public:
@@ -92,33 +84,22 @@ class reprog_device {
    * regex.
    *
    * @param pattern The regex pattern to compile.
-   * @param codepoint_flags The code point lookup table for character types.
-   * @param strings_count Number of strings that will be evaluated.
    * @param stream CUDA stream used for device memory operations and kernel launches.
    * @return The program device object.
    */
   static std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> create(
-    std::string const& pattern,
-    uint8_t const* codepoint_flags,
-    size_type strings_count,
-    rmm::cuda_stream_view stream);
+    std::string const& pattern, rmm::cuda_stream_view stream);
 
   /**
    * @brief Create the device program instance from a regex pattern.
    *
    * @param pattern The regex pattern to compile.
    * @param re_flags Regex flags for interpreting special characters in the pattern.
-   * @param codepoint_flags The code point lookup table for character types.
-   * @param strings_count Number of strings that will be evaluated.
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return The program device object.
    */
   static std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> create(
-    std::string const& pattern,
-    regex_flags const re_flags,
-    uint8_t const* codepoint_flags,
-    size_type strings_count,
-    rmm::cuda_stream_view stream);
+    std::string const& pattern, regex_flags const re_flags, rmm::cuda_stream_view stream);
 
   /**
    * @brief Called automatically by the unique_ptr returned from create().
@@ -143,12 +124,75 @@ class reprog_device {
    */
   [[nodiscard]] __device__ inline bool is_empty() const;
 
+  /**
+   * @brief Returns the size needed for working memory for the given thread count.
+   *
+   * @param num_threads Number of threads to be executed in parallel
+   * @return Size of working memory in bytes
+   */
+  [[nodiscard]] std::size_t working_memory_size(int32_t num_threads) const;
+
+  /**
+   * @brief Compute working memory for the given thread count with a maximum size.
+   *
+   * The `min_rows` overrules the `requested_max_size`.
+   * That is, the `requested_max_size` may be
+   * exceeded to keep the number of rows greater than `min_rows`.
+   * Also, if `rows < min_rows` then `min_rows` is not enforced.
+   *
+   * @param rows Number of rows to execute in parallel
+   * @param min_rows The least number of rows to meet `max_size`
+   * @param requested_max_size Requested maximum bytes for the working memory
+   * @return The size of the working memory and the number of parallel rows it will support
+   */
+  [[nodiscard]] std::pair<std::size_t, int32_t> compute_strided_working_memory(
+    int32_t rows,
+    int32_t min_rows               = MINIMUM_THREADS,
+    std::size_t requested_max_size = MAX_WORKING_MEM) const;
+
+  /**
+   * @brief Set the device working memory buffer to use for the regex execution.
+   *
+   * @param buffer Device memory pointer.
+   * @param thread_count Number of threads the memory buffer will support.
+   * @param max_insts Set to the maximum instruction count if reusing the
+   *                  memory buffer for other regex calls.
+   */
+  void set_working_memory(void* buffer, int32_t thread_count, int32_t max_insts = 0);
+
+  /**
+   * @brief Returns the size of shared memory required to hold this instance.
+   *
+   * This can be called on the CPU for specifying the shared-memory size in the
+   * kernel launch parameters.
+   * This may return 0 if the MAX_SHARED_MEM value is exceeded.
+   */
+  [[nodiscard]] int32_t compute_shared_memory_size() const;
+
+  /**
+   * @brief Returns the thread count passed on `set_working_memory`.
+   */
+  [[nodiscard]] __device__ inline int32_t thread_count() const { return _thread_count; }
+
+  /**
+   * @brief Store this object into the given device pointer (e.g. shared memory).
+   *
+   * No data is stored if MAX_SHARED_MEM is exceeded for this object.
+   */
+  __device__ inline void store(void* buffer) const;
+
+  /**
+   * @brief Load an instance of this class from a device buffer (e.g. shared memory).
+   *
+   * Data is loaded from the given buffer if MAX_SHARED_MEM is not exceeded for the given object.
+   * Otherwise, a copy of the object is returned.
+   */
+  [[nodiscard]] __device__ static inline reprog_device load(reprog_device const prog, void* buffer);
+
   /**
    * @brief Does a find evaluation using the compiled expression on the given string.
    *
-   * @tparam stack_size One of the `RX_STACK_` values based on the `insts_count`.
-   * @param idx The string index used for mapping the state memory for this string in global memory
-   * (if necessary).
+   * @param thread_idx The index used for mapping the state memory for this string in global memory.
    * @param d_str The string to search.
    * @param[in,out] begin Position index to begin the search. If found, returns the position found
    * in the string.
@@ -156,8 +200,7 @@ class reprog_device {
    * matching in the string.
    * @return Returns 0 if no match is found.
    */
-  template <int stack_size>
-  __device__ inline int32_t find(int32_t idx,
+  __device__ inline int32_t find(int32_t const thread_idx,
                                  string_view const d_str,
                                  cudf::size_type& begin,
                                  cudf::size_type& end) const;
@@ -169,9 +212,7 @@ class reprog_device {
    * The find() function should be called first to locate the begin/end bounds of the
    * the matched section.
    *
-   * @tparam stack_size One of the `RX_STACK_` values based on the `insts_count`.
-   * @param idx The string index used for mapping the state memory for this string in global
-   * memory (if necessary).
+   * @param thread_idx The index used for mapping the state memory for this string in global memory.
    * @param d_str The string to search.
    * @param begin Position index to begin the search. If found, returns the position found
    * in the string.
@@ -180,8 +221,7 @@ class reprog_device {
    * @param group_id The specific group to return its matching position values.
    * @return If valid, returns the character position of the matched group in the given string,
    */
-  template <int stack_size>
-  __device__ inline match_result extract(cudf::size_type idx,
+  __device__ inline match_result extract(int32_t const thread_idx,
                                          string_view const d_str,
                                          cudf::size_type begin,
                                          cudf::size_type end,
@@ -220,8 +260,7 @@ class reprog_device {
   /**
    * @brief Utility wrapper to setup state memory structures for calling regexec
    */
-  template <int stack_size>
-  __device__ inline int32_t call_regexec(int32_t idx,
+  __device__ inline int32_t call_regexec(int32_t const thread_idx,
                                          string_view const d_str,
                                          cudf::size_type& begin,
                                          cudf::size_type& end,
@@ -234,13 +273,16 @@ class reprog_device {
   int32_t _insts_count;           // number of instructions
   int32_t _starts_count;          // number of start-insts ids
   int32_t _classes_count;         // number of classes
+  int32_t _max_insts;             // for partitioning working memory
 
   uint8_t const* _codepoint_flags{};  // table of character types
   reinst const* _insts{};             // array of regex instructions
   int32_t const* _startinst_ids{};    // array of start instruction ids
   reclass_device const* _classes{};   // array of regex classes
 
-  void* _relists_mem{};  // runtime relist memory for regexec()
+  std::size_t _prog_size{};  // total size of this instance
+  void* _buffer{};           // working memory buffer
+  int32_t _thread_count{};   // threads available in working memory
 };
 
 }  // namespace detail
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 9fe4440d7ec..bae6fb275f6 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -45,10 +45,9 @@ struct alignas(8) relist {
   /**
    * @brief Compute the aligned memory allocation size.
    */
-  constexpr inline static std::size_t alloc_size(int32_t insts)
+  constexpr inline static std::size_t alloc_size(int32_t insts, int32_t num_threads)
   {
-    return cudf::util::round_up_unsafe<size_t>(data_size_for(insts) + sizeof(relist),
-                                               sizeof(ranges[0]));
+    return cudf::util::round_up_unsafe<size_t>(data_size_for(insts) * num_threads, sizeof(restate));
   }
 
   struct alignas(16) restate {
@@ -57,16 +56,16 @@ struct alignas(8) relist {
     int32_t reserved;
   };
 
-  __device__ __forceinline__ relist(int16_t insts, u_char* data = nullptr)
-    : masksize(cudf::util::div_rounding_up_unsafe(insts, 8))
+  __device__ __forceinline__
+  relist(int16_t insts, int32_t num_threads, u_char* gp_ptr, int32_t index)
+    : masksize(cudf::util::div_rounding_up_unsafe(insts, 8)), stride(num_threads)
   {
-    auto ptr = data == nullptr ? reinterpret_cast<u_char*>(this) + sizeof(relist) : data;
-    ranges   = reinterpret_cast<int2*>(ptr);
-    ptr += insts * sizeof(ranges[0]);
-    inst_ids = reinterpret_cast<int16_t*>(ptr);
-    ptr += insts * sizeof(inst_ids[0]);
-    mask = ptr;
-    reset();
+    auto const rdata_size = sizeof(ranges[0]);
+    auto const idata_size = sizeof(inst_ids[0]);
+    ranges                = reinterpret_cast<decltype(ranges)>(gp_ptr + (index * rdata_size));
+    inst_ids =
+      reinterpret_cast<int16_t*>(gp_ptr + (rdata_size * stride * insts) + (index * idata_size));
+    mask = gp_ptr + ((rdata_size + idata_size) * stride * insts) + (index * masksize);
   }
 
   __device__ __forceinline__ void reset()
@@ -79,15 +78,15 @@ struct alignas(8) relist {
   {
     if (readMask(id)) { return false; }
     writeMask(id);
-    inst_ids[size] = static_cast<int16_t>(id);
-    ranges[size]   = int2{begin, end};
+    inst_ids[size * stride] = static_cast<int16_t>(id);
+    ranges[size * stride]   = int2{begin, end};
     ++size;
     return true;
   }
 
   __device__ __forceinline__ restate get_state(int16_t idx) const
   {
-    return restate{ranges[idx], inst_ids[idx]};
+    return restate{ranges[idx * stride], inst_ids[idx * stride]};
   }
 
   __device__ __forceinline__ int16_t get_size() const { return size; }
@@ -95,7 +94,7 @@ struct alignas(8) relist {
  private:
   int16_t size{};
   int16_t const masksize;
-  int32_t reserved;
+  int32_t const stride;
   int2* __restrict__ ranges;       // pair per instruction
   int16_t* __restrict__ inst_ids;  // one per instruction
   u_char* __restrict__ mask;       // bit per instruction
@@ -177,6 +176,49 @@ __device__ __forceinline__ bool reprog_device::is_empty() const
   return insts_counts() == 0 || get_inst(0).type == END;
 }
 
+__device__ __forceinline__ void reprog_device::store(void* buffer) const
+{
+  if (_prog_size > MAX_SHARED_MEM) { return; }
+
+  auto ptr = static_cast<u_char*>(buffer);
+
+  // create instance inside the given buffer
+  auto result = new (ptr) reprog_device(*this);
+
+  // add the insts array
+  ptr += sizeof(reprog_device);
+  auto insts     = reinterpret_cast<reinst*>(ptr);
+  result->_insts = insts;
+  for (int idx = 0; idx < _insts_count; ++idx)
+    *insts++ = _insts[idx];
+
+  // add the startinst_ids array
+  ptr += cudf::util::round_up_unsafe(_insts_count * sizeof(_insts[0]), sizeof(_startinst_ids[0]));
+  auto ids               = reinterpret_cast<int32_t*>(ptr);
+  result->_startinst_ids = ids;
+  for (int idx = 0; idx < _starts_count; ++idx)
+    *ids++ = _startinst_ids[idx];
+
+  // add the classes array
+  ptr += cudf::util::round_up_unsafe(_starts_count * sizeof(int32_t), sizeof(_classes[0]));
+  auto classes     = reinterpret_cast<reclass_device*>(ptr);
+  result->_classes = classes;
+  // fill in each class
+  auto d_ptr = reinterpret_cast<char32_t*>(classes + _classes_count);
+  for (int idx = 0; idx < _classes_count; ++idx) {
+    classes[idx]          = _classes[idx];
+    classes[idx].literals = d_ptr;
+    for (int jdx = 0; jdx < _classes[idx].count * 2; ++jdx)
+      *d_ptr++ = _classes[idx].literals[jdx];
+  }
+}
+
+__device__ __forceinline__ reprog_device reprog_device::load(reprog_device const prog, void* buffer)
+{
+  return (prog._prog_size > MAX_SHARED_MEM) ? reprog_device(prog)
+                                            : reinterpret_cast<reprog_device*>(buffer)[0];
+}
+
 /**
  * @brief Evaluate a specific string against regex pattern compiled to this instance.
  *
@@ -352,65 +394,43 @@ __device__ __forceinline__ int32_t reprog_device::regexec(string_view const dstr
   return match;
 }
 
-template <int stack_size>
-__device__ __forceinline__ int32_t reprog_device::find(int32_t idx,
+__device__ __forceinline__ int32_t reprog_device::find(int32_t const thread_idx,
                                                        string_view const dstr,
                                                        cudf::size_type& begin,
                                                        cudf::size_type& end) const
 {
-  int32_t rtn = call_regexec<stack_size>(idx, dstr, begin, end);
+  auto const rtn = call_regexec(thread_idx, dstr, begin, end);
   if (rtn <= 0) begin = end = -1;
   return rtn;
 }
 
-template <int stack_size>
-__device__ __forceinline__ match_result reprog_device::extract(cudf::size_type idx,
+__device__ __forceinline__ match_result reprog_device::extract(int32_t const thread_idx,
                                                                string_view const dstr,
                                                                cudf::size_type begin,
                                                                cudf::size_type end,
                                                                cudf::size_type const group_id) const
 {
   end = begin + 1;
-  return call_regexec<stack_size>(idx, dstr, begin, end, group_id + 1) > 0
-           ? match_result({begin, end})
-           : thrust::nullopt;
+  return call_regexec(thread_idx, dstr, begin, end, group_id + 1) > 0 ? match_result({begin, end})
+                                                                      : thrust::nullopt;
 }
 
-template <int stack_size>
-__device__ __forceinline__ int32_t reprog_device::call_regexec(int32_t idx,
+__device__ __forceinline__ int32_t reprog_device::call_regexec(int32_t const thread_idx,
                                                                string_view const dstr,
                                                                cudf::size_type& begin,
                                                                cudf::size_type& end,
                                                                cudf::size_type const group_id) const
 {
-  u_char data1[stack_size], data2[stack_size];
+  auto gp_ptr = reinterpret_cast<u_char*>(_buffer);
+  relist list1(static_cast<int16_t>(_max_insts), _thread_count, gp_ptr, thread_idx);
 
-  relist list1(static_cast<int16_t>(_insts_count), data1);
-  relist list2(static_cast<int16_t>(_insts_count), data2);
+  gp_ptr += relist::alloc_size(_max_insts, _thread_count);
+  relist list2(static_cast<int16_t>(_max_insts), _thread_count, gp_ptr, thread_idx);
 
   reljunk jnk(&list1, &list2, get_inst(_startinst_id));
   return regexec(dstr, jnk, begin, end, group_id);
 }
 
-template <>
-__device__ __forceinline__ int32_t
-reprog_device::call_regexec<RX_STACK_ANY>(int32_t idx,
-                                          string_view const dstr,
-                                          cudf::size_type& begin,
-                                          cudf::size_type& end,
-                                          cudf::size_type const group_id) const
-{
-  auto const relists_size = relist::alloc_size(_insts_count);
-  auto* listmem           = reinterpret_cast<u_char*>(_relists_mem);  // beginning of relist buffer;
-  listmem += (idx * relists_size * 2);                                // two relist ptrs in reljunk:
-
-  auto* list1 = new (listmem) relist(static_cast<int16_t>(_insts_count));
-  auto* list2 = new (listmem + relists_size) relist(static_cast<int16_t>(_insts_count));
-
-  reljunk jnk(list1, list2, get_inst(_startinst_id));
-  return regexec(dstr, jnk, begin, end, group_id);
-}
-
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 70d6079972a..4b58d9d8a88 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -16,6 +16,7 @@
 
 #include <strings/regex/regcomp.h>
 #include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/error.hpp>
@@ -35,27 +36,21 @@ reprog_device::reprog_device(reprog& prog)
     _num_capturing_groups{prog.groups_count()},
     _insts_count{prog.insts_count()},
     _starts_count{prog.starts_count()},
-    _classes_count{prog.classes_count()}
+    _classes_count{prog.classes_count()},
+    _max_insts{prog.insts_count()},
+    _codepoint_flags{get_character_flags_table()}
 {
 }
 
 std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
-  std::string const& pattern,
-  uint8_t const* codepoint_flags,
-  size_type strings_count,
-  rmm::cuda_stream_view stream)
+  std::string const& pattern, rmm::cuda_stream_view stream)
 {
-  return reprog_device::create(
-    pattern, regex_flags::MULTILINE, codepoint_flags, strings_count, stream);
+  return reprog_device::create(pattern, regex_flags::MULTILINE, stream);
 }
 
 // Create instance of the reprog that can be passed into a device kernel
 std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
-  std::string const& pattern,
-  regex_flags const flags,
-  uint8_t const* codepoint_flags,
-  size_type strings_count,
-  rmm::cuda_stream_view stream)
+  std::string const& pattern, regex_flags const flags, rmm::cuda_stream_view stream)
 {
   // compile pattern into host object
   reprog h_prog = reprog::create_from(pattern, flags);
@@ -82,7 +77,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   auto d_buffer = new rmm::device_buffer(memsize, stream);      // output device memory;
   auto d_ptr    = reinterpret_cast<u_char*>(d_buffer->data());  // running device pointer
 
-  // put everything into a flat host buffer first
+  // create our device object; this is managed separately and returned to the caller
   reprog_device* d_prog = new reprog_device(h_prog);
 
   // copy the instructions array first (fixed-sized structs)
@@ -120,32 +115,58 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   }
 
   // initialize the rest of the elements
-  d_prog->_codepoint_flags = codepoint_flags;
-
-  // allocate execute memory if needed
-  rmm::device_buffer* d_relists{};
-  if (insts_count > RX_LARGE_INSTS) {
-    // two relist state structures are needed for execute per string
-    auto const rlm_size  = relist::alloc_size(insts_count) * 2 * strings_count;
-    d_relists            = new rmm::device_buffer(rlm_size, stream);
-    d_prog->_relists_mem = d_relists->data();
-  }
+  d_prog->_max_insts = insts_count;
+  d_prog->_prog_size = memsize + sizeof(reprog_device);
 
   // copy flat prog to device memory
   CUDF_CUDA_TRY(cudaMemcpyAsync(
     d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice, stream.value()));
 
   // build deleter to cleanup device memory
-  auto deleter = [d_buffer, d_relists](reprog_device* t) {
+  auto deleter = [d_buffer](reprog_device* t) {
     t->destroy();
     delete d_buffer;
-    delete d_relists;
   };
+
   return std::unique_ptr<reprog_device, std::function<void(reprog_device*)>>(d_prog, deleter);
 }
 
 void reprog_device::destroy() { delete this; }
 
+std::size_t reprog_device::working_memory_size(int32_t num_threads) const
+{
+  return relist::alloc_size(_insts_count, num_threads) * 2;
+}
+
+std::pair<std::size_t, int32_t> reprog_device::compute_strided_working_memory(
+  int32_t rows, int32_t min_rows, std::size_t requested_max_size) const
+{
+  auto thread_count = rows;
+  auto buffer_size  = working_memory_size(thread_count);
+  while ((buffer_size > requested_max_size) && (thread_count > min_rows)) {
+    thread_count = thread_count / 2;
+    buffer_size  = working_memory_size(thread_count);
+  }
+  // clamp to min_rows but only if rows is greater than min_rows
+  if (rows > min_rows && thread_count < min_rows) {
+    thread_count = min_rows;
+    buffer_size  = working_memory_size(thread_count);
+  }
+  return std::make_pair(buffer_size, thread_count);
+}
+
+void reprog_device::set_working_memory(void* buffer, int32_t thread_count, int32_t max_insts)
+{
+  _buffer       = buffer;
+  _thread_count = thread_count;
+  _max_insts    = _max_insts > 0 ? _max_insts : _insts_count;
+}
+
+int32_t reprog_device::compute_shared_memory_size() const
+{
+  return _prog_size < MAX_SHARED_MEM ? static_cast<int32_t>(_prog_size) : 0;
+}
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
new file mode 100644
index 00000000000..9a80be25b3b
--- /dev/null
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <strings/regex/regex.cuh>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/utilities.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/scan.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+constexpr auto regex_launch_kernel_block_size = 256;
+
+template <typename ForEachFunction>
+__global__ void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, size_type size)
+{
+  extern __shared__ u_char shmem[];
+  if (threadIdx.x == 0) { d_prog.store(shmem); }
+  __syncthreads();
+  auto const s_prog = reprog_device::load(d_prog, shmem);
+
+  auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const stride     = s_prog.thread_count();
+  for (auto idx = thread_idx; idx < size; idx += stride) {
+    fn(idx, s_prog, thread_idx);
+  }
+}
+
+template <typename ForEachFunction>
+void launch_for_each_kernel(ForEachFunction fn,
+                            reprog_device& d_prog,
+                            size_type size,
+                            rmm::cuda_stream_view stream)
+{
+  auto [buffer_size, thread_count] = d_prog.compute_strided_working_memory(size);
+
+  auto d_buffer = rmm::device_buffer(buffer_size, stream);
+  d_prog.set_working_memory(d_buffer.data(), thread_count);
+
+  auto const shmem_size = d_prog.compute_shared_memory_size();
+  cudf::detail::grid_1d grid{thread_count, regex_launch_kernel_block_size};
+  for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
+    fn, d_prog, size);
+}
+
+template <typename TransformFunction, typename OutputType>
+__global__ void transform_kernel(TransformFunction fn,
+                                 reprog_device const d_prog,
+                                 OutputType* d_output,
+                                 size_type size)
+{
+  extern __shared__ u_char shmem[];
+  if (threadIdx.x == 0) { d_prog.store(shmem); }
+  __syncthreads();
+  auto const s_prog = reprog_device::load(d_prog, shmem);
+
+  auto const thread_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const stride     = s_prog.thread_count();
+  for (auto idx = thread_idx; idx < size; idx += stride) {
+    d_output[idx] = fn(idx, s_prog, thread_idx);
+  }
+}
+
+template <typename TransformFunction, typename OutputType>
+void launch_transform_kernel(TransformFunction fn,
+                             reprog_device& d_prog,
+                             OutputType* d_output,
+                             size_type size,
+                             rmm::cuda_stream_view stream)
+{
+  auto [buffer_size, thread_count] = d_prog.compute_strided_working_memory(size);
+
+  auto d_buffer = rmm::device_buffer(buffer_size, stream);
+  d_prog.set_working_memory(d_buffer.data(), thread_count);
+
+  auto const shmem_size = d_prog.compute_shared_memory_size();
+  cudf::detail::grid_1d grid{thread_count, regex_launch_kernel_block_size};
+  transform_kernel<<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
+    fn, d_prog, d_output, size);
+}
+
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           reprog_device& d_prog,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
+{
+  auto offsets = make_numeric_column(
+    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets             = offsets->mutable_view().template data<int32_t>();
+  size_and_exec_fn.d_offsets = d_offsets;
+
+  auto [buffer_size, thread_count] = d_prog.compute_strided_working_memory(strings_count);
+
+  auto d_buffer = rmm::device_buffer(buffer_size, stream);
+  d_prog.set_working_memory(d_buffer.data(), thread_count);
+  auto const shmem_size = d_prog.compute_shared_memory_size();
+  cudf::detail::grid_1d grid{thread_count, 256};
+
+  // Compute the output size for each row
+  if (strings_count > 0) {
+    for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
+      size_and_exec_fn, d_prog, strings_count);
+  }
+
+  // Convert sizes to offsets
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+  // Now build the chars column
+  auto const char_bytes = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
+  std::unique_ptr<column> chars = create_chars_child_column(char_bytes, stream, mr);
+  if (char_bytes > 0) {
+    size_and_exec_fn.d_chars = chars->mutable_view().template data<char>();
+    for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
+      size_and_exec_fn, d_prog, strings_count);
+  }
+
+  return std::make_pair(std::move(offsets), std::move(chars));
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 384813d6e3d..107adf07263 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -16,9 +16,7 @@
 
 #include "backref_re.cuh"
 
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -43,7 +41,7 @@ namespace {
  * @brief Return the capturing group index pattern to use with the given replacement string.
  *
  * Only two patterns are supported at this time `\d` and `${d}` where `d` is an integer in
- * the range 1-99. The `\d` pattern is returned by default unless no `\d` pattern is found in
+ * the range 0-99. The `\d` pattern is returned by default unless no `\d` pattern is found in
  * the `repl` string,
  *
  * Reference: https://www.regular-expressions.info/refreplacebackref.html
@@ -98,45 +96,15 @@ std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string con
   return {rtn, backrefs};
 }
 
-template <typename Iterator>
-struct replace_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  std::unique_ptr<column> operator()(strings_column_view const& input,
-                                     string_view const& d_repl_template,
-                                     Iterator backrefs_begin,
-                                     Iterator backrefs_end,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    auto const d_strings = column_device_view::create(input.parent(), stream);
-
-    auto children = make_strings_children(
-      backrefs_fn<Iterator, stack_size>{
-        *d_strings, d_prog, d_repl_template, backrefs_begin, backrefs_end},
-      input.size(),
-      stream,
-      mr);
-
-    return make_strings_column(input.size(),
-                               std::move(children.first),
-                               std::move(children.second),
-                               input.null_count(),
-                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
-  }
-};
-
 }  // namespace
 
 //
-std::unique_ptr<column> replace_with_backrefs(
-  strings_column_view const& input,
-  std::string const& pattern,
-  std::string const& replacement,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
+                                              std::string const& pattern,
+                                              std::string const& replacement,
+                                              regex_flags const flags,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -144,8 +112,7 @@ std::unique_ptr<column> replace_with_backrefs(
   CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty");
 
   // compile regex into device object
-  auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+  auto d_prog = reprog_device::create(pattern, flags, stream);
 
   // parse the repl string for back-ref indicators
   auto group_count = std::min(99, d_prog->group_counts());  // group count should NOT exceed 99
@@ -155,15 +122,21 @@ std::unique_ptr<column> replace_with_backrefs(
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value();
 
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+
   using BackRefIterator = decltype(backrefs.begin());
-  return regex_dispatcher(*d_prog,
-                          replace_dispatch_fn<BackRefIterator>{*d_prog},
-                          input,
-                          d_repl_template,
-                          backrefs.begin(),
-                          backrefs.end(),
-                          stream,
-                          mr);
+  auto children         = make_strings_children(
+    backrefs_fn<BackRefIterator>{*d_strings, d_repl_template, backrefs.begin(), backrefs.end()},
+    *d_prog,
+    input.size(),
+    stream,
+    mr);
+
+  return make_strings_column(input.size(),
+                             std::move(children.first),
+                             std::move(children.second),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index 13a67e3b4d7..db5b8a1eb17 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include <strings/regex/regex.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 
-#include <strings/regex/regex.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/execution_policy.h>
@@ -39,17 +39,16 @@ using backref_type = thrust::pair<size_type, size_type>;
  *
  * The logic includes computing the size of each string and also writing the output.
  */
-template <typename Iterator, int stack_size>
+template <typename Iterator>
 struct backrefs_fn {
   column_device_view const d_strings;
-  reprog_device prog;
   string_view const d_repl;  // string replacement template
   Iterator backrefs_begin;
   Iterator backrefs_end;
   int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ void operator()(size_type idx)
+  __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) {
       if (!d_chars) d_offsets[idx] = 0;
@@ -65,7 +64,7 @@ struct backrefs_fn {
     size_type end     = nchars;  // last character position (exclusive)
 
     // copy input to output replacing strings as we go
-    while (prog.find<stack_size>(idx, d_str, begin, end) > 0)  // inits the begin/end vars
+    while (prog.find(prog_idx, d_str, begin, end) > 0)  // inits the begin/end vars
     {
       auto spos = d_str.byte_offset(begin);           // get offset for the
       auto epos = d_str.byte_offset(end);             // character position values;
@@ -84,7 +83,7 @@ struct backrefs_fn {
             lpos_template += copy_length;
           }
           // extract the specific group's string for this backref's index
-          auto extracted = prog.extract<stack_size>(idx, d_str, begin, end, backref.first - 1);
+          auto extracted = prog.extract(prog_idx, d_str, begin, end, backref.first - 1);
           if (!extracted || (extracted.value().second <= extracted.value().first)) {
             return;  // no value for this backref number; that is ok
           }
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 3189739e492..a3f2631f424 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include <strings/regex/dispatcher.hpp>
 #include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -32,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/execution_policy.h>
+#include <thrust/fill.h>
 #include <thrust/pair.h>
 
 #include <algorithm>
@@ -47,7 +46,6 @@ using found_range = thrust::pair<size_type, size_type>;
  * @brief This functor handles replacing strings by applying the compiled regex patterns
  * and inserting the corresponding new string within the matched range of characters.
  */
-template <int stack_size>
 struct replace_multi_regex_fn {
   column_device_view const d_strings;
   device_span<reprog_device const> progs;  // array of regex progs
@@ -84,9 +82,9 @@ struct replace_multi_regex_fn {
           continue;                             // or later in the string
         reprog_device prog = progs[ptn_idx];
 
-        auto begin = static_cast<int32_t>(ch_pos);
-        auto end   = static_cast<int32_t>(nchars);
-        if (!prog.is_empty() && prog.find<stack_size>(idx, d_str, begin, end) > 0)
+        auto begin = ch_pos;
+        auto end   = nchars;
+        if (!prog.is_empty() && prog.find(idx, d_str, begin, end) > 0)
           d_ranges[ptn_idx] = found_range{begin, end};  // found a match
         else
           d_ranges[ptn_idx] = found_range{nchars, nchars};  // this pattern is done
@@ -123,33 +121,6 @@ struct replace_multi_regex_fn {
   }
 };
 
-struct replace_dispatch_fn {
-  template <int stack_size>
-  std::unique_ptr<column> operator()(strings_column_view const& input,
-                                     device_span<reprog_device const> d_progs,
-                                     strings_column_view const& replacements,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    auto const d_strings = column_device_view::create(input.parent(), stream);
-    auto const d_repls   = column_device_view::create(replacements.parent(), stream);
-
-    auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
-
-    auto children = make_strings_children(
-      replace_multi_regex_fn<stack_size>{*d_strings, d_progs, found_ranges.data(), *d_repls},
-      input.size(),
-      stream,
-      mr);
-
-    return make_strings_column(input.size(),
-                               std::move(children.first),
-                               std::move(children.second),
-                               input.null_count(),
-                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
-  }
-};
-
 }  // namespace
 
 std::unique_ptr<column> replace_re(
@@ -168,15 +139,12 @@ std::unique_ptr<column> replace_re(
   CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have any nulls");
 
   // compile regexes into device objects
-  auto const d_char_table = get_character_flags_table();
   auto h_progs = std::vector<std::unique_ptr<reprog_device, std::function<void(reprog_device*)>>>(
     patterns.size());
-  std::transform(patterns.begin(),
-                 patterns.end(),
-                 h_progs.begin(),
-                 [flags, d_char_table, input, stream](auto const& ptn) {
-                   return reprog_device::create(ptn, flags, d_char_table, input.size(), stream);
-                 });
+  std::transform(
+    patterns.begin(), patterns.end(), h_progs.begin(), [flags, stream](auto const& ptn) {
+      return reprog_device::create(ptn, flags, stream);
+    });
 
   // get the longest regex for the dispatcher
   auto const max_prog =
@@ -184,15 +152,37 @@ std::unique_ptr<column> replace_re(
       return lhs->insts_counts() < rhs->insts_counts();
     });
 
+  auto d_max_prog        = **max_prog;
+  auto const buffer_size = d_max_prog.working_memory_size(input.size());
+  auto d_buffer          = rmm::device_buffer(buffer_size, stream);
+
   // copy all the reprog_device instances to a device memory array
   std::vector<reprog_device> progs;
-  std::transform(h_progs.begin(), h_progs.end(), std::back_inserter(progs), [](auto const& d_prog) {
-    return *d_prog;
-  });
+  std::transform(h_progs.begin(),
+                 h_progs.end(),
+                 std::back_inserter(progs),
+                 [d_buffer = d_buffer.data(), size = input.size()](auto& prog) {
+                   prog->set_working_memory(d_buffer, size);
+                   return *prog;
+                 });
   auto d_progs = cudf::detail::make_device_uvector_async(progs, stream);
 
-  return regex_dispatcher(
-    **max_prog, replace_dispatch_fn{}, input, d_progs, replacements, stream, mr);
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_repls   = column_device_view::create(replacements.parent(), stream);
+
+  auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
+
+  auto children = make_strings_children(
+    replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls},
+    input.size(),
+    stream,
+    mr);
+
+  return make_strings_column(input.size(),
+                             std::move(children.first),
+                             std::move(children.second),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index af74d8bdb92..159f83453bd 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -38,16 +36,14 @@ namespace {
  * @brief This functor handles replacing strings by applying the compiled regex pattern
  * and inserting the new string within the matched range of characters.
  */
-template <int stack_size>
 struct replace_regex_fn {
   column_device_view const d_strings;
-  reprog_device prog;
   string_view const d_repl;
   size_type const maxrepl;
   int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ void operator()(size_type idx)
+  __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) {
       if (!d_chars) d_offsets[idx] = 0;
@@ -62,13 +58,13 @@ struct replace_regex_fn {
     auto out_ptr = d_chars ? d_chars + d_offsets[idx]   // output pointer (o)
                            : nullptr;
     size_type last_pos = 0;
-    int32_t begin      = 0;   // these are for calling prog.find
-    int32_t end        = -1;  // matches final word-boundary if at the end of the string
+    size_type begin    = 0;   // these are for calling prog.find
+    size_type end      = -1;  // matches final word-boundary if at the end of the string
 
     // copy input to output replacing strings as we go
     while (mxn-- > 0 && begin <= nchars) {  // maximum number of replaces
 
-      if (prog.is_empty() || prog.find<stack_size>(idx, d_str, begin, end) <= 0) {
+      if (prog.is_empty() || prog.find(prog_idx, d_str, begin, end) <= 0) {
         break;  // no more matches
       }
 
@@ -100,32 +96,6 @@ struct replace_regex_fn {
   }
 };
 
-struct replace_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  std::unique_ptr<column> operator()(strings_column_view const& input,
-                                     string_view const& d_replacement,
-                                     size_type max_replace_count,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    auto const d_strings = column_device_view::create(input.parent(), stream);
-
-    auto children = make_strings_children(
-      replace_regex_fn<stack_size>{*d_strings, d_prog, d_replacement, max_replace_count},
-      input.size(),
-      stream,
-      mr);
-
-    return make_strings_column(input.size(),
-                               std::move(children.first),
-                               std::move(children.second),
-                               input.null_count(),
-                               cudf::detail::copy_bitmask(input.parent(), stream, mr));
-  }
-};
-
 }  // namespace
 
 //
@@ -144,13 +114,20 @@ std::unique_ptr<column> replace_re(
   string_view d_repl(replacement.data(), replacement.size());
 
   // compile regex into device object
-  auto d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
+  auto d_prog = reprog_device::create(pattern, flags, stream);
 
   auto const maxrepl = max_replace_count.value_or(-1);
 
-  return regex_dispatcher(
-    *d_prog, replace_dispatch_fn{*d_prog}, input, d_repl, maxrepl, stream, mr);
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+
+  auto children = make_strings_children(
+    replace_regex_fn{*d_strings, d_repl, maxrepl}, *d_prog, input.size(), stream, mr);
+
+  return make_strings_column(input.size(),
+                             std::move(children.first),
+                             std::move(children.second),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 323ad2cbc09..64e46d07e25 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -15,9 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -33,7 +31,6 @@
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
 #include <thrust/reduce.h>
 
@@ -52,14 +49,12 @@ namespace {
  * For strings with fewer matches, null entries are appended into `d_indices`
  * up to the maximum column count.
  */
-template <int stack_size>
 struct findall_fn {
   column_device_view const d_strings;
-  reprog_device prog;
   size_type const* d_counts;  ///< match counts for each string
   indices_span d_indices;     ///< 2D-span: output matches added here
 
-  __device__ void operator()(size_type idx)
+  __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     auto const match_count = d_counts[idx];
 
@@ -72,7 +67,7 @@ struct findall_fn {
       int32_t begin = 0;
       int32_t end   = -1;
       for (auto col_idx = 0; col_idx < match_count; ++col_idx) {
-        if (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+        if (prog.find(prog_idx, d_str, begin, end) > 0) {
           auto const begin_offset = d_str.byte_offset(begin);
           auto const end_offset   = d_str.byte_offset(end);
           d_output[col_idx] =
@@ -82,28 +77,12 @@ struct findall_fn {
         end   = nchars;
       }
     }
-
     // fill the remaining entries for this row with nulls
     thrust::fill(
       thrust::seq, d_output.begin() + match_count, d_output.end(), string_index_pair{nullptr, 0});
   }
 };
 
-struct findall_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  void operator()(column_device_view const& d_strings,
-                  size_type const* d_find_counts,
-                  indices_span& d_indices,
-                  rmm::cuda_stream_view stream)
-  {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       d_strings.size(),
-                       findall_fn<stack_size>{d_strings, d_prog, d_find_counts, d_indices});
-  }
-};
 }  // namespace
 
 std::unique_ptr<table> findall(strings_column_view const& input,
@@ -115,11 +94,10 @@ std::unique_ptr<table> findall(strings_column_view const& input,
   auto const strings_count = input.size();
 
   // compile regex into device object
-  auto const d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
+  auto const d_prog = reprog_device::create(pattern, flags, stream);
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
-  auto find_counts     = count_matches(*d_strings, *d_prog, strings_count + 1, stream);
+  auto find_counts     = count_matches(*d_strings, *d_prog, strings_count, stream);
   auto d_find_counts   = find_counts->view().data<size_type>();
 
   size_type const columns_count = thrust::reduce(
@@ -139,9 +117,8 @@ std::unique_ptr<table> findall(strings_column_view const& input,
   } else {
     // place all matching strings into the indices vector
     auto d_indices = indices_span(indices.data(), strings_count, columns_count);
-    regex_dispatcher(
-      *d_prog, findall_dispatch_fn{*d_prog}, *d_strings, d_find_counts, d_indices, stream);
-
+    launch_for_each_kernel(
+      findall_fn{*d_strings, d_find_counts, d_indices}, *d_prog, strings_count, stream);
     results.resize(columns_count);
   }
 
diff --git a/cpp/src/strings/search/findall_record.cu b/cpp/src/strings/search/findall_record.cu
index 46155bd7cf5..2f4b9ce5b24 100644
--- a/cpp/src/strings/search/findall_record.cu
+++ b/cpp/src/strings/search/findall_record.cu
@@ -15,9 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -32,8 +30,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
 #include <thrust/scan.h>
 
@@ -49,55 +45,48 @@ namespace {
  * @brief This functor handles extracting matched strings by applying the compiled regex pattern
  * and creating string_index_pairs for all the substrings.
  */
-template <int stack_size>
 struct findall_fn {
   column_device_view const d_strings;
-  reprog_device prog;
   offset_type const* d_offsets;
   string_index_pair* d_indices;
 
-  __device__ void operator()(size_type const idx)
+  __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) { return; }
-    auto const d_str = d_strings.element<string_view>(idx);
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();
 
     auto d_output        = d_indices + d_offsets[idx];
     size_type output_idx = 0;
 
-    int32_t begin = 0;
-    int32_t end   = d_str.length();
-    while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
+    size_type begin = 0;
+    size_type end   = nchars;
+    while ((begin < end) && (prog.find(prog_idx, d_str, begin, end) > 0)) {
       auto const spos = d_str.byte_offset(begin);  // convert
       auto const epos = d_str.byte_offset(end);    // to bytes
 
       d_output[output_idx++] = string_index_pair{d_str.data() + spos, (epos - spos)};
 
       begin = end + (begin == end);
-      end   = d_str.length();
+      end   = nchars;
     }
   }
 };
 
-struct findall_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  std::unique_ptr<column> operator()(column_device_view const& d_strings,
+std::unique_ptr<column> findall_util(column_device_view const& d_strings,
+                                     reprog_device& d_prog,
                                      size_type total_matches,
                                      offset_type const* d_offsets,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
-  {
-    rmm::device_uvector<string_index_pair> indices(total_matches, stream);
+{
+  rmm::device_uvector<string_index_pair> indices(total_matches, stream);
 
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       d_strings.size(),
-                       findall_fn<stack_size>{d_strings, d_prog, d_offsets, indices.data()});
+  launch_for_each_kernel(
+    findall_fn{d_strings, d_offsets, indices.data()}, d_prog, d_strings.size(), stream);
 
-    return make_strings_column(indices.begin(), indices.end(), stream, mr);
-  }
-};
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
+}
 
 }  // namespace
 
@@ -113,8 +102,7 @@ std::unique_ptr<column> findall_record(
   auto const d_strings     = column_device_view::create(input.parent(), stream);
 
   // compile regex into device object
-  auto const d_prog =
-    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
+  auto const d_prog = reprog_device::create(pattern, flags, stream);
 
   // Create lists offsets column
   auto offsets   = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
@@ -128,8 +116,7 @@ std::unique_ptr<column> findall_record(
   auto const total_matches =
     cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
 
-  auto strings_output = regex_dispatcher(
-    *d_prog, findall_dispatch_fn{*d_prog}, *d_strings, total_matches, d_offsets, stream, mr);
+  auto strings_output = findall_util(*d_strings, *d_prog, total_matches, d_offsets, stream, mr);
 
   // Build the lists column from the offsets and the strings
   return make_lists_column(strings_count,
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 3ec6df058c6..16edd0606e9 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -15,9 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
-#include <strings/regex/dispatcher.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+#include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -28,12 +26,10 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/distance.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
@@ -59,18 +55,17 @@ enum class split_direction {
  * The `d_token_offsets` specifies the output position within `d_tokens`
  * for each string.
  */
-template <int stack_size>
 struct token_reader_fn {
   column_device_view const d_strings;
-  reprog_device prog;
   split_direction const direction;
   offset_type const* d_token_offsets;
   string_index_pair* d_tokens;
 
-  __device__ void operator()(size_type idx)
+  __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) { return; }
-    auto const d_str = d_strings.element<string_view>(idx);
+    auto const d_str  = d_strings.element<string_view>(idx);
+    auto const nchars = d_str.length();
 
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
@@ -78,9 +73,9 @@ struct token_reader_fn {
 
     size_type token_idx = 0;
     size_type begin     = 0;  // characters
-    size_type end       = d_str.length();
+    size_type end       = nchars;
     size_type last_pos  = 0;  // bytes
-    while (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+    while (prog.find(prog_idx, d_str, begin, end) > 0) {
       // get the token (characters just before this match)
       auto const token =
         string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
@@ -97,7 +92,7 @@ struct token_reader_fn {
       // setup for next match
       last_pos = d_str.byte_offset(end);
       begin    = end + (begin == end);
-      end      = d_str.length();
+      end      = nchars;
     }
 
     // set the last token to the remainder of the string
@@ -116,28 +111,6 @@ struct token_reader_fn {
   }
 };
 
-struct generate_dispatch_fn {
-  reprog_device d_prog;
-
-  template <int stack_size>
-  rmm::device_uvector<string_index_pair> operator()(column_device_view const& d_strings,
-                                                    size_type total_tokens,
-                                                    split_direction direction,
-                                                    offset_type const* d_offsets,
-                                                    rmm::cuda_stream_view stream)
-  {
-    rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
-
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      d_strings.size(),
-      token_reader_fn<stack_size>{d_strings, d_prog, direction, d_offsets, tokens.data()});
-
-    return tokens;
-  }
-};
-
 /**
  * @brief Call regex to split each input string into tokens.
  *
@@ -176,8 +149,15 @@ rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const&
   // the last offset entry is the total number of tokens to be generated
   auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
 
-  return regex_dispatcher(
-    d_prog, generate_dispatch_fn{d_prog}, d_strings, total_tokens, direction, d_offsets, stream);
+  rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
+  if (total_tokens == 0) { return tokens; }
+
+  launch_for_each_kernel(token_reader_fn{d_strings, direction, d_offsets, tokens.data()},
+                         d_prog,
+                         d_strings.size(),
+                         stream);
+
+  return tokens;
 }
 
 /**
@@ -221,7 +201,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   }
 
   // create the regex device prog from the given pattern
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_prog    = reprog_device::create(pattern, stream);
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
@@ -283,7 +263,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto const strings_count = input.size();
 
   // create the regex device prog from the given pattern
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_prog    = reprog_device::create(pattern, stream);
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index a2c173cae5f..5baab0f09a2 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -441,6 +441,12 @@ std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent
   return {table_view{superimposed_columns}, std::move(superimposed_nullmasks)};
 }
 
+bool contains_null_structs(column_view const& col)
+{
+  return (is_struct(col) && col.has_nulls()) ||
+         std::any_of(col.child_begin(), col.child_end(), contains_null_structs);
+}
+
 }  // namespace detail
 }  // namespace structs
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 95c54d7596e..eadcd985de3 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -470,6 +470,7 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp)
 # * lists tests ----------------------------------------------------------------------------------
 ConfigureTest(
   LISTS_TEST
+  lists/apply_boolean_mask_test.cpp
   lists/combine/concatenate_list_elements_tests.cpp
   lists/combine/concatenate_rows_tests.cpp
   lists/contains_tests.cpp
diff --git a/cpp/tests/lists/apply_boolean_mask_test.cpp b/cpp/tests/lists/apply_boolean_mask_test.cpp
new file mode 100644
index 00000000000..a5b036210ba
--- /dev/null
+++ b/cpp/tests/lists/apply_boolean_mask_test.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/lists/extract.hpp>
+#include <cudf/lists/stream_compaction.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+namespace cudf::test {
+
+using namespace iterators;
+using cudf::lists_column_view;
+using cudf::lists::apply_boolean_mask;
+
+template <typename T>
+using lists    = lists_column_wrapper<T, int32_t>;
+using filter_t = lists_column_wrapper<bool, int32_t>;
+
+template <typename T>
+using fwcw    = fixed_width_column_wrapper<T, int32_t>;
+using offsets = fwcw<int32_t>;
+using strings = strings_column_wrapper;
+
+auto constexpr X = int32_t{0};  // Placeholder for NULL.
+
+struct ApplyBooleanMaskTest : public BaseFixture {
+};
+
+template <typename T>
+struct ApplyBooleanMaskTypedTest : ApplyBooleanMaskTest {
+};
+
+TYPED_TEST_SUITE(ApplyBooleanMaskTypedTest, cudf::test::NumericTypes);
+
+TYPED_TEST(ApplyBooleanMaskTypedTest, StraightLine)
+{
+  using T    = TypeParam;
+  auto input = lists<T>{{0, 1, 2, 3}, {4, 5}, {6, 7, 8, 9}, {0, 1}, {2, 3, 4, 5}, {6, 7}}.release();
+  auto filter = filter_t{{1, 0, 1, 0}, {1, 0}, {1, 0, 1, 0}, {1, 0}, {1, 0, 1, 0}, {1, 0}};
+
+  {
+    // Unsliced.
+    auto filtered = apply_boolean_mask(lists_column_view{*input}, lists_column_view{filter});
+    auto expected = lists<T>{{0, 2}, {4}, {6, 8}, {0}, {2, 4}, {6}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected);
+  }
+  {
+    // Sliced input: Remove the first row.
+    auto sliced = cudf::slice(*input, {1, input->size()}).front();
+    //           == lists_t {{4, 5}, {6, 7, 8, 9}, {0, 1}, {2, 3, 4, 5}, {6, 7}};
+    auto filter   = filter_t{{0, 1}, {0, 1, 0, 1}, {1, 1}, {0, 1, 0, 1}, {0, 0}};
+    auto filtered = apply_boolean_mask(lists_column_view{sliced}, lists_column_view{filter});
+    auto expected = lists<T>{{5}, {7, 9}, {0, 1}, {3, 5}, {}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected);
+  }
+}
+
+TYPED_TEST(ApplyBooleanMaskTypedTest, NullElementsInTheListRows)
+{
+  using T = TypeParam;
+  auto input =
+    lists<T>{
+      {0, 1, 2, 3},
+      lists<T>{{X, 5}, null_at(0)},
+      {6, 7, 8, 9},
+      {0, 1},
+      lists<T>{{X, 3, 4, X}, nulls_at({0, 3})},
+      lists<T>{{X, X}, nulls_at({0, 1})},
+    }
+      .release();
+  auto filter = filter_t{{1, 0, 1, 0}, {1, 0}, {1, 0, 1, 0}, {1, 0}, {1, 0, 1, 0}, {1, 0}};
+
+  {
+    // Unsliced.
+    auto filtered = apply_boolean_mask(lists_column_view{*input}, lists_column_view{filter});
+    auto expected = lists<T>{{0, 2},
+                             lists<T>{{X}, null_at(0)},
+                             {6, 8},
+                             {0},
+                             lists<T>{{X, 4}, null_at(0)},
+                             lists<T>{{X}, null_at(0)}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected);
+  }
+  {
+    // Sliced input: Remove the first row.
+    auto sliced = cudf::slice(*input, {1, input->size()}).front();
+    //           == lists_t {{X, 5}, {6, 7, 8, 9}, {0, 1}, {X, 3, 4, X}, {X, X}};
+    auto filter   = filter_t{{0, 1}, {0, 1, 0, 1}, {1, 1}, {0, 1, 0, 1}, {0, 0}};
+    auto filtered = apply_boolean_mask(lists_column_view{sliced}, lists_column_view{filter});
+    auto expected = lists<T>{{5}, {7, 9}, {0, 1}, lists<T>{{3, X}, null_at(1)}, {}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected);
+  }
+}
+
+TYPED_TEST(ApplyBooleanMaskTypedTest, NullListRowsInTheInputColumn)
+{
+  using T = TypeParam;
+  auto input =
+    lists<T>{{{0, 1, 2, 3}, {}, {6, 7, 8, 9}, {}, {2, 3, 4, 5}, {6, 7}}, nulls_at({1, 3})}
+      .release();
+  auto filter = filter_t{{1, 0, 1, 0}, {}, {1, 0, 1, 0}, {}, {1, 0, 1, 0}, {1, 0}};
+
+  {
+    // Unsliced.
+    auto filtered = apply_boolean_mask(lists_column_view{*input}, lists_column_view{filter});
+    auto expected = lists<T>{{{0, 2}, {}, {6, 8}, {}, {2, 4}, {6}}, nulls_at({1, 3})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected);
+  }
+  {
+    // Sliced input: Remove the first row.
+    auto sliced = cudf::slice(*input, {1, input->size()}).front();
+    //           == lists_t{{{}, {6, 7, 8, 9}, {}, {2, 3, 4, 5}, {6, 7}}, nulls_at({0,2})};
+    auto filter   = filter_t{{}, {0, 1, 0, 1}, {}, {0, 1, 0, 1}, {0, 0}};
+    auto filtered = apply_boolean_mask(lists_column_view{sliced}, lists_column_view{filter});
+    auto expected = lists<T>{{{}, {7, 9}, {}, {3, 5}, {}}, nulls_at({0, 2})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected);
+  }
+  {
+    // Sliced input: Remove the first two rows.
+    auto sliced = cudf::slice(*input, {2, input->size()}).front();
+    //           == lists_t{{{6, 7, 8, 9}, {}, {2, 3, 4, 5}, {6, 7}}, null_at(1)};
+    auto filter   = filter_t{{0, 1, 0, 1}, {}, {0, 1, 0, 1}, {0, 0}};
+    auto filtered = apply_boolean_mask(lists_column_view{sliced}, lists_column_view{filter});
+    auto expected = lists<T>{{{7, 9}, {}, {3, 5}, {}}, null_at(1)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*filtered, expected);
+  }
+}
+
+TYPED_TEST(ApplyBooleanMaskTypedTest, StructInput)
+{
+  using T    = TypeParam;
+  using fwcw = fwcw<T>;
+
+  auto constexpr num_input_rows = 7;
+  auto const input              = [] {
+    auto child_num             = fwcw{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto child_str             = strings{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    auto const null_mask_begin = null_at(5);
+    auto const null_mask_end   = null_mask_begin + num_input_rows;
+    return cudf::make_lists_column(num_input_rows,
+                                   offsets{0, 2, 3, 6, 6, 8, 8, 10}.release(),
+                                   structs_column_wrapper{{child_num, child_str}}.release(),
+                                   1,
+                                   detail::make_null_mask(null_mask_begin, null_mask_end));
+  }();
+  {
+    // Unsliced.
+    // The input should now look as follows: (String child dropped for brevity.)
+    // Input:                     {[0, 1], [2], [3, 4, 5], [], [6, 7], [], [8, 9]}
+    auto const filter   = filter_t{{1, 1}, {0}, {0, 1, 0}, {}, {1, 0}, {}, {0, 1}};
+    auto const result   = apply_boolean_mask(lists_column_view{*input}, lists_column_view{filter});
+    auto const expected = [] {
+      auto child_num             = fwcw{0, 1, 4, 6, 9};
+      auto child_str             = strings{"0", "1", "4", "6", "9"};
+      auto const null_mask_begin = null_at(5);
+      auto const null_mask_end   = null_mask_begin + num_input_rows;
+      return cudf::make_lists_column(num_input_rows,
+                                     offsets{0, 2, 2, 3, 3, 4, 4, 5}.release(),
+                                     structs_column_wrapper{{child_num, child_str}}.release(),
+                                     1,
+                                     detail::make_null_mask(null_mask_begin, null_mask_end));
+    }();
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+  }
+  {
+    // Sliced. Remove the first row.
+    auto const sliced_input = cudf::slice(*input, {1, input->size()}).front();
+    // The input should now look as follows: (String child dropped for brevity.)
+    // Input:                   {[2], [3, 4, 5], [], [6, 7], [], [8, 9]}
+    auto const filter = filter_t{{0}, {0, 1, 0}, {}, {1, 0}, {}, {0, 1}};
+    auto const result =
+      apply_boolean_mask(lists_column_view{sliced_input}, lists_column_view{filter});
+    auto const expected = [] {
+      auto child_num             = fwcw{4, 6, 9};
+      auto child_str             = strings{"4", "6", "9"};
+      auto const null_mask_begin = null_at(4);
+      auto const null_mask_end   = null_mask_begin + num_input_rows;
+      return cudf::make_lists_column(num_input_rows - 1,
+                                     offsets{0, 0, 1, 1, 2, 2, 3}.release(),
+                                     structs_column_wrapper{{child_num, child_str}}.release(),
+                                     1,
+                                     detail::make_null_mask(null_mask_begin, null_mask_end));
+    }();
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+  }
+}
+
+TEST_F(ApplyBooleanMaskTest, Trivial)
+{
+  auto const input  = lists<int32_t>{};
+  auto const filter = filter_t{};
+  auto const result = apply_boolean_mask(lists_column_view{input}, lists_column_view{filter});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, lists<int32_t>{});
+}
+
+TEST_F(ApplyBooleanMaskTest, Failure)
+{
+  {
+    // Invalid mask type.
+    auto const input  = lists<int32_t>{{1, 2, 3}, {4, 5, 6}};
+    auto const filter = lists<int32_t>{{0, 0, 0}};
+    CUDF_EXPECT_THROW_MESSAGE(
+      apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
+      "Mask must be of type BOOL8.");
+  }
+  {
+    // Mismatched number of rows.
+    auto const input  = lists<int32_t>{{1, 2, 3}, {4, 5, 6}};
+    auto const filter = filter_t{{0, 0, 0}};
+    CUDF_EXPECT_THROW_MESSAGE(
+      apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
+      "Boolean masks column must have same number of rows as input.");
+  }
+}
+}  // namespace cudf::test
diff --git a/docs/cudf/source/_static/params.css b/docs/cudf/source/_static/params.css
index 9e6be7ca75f..17c9d5accbd 100644
--- a/docs/cudf/source/_static/params.css
+++ b/docs/cudf/source/_static/params.css
@@ -50,11 +50,17 @@ table.io-supported-types-table thead{
 
 }
 
+/* Used to make special-table scrollable when it overflows */
+.special-table-wrapper {
+    width: 100%;
+    overflow: auto !important;
+}
+
 .special-table td, .special-table th {
     border: 1px solid #dee2e6;
 }
 
-/* Needed to resolve https://github.com/executablebooks/jupyter-book/issues/1611 */ 
+/* Needed to resolve https://github.com/executablebooks/jupyter-book/issues/1611 */
 .output.text_html {
     overflow: auto;
 }
diff --git a/docs/cudf/source/basics/PandasCompat.rst b/docs/cudf/source/basics/PandasCompat.rst
deleted file mode 100644
index fe9161e49c3..00000000000
--- a/docs/cudf/source/basics/PandasCompat.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Pandas Compatibility Notes
-==========================
-
-.. pandas-compat-list::
diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst
deleted file mode 100644
index 9b8983fba49..00000000000
--- a/docs/cudf/source/basics/basics.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-Basics
-======
-
-
-Supported Dtypes
-----------------
-
-cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``,
-``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``,
-``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes).
-
-
-The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type.
-
-.. rst-class:: special-table
-.. table::
-
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Kind of Data    | Data Type        | Scalar                                                       | String Aliases                               |
-    +=================+==================+==============================================================+==============================================+
-    | Integer         |                  | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_,        | ``'int8'``, ``'int16'``, ``'int32'``,        |
-    |                 |                  | np.uint16_, np.uint32_, np.uint64_                           | ``'int64'``, ``'uint8'``, ``'uint16'``,      |
-    |                 |                  |                                                              | ``'uint32'``, ``'uint64'``                   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Float           |                  | np.float32_, np.float64_                                     | ``'float32'``, ``'float64'``                 |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Strings         |                  | `str <https://docs.python.org/3/library/stdtypes.html#str>`_ | ``'string'``, ``'object'``                   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Datetime        |                  | np.datetime64_                                               | ``'datetime64[s]'``, ``'datetime64[ms]'``,   |
-    |                 |                  |                                                              | ``'datetime64[us]'``, ``'datetime64[ns]'``   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Timedelta       |                  | np.timedelta64_                                              | ``'timedelta64[s]'``, ``'timedelta64[ms]'``, |
-    | (duration type) |                  |                                                              | ``'timedelta64[us]'``, ``'timedelta64[ns]'`` |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Categorical     | CategoricalDtype | (none)                                                       | ``'category'``                               |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Boolean         |                  | np.bool_                                                     | ``'bool'``                                   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Decimal         | Decimal32Dtype,  | (none)                                                       | (none)                                       |
-    |                 | Decimal64Dtype,  |                                                              |                                              |
-    |                 | Decimal128Dtype  |                                                              |                                              |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Lists           | ListDtype        | list                                                         | ``'list'``                                   |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-    | Structs         | StructDtype      | dict                                                         | ``'struct'``                                 |
-    +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+
-
-**Note: All dtypes above are Nullable**
-
-.. _np.int8:
-.. _np.int16:
-.. _np.int32:
-.. _np.int64:
-.. _np.uint8:
-.. _np.uint16:
-.. _np.uint32:
-.. _np.uint64:
-.. _np.float32:
-.. _np.float64:
-.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html
-.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes
-.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic
diff --git a/docs/cudf/source/basics/dask-cudf.rst b/docs/cudf/source/basics/dask-cudf.rst
deleted file mode 100644
index a9c65dfbfae..00000000000
--- a/docs/cudf/source/basics/dask-cudf.rst
+++ /dev/null
@@ -1,107 +0,0 @@
-Multi-GPU with Dask-cuDF
-========================
-
-cuDF is a single-GPU library. For Multi-GPU cuDF solutions we use
-`Dask <https://dask.org/>`__ and the `dask-cudf
-package <https://github.com/rapidsai/cudf/tree/main/python/dask_cudf>`__,
-which is able to scale cuDF across multiple GPUs on a single machine, or
-multiple GPUs across many machines in a cluster.
-
-`Dask DataFrame <http://docs.dask.org/en/latest/dataframe.html>`__ was
-originally designed to scale Pandas, orchestrating many Pandas
-DataFrames spread across many CPUs into a cohesive parallel DataFrame.
-Because cuDF currently implements only a subset of Pandas’s API, not all
-Dask DataFrame operations work with cuDF.
-
-The following is tested and expected to work:
-
-What works
-----------
-
--  Data ingestion
-
-   -  ``dask_cudf.read_csv``
-   -  Use standard Dask ingestion with Pandas, then convert to cuDF (For
-      Parquet and other formats this is often decently fast)
-
--  Linear operations
-
-   -  Element-wise operations: ``df.x + df.y``, ``df ** 2``
-   -  Assignment: ``df['z'] = df.x + df.y``
-   -  Row-wise selections: ``df[df.x > 0]``
-   -  Loc: ``df.loc['2001-01-01': '2005-02-02']``
-   -  Date time/string accessors: ``df.timestamp.dt.dayofweek``
-   -  ... and most similar operations in this category that are already
-      implemented in cuDF
-
--  Reductions
-
-   -  Like ``sum``, ``mean``, ``max``, ``count``, and so on on
-      ``Series`` objects
-   -  Support for reductions on full dataframes
-   -  \ ``std``\
-   -  Custom reductions with
-      `dask.dataframe.reduction <http://docs.dask.org/en/latest/generated/dask.dataframe.Series.reduction.html>`__
-
--  Groupby aggregations
-
-   -  On single columns: ``df.groupby('x').y.max()``
-   -  With custom aggregations:
-   -  groupby standard deviation
-   -  grouping on multiple columns
-   -  groupby agg for multiple outputs
-
--  Joins:
-
-   -  On full unsorted columns: ``left.merge(right, on='id')``
-      (expensive)
-   -  On sorted indexes:
-      ``left.merge(right, left_index=True, right_index=True)`` (fast)
-   -  On large and small dataframes: ``left.merge(cudf_df, on='id')``
-      (fast)
-
--  Rolling operations
--  Converting to and from other forms
-
-   -  Dask + Pandas to Dask + cuDF
-      ``df.map_partitions(cudf.from_pandas)``
-   -  Dask + cuDF to Dask + Pandas
-      ``df.map_partitions(lambda df: df.to_pandas())``
-   -  cuDF to Dask + cuDF:
-      ``dask.dataframe.from_pandas(df, npartitions=20)``
-   -  Dask + cuDF to cuDF: ``df.compute()``
-
-Additionally all generic Dask operations, like ``compute``, ``persist``,
-``visualize`` and so on work regardless.
-
-Developing the API
-------------------
-
-Above we mention the following:
-
-    and most similar operations in this category that are already
-    implemented in cuDF
-
-This is because it is difficult to create a comprehensive list of
-operations in the cuDF and Pandas libraries. The API is large enough to
-be difficult to track effectively. For any operation that operates
-row-wise like ``fillna`` or ``query`` things will likely, but not
-certainly work. If operations don't work it is often due to a slight
-inconsistency between Pandas and cuDF that is generally easy to fix. We
-encourage users to look at the `cuDF issue
-tracker <https://github.com/rapidsai/cudf/issues>`__ to see if their
-issue has already been reported and, if not, `raise a new
-issue <https://github.com/rapidsai/cudf/issues/new>`__.
-
-Navigating the API
-------------------
-
-This project reuses the `Dask
-DataFrame <https://docs.dask.org/en/latest/dataframe.html>`__ project,
-which was originally designed for Pandas, with the newer library cuDF.
-Because we use the same Dask classes for both projects there are often
-methods that are implemented for Pandas, but not yet for cuDF. As a
-result users looking at the full Dask DataFrame API can be misleading,
-and often lead to frustration when operations that are advertised in the
-Dask API do not work as expected with cuDF. We apologize for this in
-advance.
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
deleted file mode 100644
index f74853769f6..00000000000
--- a/docs/cudf/source/basics/groupby.rst
+++ /dev/null
@@ -1,274 +0,0 @@
-.. _basics.groupby:
-
-GroupBy
-=======
-
-cuDF supports a small (but important) subset of Pandas' `groupby
-API <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html>`__.
-
-Summary of supported operations
--------------------------------
-
-1. Grouping by one or more columns
-2. Basic aggregations such as "sum", "mean", etc.
-3. Quantile aggregation
-4. A "collect" or ``list`` aggregation for collecting values in a group
-   into lists
-5. Automatic exclusion of columns with unsupported dtypes ("nuisance"
-   columns) when aggregating
-6. Iterating over the groups of a GroupBy object
-7. ``GroupBy.groups`` API that returns a mapping of group keys to row
-   labels
-8. ``GroupBy.apply`` API for performing arbitrary operations on each
-   group. Note that this has very limited functionality compared to the
-   equivalent Pandas function. See the section on
-   `apply <#groupby-apply>`__ for more details.
-9. ``GroupBy.pipe`` similar to
-   `Pandas <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`__.
-
-Grouping
---------
-
-A GroupBy object is created by grouping the values of a ``Series`` or
-``DataFrame`` by one or more columns:
-
-.. code:: python
-
-    import cudf
-
-    >>> df = cudf.DataFrame({'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]})
-    >>> df
-    >>> gb1 = df.groupby('a')  # grouping by a single column
-    >>> gb2 = df.groupby(['a', 'b'])  # grouping by multiple columns
-    >>> gb3 = df.groupby(cudf.Series(['a', 'a', 'b', 'b', 'b']))  # grouping by an external column
-
-.. warning::
-
-       cuDF uses `sort=False` by default to achieve better performance, which provides no gaurentee to the group order in outputs. This deviates from Pandas default behavior.
-
-       For example:
-
-       .. code-block:: python
-       
-          >>> df = cudf.DataFrame({'a' : [2, 2, 1], 'b' : [42, 21, 11]})
-          >>> df.groupby('a').sum()
-             b
-          a    
-          2  63
-          1  11
-          >>> df.to_pandas().groupby('a').sum()
-             b
-          a    
-          1  11
-          2  63
-       
-       Setting `sort=True` will produce Pandas-like output, but with some performance penalty:
-
-       .. code-block:: python
-       
-          >>> df.groupby('a', sort=True).sum()
-             b
-          a    
-          1  11
-          2  63
-
-Grouping by index levels
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-You can also group by one or more levels of a MultiIndex:
-
-.. code:: python
-
-    >>> df = cudf.DataFrame(
-    ...     {'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]}
-    ... ).set_index(['a', 'b'])
-    ...
-    >>> df.groupby(level='a')
-
-The ``Grouper`` object
-~~~~~~~~~~~~~~~~~~~~~~
-
-A ``Grouper`` can be used to disambiguate between columns and levels
-when they have the same name:
-
-.. code:: python
-
-    >>> df
-       b  c
-    b
-    1  1  1
-    1  1  2
-    1  2  3
-    2  2  4
-    2  3  5
-    >>> df.groupby('b', level='b')  # ValueError: Cannot specify both by and level
-    >>> df.groupby([cudf.Grouper(key='b'), cudf.Grouper(level='b')])  # OK
-
-Aggregation
------------
-
-Aggregations on groups is supported via the ``agg`` method:
-
-.. code:: python
-
-    >>> df
-       a  b  c
-    0  1  1  1
-    1  1  1  2
-    2  1  2  3
-    3  2  2  4
-    4  2  3  5
-    >>> df.groupby('a').agg('sum')
-       b  c
-    a
-    1  4  6
-    2  5  9
-    >>> df.groupby('a').agg({'b': ['sum', 'min'], 'c': 'mean'})
-        b        c
-      sum min mean
-    a
-    1   4   1  2.0
-    2   5   2  4.5
-    >>> df.groupby("a").corr(method="pearson")
-              b          c
-    a                      
-    1 b  1.000000  0.866025
-      c  0.866025  1.000000
-    2 b  1.000000  1.000000
-      c  1.000000  1.000000
-
-The following table summarizes the available aggregations and the types
-that support them:
-
-.. rst-class:: special-table
-.. table::
-
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | Aggregations / dtypes              | Numeric   | Datetime   | String   | Categorical   | List   | Struct   | Interval   | Decimal   |
-   +====================================+===========+============+==========+===============+========+==========+============+===========+
-   | count                              | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | size                               | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | sum                                | ✅        | ✅         |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | idxmin                             | ✅        | ✅         |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | idxmax                             | ✅        | ✅         |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | min                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | max                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | mean                               | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | var                                | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | std                                | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | quantile                           | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | median                             | ✅        | ✅         |          |               |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | nunique                            | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | nth                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | collect                            | ✅        | ✅         | ✅       |               | ✅     |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | corr                               | ✅        |            |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   | cov                                | ✅        |            |          |               |        |          |            | ✅        |
-   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-
-GroupBy apply
--------------
-
-To apply function on each group, use the ``GroupBy.apply()`` method:
-
-.. code:: python
-
-    >>> df
-       a  b  c
-    0  1  1  1
-    1  1  1  2
-    2  1  2  3
-    3  2  2  4
-    4  2  3  5
-    >>> df.groupby('a').apply(lambda x: x.max() - x.min())
-       a  b  c
-    a
-    0  0  1  2
-    1  0  1  1
-
-Limitations
-~~~~~~~~~~~
-
--  ``apply`` works by applying the provided function to each group
-   sequentially, and concatenating the results together. **This can be
-   very slow**, especially for a large number of small groups. For a
-   small number of large groups, it can give acceptable performance
-
--  The results may not always match Pandas exactly. For example, cuDF
-   may return a ``DataFrame`` containing a single column where Pandas
-   returns a ``Series``. Some post-processing may be required to match
-   Pandas behavior.
-
--  cuDF does not support some of the exceptional cases that Pandas
-   supports with ``apply``, such as calling |describe|_ inside the
-   callable.
-
- .. |describe| replace:: ``describe``
- .. _describe: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply
-
-
-Transform
----------
-
-The ``.transform()`` method aggregates per group, and broadcasts the
-result to the group size, resulting in a Series/DataFrame that is of
-the same size as the input Series/DataFrame.
-
-.. code:: python
-
-     >>> import cudf
-     >>> df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]})
-     >>> df.groupby('a').transform('max')
-        b
-     0  5
-     1  3
-     2  3
-     3  5
-     4  5
-
-
-Rolling window calculations
----------------------------
-
-Use the ``GroupBy.rolling()`` method to perform rolling window
-calculations on each group:
-
-.. code:: python
-
-    >>> df
-       a  b  c
-    0  1  1  1
-    1  1  1  2
-    2  1  2  3
-    3  2  2  4
-    4  2  3  5
-
-Rolling window sum on each group with a window size of 2:
-
-.. code:: python
-
-    >>> df.groupby('a').rolling(2).sum()
-            a     b     c
-    a
-    1 0  <NA>  <NA>  <NA>
-      1     2     2     3
-      2     2     3     5
-    2 3  <NA>  <NA>  <NA>
-      4     4     5     9
diff --git a/docs/cudf/source/basics/index.rst b/docs/cudf/source/basics/index.rst
deleted file mode 100644
index a29866d7e32..00000000000
--- a/docs/cudf/source/basics/index.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-======
-Basics
-======
-
-
-.. toctree::
-   :maxdepth: 2
-
-   basics
-   io.rst
-   groupby.rst
-   PandasCompat.rst
-   dask-cudf.rst
-   internals.rst
-   
\ No newline at end of file
diff --git a/docs/cudf/source/basics/internals.rst b/docs/cudf/source/basics/internals.rst
deleted file mode 100644
index 96ef40d51e6..00000000000
--- a/docs/cudf/source/basics/internals.rst
+++ /dev/null
@@ -1,216 +0,0 @@
-cuDF internals
-==============
-
-The cuDF API closely matches that of the
-`Pandas <https://pandas.pydata.org/>`__ library. Thus, we have the types
-``cudf.Series``, ``cudf.DataFrame`` and ``cudf.Index`` which look and
-feel very much like their Pandas counterparts.
-
-Under the hood, however, cuDF uses data structures very different from
-Pandas. In this document, we describe these internal data structures.
-
-Column
-------
-
-Columns are cuDF's core data structure and they are modeled after the
-`Apache Arrow Columnar
-Format <https://arrow.apache.org/docs/format/Columnar.html>`__.
-
-A column represents a sequence of values, any number of which may be
-"null". Columns are specialized based on the type of data they contain.
-Thus we have ``NumericalColumn``, ``StringColumn``, ``DatetimeColumn``,
-etc.,
-
-A column is composed of the following:
-
--  A **data type**, specifying the type of each element.
--  A **data buffer** that may store the data for the column elements.
-   Some column types do not have a data buffer, instead storing data in
-   the children columns.
--  A **mask buffer** whose bits represent the validity (null or not
-   null) of each element. Columns whose elements are all "valid" may not
-   have a mask buffer. Mask buffers are padded to 64 bytes.
--  A tuple of **children** columns, which enable the representation
-   complex types such as columns with non-fixed width elements such as
-   strings or lists.
--  A **size** indicating the number of elements in the column.
--  An integer **offset**: a column may represent a "slice" of another
-   column, in which case this offset represents the first element of the
-   slice. The size of the column then gives the extent of the slice. A
-   column that is not a slice has an offset of 0.
-
-For example, the ``NumericalColumn`` backing a Series with 1000 elements
-of type 'int32' and containing nulls is composed of:
-
-1. A data buffer of size 4000 bytes (sizeof(int32) \* 1000)
-2. A mask buffer of size 128 bytes (1000/8 padded to a multiple of 64
-   bytes)
-3. No children columns
-
-As another example, the ``StringColumn`` backing the Series
-``['do', 'you', 'have', 'any', 'cheese?']`` is composed of:
-
-1. No data buffer
-2. No mask buffer as there are no nulls in the Series
-3. Two children columns:
-
-    -  A column of UTF-8 characters
-       ``['d', 'o', 'y', 'o', 'u', h' ... '?']``
-    -  A column of "offsets" to the characters column (in this case,
-       ``[0, 2, 5, 9, 12, 19]``)
-
-Buffer
-------
-
-The data and mask buffers of a column represent data in GPU memory
-(a.k.a *device memory*), and are object of type
-``cudf.core.buffer.Buffer``.
-
-Buffers can be constructed from array-like objects that live either on
-the host (e.g., numpy arrays) or the device (e.g., cupy arrays). Arrays
-must be of ``uint8`` dtype or viewed as such.
-
-When constructing a Buffer from a host object such as a numpy array, new
-device memory is allocated:
-
-.. code:: python
-
-    >>> from cudf.core.buffer import Buffer
-    >>> buf = Buffer(np.array([1, 2, 3], dtype='int64').view("uint8"))
-    >>> print(buf.ptr)  # address of new device memory allocation
-    140050901762560
-    >>> print(buf.size)
-    24
-    >>> print(buf._owner)
-    <rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baab50>
-
-cuDF uses the `RMM <https://github.com/rapidsai/rmm>`__ library for
-allocating device memory. You can read more about device memory
-allocation with RMM
-`here <https://github.com/rapidsai/rmm#devicebuffers>`__.
-
-When constructing a Buffer from a device object such as a CuPy array, no
-new device memory is allocated. Instead, the Buffer points to the
-existing allocation, keeping a reference to the device array:
-
-.. code:: python
-
-    >>> import cupy as cp
-    >>> c_ary = cp.asarray([1, 2, 3], dtype='int64')
-    >>> buf = Buffer(c_ary.view("uint8"))
-    >>> print(c_ary.data.mem.ptr)
-    140050901762560
-    >>> print(buf.ptr)
-    140050901762560
-    >>> print(buf.size)
-    24
-    >>> print(buf._owner is c_ary)
-    True
-
-An uninitialized block of device memory can be allocated with
-``Buffer.empty``:
-
-.. code:: python
-
-    >>> buf = Buffer.empty(10)
-    >>> print(buf.size)
-    10
-    >>> print(buf._owner)
-    <rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baa890>
-
-ColumnAccessor
---------------
-
-cuDF ``Series``, ``DataFrame`` and ``Index`` are all subclasses of an
-internal ``Frame`` class. The underlying data structure of ``Frame`` is
-an ordered, dictionary-like object known as ``ColumnAccessor``, which
-can be accessed via the ``._data`` attribute:
-
-.. code:: python
-
-    >>> a = cudf.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
-    >>> a._data
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d12e050>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e320>)]), multiindex=False, level_names=(None,))
-
-ColumnAccessor is an ordered mapping of column labels to columns. In
-addition to behaving like an OrderedDict, it supports things like
-selecting multiple columns (both by index and label), as well as
-hierarchical indexing.
-
-.. code:: python
-
-    >>> from cudf.core.column_accessor import ColumnAccessor
-
-The values of a ColumnAccessor are coerced to Columns during
-construction:
-
-.. code:: python
-
-    >>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
-    >>> ca['x']
-    <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
-    >>> ca['y']
-    <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>
-    >>> ca.pop('x')
-    <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
-    >>> ca
-    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>)]), multiindex=False, level_names=(None,))
-
-Columns can be inserted at a specified location:
-
-.. code:: python
-
-    >>> ca.insert('z', [3, 4, 5], loc=1)
-    >>> ca
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e3b0>)]), multiindex=False, level_names=(None,))
-
-Selecting columns by index:
-
-.. code:: python
-
-    >>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c'], 'z': [4, 5, 6]})
-    >>> ca.select_by_index(1)
-    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
-    >>> ca.select_by_index([0, 1])
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
-    >>> ca.select_by_index(slice(1, 3))
-    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
-
-Selecting columns by label:
-
-.. code:: python
-
-    >>> ca.select_by_label(['y', 'z'])
-    ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
-    >>> ca.select_by_label(slice('x', 'y'))
-    ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
-
-A ColumnAccessor with tuple keys (and constructed with
-``multiindex=True``) can be hierarchically indexed:
-
-.. code:: python
-
-    >>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], 'b': [4, 5, 6]}, multiindex=True)
-    >>> ca.select_by_label('a')
-    ColumnAccessor(OrderedColumnDict([('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('c', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>)]), multiindex=False, level_names=(None,))
-    >>> ca.select_by_label(('a', 'b'))
-    ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>)]), multiindex=False, level_names=(None,))
-
-"Wildcard" indexing is also allowed:
-
-.. code:: python
-
-    >>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], ('d', 'b'): [4, 5, 6]}, multiindex=True)
-    >>> ca.select_by_label((slice(None), 'b'))
-    ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578830>), (('d', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>)]), multiindex=True, level_names=(None, None))
-
-Finally, ColumnAccessors can convert to Pandas ``Index`` or
-``MultiIndex`` objects:
-
-.. code:: python
-
-    >>> ca.to_pandas_index()
-    MultiIndex([('a', 'b'),
-                ('a', 'c'),
-                ('d', 'b')],
-               )
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
deleted file mode 100644
index ce774453386..00000000000
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-GPUDirect Storage Integration
-=============================
-
-Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations.
-GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU.
-GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer.
-The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
-GDS is also included in CUDA Toolkit 11.4 and higher.
-
-Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``.
-This variable also controls the GDS compatibility mode.
-
-There are four valid values for the environment variable:
-
-- "GDS": Enable GDS use; GDS compatibility mode is *off*.
-- "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
-- "KVIKIO": Enable GDS through `KvikIO <https://github.com/rapidsai/kvikio>`_.
-- "OFF": Completely disable GDS use.
-
-If no value is set, behavior will be the same as the "GDS" option.
-
-This environment variable also affects how cuDF treats GDS errors.
-When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
-When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on),
-cuDF throws an exception to propagate the error to the user.
-When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a KvikIO API call fails for any reason (unlikely, given that KvikIO implements its own compatibility mode) cuDF throws an exception to propagate the error to the user.
-For more information about error handling, compatibility mode, and tuning parameters in KvikIO see: https://github.com/rapidsai/kvikio
-
-Operations that support the use of GPUDirect Storage:
-
-- :py:func:`cudf.read_avro`
-- :py:func:`cudf.read_parquet`
-- :py:func:`cudf.read_orc`
-- :py:meth:`cudf.DataFrame.to_csv`
-- :py:meth:`cudf.DataFrame.to_parquet`
-- :py:meth:`cudf.DataFrame.to_orc`
-
-Several parameters that can be used to tune the performance of GDS-enabled I/O are exposed through environment variables:
-
-- ``LIBCUDF_CUFILE_THREAD_COUNT``: Integral value, maximum number of parallel reads/writes per file (default 16);
-- ``LIBCUDF_CUFILE_SLICE_SIZE``: Integral value, maximum size of each GDS read/write, in bytes (default 4MB).
-  Larger I/O operations are split into multiple calls.
diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst
deleted file mode 100644
index fc24e0c15f4..00000000000
--- a/docs/cudf/source/basics/io-nvcomp-integration.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-nvCOMP Integration
-=============================
-
-Some types of compression/decompression can be performed using either the `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation.
-
-Which implementation is used by default depends on the data format and the compression type.
-Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
-
-There are three valid values for the environment variable:
-
-- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use.
-- "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations.
-- "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead.
-
-If no value is set, behavior will be the same as the "STABLE" option.
-
-
-.. table:: Current policy for nvCOMP use for different types
-    :widths: 20 15 15 15 15 15 15 15 15 15
-
-    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
-    |                       |       CSV       |      Parquet    |       JSON       |       ORC       |  AVRO  |
-    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
-    | Compression Type      | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader |
-    +=======================+========+========+========+========+=========+========+========+========+========+
-    | snappy                | ❌     | ❌     | Stable | Stable | ❌      | ❌     | Stable | Stable | ❌     |
-    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
diff --git a/docs/cudf/source/basics/io.rst b/docs/cudf/source/basics/io.rst
deleted file mode 100644
index ee3d997d664..00000000000
--- a/docs/cudf/source/basics/io.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-~~~~~~~~~~~~~~
-Input / Output
-~~~~~~~~~~~~~~
-
-This page contains Input / Output related APIs in cuDF.
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   io-supported-types.rst
-   io-gds-integration.rst
-   io-nvcomp-integration.rst
\ No newline at end of file
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c8b30120924..0ffbdf47d54 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -197,8 +197,9 @@
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    "python": ("https://docs.python.org/", None),
+    "python": ("https://docs.python.org/3", None),
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
 }
 
 # Config numpydoc
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 90b287bd1b6..2c1df4a0c12 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -14,7 +14,6 @@ the details of CUDA programming.
    :caption: Contents:
 
    user_guide/index
-   basics/index
    api_docs/index
 
 
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 9bb95406e8a..080fce3c55c 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -2,6 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "e9357872",
    "metadata": {},
    "source": [
     "10 Minutes to cuDF and Dask-cuDF\n",
@@ -26,6 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "92eed4cb",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,6 +47,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ed6c6047",
    "metadata": {},
    "source": [
     "Object Creation\n",
@@ -53,6 +56,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "aeedd961",
    "metadata": {},
    "source": [
     "Creating a `cudf.Series` and `dask_cudf.Series`."
@@ -61,6 +65,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "cf8b08e5",
    "metadata": {},
    "outputs": [
     {
@@ -87,6 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "083a5898",
    "metadata": {},
    "outputs": [
     {
@@ -112,6 +118,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6346e1b1",
    "metadata": {},
    "source": [
     "Creating a `cudf.DataFrame` and a `dask_cudf.DataFrame` by specifying values for each column."
@@ -120,6 +127,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "83d1e7f5",
    "metadata": {},
    "outputs": [
     {
@@ -313,6 +321,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "71b61d62",
    "metadata": {},
    "outputs": [
     {
@@ -502,6 +511,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c7cb5abc",
    "metadata": {},
    "source": [
     "Creating a `cudf.DataFrame` from a pandas `Dataframe` and a `dask_cudf.Dataframe` from a `cudf.Dataframe`.\n",
@@ -512,6 +522,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "07a62244",
    "metadata": {},
    "outputs": [
     {
@@ -586,6 +597,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "f5cb0c65",
    "metadata": {},
    "outputs": [
     {
@@ -658,6 +670,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "025eac40",
    "metadata": {},
    "source": [
     "Viewing Data\n",
@@ -666,6 +679,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "47a567e8",
    "metadata": {},
    "source": [
     "Viewing the top rows of a GPU dataframe."
@@ -674,6 +688,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "ab8cbdb8",
    "metadata": {},
    "outputs": [
     {
@@ -737,6 +752,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "2e923d8a",
    "metadata": {},
    "outputs": [
     {
@@ -799,6 +815,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "61257b4b",
    "metadata": {},
    "source": [
     "Sorting by values."
@@ -807,6 +824,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "512770f9",
    "metadata": {},
    "outputs": [
     {
@@ -996,6 +1014,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "1a13993f",
    "metadata": {},
    "outputs": [
     {
@@ -1184,6 +1203,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "19bce4c4",
    "metadata": {},
    "source": [
     "Selection\n",
@@ -1194,6 +1214,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ba55980e",
    "metadata": {},
    "source": [
     "Selecting a single column, which initially yields a `cudf.Series` or `dask_cudf.Series`. Calling `compute` results in a `cudf.Series` (equivalent to `df.a`)."
@@ -1202,6 +1223,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "885989a6",
    "metadata": {},
    "outputs": [
     {
@@ -1242,6 +1264,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "14a74255",
    "metadata": {},
    "outputs": [
     {
@@ -1281,6 +1304,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "498d79f2",
    "metadata": {},
    "source": [
     "## Selection by Label"
@@ -1288,6 +1312,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4b8b8e13",
    "metadata": {},
    "source": [
     "Selecting rows from index 2 to index 5 from columns 'a' and 'b'."
@@ -1296,6 +1321,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "d40bc19c",
    "metadata": {},
    "outputs": [
     {
@@ -1368,6 +1394,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "7688535b",
    "metadata": {},
    "outputs": [
     {
@@ -1439,6 +1466,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "8a64ce7a",
    "metadata": {},
    "source": [
     "## Selection by Position"
@@ -1446,6 +1474,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "dfba2bb2",
    "metadata": {},
    "source": [
     "Selecting via integers and integer slices, like numpy/pandas. Note that this functionality is not available for Dask-cuDF DataFrames."
@@ -1454,6 +1483,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "fb8d6d43",
    "metadata": {},
    "outputs": [
     {
@@ -1477,6 +1507,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "263231da",
    "metadata": {},
    "outputs": [
     {
@@ -1542,6 +1573,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2223b089",
    "metadata": {},
    "source": [
     "You can also select elements of a `DataFrame` or `Series` with direct index access."
@@ -1550,6 +1582,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "13f6158b",
    "metadata": {},
    "outputs": [
     {
@@ -1613,6 +1646,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "3cf4aa26",
    "metadata": {},
    "outputs": [
     {
@@ -1634,6 +1668,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ff633b2d",
    "metadata": {},
    "source": [
     "## Boolean Indexing"
@@ -1641,6 +1676,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "bbdef48f",
    "metadata": {},
    "source": [
     "Selecting rows in a `DataFrame` or `Series` by direct Boolean indexing."
@@ -1649,6 +1685,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "becb916f",
    "metadata": {},
    "outputs": [
     {
@@ -1726,6 +1763,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "b9475c43",
    "metadata": {},
    "outputs": [
     {
@@ -1802,6 +1840,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ecf982f5",
    "metadata": {},
    "source": [
     "Selecting values from a `DataFrame` where a Boolean condition is met, via the `query` API."
@@ -1810,6 +1849,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
+   "id": "fc2fc9f9",
    "metadata": {},
    "outputs": [
     {
@@ -1866,6 +1906,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
+   "id": "1a05a07f",
    "metadata": {},
    "outputs": [
     {
@@ -1921,6 +1962,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7f8955a0",
    "metadata": {},
    "source": [
     "You can also pass local variables to Dask-cuDF queries, via the `local_dict` keyword. With standard cuDF, you may either use the `local_dict` keyword or directly pass the variable via the `@` keyword. Supported logical operators include `>`, `<`, `>=`, `<=`, `==`, and `!=`."
@@ -1929,6 +1971,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
+   "id": "49485a4b",
    "metadata": {},
    "outputs": [
     {
@@ -1986,6 +2029,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
+   "id": "0f3a9116",
    "metadata": {},
    "outputs": [
     {
@@ -2042,6 +2086,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c355af07",
    "metadata": {},
    "source": [
     "Using the `isin` method for filtering."
@@ -2050,6 +2095,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
+   "id": "f44a5a57",
    "metadata": {},
    "outputs": [
     {
@@ -2112,6 +2158,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "79a50beb",
    "metadata": {},
    "source": [
     "## MultiIndex"
@@ -2119,6 +2166,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "14e70234",
    "metadata": {},
    "source": [
     "cuDF supports hierarchical indexing of DataFrames using MultiIndex. Grouping hierarchically (see `Grouping` below) automatically produces a DataFrame with a MultiIndex."
@@ -2127,6 +2175,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
+   "id": "882973ed",
    "metadata": {},
    "outputs": [
     {
@@ -2153,6 +2202,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c10971cc",
    "metadata": {},
    "source": [
     "This index can back either axis of a DataFrame."
@@ -2161,6 +2211,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
+   "id": "5417aeb9",
    "metadata": {},
    "outputs": [
     {
@@ -2238,6 +2289,7 @@
   {
    "cell_type": "code",
    "execution_count": 29,
+   "id": "4d6fb4ff",
    "metadata": {},
    "outputs": [
     {
@@ -2311,6 +2363,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "63dc11d8",
    "metadata": {},
    "source": [
     "Accessing values of a DataFrame with a MultiIndex. Note that slicing is not yet supported."
@@ -2319,6 +2372,7 @@
   {
    "cell_type": "code",
    "execution_count": 30,
+   "id": "3644920c",
    "metadata": {},
    "outputs": [
     {
@@ -2340,6 +2394,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "697a9a36",
    "metadata": {},
    "source": [
     "Missing Data\n",
@@ -2348,6 +2403,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "86655274",
    "metadata": {},
    "source": [
     "Missing data can be replaced by using the `fillna` method."
@@ -2356,6 +2412,7 @@
   {
    "cell_type": "code",
    "execution_count": 31,
+   "id": "28b06c52",
    "metadata": {},
    "outputs": [
     {
@@ -2381,6 +2438,7 @@
   {
    "cell_type": "code",
    "execution_count": 32,
+   "id": "7fb6a126",
    "metadata": {},
    "outputs": [
     {
@@ -2405,6 +2463,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7a0b732f",
    "metadata": {},
    "source": [
     "Operations\n",
@@ -2413,6 +2472,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1e8b0464",
    "metadata": {},
    "source": [
     "## Stats"
@@ -2420,6 +2480,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7523512b",
    "metadata": {},
    "source": [
     "Calculating descriptive statistics for a `Series`."
@@ -2428,6 +2489,7 @@
   {
    "cell_type": "code",
    "execution_count": 33,
+   "id": "f7cb604e",
    "metadata": {},
    "outputs": [
     {
@@ -2448,6 +2510,7 @@
   {
    "cell_type": "code",
    "execution_count": 34,
+   "id": "b8957a5f",
    "metadata": {},
    "outputs": [
     {
@@ -2467,6 +2530,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "71fa928a",
    "metadata": {},
    "source": [
     "## Applymap"
@@ -2474,6 +2538,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d98d6f7b",
    "metadata": {},
    "source": [
     "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
@@ -2482,6 +2547,7 @@
   {
    "cell_type": "code",
    "execution_count": 35,
+   "id": "5e627811",
    "metadata": {},
    "outputs": [
     {
@@ -2533,6 +2599,7 @@
   {
    "cell_type": "code",
    "execution_count": 36,
+   "id": "96cf628e",
    "metadata": {},
    "outputs": [
     {
@@ -2572,6 +2639,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "cd69c00a",
    "metadata": {},
    "source": [
     "## Histogramming"
@@ -2579,6 +2647,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "39982866",
    "metadata": {},
    "source": [
     "Counting the number of occurrences of each unique value of variable."
@@ -2587,6 +2656,7 @@
   {
    "cell_type": "code",
    "execution_count": 37,
+   "id": "62808675",
    "metadata": {},
    "outputs": [
     {
@@ -2627,6 +2697,7 @@
   {
    "cell_type": "code",
    "execution_count": 38,
+   "id": "5b2a42ce",
    "metadata": {},
    "outputs": [
     {
@@ -2666,6 +2737,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2d7e62e4",
    "metadata": {},
    "source": [
     "## String Methods"
@@ -2673,6 +2745,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4e704eca",
    "metadata": {},
    "source": [
     "Like pandas, cuDF provides string processing methods in the `str` attribute of `Series`. Full documentation of string methods is a work in progress. Please see the cuDF API documentation for more information."
@@ -2681,6 +2754,7 @@
   {
    "cell_type": "code",
    "execution_count": 39,
+   "id": "c73e70bb",
    "metadata": {},
    "outputs": [
     {
@@ -2711,6 +2785,7 @@
   {
    "cell_type": "code",
    "execution_count": 40,
+   "id": "697c1c94",
    "metadata": {},
    "outputs": [
     {
@@ -2740,6 +2815,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "dfc1371e",
    "metadata": {},
    "source": [
     "## Concat"
@@ -2747,6 +2823,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "f6fb9b53",
    "metadata": {},
    "source": [
     "Concatenating `Series` and `DataFrames` row-wise."
@@ -2755,6 +2832,7 @@
   {
    "cell_type": "code",
    "execution_count": 41,
+   "id": "60538bbd",
    "metadata": {},
    "outputs": [
     {
@@ -2786,6 +2864,7 @@
   {
    "cell_type": "code",
    "execution_count": 42,
+   "id": "17953847",
    "metadata": {},
    "outputs": [
     {
@@ -2816,6 +2895,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "27f0d621",
    "metadata": {},
    "source": [
     "## Join"
@@ -2823,6 +2903,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "fd35f1a7",
    "metadata": {},
    "source": [
     "Performing SQL style merges. Note that the dataframe order is not maintained, but may be restored post-merge by sorting by the index."
@@ -2831,6 +2912,7 @@
   {
    "cell_type": "code",
    "execution_count": 43,
+   "id": "52ada00a",
    "metadata": {},
    "outputs": [
     {
@@ -2924,6 +3006,7 @@
   {
    "cell_type": "code",
    "execution_count": 44,
+   "id": "409fcf92",
    "metadata": {},
    "outputs": [
     {
@@ -3011,6 +3094,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d9dcb86b",
    "metadata": {},
    "source": [
     "## Append"
@@ -3018,6 +3102,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1f896819",
    "metadata": {},
    "source": [
     "Appending values from another `Series` or array-like object."
@@ -3026,6 +3111,7 @@
   {
    "cell_type": "code",
    "execution_count": 45,
+   "id": "9976c1ce",
    "metadata": {},
    "outputs": [
     {
@@ -3064,6 +3150,7 @@
   {
    "cell_type": "code",
    "execution_count": 46,
+   "id": "fe5c54ab",
    "metadata": {},
    "outputs": [
     {
@@ -3093,6 +3180,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9fa10ef3",
    "metadata": {},
    "source": [
     "## Grouping"
@@ -3100,6 +3188,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "8a6e41f5",
    "metadata": {},
    "source": [
     "Like pandas, cuDF and Dask-cuDF support the Split-Apply-Combine groupby paradigm."
@@ -3108,6 +3197,7 @@
   {
    "cell_type": "code",
    "execution_count": 47,
+   "id": "2a8cafa7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3119,6 +3209,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0179d60c",
    "metadata": {},
    "source": [
     "Grouping and then applying the `sum` function to the grouped data."
@@ -3127,6 +3218,7 @@
   {
    "cell_type": "code",
    "execution_count": 48,
+   "id": "7c56d186",
    "metadata": {},
    "outputs": [
     {
@@ -3201,6 +3293,7 @@
   {
    "cell_type": "code",
    "execution_count": 49,
+   "id": "f8823b30",
    "metadata": {},
    "outputs": [
     {
@@ -3274,6 +3367,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a84cb883",
    "metadata": {},
    "source": [
     "Grouping hierarchically then applying the `sum` function to grouped data."
@@ -3282,6 +3376,7 @@
   {
    "cell_type": "code",
    "execution_count": 50,
+   "id": "2184e3ad",
    "metadata": {},
    "outputs": [
     {
@@ -3372,6 +3467,7 @@
   {
    "cell_type": "code",
    "execution_count": 51,
+   "id": "4ec311c1",
    "metadata": {},
    "outputs": [
     {
@@ -3461,6 +3557,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "dedfeb1b",
    "metadata": {},
    "source": [
     "Grouping and applying statistical functions to specific columns, using `agg`."
@@ -3469,6 +3566,7 @@
   {
    "cell_type": "code",
    "execution_count": 52,
+   "id": "2563d8b2",
    "metadata": {},
    "outputs": [
     {
@@ -3539,6 +3637,7 @@
   {
    "cell_type": "code",
    "execution_count": 53,
+   "id": "22c77e75",
    "metadata": {},
    "outputs": [
     {
@@ -3608,6 +3707,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6d074822",
    "metadata": {},
    "source": [
     "## Transpose"
@@ -3615,6 +3715,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "16c0f0a8",
    "metadata": {},
    "source": [
     "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF."
@@ -3623,6 +3724,7 @@
   {
    "cell_type": "code",
    "execution_count": 54,
+   "id": "e265861e",
    "metadata": {},
    "outputs": [
     {
@@ -3690,6 +3792,7 @@
   {
    "cell_type": "code",
    "execution_count": 55,
+   "id": "1fe9b972",
    "metadata": {},
    "outputs": [
     {
@@ -3752,14 +3855,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9ce02827",
    "metadata": {},
    "source": [
     "Time Series\n",
-    "------------\n"
+    "------------"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "fec907ff",
    "metadata": {},
    "source": [
     "`DataFrames` supports `datetime` typed columns, which allow users to interact with and filter data based on specific timestamps."
@@ -3768,6 +3873,7 @@
   {
    "cell_type": "code",
    "execution_count": 56,
+   "id": "7a425d3f",
    "metadata": {},
    "outputs": [
     {
@@ -3847,6 +3953,7 @@
   {
    "cell_type": "code",
    "execution_count": 57,
+   "id": "87f0e56e",
    "metadata": {},
    "outputs": [
     {
@@ -3919,6 +4026,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0d0e541c",
    "metadata": {},
    "source": [
     "Categoricals\n",
@@ -3927,6 +4035,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a36f9543",
    "metadata": {},
    "source": [
     "`DataFrames` support categorical columns."
@@ -3935,6 +4044,7 @@
   {
    "cell_type": "code",
    "execution_count": 58,
+   "id": "05bd8be8",
    "metadata": {},
    "outputs": [
     {
@@ -4021,6 +4131,7 @@
   {
    "cell_type": "code",
    "execution_count": 59,
+   "id": "676b4963",
    "metadata": {},
    "outputs": [
     {
@@ -4105,6 +4216,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e24f2e7b",
    "metadata": {},
    "source": [
     "Accessing the categories of a column. Note that this is currently not supported in Dask-cuDF."
@@ -4113,6 +4225,7 @@
   {
    "cell_type": "code",
    "execution_count": 60,
+   "id": "06310c36",
    "metadata": {},
    "outputs": [
     {
@@ -4132,6 +4245,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4eb6f858",
    "metadata": {},
    "source": [
     "Accessing the underlying code values of each categorical observation."
@@ -4140,6 +4254,7 @@
   {
    "cell_type": "code",
    "execution_count": 61,
+   "id": "0f6db260",
    "metadata": {},
    "outputs": [
     {
@@ -4166,6 +4281,7 @@
   {
    "cell_type": "code",
    "execution_count": 62,
+   "id": "b87c4375",
    "metadata": {},
    "outputs": [
     {
@@ -4191,6 +4307,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3f816916",
    "metadata": {},
    "source": [
     "Converting Data Representation\n",
@@ -4199,6 +4316,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "64a17f6d",
    "metadata": {},
    "source": [
     "## Pandas"
@@ -4206,6 +4324,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3acdcacc",
    "metadata": {},
    "source": [
     "Converting a cuDF and Dask-cuDF `DataFrame` to a pandas `DataFrame`."
@@ -4214,6 +4333,7 @@
   {
    "cell_type": "code",
    "execution_count": 63,
+   "id": "d1fed919",
    "metadata": {},
    "outputs": [
     {
@@ -4310,6 +4430,7 @@
   {
    "cell_type": "code",
    "execution_count": 64,
+   "id": "567c7363",
    "metadata": {},
    "outputs": [
     {
@@ -4405,6 +4526,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c2121453",
    "metadata": {},
    "source": [
     "## Numpy"
@@ -4412,6 +4534,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a9faa2c5",
    "metadata": {},
    "source": [
     "Converting a cuDF or Dask-cuDF `DataFrame` to a numpy `ndarray`."
@@ -4420,6 +4543,7 @@
   {
    "cell_type": "code",
    "execution_count": 65,
+   "id": "5490d226",
    "metadata": {},
    "outputs": [
     {
@@ -4459,6 +4583,7 @@
   {
    "cell_type": "code",
    "execution_count": 66,
+   "id": "b77ac8ae",
    "metadata": {},
    "outputs": [
     {
@@ -4497,6 +4622,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1d24d30f",
    "metadata": {},
    "source": [
     "Converting a cuDF or Dask-cuDF `Series` to a numpy `ndarray`."
@@ -4505,6 +4631,7 @@
   {
    "cell_type": "code",
    "execution_count": 67,
+   "id": "f71a0ba3",
    "metadata": {},
    "outputs": [
     {
@@ -4526,6 +4653,7 @@
   {
    "cell_type": "code",
    "execution_count": 68,
+   "id": "a45a74b5",
    "metadata": {},
    "outputs": [
     {
@@ -4546,6 +4674,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0d78a4d2",
    "metadata": {},
    "source": [
     "## Arrow"
@@ -4553,6 +4682,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7e35b829",
    "metadata": {},
    "source": [
     "Converting a cuDF or Dask-cuDF `DataFrame` to a PyArrow `Table`."
@@ -4561,6 +4691,7 @@
   {
    "cell_type": "code",
    "execution_count": 69,
+   "id": "bb9e9a2a",
    "metadata": {},
    "outputs": [
     {
@@ -4592,6 +4723,7 @@
   {
    "cell_type": "code",
    "execution_count": 70,
+   "id": "4d020de7",
    "metadata": {},
    "outputs": [
     {
@@ -4622,14 +4754,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ace7b4f9",
    "metadata": {},
    "source": [
     "Getting Data In/Out\n",
-    "------------------------\n"
+    "------------------------"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "161abb12",
    "metadata": {},
    "source": [
     "## CSV"
@@ -4637,6 +4771,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7e5dc381",
    "metadata": {},
    "source": [
     "Writing to a CSV file."
@@ -4645,6 +4780,7 @@
   {
    "cell_type": "code",
    "execution_count": 71,
+   "id": "3a59715f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4657,6 +4793,7 @@
   {
    "cell_type": "code",
    "execution_count": 72,
+   "id": "4ebe98ed",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4665,6 +4802,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0479fc4f",
    "metadata": {},
    "source": [
     "Reading from a csv file."
@@ -4673,6 +4811,7 @@
   {
    "cell_type": "code",
    "execution_count": 73,
+   "id": "1a70e831",
    "metadata": {},
    "outputs": [
     {
@@ -4905,6 +5044,7 @@
   {
    "cell_type": "code",
    "execution_count": 74,
+   "id": "4c3d9ca3",
    "metadata": {},
    "outputs": [
     {
@@ -5136,6 +5276,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3d739c6e",
    "metadata": {},
    "source": [
     "Reading all CSV files in a directory into a single `dask_cudf.DataFrame`, using the star wildcard."
@@ -5144,6 +5285,7 @@
   {
    "cell_type": "code",
    "execution_count": 75,
+   "id": "cb7187d2",
    "metadata": {},
    "outputs": [
     {
@@ -5555,6 +5697,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c0939a1e",
    "metadata": {},
    "source": [
     "## Parquet"
@@ -5562,6 +5705,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "14e6a634",
    "metadata": {},
    "source": [
     "Writing to parquet files, using the CPU via PyArrow."
@@ -5570,6 +5714,7 @@
   {
    "cell_type": "code",
    "execution_count": 76,
+   "id": "1812346f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -5578,6 +5723,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "093cd0fe",
    "metadata": {},
    "source": [
     "Reading parquet files with a GPU-accelerated parquet reader."
@@ -5586,6 +5732,7 @@
   {
    "cell_type": "code",
    "execution_count": 77,
+   "id": "2354b20b",
    "metadata": {},
    "outputs": [
     {
@@ -5817,6 +5964,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "132c3ff2",
    "metadata": {},
    "source": [
     "Writing to parquet files from a `dask_cudf.DataFrame` using PyArrow under the hood."
@@ -5825,6 +5973,7 @@
   {
    "cell_type": "code",
    "execution_count": 78,
+   "id": "c5d7686c",
    "metadata": {},
    "outputs": [
     {
@@ -5844,6 +5993,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0d73d1dd",
    "metadata": {},
    "source": [
     "## ORC"
@@ -5851,6 +6001,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "61b5f466",
    "metadata": {},
    "source": [
     "Reading ORC files."
@@ -5858,16 +6009,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 79,
+   "id": "93364ff3",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'/home/mmccarty/sandbox/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc'"
+       "'/home/ashwin/workspace/rapids/cudf/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc'"
       ]
      },
-     "execution_count": 80,
+     "execution_count": 79,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -5883,7 +6035,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 80,
+   "id": "2b6785c7",
    "metadata": {},
    "outputs": [
     {
@@ -5974,7 +6127,7 @@
        "1  [{'key': 'chani', 'value': {'int1': 5, 'string...  "
       ]
      },
-     "execution_count": 81,
+     "execution_count": 80,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -5986,6 +6139,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "238ce6a4",
    "metadata": {},
    "source": [
     "Dask Performance Tips\n",
@@ -6000,6 +6154,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3de9aeca",
    "metadata": {},
    "source": [
     "First, we set up a GPU cluster. With our `client` set up, Dask-cuDF computation will be distributed across the GPUs in the cluster."
@@ -6007,17 +6162,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 81,
+   "id": "e4852d48",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-ghcx5g0e', purging\n",
-      "2022-04-21 10:11:07,360 - distributed.diskutils - INFO - Found stale lock file and directory '/home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-wh16f0h3', purging\n",
-      "2022-04-21 10:11:07,360 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
-      "2022-04-21 10:11:07,388 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
+      "2022-04-21 13:26:06,860 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n",
+      "2022-04-21 13:26:06,904 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize\n"
      ]
     },
     {
@@ -6027,7 +6181,7 @@
        "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
        "    <div style=\"margin-left: 48px;\">\n",
        "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
-       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-e3492c89-c17c-11ec-813e-fc3497a62adc</p>\n",
+       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-20d00fd5-c198-11ec-906c-c8d9d2247354</p>\n",
        "        <table style=\"width: 100%; text-align: left;\">\n",
        "\n",
        "        <tr>\n",
@@ -6056,7 +6210,7 @@
        "    </div>\n",
        "    <div style=\"margin-left: 48px;\">\n",
        "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCUDACluster</h3>\n",
-       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">db2501e1</p>\n",
+       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">47648c26</p>\n",
        "        <table style=\"width: 100%; text-align: left;\">\n",
        "            <tr>\n",
        "                <td style=\"text-align: left;\">\n",
@@ -6093,11 +6247,11 @@
        "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
        "        <div style=\"margin-left: 48px;\">\n",
        "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
-       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-6f476508-e52f-49e9-8f1f-6a8641e177bd</p>\n",
+       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-f28bff16-cb70-452c-b8af-b9299a8d7b20</p>\n",
        "            <table style=\"width: 100%; text-align: left;\">\n",
        "                <tr>\n",
        "                    <td style=\"text-align: left;\">\n",
-       "                        <strong>Comm:</strong> tcp://127.0.0.1:39755\n",
+       "                        <strong>Comm:</strong> tcp://127.0.0.1:33995\n",
        "                    </td>\n",
        "                    <td style=\"text-align: left;\">\n",
        "                        <strong>Workers:</strong> 2\n",
@@ -6139,7 +6293,7 @@
        "                <table style=\"width: 100%; text-align: left;\">\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Comm: </strong> tcp://127.0.0.1:33491\n",
+       "                            <strong>Comm: </strong> tcp://127.0.0.1:40479\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Total threads: </strong> 1\n",
@@ -6147,7 +6301,7 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:34333/status\" target=\"_blank\">http://127.0.0.1:34333/status</a>\n",
+       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:38985/status\" target=\"_blank\">http://127.0.0.1:38985/status</a>\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Memory: </strong> 62.82 GiB\n",
@@ -6155,13 +6309,13 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Nanny: </strong> tcp://127.0.0.1:43093\n",
+       "                            <strong>Nanny: </strong> tcp://127.0.0.1:33447\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\"></td>\n",
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
-       "                            <strong>Local directory: </strong> /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-jsuvfju4\n",
+       "                            <strong>Local directory: </strong> /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-be7zg92w\n",
        "                        </td>\n",
        "                    </tr>\n",
        "\n",
@@ -6193,7 +6347,7 @@
        "                <table style=\"width: 100%; text-align: left;\">\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Comm: </strong> tcp://127.0.0.1:44033\n",
+       "                            <strong>Comm: </strong> tcp://127.0.0.1:40519\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Total threads: </strong> 1\n",
@@ -6201,7 +6355,7 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:45225/status\" target=\"_blank\">http://127.0.0.1:45225/status</a>\n",
+       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:40951/status\" target=\"_blank\">http://127.0.0.1:40951/status</a>\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\">\n",
        "                            <strong>Memory: </strong> 62.82 GiB\n",
@@ -6209,13 +6363,13 @@
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td style=\"text-align: left;\">\n",
-       "                            <strong>Nanny: </strong> tcp://127.0.0.1:46529\n",
+       "                            <strong>Nanny: </strong> tcp://127.0.0.1:39133\n",
        "                        </td>\n",
        "                        <td style=\"text-align: left;\"></td>\n",
        "                    </tr>\n",
        "                    <tr>\n",
        "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
-       "                            <strong>Local directory: </strong> /home/mmccarty/sandbox/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-zlsacw8_\n",
+       "                            <strong>Local directory: </strong> /home/ashwin/workspace/rapids/cudf/docs/cudf/source/user_guide/dask-worker-space/worker-3v0c20ux\n",
        "                        </td>\n",
        "                    </tr>\n",
        "\n",
@@ -6251,10 +6405,10 @@
        "</div>"
       ],
       "text/plain": [
-       "<Client: 'tcp://127.0.0.1:39755' processes=2 threads=2, memory=125.65 GiB>"
+       "<Client: 'tcp://127.0.0.1:33995' processes=2 threads=2, memory=45.79 GiB>"
       ]
      },
-     "execution_count": 82,
+     "execution_count": 81,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6272,6 +6426,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "181e4d10",
    "metadata": {},
    "source": [
     "### Persisting Data\n",
@@ -6280,7 +6435,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 82,
+   "id": "d47a1142",
    "metadata": {},
    "outputs": [
     {
@@ -6356,7 +6512,7 @@
        "<dask_cudf.DataFrame | 20 tasks | 5 npartitions>"
       ]
      },
-     "execution_count": 83,
+     "execution_count": 82,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6372,45 +6528,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 83,
+   "id": "c3cb612a",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Thu Apr 21 10:11:07 2022       \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\n",
-      "|-------------------------------+----------------------+----------------------+\n",
-      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
-      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
-      "|                               |                      |               MIG M. |\n",
-      "|===============================+======================+======================|\n",
-      "|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0  On |                  Off |\n",
-      "| 30%   48C    P2    83W / 300W |   2970MiB / 48651MiB |      7%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "|   1  NVIDIA RTX A6000    On   | 00000000:02:00.0 Off |                  Off |\n",
-      "| 30%   36C    P2    25W / 300W |    265MiB / 48685MiB |      5%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "                                                                               \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| Processes:                                                                  |\n",
-      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
-      "|        ID   ID                                                   Usage      |\n",
-      "|=============================================================================|\n",
-      "|    0   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                871MiB |\n",
-      "|    0   N/A  N/A      2441      G   /usr/bin/gnome-shell              316MiB |\n",
-      "|    0   N/A  N/A   1240494      G   ...AAAAAAAAA= --shared-files       68MiB |\n",
-      "|    0   N/A  N/A   1240525      G   ...RendererForSitePerProcess       41MiB |\n",
-      "|    0   N/A  N/A   1243689      C   .../envs/cudf_dev/bin/python      593MiB |\n",
-      "|    0   N/A  N/A   1245502      C   .../envs/cudf_dev/bin/python      753MiB |\n",
-      "|    0   N/A  N/A   1245751      C   .../envs/cudf_dev/bin/python      257MiB |\n",
-      "|    1   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                  4MiB |\n",
-      "|    1   N/A  N/A   1245748      C   .../envs/cudf_dev/bin/python      257MiB |\n",
-      "+-----------------------------------------------------------------------------+\n"
+      "Thu Apr 21 13:26:07 2022       \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\r\n",
+      "|-------------------------------+----------------------+----------------------+\r\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
+      "|                               |                      |               MIG M. |\r\n",
+      "|===============================+======================+======================|\r\n",
+      "|   0  Quadro GV100        Off  | 00000000:15:00.0 Off |                  Off |\r\n",
+      "| 39%   52C    P2    51W / 250W |   1115MiB / 32508MiB |      0%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "|   1  Quadro GV100        Off  | 00000000:2D:00.0 Off |                  Off |\r\n",
+      "| 43%   57C    P2    52W / 250W |    306MiB / 32498MiB |      0%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "                                                                               \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| Processes:                                                                  |\r\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\r\n",
+      "|        ID   ID                                                   Usage      |\r\n",
+      "|=============================================================================|\r\n",
+      "+-----------------------------------------------------------------------------+\r\n"
      ]
     }
    ],
@@ -6420,6 +6568,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b98810c4",
    "metadata": {},
    "source": [
     "Because Dask is lazy, the computation has not yet occurred. We can see that there are twenty tasks in the task graph and we've used about 800 MB of memory. We can force computation by using `persist`. By forcing execution, the result is now explicitly in memory and our task graph only contains one task per partition (the baseline)."
@@ -6427,7 +6576,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 84,
+   "id": "a929577c",
    "metadata": {},
    "outputs": [
     {
@@ -6503,7 +6653,7 @@
        "<dask_cudf.DataFrame | 5 tasks | 5 npartitions>"
       ]
      },
-     "execution_count": 85,
+     "execution_count": 84,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6515,45 +6665,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 85,
+   "id": "8aa7c079",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Thu Apr 21 10:11:08 2022       \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\n",
-      "|-------------------------------+----------------------+----------------------+\n",
-      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
-      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
-      "|                               |                      |               MIG M. |\n",
-      "|===============================+======================+======================|\n",
-      "|   0  NVIDIA RTX A6000    On   | 00000000:01:00.0  On |                  Off |\n",
-      "| 30%   48C    P2    84W / 300W |   2970MiB / 48651MiB |      3%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "|   1  NVIDIA RTX A6000    On   | 00000000:02:00.0 Off |                  Off |\n",
-      "| 30%   36C    P2    37W / 300W |    265MiB / 48685MiB |      0%      Default |\n",
-      "|                               |                      |                  N/A |\n",
-      "+-------------------------------+----------------------+----------------------+\n",
-      "                                                                               \n",
-      "+-----------------------------------------------------------------------------+\n",
-      "| Processes:                                                                  |\n",
-      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
-      "|        ID   ID                                                   Usage      |\n",
-      "|=============================================================================|\n",
-      "|    0   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                871MiB |\n",
-      "|    0   N/A  N/A      2441      G   /usr/bin/gnome-shell              316MiB |\n",
-      "|    0   N/A  N/A   1240494      G   ...AAAAAAAAA= --shared-files       68MiB |\n",
-      "|    0   N/A  N/A   1240525      G   ...RendererForSitePerProcess       41MiB |\n",
-      "|    0   N/A  N/A   1243689      C   .../envs/cudf_dev/bin/python      593MiB |\n",
-      "|    0   N/A  N/A   1245502      C   .../envs/cudf_dev/bin/python      753MiB |\n",
-      "|    0   N/A  N/A   1245751      C   .../envs/cudf_dev/bin/python      257MiB |\n",
-      "|    1   N/A  N/A      2292      G   /usr/lib/xorg/Xorg                  4MiB |\n",
-      "|    1   N/A  N/A   1245748      C   .../envs/cudf_dev/bin/python      257MiB |\n",
-      "+-----------------------------------------------------------------------------+\n"
+      "Thu Apr 21 13:26:08 2022       \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |\r\n",
+      "|-------------------------------+----------------------+----------------------+\r\n",
+      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\r\n",
+      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\r\n",
+      "|                               |                      |               MIG M. |\r\n",
+      "|===============================+======================+======================|\r\n",
+      "|   0  Quadro GV100        Off  | 00000000:15:00.0 Off |                  Off |\r\n",
+      "| 39%   52C    P2    52W / 250W |   1115MiB / 32508MiB |      3%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "|   1  Quadro GV100        Off  | 00000000:2D:00.0 Off |                  Off |\r\n",
+      "| 43%   57C    P2    51W / 250W |    306MiB / 32498MiB |      0%      Default |\r\n",
+      "|                               |                      |                  N/A |\r\n",
+      "+-------------------------------+----------------------+----------------------+\r\n",
+      "                                                                               \r\n",
+      "+-----------------------------------------------------------------------------+\r\n",
+      "| Processes:                                                                  |\r\n",
+      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\r\n",
+      "|        ID   ID                                                   Usage      |\r\n",
+      "|=============================================================================|\r\n",
+      "+-----------------------------------------------------------------------------+\r\n"
      ]
     }
    ],
@@ -6563,6 +6705,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ff9e14b6",
    "metadata": {},
    "source": [
     "Because we forced computation, we now have a larger object in distributed GPU memory."
@@ -6570,6 +6713,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "bb3b3dee",
    "metadata": {},
    "source": [
     "### Wait\n",
@@ -6580,7 +6724,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 86,
+   "id": "ef71bf00",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -6598,6 +6743,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e1099ec0",
    "metadata": {},
    "source": [
     "This function will do a basic transformation of every column in the dataframe, but the time spent in the function will vary due to the `time.sleep` statement randomly adding 1-60 seconds of time. We'll run this on every partition of our dataframe using `map_partitions`, which adds the task to our task-graph, and store the result. We can then call `persist` to force execution."
@@ -6605,7 +6751,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 87,
+   "id": "700dd799",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -6615,6 +6762,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "5eb83a7e",
    "metadata": {},
    "source": [
     "However, some partitions will be done **much** sooner than others. If we had downstream processes that should wait for all partitions to be completed, we can enforce that behavior using `wait`."
@@ -6622,16 +6770,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 88,
+   "id": "73bccf94",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "DoneAndNotDoneFutures(done={<Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 0)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 2)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 3)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 1)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-cec36d97aab9d38423f8023d1b43b6d3', 4)>}, not_done=set())"
+       "DoneAndNotDoneFutures(done={<Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 0)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 3)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 1)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 2)>, <Future: finished, type: cudf.core.dataframe.DataFrame, key: ('func-c8623f55c898739bdfb89533682776dc', 4)>}, not_done=set())"
       ]
      },
-     "execution_count": 89,
+     "execution_count": 88,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -6642,21 +6791,22 @@
   },
   {
    "cell_type": "markdown",
+   "id": "447301f5",
    "metadata": {},
    "source": [
-    "## With `wait`, we can safely proceed on in our workflow."
+    "With `wait`, we can safely proceed on in our workflow."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "7e06fcf4",
    "metadata": {},
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
-  "anaconda-cloud": {},
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
@@ -6673,21 +6823,8 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.8.13"
-  },
-  "toc": {
-   "base_numbering": 1,
-   "nav_menu": {},
-   "number_sections": true,
-   "sideBar": true,
-   "skip_h1_title": false,
-   "title_cell": "Table of Contents",
-   "title_sidebar": "Contents",
-   "toc_cell": false,
-   "toc_position": {},
-   "toc_section_display": true,
-   "toc_window_display": false
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/docs/cudf/source/user_guide/PandasCompat.md b/docs/cudf/source/user_guide/PandasCompat.md
new file mode 100644
index 00000000000..a33a354e2f8
--- /dev/null
+++ b/docs/cudf/source/user_guide/PandasCompat.md
@@ -0,0 +1,5 @@
+# Pandas Compatibility Notes
+
+```{eval-rst}
+.. pandas-compat-list::
+```
diff --git a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
similarity index 87%
rename from docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
rename to docs/cudf/source/user_guide/cupy-interop.ipynb
index 35ca21f380e..9fbac3b2578 100644
--- a/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -2,9 +2,10 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "8e5e6878",
    "metadata": {},
    "source": [
-    "# 10 Minutes to cuDF and CuPy\n",
+    "# Interoperability between cuDF and CuPy\n",
     "\n",
     "This notebook provides introductory examples of how you can use cuDF and CuPy together to take advantage of CuPy array functionality (such as advanced linear algebra operations)."
    ]
@@ -12,6 +13,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "8b2d45c3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,6 +31,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e7e64b1a",
    "metadata": {},
    "source": [
     "### Converting a cuDF DataFrame to a CuPy Array\n",
@@ -45,15 +48,16 @@
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "45c482ab",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "183 µs ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "553 µs ± 6.25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n",
-      "546 µs ± 2.25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
+      "118 µs ± 77.2 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
+      "360 µs ± 6.04 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
+      "355 µs ± 722 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
      ]
     }
    ],
@@ -72,6 +76,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "a565effc",
    "metadata": {},
    "outputs": [
     {
@@ -98,6 +103,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0759ab29",
    "metadata": {},
    "source": [
     "### Converting a cuDF Series to a CuPy Array"
@@ -105,27 +111,29 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4f35ffbd",
    "metadata": {},
    "source": [
     "There are also multiple ways to convert a cuDF Series to a CuPy array:\n",
     "\n",
     "1. We can pass the Series to `cupy.asarray` as cuDF Series exposes [`__cuda_array_interface__`](https://docs-cupy.chainer.org/en/stable/reference/interoperability.html).\n",
     "2. We can leverage the dlpack interface `to_dlpack()`. \n",
-    "3. We can also use `Series.values` \n"
+    "3. We can also use `Series.values`"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "8f97f304",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "76.8 µs ± 636 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "198 µs ± 2.72 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
-      "181 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
+      "54.4 µs ± 66 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
+      "125 µs ± 1.21 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
+      "119 µs ± 805 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
      ]
     }
    ],
@@ -140,6 +148,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "f96d5676",
    "metadata": {},
    "outputs": [
     {
@@ -160,6 +169,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c36e5b88",
    "metadata": {},
    "source": [
     "From here, we can proceed with normal CuPy workflows, such as reshaping the array, getting the diagonal, or calculating the norm."
@@ -168,6 +178,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "2a7ae43f",
    "metadata": {},
    "outputs": [
     {
@@ -195,6 +206,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "b442a30c",
    "metadata": {},
    "outputs": [
     {
@@ -219,6 +231,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "be7f4d32",
    "metadata": {},
    "outputs": [
     {
@@ -238,6 +251,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b353bded",
    "metadata": {},
    "source": [
     "### Converting a CuPy Array to a cuDF DataFrame\n",
@@ -256,13 +270,14 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "8887b253",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "23.9 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+      "14.3 ms ± 33.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -273,6 +288,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "08ec4ffa",
    "metadata": {},
    "outputs": [
     {
@@ -475,6 +491,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6804d291",
    "metadata": {},
    "source": [
     "We can check whether our array is Fortran contiguous by using cupy.isfortran or looking at the [flags](https://docs-cupy.chainer.org/en/stable/reference/generated/cupy.ndarray.html#cupy.ndarray.flags) of the array."
@@ -483,6 +500,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "65b8bd0d",
    "metadata": {},
    "outputs": [
     {
@@ -502,6 +520,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "151982ad",
    "metadata": {},
    "source": [
     "In this case, we'll need to convert it before going to a cuDF DataFrame. In the next two cells, we create the DataFrame by leveraging dlpack and the CUDA array interface, respectively."
@@ -510,13 +529,14 @@
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "27b2f563",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "9.15 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "6.57 ms ± 9.08 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -530,13 +550,14 @@
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "0a0cc290",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "5.74 ms ± 29.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+      "4.48 ms ± 7.89 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
      ]
     }
    ],
@@ -550,6 +571,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "0d2c5beb",
    "metadata": {},
    "outputs": [
     {
@@ -753,6 +775,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "395e2bba",
    "metadata": {},
    "source": [
     "### Converting a CuPy Array to a cuDF Series\n",
@@ -763,6 +786,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "d8518208",
    "metadata": {},
    "outputs": [
     {
@@ -787,6 +811,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7e159619",
    "metadata": {},
    "source": [
     "### Interweaving CuDF and CuPy for Smooth PyData Workflows\n",
@@ -799,6 +824,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "2bb8ed81",
    "metadata": {},
    "outputs": [
     {
@@ -1000,6 +1026,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2f3d4e78",
    "metadata": {},
    "source": [
     "We can just transform it into a CuPy array and use the `axis` argument of `sum`."
@@ -1008,6 +1035,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "2dde030d",
    "metadata": {},
    "outputs": [
     {
@@ -1035,6 +1063,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4450dcc3",
    "metadata": {},
    "source": [
     "With just that single line, we're able to seamlessly move between data structures in this ecosystem, giving us enormous flexibility without sacrificing speed."
@@ -1042,6 +1071,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "61bfb868",
    "metadata": {},
    "source": [
     "### Converting a cuDF DataFrame to a CuPy Sparse Matrix\n",
@@ -1054,6 +1084,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "e531fd15",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1072,6 +1103,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3f5e6ade",
    "metadata": {},
    "source": [
     "We can define a sparsely populated DataFrame to illustrate this conversion to either sparse matrix format."
@@ -1080,6 +1112,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "58c7e074",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1095,6 +1128,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "9265228d",
    "metadata": {},
    "outputs": [
     {
@@ -1143,115 +1177,115 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>9.37476</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>6.237859</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>11.308953</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.065878</td>\n",
+       "      <td>-5.241297</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>17.58476</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>12.35705</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>3.232751</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>8.341915</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>3.110362</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.00000</td>\n",
+       "      <td>10.869279</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>7.743024</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.00000</td>\n",
-       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>5.987098</td>\n",
-       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.526274</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.00000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
@@ -1261,19 +1295,19 @@
        "</div>"
       ],
       "text/plain": [
-       "         a0   a1   a2        a3   a4       a5        a6   a7   a8        a9  \\\n",
-       "0  0.000000  0.0  0.0  0.000000  0.0  9.37476  0.000000  0.0  0.0  0.000000   \n",
-       "1  0.000000  0.0  0.0  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.000000   \n",
-       "2  3.232751  0.0  0.0  0.000000  0.0  0.00000  8.341915  0.0  0.0  0.000000   \n",
-       "3  0.000000  0.0  0.0  0.000000  0.0  0.00000  0.000000  0.0  0.0  0.000000   \n",
-       "4  0.000000  0.0  0.0  7.743024  0.0  0.00000  0.000000  0.0  0.0  5.987098   \n",
+       "    a0   a1   a2   a3   a4   a5        a6   a7   a8        a9  a10  a11  a12  \\\n",
+       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0   \n",
+       "1  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0 -5.241297  0.0  0.0  0.0   \n",
+       "2  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0   \n",
+       "3  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0  0.0  0.0   \n",
+       "4  0.0  0.0  0.0  0.0  0.0  0.0  2.526274  0.0  0.0  0.000000  0.0  0.0  0.0   \n",
        "\n",
-       "        a10  a11  a12       a13  a14  a15       a16  a17  a18       a19  \n",
-       "0  6.237859  0.0  0.0  0.000000  0.0  0.0   0.00000  0.0  0.0  0.000000  \n",
-       "1  0.000000  0.0  0.0  0.065878  0.0  0.0  12.35705  0.0  0.0  0.000000  \n",
-       "2  0.000000  0.0  0.0  0.000000  0.0  0.0   0.00000  0.0  0.0  3.110362  \n",
-       "3  0.000000  0.0  0.0  0.000000  0.0  0.0   0.00000  0.0  0.0  0.000000  \n",
-       "4  0.000000  0.0  0.0  0.000000  0.0  0.0   0.00000  0.0  0.0  0.000000  "
+       "        a13        a14  a15  a16  a17  a18        a19  \n",
+       "0   0.00000   0.000000  0.0  0.0  0.0  0.0  11.308953  \n",
+       "1  17.58476   0.000000  0.0  0.0  0.0  0.0   0.000000  \n",
+       "2   0.00000   0.000000  0.0  0.0  0.0  0.0   0.000000  \n",
+       "3   0.00000  10.869279  0.0  0.0  0.0  0.0   0.000000  \n",
+       "4   0.00000   0.000000  0.0  0.0  0.0  0.0   0.000000  "
       ]
      },
      "execution_count": 20,
@@ -1288,63 +1322,64 @@
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "5ba1a551",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  (2, 0)\t3.2327506467190874\n",
-      "  (259, 0)\t10.723428115951062\n",
-      "  (643, 0)\t0.47763624588488707\n",
-      "  (899, 0)\t8.857065309921685\n",
-      "  (516, 0)\t8.792407143276648\n",
-      "  (262, 0)\t2.1900894573805396\n",
-      "  (390, 0)\t5.007630701229646\n",
-      "  (646, 0)\t6.630703075588639\n",
-      "  (392, 0)\t5.573713453854357\n",
-      "  (776, 0)\t10.501281989515688\n",
-      "  (904, 0)\t8.261890175181366\n",
-      "  (1033, 0)\t-0.41106824704220446\n",
-      "  (522, 0)\t12.619952511457068\n",
-      "  (139, 0)\t12.753348070606792\n",
-      "  (141, 0)\t4.936902335394504\n",
-      "  (270, 0)\t-1.7695949916946174\n",
-      "  (782, 0)\t4.378746787324408\n",
-      "  (15, 0)\t8.554141682891935\n",
-      "  (527, 0)\t5.1994882136423\n",
-      "  (912, 0)\t2.6101212854793125\n",
-      "  (401, 0)\t5.614628764689268\n",
-      "  (403, 0)\t9.999468341523317\n",
-      "  (787, 0)\t7.6170790481600985\n",
-      "  (404, 0)\t5.105328903336744\n",
-      "  (916, 0)\t1.395526391114967\n",
+      "  (770, 0)\t-1.373354548007899\n",
+      "  (771, 0)\t11.641890592020793\n",
+      "  (644, 0)\t-1.4820515981598015\n",
+      "  (773, 0)\t4.374245789758399\n",
+      "  (646, 0)\t4.58071340724814\n",
+      "  (776, 0)\t5.115792716318899\n",
+      "  (649, 0)\t8.676941295251092\n",
+      "  (522, 0)\t-0.11573951593420229\n",
+      "  (396, 0)\t8.124303607236273\n",
+      "  (652, 0)\t9.359339954077681\n",
+      "  (141, 0)\t8.50710863345112\n",
+      "  (272, 0)\t7.440244879175392\n",
+      "  (1042, 0)\t4.286859524587998\n",
+      "  (275, 0)\t-0.6091666840632348\n",
+      "  (787, 0)\t10.124449357828695\n",
+      "  (915, 0)\t11.391560911074649\n",
+      "  (1043, 0)\t11.478396096078907\n",
+      "  (408, 0)\t11.204049991287349\n",
+      "  (536, 0)\t13.239689100708974\n",
+      "  (26, 0)\t4.951917355877771\n",
+      "  (794, 0)\t2.736556006961319\n",
+      "  (539, 0)\t12.553519350929216\n",
+      "  (412, 0)\t2.8682583361020786\n",
+      "  (540, 0)\t-1.2121388231076713\n",
+      "  (796, 0)\t6.986443354019786\n",
       "  :\t:\n",
-      "  (9328, 19)\t5.938629381103238\n",
-      "  (9457, 19)\t4.463547879031807\n",
-      "  (9458, 19)\t-0.8034946631917106\n",
-      "  (8051, 19)\t-1.904327616912268\n",
-      "  (8819, 19)\t8.314944347687199\n",
-      "  (7543, 19)\t1.4303204025224376\n",
-      "  (8824, 19)\t5.1559713157589\n",
-      "  (7673, 19)\t7.478681299798863\n",
-      "  (7802, 19)\t0.502526238006068\n",
-      "  (8186, 19)\t-3.824944685072472\n",
-      "  (8570, 19)\t8.442324394481236\n",
-      "  (8571, 19)\t6.204199957873215\n",
-      "  (7420, 19)\t0.297737356585836\n",
-      "  (9212, 19)\t3.934797966994188\n",
-      "  (7421, 19)\t14.26161925450462\n",
-      "  (8574, 19)\t5.826108027573207\n",
-      "  (9214, 19)\t7.209975861932724\n",
-      "  (9825, 19)\t11.155342644729613\n",
-      "  (9702, 19)\t3.55144040779287\n",
-      "  (9578, 19)\t12.638681362546228\n",
-      "  (9712, 19)\t2.3542852760656348\n",
-      "  (9969, 19)\t-2.645175092587592\n",
-      "  (9973, 19)\t-2.2666402312025213\n",
-      "  (9851, 19)\t-4.293381721466055\n",
-      "  (9596, 19)\t6.6580506888430415\n"
+      "  (9087, 19)\t-2.9543770156500395\n",
+      "  (9440, 19)\t3.903613949374532\n",
+      "  (9186, 19)\t0.3141028170017329\n",
+      "  (9571, 19)\t1.7347840594688502\n",
+      "  (9188, 19)\t14.68745562157488\n",
+      "  (9316, 19)\t13.808308442016436\n",
+      "  (9957, 19)\t9.705810918221086\n",
+      "  (9318, 19)\t9.984168186940485\n",
+      "  (9446, 19)\t5.173000114288142\n",
+      "  (9830, 19)\t3.2442816093793607\n",
+      "  (9835, 19)\t5.713078257113576\n",
+      "  (9580, 19)\t5.373437384911853\n",
+      "  (9326, 19)\t10.736403419943093\n",
+      "  (9711, 19)\t-4.003216472911014\n",
+      "  (9200, 19)\t5.560182026578174\n",
+      "  (9844, 19)\t6.17251145210342\n",
+      "  (9333, 19)\t7.085353006324948\n",
+      "  (9208, 19)\t6.789030498520347\n",
+      "  (9464, 19)\t4.314887636528589\n",
+      "  (9720, 19)\t12.446300974563027\n",
+      "  (9594, 19)\t4.317523130615451\n",
+      "  (9722, 19)\t-2.3257161477576336\n",
+      "  (9723, 19)\t1.9288133227037407\n",
+      "  (9469, 19)\t0.268312217498608\n",
+      "  (9599, 19)\t4.100996763787237\n"
      ]
     }
    ],
@@ -1355,6 +1390,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e8e58cd5",
    "metadata": {},
    "source": [
     "From here, we could continue our workflow with a CuPy sparse matrix.\n",
@@ -1379,9 +1415,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/docs/cudf/source/user_guide/dask-cudf.md b/docs/cudf/source/user_guide/dask-cudf.md
new file mode 100644
index 00000000000..0c0b37f641c
--- /dev/null
+++ b/docs/cudf/source/user_guide/dask-cudf.md
@@ -0,0 +1,104 @@
+# Multi-GPU with Dask-cuDF
+
+cuDF is a single-GPU library. For Multi-GPU cuDF solutions we use
+[Dask](https://dask.org/) and the [dask-cudf
+package](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf),
+which is able to scale cuDF across multiple GPUs on a single machine,
+or multiple GPUs across many machines in a cluster.
+
+[Dask DataFrame](http://docs.dask.org/en/latest/dataframe.html) was
+originally designed to scale Pandas, orchestrating many Pandas
+DataFrames spread across many CPUs into a cohesive parallel DataFrame.
+Because cuDF currently implements only a subset of the Pandas API, not
+all Dask DataFrame operations work with cuDF.
+
+The following is tested and expected to work:
+
+## What works
+
+- Data ingestion
+
+  - `dask_cudf.read_csv`
+  - Use standard Dask ingestion with Pandas, then convert to cuDF (For
+    Parquet and other formats this is often decently fast)
+
+- Linear operations
+
+  - Element-wise operations: `df.x + df.y`, `df ** 2`
+  - Assignment: `df['z'] = df.x + df.y`
+  - Row-wise selections: `df[df.x > 0]`
+  - Loc: `df.loc['2001-01-01': '2005-02-02']`
+  - Date time/string accessors: `df.timestamp.dt.dayofweek`
+  - ... and most similar operations in this category that are already
+    implemented in cuDF
+
+- Reductions
+
+  - Like `sum`, `mean`, `max`, `count`, and so on on
+    `Series` objects
+  - Support for reductions on full dataframes
+  - `std`
+  - Custom reductions with
+    [dask.dataframe.reduction](https://docs.dask.org/en/latest/generated/dask.dataframe.Series.reduction.html)
+
+- Groupby aggregations
+
+  - On single columns: `df.groupby('x').y.max()`
+  - With custom aggregations:
+  - groupby standard deviation
+  - grouping on multiple columns
+  - groupby agg for multiple outputs
+
+- Joins:
+
+  - On full unsorted columns: `left.merge(right, on='id')`
+    (expensive)
+  - On sorted indexes:
+    `left.merge(right, left_index=True, right_index=True)` (fast)
+  - On large and small dataframes: `left.merge(cudf_df, on='id')`
+    (fast)
+
+- Rolling operations
+
+- Converting to and from other forms
+
+  - Dask + Pandas to Dask + cuDF
+    `df.map_partitions(cudf.from_pandas)`
+  - Dask + cuDF to Dask + Pandas
+    `df.map_partitions(lambda df: df.to_pandas())`
+  - cuDF to Dask + cuDF:
+    `dask.dataframe.from_pandas(df, npartitions=20)`
+  - Dask + cuDF to cuDF: `df.compute()`
+
+Additionally all generic Dask operations, like `compute`, `persist`,
+`visualize` and so on work regardless.
+
+## Developing the API
+
+Above we mention the following:
+
+> and most similar operations in this category that are already
+> implemented in cuDF
+
+This is because it is difficult to create a comprehensive list of
+operations in the cuDF and Pandas libraries. The API is large enough to
+be difficult to track effectively. For any operation that operates
+row-wise like `fillna` or `query` things will likely, but not
+certainly work. If operations don't work it is often due to a slight
+inconsistency between Pandas and cuDF that is generally easy to fix. We
+encourage users to look at the [cuDF issue
+tracker](https://github.com/rapidsai/cudf/issues) to see if their
+issue has already been reported and, if not, [raise a new
+issue](https://github.com/rapidsai/cudf/issues/new).
+
+## Navigating the API
+
+This project reuses the [Dask
+DataFrame](https://docs.dask.org/en/latest/dataframe.html) project,
+which was originally designed for Pandas, with the newer library cuDF.
+Because we use the same Dask classes for both projects there are often
+methods that are implemented for Pandas, but not yet for cuDF. As a
+result users looking at the full Dask DataFrame API can be misleading,
+and often lead to frustration when operations that are advertised in the
+Dask API do not work as expected with cuDF. We apologize for this in
+advance.
diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md
new file mode 100644
index 00000000000..8963f87d52e
--- /dev/null
+++ b/docs/cudf/source/user_guide/data-types.md
@@ -0,0 +1,153 @@
+# Supported Data Types
+
+cuDF supports many data types supported by NumPy and Pandas, including
+numeric, datetime, timedelta, categorical and string data types. We
+also provide special data types for working with decimals, list-like,
+and dictionary-like data.
+
+All data types in cuDF are [nullable](missing-data).
+
+<div class="special-table">
+
+| Kind of data         | Data type(s)                                                                    |
+|----------------------|---------------------------------------------------------------------------------|
+| Signed integer       | `'int8'`, `'int16'`, `'int32'`, `'int64'`                                       |
+| Unsigned integer     | `'uint32'`, `'uint64'`                                                          |
+| Floating-point       | `'float32'`, `'float64'`                                                        |
+| Datetime             | `'datetime64[s]'`, `'datetime64[ms]'`, `'datetime64['us']`, `'datetime64[ns]'`  |
+| Timedelta (duration) | `'timedelta[s]'`, `'timedelta[ms]'`, `'timedelta['us']`, `'timedelta[ns]'`      |
+| Category             | `cudf.CategoricalDtype`                                                         |
+| String               | `'object'` or `'string'`                                                        |
+| Decimal              | `cudf.Decimal32Dtype`, `cudf.Decimal64Dtype`, `cudf.Decimal64Dtype`             |
+| List                 | `cudf.ListDtype`                                                                |
+| Struct               | `cudf.StructDtype`                                                              |
+
+</div>
+
+## NumPy data types
+
+We use NumPy data types for integer, floating, datetime, timedelta,
+and string data types.  Thus, just like in NumPy,
+`np.dtype("float32")`, `np.float32`, and `"float32"` are all acceptable
+ways to specify the `float32` data type:
+
+```python
+>>> import cudf
+>>> s = cudf.Series([1, 2, 3], dtype="float32")
+>>> s
+0    1.0
+1    2.0
+2    3.0
+dtype: float32
+```
+
+## A note on `object`
+
+The data type associated with string data in cuDF is `"np.object"`.
+
+```python
+>>> import cudf 
+>>> s = cudf.Series(["abc", "def", "ghi"])
+>>> s.dtype
+dtype("object")
+```
+
+This is for compatibility with Pandas, but it can be misleading. In
+both NumPy and Pandas, `"object"` is the data type associated data
+composed of arbitrary Python objects (not just strings).  However,
+cuDF does not support storing arbitrary Python objects.
+
+## Decimal data types
+
+We provide special data types for working with decimal data, namely
+`Decimal32Dtype`, `Decimal64Dtype`, and `Decimal128Dtype`.  Use these
+data types when you need to store values with greater precision than
+allowed by floating-point representation.
+
+Decimal data types in cuDF are based on fixed-point representation.  A
+decimal data type is composed of a _precision_ and a _scale_.  The
+precision represents the total number of digits in each value of this
+dtype. For example, the precision associated with the decimal value
+`1.023` is `4`. The scale is the total number of digits to the right
+of the decimal point. The scale associated with the value `1.023` is
+3.
+
+Each decimal data type is associated with a maximum precision:
+
+```python
+>>> cudf.Decimal32Dtype.MAX_PRECISION
+9.0
+>>> cudf.Decimal64Dtype.MAX_PRECISION
+18.0
+>>> cudf.Decimal128Dtype.MAX_PRECISION
+38
+```
+
+One way to create a decimal Series is from values of type [decimal.Decimal][python-decimal].
+
+```python
+>>> from decimal import Decimal
+>>> s = cudf.Series([Decimal("1.01"), Decimal("4.23"), Decimal("0.5")])
+>>> s
+0    1.01
+1    4.23
+2    0.50
+dtype: decimal128
+>>> s.dtype
+Decimal128Dtype(precision=3, scale=2)
+```
+
+Notice the data type of the result: `1.01`, `4.23`, `0.50` can all be
+represented with a precision of at least 3 and a scale of at least 2.
+
+However, the value `1.234` needs a precision of at least 4, and a
+scale of at least 3, and cannot be fully represented using this data
+type:
+
+```python
+>>> s[1] = Decimal("1.234")  # raises an error
+```
+
+## Nested data types (`List` and `Struct`)
+
+`ListDtype` and `StructDtype` are special data types in cuDF for
+working with list-like and dictionary-like data. These are referred to
+as "nested" data types, because they enable you to store a list of
+lists, or a struct of lists, or a struct of list of lists, etc.,
+
+You can create lists and struct Series from existing Pandas Series of
+lists and dictionaries respectively:
+
+```python
+>>> psr = pd.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
+>>> psr
+0 {'a': 1, 'b': 2}
+1 {'a': 3, 'b': 4}
+dtype: object
+>>> gsr = cudf.from_pandas(psr)
+>>> gsr
+0 {'a': 1, 'b': 2}
+1 {'a': 3, 'b': 4}
+dtype: struct
+>>> gsr.dtype
+StructDtype({'a': dtype('int64'), 'b': dtype('int64')})
+```
+
+Or by reading them from disk, using a [file format that supports
+nested data](io).
+
+```python
+>>> pdf = pd.DataFrame({"a": [[1, 2], [3, 4, 5], [6, 7, 8]]})
+>>> pdf.to_parquet("lists.pq")
+>>> gdf = cudf.read_parquet("lists.pq")
+>>> gdf
+           a
+0     [1, 2]
+1  [3, 4, 5]
+2  [6, 7, 8]
+>>> gdf["a"].dtype
+ListDtype(int64)
+```
+
+[numpy-dtype]: https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes
+[python-decimal]: https://docs.python.org/3/library/decimal.html#decimal.Decimal
diff --git a/docs/cudf/source/user_guide/groupby.md b/docs/cudf/source/user_guide/groupby.md
new file mode 100644
index 00000000000..66b548727e1
--- /dev/null
+++ b/docs/cudf/source/user_guide/groupby.md
@@ -0,0 +1,273 @@
+---
+substitutions:
+  describe: '`describe`'
+---
+
+(basics-groupby)=
+
+# GroupBy
+
+cuDF supports a small (but important) subset of Pandas' [groupby
+API](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html).
+
+## Summary of supported operations
+
+1. Grouping by one or more columns
+2. Basic aggregations such as "sum", "mean", etc.
+3. Quantile aggregation
+4. A "collect" or `list` aggregation for collecting values in a group
+   into lists
+5. Automatic exclusion of columns with unsupported dtypes ("nuisance"
+   columns) when aggregating
+6. Iterating over the groups of a GroupBy object
+7. `GroupBy.groups` API that returns a mapping of group keys to row
+   labels
+8. `GroupBy.apply` API for performing arbitrary operations on each
+   group. Note that this has very limited functionality compared to the
+   equivalent Pandas function. See the section on
+   [apply](#groupby-apply) for more details.
+9. `GroupBy.pipe` similar to
+   [Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls).
+
+## Grouping
+
+A GroupBy object is created by grouping the values of a `Series` or
+`DataFrame` by one or more columns:
+
+```python
+>>> import cudf
+>>> df = cudf.DataFrame({'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]})
+>>> df
+   a  b  c
+0  1  1  1
+1  1  1  2
+2  1  2  3
+3  2  2  4
+4  2  3  5
+>>> gb1 = df.groupby('a')  # grouping by a single column
+>>> gb2 = df.groupby(['a', 'b'])  # grouping by multiple columns
+>>> gb3 = df.groupby(cudf.Series(['a', 'a', 'b', 'b', 'b']))  # grouping by an external column
+```
+
+````{warning}
+Unlike Pandas, cuDF uses `sort=False` by default to achieve better
+performance, which does not guarantee any particular group order in
+the result.
+
+For example:
+
+```python
+>>> df = cudf.DataFrame({'a' : [2, 2, 1], 'b' : [42, 21, 11]})
+>>> df.groupby('a').sum()
+   b
+a
+2  63
+1  11
+>>> df.to_pandas().groupby('a').sum()
+   b
+a
+1  11
+2  63
+```
+
+Setting `sort=True` will produce Pandas-like output, but with some performance penalty:
+
+```python
+>>> df.groupby('a', sort=True).sum()
+   b
+a
+1  11
+2  63
+```
+````
+
+### Grouping by index levels
+
+You can also group by one or more levels of a MultiIndex:
+
+```python
+>>> df = cudf.DataFrame(
+...     {'a': [1, 1, 1, 2, 2], 'b': [1, 1, 2, 2, 3], 'c': [1, 2, 3, 4, 5]}
+... ).set_index(['a', 'b'])
+...
+>>> df.groupby(level='a')
+```
+
+### The `Grouper` object
+
+A `Grouper` can be used to disambiguate between columns and levels
+when they have the same name:
+
+```python
+>>> df
+   b  c
+b
+1  1  1
+1  1  2
+1  2  3
+2  2  4
+2  3  5
+>>> df.groupby('b', level='b')  # ValueError: Cannot specify both by and level
+>>> df.groupby([cudf.Grouper(key='b'), cudf.Grouper(level='b')])  # OK
+```
+
+## Aggregation
+
+Aggregations on groups are supported via the `agg` method:
+
+```python
+>>> df
+   a  b  c
+0  1  1  1
+1  1  1  2
+2  1  2  3
+3  2  2  4
+4  2  3  5
+>>> df.groupby('a').agg('sum')
+   b  c
+a
+1  4  6
+2  5  9
+>>> df.groupby('a').agg({'b': ['sum', 'min'], 'c': 'mean'})
+    b        c
+  sum min mean
+a
+1   4   1  2.0
+2   5   2  4.5
+>>> df.groupby("a").corr(method="pearson")
+          b          c
+a
+1 b  1.000000  0.866025
+  c  0.866025  1.000000
+2 b  1.000000  1.000000
+  c  1.000000  1.000000
+```
+
+The following table summarizes the available aggregations and the types
+that support them:
+
+```{eval-rst}
+.. table::
+    :class: special-table
+
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | Aggregations / dtypes              | Numeric   | Datetime   | String   | Categorical   | List   | Struct   | Interval   | Decimal   |
+    +====================================+===========+============+==========+===============+========+==========+============+===========+
+    | count                              | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | size                               | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | sum                                | ✅        | ✅         |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | idxmin                             | ✅        | ✅         |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | idxmax                             | ✅        | ✅         |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | min                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | max                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | mean                               | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | var                                | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | std                                | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | quantile                           | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | median                             | ✅        | ✅         |          |               |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | nunique                            | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | nth                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | collect                            | ✅        | ✅         | ✅       |               | ✅     |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | corr                               | ✅        |            |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+    | cov                                | ✅        |            |          |               |        |          |            | ✅        |
+    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+```
+
+## GroupBy apply
+
+To apply function on each group, use the `GroupBy.apply()` method:
+
+```python
+>>> df
+   a  b  c
+0  1  1  1
+1  1  1  2
+2  1  2  3
+3  2  2  4
+4  2  3  5
+>>> df.groupby('a').apply(lambda x: x.max() - x.min())
+   a  b  c
+a
+0  0  1  2
+1  0  1  1
+```
+
+### Limitations
+
+- `apply` works by applying the provided function to each group
+  sequentially, and concatenating the results together. **This can be
+  very slow**, especially for a large number of small groups. For a
+  small number of large groups, it can give acceptable performance.
+- The results may not always match Pandas exactly. For example, cuDF
+  may return a `DataFrame` containing a single column where Pandas
+  returns a `Series`. Some post-processing may be required to match
+  Pandas behavior.
+- cuDF does not support some of the exceptional cases that Pandas
+  supports with `apply`, such as calling [describe] inside the
+  callable.
+
+## Transform
+
+The `.transform()` method aggregates per group, and broadcasts the
+result to the group size, resulting in a Series/DataFrame that is of
+the same size as the input Series/DataFrame.
+
+```python
+>>> import cudf
+>>> df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]})
+>>> df.groupby('a').transform('max')
+   b
+0  5
+1  3
+2  3
+3  5
+4  5
+```
+
+## Rolling window calculations
+
+Use the `GroupBy.rolling()` method to perform rolling window
+calculations on each group:
+
+```python
+>>> df
+   a  b  c
+0  1  1  1
+1  1  1  2
+2  1  2  3
+3  2  2  4
+4  2  3  5
+```
+
+Rolling window sum on each group with a window size of 2:
+
+```python
+>>> df.groupby('a').rolling(2).sum()
+        a     b     c
+a
+1 0  <NA>  <NA>  <NA>
+  1     2     2     3
+  2     2     3     5
+2 3  <NA>  <NA>  <NA>
+  4     4     5     9
+```
+
+[describe]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 8026c378156..ef7500a2be9 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -2,15 +2,16 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "77149e57",
    "metadata": {},
    "source": [
-    "Overview of User Defined Functions with cuDF\n",
-    "===================================="
+    "# Overview of User Defined Functions with cuDF"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "0c6b65ce",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,6 +22,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "8826af13",
    "metadata": {},
    "source": [
     "Like many tabular data processing APIs, cuDF provides a range of composable, DataFrame style operators. While out of the box functions are flexible and useful, it is sometimes necessary to write custom code, or user-defined functions (UDFs), that can be applied to rows, columns, and other groupings of the cells making up the DataFrame.\n",
@@ -39,10 +41,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "32a8f4fb",
    "metadata": {},
    "source": [
-    "Series UDFs\n",
-    "--------------\n",
+    "## Series UDFs\n",
     "\n",
     "You can execute UDFs on Series in two ways:\n",
     "\n",
@@ -54,14 +56,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "49399a84",
    "metadata": {},
    "source": [
-    "`cudf.Series.apply`\n",
-    "---------------------"
+    "### `cudf.Series.apply`"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "0a209ea2",
    "metadata": {},
    "source": [
     "cuDF provides a similar API to `pandas.Series.apply` for applying scalar UDFs to series objects. Here is a very basic example."
@@ -70,6 +73,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "e28d5b82",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,6 +83,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "48a9fa5e",
    "metadata": {},
    "source": [
     "UDFs destined for `cudf.Series.apply` might look something like this:"
@@ -87,6 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "96aeb19f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -97,6 +103,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e61d0169",
    "metadata": {},
    "source": [
     "`cudf.Series.apply` is called like `pd.Series.apply` and returns a new `Series` object:"
@@ -105,6 +112,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "8ca08834",
    "metadata": {},
    "outputs": [
     {
@@ -127,14 +135,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c98dab03",
    "metadata": {},
    "source": [
-    "Functions with Additional Scalar Arguments\n",
-    "---------------------------------------------------"
+    "### Functions with Additional Scalar Arguments"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "2aa3df6f",
    "metadata": {},
    "source": [
     "In addition, `cudf.Series.apply` supports `args=` just like pandas, allowing you to write UDFs that accept an arbitrary number of scalar arguments. Here is an example of such a function and it's API call in both pandas and cuDF:"
@@ -143,6 +152,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "8d156d01",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -153,6 +163,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "1dee82d7",
    "metadata": {},
    "outputs": [
     {
@@ -176,6 +187,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "22739e28",
    "metadata": {},
    "source": [
     "As a final note, `**kwargs` is not yet supported."
@@ -183,14 +195,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "afbf33dc",
    "metadata": {},
    "source": [
-    "Nullable Data\n",
-    "----------------"
+    "### Nullable Data"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "5dc06e8c",
    "metadata": {},
    "source": [
     "The null value `NA` an propagates through unary and binary operations. Thus, `NA + 1`, `abs(NA)`, and `NA == NA` all return `NA`. To make this concrete, let's look at the same example from above, this time using nullable data:"
@@ -199,6 +212,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "bda261dd",
    "metadata": {},
    "outputs": [
     {
@@ -224,6 +238,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "0123ae07",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -235,6 +250,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "e95868dd",
    "metadata": {},
    "outputs": [
     {
@@ -258,6 +274,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "97372e15",
    "metadata": {},
    "source": [
     "Often however you want explicit null handling behavior inside the function. cuDF exposes this capability the same way as pandas, by interacting directly with the `NA` singleton object. Here's an example of a function with explicit null handling:"
@@ -266,6 +283,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "6c65241b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -280,6 +298,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "ab0f4dbf",
    "metadata": {},
    "outputs": [
     {
@@ -303,6 +322,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "bdddc4e8",
    "metadata": {},
    "source": [
     "In addition, `cudf.NA` can be returned from a function directly or conditionally. This capability should allow you to implement custom null handling in a wide variety of cases."
@@ -310,14 +330,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "54cafbc0",
    "metadata": {},
    "source": [
-    "Lower level control with custom `numba` kernels\n",
-    "---------------------------------------------------------"
+    "### Lower level control with custom `numba` kernels"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "00914f2a",
    "metadata": {},
    "source": [
     "In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html).\n",
@@ -329,6 +350,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "732434f6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -338,6 +360,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "4f5997e5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -352,6 +375,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d9667a55",
    "metadata": {},
    "source": [
     "This kernel will take an input array, multiply it by a configurable value (supplied at runtime), and store the result in an output array. Notice that we wrapped our logic in an `if` statement. Because we can launch more threads than the size of our array, we need to make sure that we don't use threads with an index that would be out of bounds. Leaving this out can result in undefined behavior.\n",
@@ -362,6 +386,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "ea6008a6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -372,6 +397,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3fb69909",
    "metadata": {},
    "source": [
     "After calling our kernel, our DataFrame is now populated with the result."
@@ -380,6 +406,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "183a82ed",
    "metadata": {},
    "outputs": [
     {
@@ -469,6 +496,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ab9c305e",
    "metadata": {},
    "source": [
     "This API allows a you to theoretically write arbitrary kernel logic, potentially accessing and using elements of the series at arbitrary indices and use them on cuDF data structures. Advanced developers with some CUDA experience can often use this capability to implement iterative transformations, or spot treat problem areas of a data pipeline with a custom kernel that does the same job faster."
@@ -476,28 +504,29 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0acc6ef2",
    "metadata": {},
    "source": [
-    "DataFrame UDFs\n",
-    "--------------------\n",
+    "## DataFrame UDFs\n",
     "\n",
     "Like `cudf.Series`, there are multiple ways of using UDFs on dataframes, which essentially amount to UDFs that expect multiple columns as input:\n",
     "\n",
     "- `cudf.DataFrame.apply`, which functions like `pd.DataFrame.apply` and expects a row udf\n",
     "- `cudf.DataFrame.apply_rows`, which is a thin wrapper around numba and expects a numba kernel\n",
-    "- `cudf.DataFrame.apply_chunks`, which is similar to `cudf.DataFrame.apply_rows` but offers lower level control.\n"
+    "- `cudf.DataFrame.apply_chunks`, which is similar to `cudf.DataFrame.apply_rows` but offers lower level control."
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "2102c3ed",
    "metadata": {},
    "source": [
-    "`cudf.DataFrame.apply`\n",
-    "---------------------------"
+    "### `cudf.DataFrame.apply`"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "238bec41",
    "metadata": {},
    "source": [
     "`cudf.DataFrame.apply` is the main entrypoint for UDFs that expect multiple columns as input and produce a single output column. Functions intended to be consumed by this API are written in terms of a \"row\" argument. The \"row\" is considered to be like a dictionary and contains all of the column values at a certain `iloc` in a `DataFrame`. The function can access these values by key within the function, the keys being the column names corresponding to the desired value. Below is an example function that would be used to add column `A` and column `B` together inside a UDF."
@@ -506,6 +535,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "73653918",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -515,6 +545,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b5eb32dd",
    "metadata": {},
    "source": [
     "Let's create some very basic toy data containing at least one null."
@@ -523,6 +554,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "077feb75",
    "metadata": {},
    "outputs": [
     {
@@ -592,14 +624,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "609a3da5",
    "metadata": {},
    "source": [
-    "Finally call the function as you would in pandas - by using a lambda function to map the UDF onto \"rows\" of the DataFrame: "
+    "Finally call the function as you would in pandas - by using a lambda function to map the UDF onto \"rows\" of the DataFrame:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "091e39e1",
    "metadata": {},
    "outputs": [
     {
@@ -622,6 +656,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "44e54c31",
    "metadata": {},
    "source": [
     "The same function should produce the same result as pandas:"
@@ -630,6 +665,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "bd345fab",
    "metadata": {},
    "outputs": [
     {
@@ -652,6 +688,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "004fbbba",
    "metadata": {},
    "source": [
     "Notice that Pandas returns `object` dtype - see notes on this in the caveats section."
@@ -659,6 +696,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0b11c172",
    "metadata": {},
    "source": [
     "Like `cudf.Series.apply`, these functions support generalized null handling. Here's a function that conditionally returns a different value if a certain input is null:"
@@ -667,6 +705,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "b70f4b3b",
    "metadata": {},
    "outputs": [
     {
@@ -737,6 +776,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "0313c8df",
    "metadata": {},
    "outputs": [
     {
@@ -759,6 +799,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "313c77f3",
    "metadata": {},
    "source": [
     "`cudf.NA` can also be directly returned from a function resulting in data that has the the correct nulls in the end, just as if it were run in Pandas. For the following data, the last row fulfills the condition that `1 + 3 > 3` and returns `NA` for that row:"
@@ -767,6 +808,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
+   "id": "96a7952a",
    "metadata": {},
    "outputs": [
     {
@@ -845,6 +887,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
+   "id": "e0815f60",
    "metadata": {},
    "outputs": [
     {
@@ -867,6 +910,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b9c674f4",
    "metadata": {},
    "source": [
     "Mixed types are allowed, but will return the common type, rather than object as in Pandas. Here's a null aware op between an int and a float column:"
@@ -875,6 +919,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
+   "id": "495efd14",
    "metadata": {},
    "outputs": [
     {
@@ -948,6 +993,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
+   "id": "678b0b5a",
    "metadata": {},
    "outputs": [
     {
@@ -970,6 +1016,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ce0897c0",
    "metadata": {},
    "source": [
     "Functions may also return scalar values, however the result will be promoted to a safe type regardless of the data. This means even if you have a function like:\n",
@@ -991,6 +1038,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
+   "id": "acf48d56",
    "metadata": {},
    "outputs": [
     {
@@ -1063,6 +1111,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
+   "id": "78a98172",
    "metadata": {},
    "outputs": [
     {
@@ -1085,6 +1134,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2ceaece4",
    "metadata": {},
    "source": [
     "Any number of columns and many arithmetic operators are supported, allowing for complex UDFs:"
@@ -1093,6 +1143,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
+   "id": "142c30a9",
    "metadata": {},
    "outputs": [
     {
@@ -1181,6 +1232,7 @@
   {
    "cell_type": "code",
    "execution_count": 29,
+   "id": "fee9198a",
    "metadata": {},
    "outputs": [
     {
@@ -1203,17 +1255,17 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9c587bd2",
    "metadata": {},
    "source": [
-    "Numba kernels for DataFrames\n",
-    "------------------------------------"
+    "### Numba kernels for DataFrames"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "adc6a459",
    "metadata": {},
    "source": [
-    "\n",
     "We could apply a UDF on a DataFrame like we did above with `forall`. We'd need to write a kernel that expects multiple inputs, and pass multiple Series as arguments when we execute our kernel. Because this is fairly common and can be difficult to manage, cuDF provides two APIs to streamline this: `apply_rows` and `apply_chunks`. Below, we walk through an example of using `apply_rows`. `apply_chunks` works in a similar way, but also offers more control over low-level kernel behavior.\n",
     "\n",
     "Now that we have two numeric columns in our DataFrame, let's write a kernel that uses both of them."
@@ -1222,6 +1274,7 @@
   {
    "cell_type": "code",
    "execution_count": 30,
+   "id": "90cbcd85",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1235,6 +1288,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "bce045f2",
    "metadata": {},
    "source": [
     "Notice that we need to `enumerate` through our `zipped` function arguments (which either match or are mapped to our input column names). We can pass this kernel to `apply_rows`. We'll need to specify a few arguments:\n",
@@ -1251,6 +1305,7 @@
   {
    "cell_type": "code",
    "execution_count": 31,
+   "id": "e782daff",
    "metadata": {},
    "outputs": [
     {
@@ -1337,6 +1392,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6b838b89",
    "metadata": {},
    "source": [
     "As expected, we see our conditional addition worked. At this point, we've successfully executed UDFs on the core data structures of cuDF."
@@ -1344,9 +1400,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "fca97003",
    "metadata": {},
    "source": [
-    "## Null Handling in `apply_rows` and `apply_chunks`\n",
+    "### Null Handling in `apply_rows` and `apply_chunks`\n",
     "\n",
     "By default, DataFrame methods for applying UDFs like `apply_rows` will handle nulls pessimistically (all rows with a null value will be removed from the output if they are used in the kernel). Exploring how not handling not pessimistically can lead to undefined behavior is outside the scope of this guide. Suffice it to say, pessimistic null handling is the safe and consistent approach. You can see an example below."
    ]
@@ -1354,6 +1411,7 @@
   {
    "cell_type": "code",
    "execution_count": 32,
+   "id": "befd8333",
    "metadata": {},
    "outputs": [
     {
@@ -1445,6 +1503,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c710ce86",
    "metadata": {},
    "source": [
     "In the dataframe above, there are three null values. Each column has a null in a different row. When we use our UDF with `apply_rows`, our output should have two nulls due to pessimistic null handling (because we're not using column `c`, the null value there does not matter to us)."
@@ -1453,6 +1512,7 @@
   {
    "cell_type": "code",
    "execution_count": 33,
+   "id": "d1f3dcaf",
    "metadata": {},
    "outputs": [
     {
@@ -1546,6 +1606,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "53b9a2f8",
    "metadata": {},
    "source": [
     "As expected, we end up with two nulls in our output. The null values from the columns we used propogated to our output, but the null from the column we ignored did not."
@@ -1553,10 +1614,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4bbefa67",
    "metadata": {},
    "source": [
-    "Rolling Window UDFs\n",
-    "-------------------------\n",
+    "## Rolling Window UDFs\n",
     "\n",
     "For time-series data, we may need to operate on a small \\\"window\\\" of our column at a time, processing each portion independently. We could slide (\\\"roll\\\") this window over the entire column to answer questions like \\\"What is the 3-day moving average of a stock price over the past year?\"\n",
     "\n",
@@ -1566,6 +1627,7 @@
   {
    "cell_type": "code",
    "execution_count": 34,
+   "id": "6bc6aea3",
    "metadata": {},
    "outputs": [
     {
@@ -1593,6 +1655,7 @@
   {
    "cell_type": "code",
    "execution_count": 35,
+   "id": "a4c31df1",
    "metadata": {},
    "outputs": [
     {
@@ -1613,6 +1676,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ff40d863",
    "metadata": {},
    "source": [
     "Next, we'll define a function to use on our rolling windows. We created this one to highlight how you can include things like loops, mathematical functions, and conditionals. Rolling window UDFs do not yet support null values."
@@ -1621,6 +1685,7 @@
   {
    "cell_type": "code",
    "execution_count": 36,
+   "id": "eb5a081b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1637,6 +1702,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "df8ba31d",
    "metadata": {},
    "source": [
     "We can execute the function by passing it to `apply`. With `window=3`, `min_periods=3`, and `center=False`, our first two values are `null`."
@@ -1645,6 +1711,7 @@
   {
    "cell_type": "code",
    "execution_count": 37,
+   "id": "ddec3263",
    "metadata": {},
    "outputs": [
     {
@@ -1670,6 +1737,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "187478db",
    "metadata": {},
    "source": [
     "We can apply this function to every column in a DataFrame, too."
@@ -1678,6 +1746,7 @@
   {
    "cell_type": "code",
    "execution_count": 38,
+   "id": "8b61094a",
    "metadata": {},
    "outputs": [
     {
@@ -1759,6 +1828,7 @@
   {
    "cell_type": "code",
    "execution_count": 39,
+   "id": "bb8c3019",
    "metadata": {},
    "outputs": [
     {
@@ -1867,10 +1937,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d4785060",
    "metadata": {},
    "source": [
-    "GroupBy DataFrame UDFs\n",
-    "-------------------------------\n",
+    "## GroupBy DataFrame UDFs\n",
     "\n",
     "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n",
     "\n",
@@ -1880,6 +1950,7 @@
   {
    "cell_type": "code",
    "execution_count": 40,
+   "id": "3dc272ab",
    "metadata": {},
    "outputs": [
     {
@@ -1971,6 +2042,7 @@
   {
    "cell_type": "code",
    "execution_count": 41,
+   "id": "c0578e0a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1979,6 +2051,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4808726f",
    "metadata": {},
    "source": [
     "Next we'll define a function to apply to each group independently. In this case, we'll take the rolling average of column `e`, and call that new column `rolling_avg_e`."
@@ -1987,6 +2060,7 @@
   {
    "cell_type": "code",
    "execution_count": 42,
+   "id": "19f0f7fe",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2006,6 +2080,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7566f359",
    "metadata": {},
    "source": [
     "We can execute this with a very similar API to `apply_rows`. This time, though, it's going to execute independently for each group."
@@ -2014,6 +2089,7 @@
   {
    "cell_type": "code",
    "execution_count": 43,
+   "id": "c43426c3",
    "metadata": {},
    "outputs": [
     {
@@ -2157,6 +2233,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c8511306",
    "metadata": {},
    "source": [
     "Notice how, with a window size of three in the kernel, the first two values in each group for our output column are null."
@@ -2164,10 +2241,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0060678c",
    "metadata": {},
    "source": [
-    "Numba Kernels on CuPy Arrays\n",
-    "-------------------------------------\n",
+    "## Numba Kernels on CuPy Arrays\n",
     "\n",
     "We can also execute Numba kernels on CuPy NDArrays, again thanks to the `__cuda_array_interface__`. We can even run the same UDF on the Series and the CuPy array. First, we define a Series and then create a CuPy array from that Series."
    ]
@@ -2175,6 +2252,7 @@
   {
    "cell_type": "code",
    "execution_count": 44,
+   "id": "aa6a8509",
    "metadata": {},
    "outputs": [
     {
@@ -2198,6 +2276,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0fed556f",
    "metadata": {},
    "source": [
     "Next, we define a UDF and execute it on our Series. We need to allocate a Series of the same size for our output, which we'll call `out`."
@@ -2206,6 +2285,7 @@
   {
    "cell_type": "code",
    "execution_count": 45,
+   "id": "0bb8bf93",
    "metadata": {},
    "outputs": [
     {
@@ -2238,6 +2318,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a857b169",
    "metadata": {},
    "source": [
     "Finally, we execute the same function on our array. We allocate an empty array `out` to store our results."
@@ -2246,6 +2327,7 @@
   {
    "cell_type": "code",
    "execution_count": 46,
+   "id": "ce60b639",
    "metadata": {},
    "outputs": [
     {
@@ -2267,14 +2349,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "b899d51c",
    "metadata": {},
    "source": [
-    "Caveats\n",
-    "---------"
+    "## Caveats"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "fe7eb68b",
    "metadata": {},
    "source": [
     "- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`.\n",
@@ -2283,10 +2366,10 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c690563b",
    "metadata": {},
    "source": [
-    "Summary\n",
-    "-----------\n",
+    "## Summary\n",
     "\n",
     "This guide has covered a lot of content. At this point, you should hopefully feel comfortable writing UDFs (with or without null values) that operate on\n",
     "\n",
@@ -2323,5 +2406,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md
new file mode 100644
index 00000000000..2750c75790a
--- /dev/null
+++ b/docs/cudf/source/user_guide/index.md
@@ -0,0 +1,16 @@
+# User Guide
+
+```{toctree}
+:maxdepth: 2
+
+10min
+data-types
+io
+missing-data
+groupby
+guide-to-udfs
+cupy-interop
+dask-cudf
+internals
+PandasCompat
+```
diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst
deleted file mode 100644
index 1061008eb3c..00000000000
--- a/docs/cudf/source/user_guide/index.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-==========
-User Guide
-==========
-
-
-.. toctree::
-   :maxdepth: 2
-
-   10min.ipynb
-   10min-cudf-cupy.ipynb
-   guide-to-udfs.ipynb
-   Working-with-missing-data.ipynb
diff --git a/docs/cudf/source/user_guide/internals.md b/docs/cudf/source/user_guide/internals.md
new file mode 100644
index 00000000000..6ceef3d3492
--- /dev/null
+++ b/docs/cudf/source/user_guide/internals.md
@@ -0,0 +1,212 @@
+# cuDF internals
+
+The cuDF API closely matches that of the
+[Pandas](https://pandas.pydata.org/) library. Thus, we have the types
+`cudf.Series`, `cudf.DataFrame` and `cudf.Index` which look and
+feel very much like their Pandas counterparts.
+
+Under the hood, however, cuDF uses data structures very different from
+Pandas. In this document, we describe these internal data structures.
+
+## Column
+
+Columns are cuDF's core data structure and they are modeled after the
+[Apache Arrow Columnar
+Format](https://arrow.apache.org/docs/format/Columnar.html).
+
+A column represents a sequence of values, any number of which may be
+"null". Columns are specialized based on the type of data they contain.
+Thus we have `NumericalColumn`, `StringColumn`, `DatetimeColumn`,
+etc.
+
+A column is composed of the following:
+
+- A **data type**, specifying the type of each element.
+- A **data buffer** that may store the data for the column elements.
+  Some column types do not have a data buffer, instead storing data in
+  the children columns.
+- A **mask buffer** whose bits represent the validity (null or not
+  null) of each element. Columns whose elements are all "valid" may not
+  have a mask buffer. Mask buffers are padded to 64 bytes.
+- A tuple of **children** columns, which enable the representation
+  complex types such as columns with non-fixed width elements such as
+  strings or lists.
+- A **size** indicating the number of elements in the column.
+- An integer **offset**: a column may represent a "slice" of another
+  column, in which case this offset represents the first element of the
+  slice. The size of the column then gives the extent of the slice. A
+  column that is not a slice has an offset of 0.
+
+For example, the `NumericalColumn` backing a Series with 1000 elements
+of type 'int32' and containing nulls is composed of:
+
+1. A data buffer of size 4000 bytes (sizeof(int32) * 1000)
+2. A mask buffer of size 128 bytes (1000/8 padded to a multiple of 64
+   bytes)
+3. No children columns
+
+As another example, the `StringColumn` backing the Series
+`['do', 'you', 'have', 'any', 'cheese?']` is composed of:
+
+1. No data buffer
+2. No mask buffer as there are no nulls in the Series
+3. Two children columns:
+
+   > - A column of UTF-8 characters
+   >   `['d', 'o', 'y', 'o', 'u', 'h' ..., '?']`
+   > - A column of "offsets" to the characters column (in this case,
+   >   `[0, 2, 5, 9, 12, 19]`)
+
+## Buffer
+
+The data and mask buffers of a column represent data in GPU memory
+(a.k.a *device memory*), and are objects of type
+`cudf.core.buffer.Buffer`.
+
+Buffers can be constructed from array-like objects that live either on
+the host (e.g., numpy arrays) or the device (e.g., cupy arrays). Arrays
+must be of `uint8` dtype or viewed as such.
+
+When constructing a Buffer from a host object such as a numpy array, new
+device memory is allocated:
+
+```python
+>>> from cudf.core.buffer import Buffer
+>>> buf = Buffer(np.array([1, 2, 3], dtype='int64').view("uint8"))
+>>> print(buf.ptr)  # address of new device memory allocation
+140050901762560
+>>> print(buf.size)
+24
+>>> print(buf._owner)
+<rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baab50>
+```
+
+cuDF uses the [RMM](https://github.com/rapidsai/rmm) library for
+allocating device memory. You can read more about device memory
+allocation with RMM
+[here](https://github.com/rapidsai/rmm#devicebuffers).
+
+When constructing a Buffer from a device object such as a CuPy array, no
+new device memory is allocated. Instead, the Buffer points to the
+existing allocation, keeping a reference to the device array:
+
+```python
+>>> import cupy as cp
+>>> c_ary = cp.asarray([1, 2, 3], dtype='int64')
+>>> buf = Buffer(c_ary.view("uint8"))
+>>> print(c_ary.data.mem.ptr)
+140050901762560
+>>> print(buf.ptr)
+140050901762560
+>>> print(buf.size)
+24
+>>> print(buf._owner is c_ary)
+True
+```
+
+An uninitialized block of device memory can be allocated with
+`Buffer.empty`:
+
+```python
+>>> buf = Buffer.empty(10)
+>>> print(buf.size)
+10
+>>> print(buf._owner)
+<rmm._lib.device_buffer.DeviceBuffer object at 0x7f6055baa890>
+```
+
+## ColumnAccessor
+
+cuDF `Series`, `DataFrame` and `Index` are all subclasses of an
+internal `Frame` class. The underlying data structure of `Frame` is
+an ordered, dictionary-like object known as `ColumnAccessor`, which
+can be accessed via the `._data` attribute:
+
+```python
+>>> a = cudf.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
+>>> a._data
+ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d12e050>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e320>)]), multiindex=False, level_names=(None,))
+```
+
+ColumnAccessor is an ordered mapping of column labels to columns. In
+addition to behaving like an OrderedDict, it supports things like
+selecting multiple columns (both by index and label), as well as
+hierarchical indexing.
+
+```python
+>>> from cudf.core.column_accessor import ColumnAccessor
+```
+
+The values of a ColumnAccessor are coerced to Columns during
+construction:
+
+```python
+>>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
+>>> ca['x']
+<cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
+>>> ca['y']
+<cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>
+>>> ca.pop('x')
+<cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>
+>>> ca
+ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578b90>)]), multiindex=False, level_names=(None,))
+```
+
+Columns can be inserted at a specified location:
+
+```python
+>>> ca.insert('z', [3, 4, 5], loc=1)
+>>> ca
+ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d12e3b0>)]), multiindex=False, level_names=(None,))
+```
+
+Selecting columns by index:
+
+```python
+>>> ca = ColumnAccessor({'x': [1, 2, 3], 'y': ['a', 'b', 'c'], 'z': [4, 5, 6]})
+>>> ca.select_by_index(1)
+ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
+>>> ca.select_by_index([0, 1])
+ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
+>>> ca.select_by_index(slice(1, 3))
+ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
+```
+
+Selecting columns by label:
+
+```python
+>>> ca.select_by_label(['y', 'z'])
+ColumnAccessor(OrderedColumnDict([('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>), ('z', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5788c0>)]), multiindex=False, level_names=(None,))
+>>> ca.select_by_label(slice('x', 'y'))
+ColumnAccessor(OrderedColumnDict([('x', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('y', <cudf.core.column.string.StringColumn object at 0x7f5a7d578830>)]), multiindex=False, level_names=(None,))
+```
+
+A ColumnAccessor with tuple keys (and constructed with
+`multiindex=True`) can be hierarchically indexed:
+
+```python
+>>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], 'b': [4, 5, 6]}, multiindex=True)
+>>> ca.select_by_label('a')
+ColumnAccessor(OrderedColumnDict([('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>), ('c', <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578dd0>)]), multiindex=False, level_names=(None,))
+>>> ca.select_by_label(('a', 'b'))
+ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d5789e0>)]), multiindex=False, level_names=(None,))
+```
+
+"Wildcard" indexing is also allowed:
+
+```python
+>>> ca = ColumnAccessor({('a', 'b'): [1, 2, 3], ('a', 'c'): [2, 3, 4], ('d', 'b'): [4, 5, 6]}, multiindex=True)
+>>> ca.select_by_label((slice(None), 'b'))
+ColumnAccessor(OrderedColumnDict([(('a', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578830>), (('d', 'b'), <cudf.core.column.numerical.NumericalColumn object at 0x7f5a7d578680>)]), multiindex=True, level_names=(None, None))
+```
+
+Finally, ColumnAccessors can convert to Pandas `Index` or
+`MultiIndex` objects:
+
+```python
+>>> ca.to_pandas_index()
+MultiIndex([('a', 'b'),
+            ('a', 'c'),
+            ('d', 'b')],
+           )
+```
diff --git a/docs/cudf/source/basics/io-supported-types.rst b/docs/cudf/source/user_guide/io.md
similarity index 69%
rename from docs/cudf/source/basics/io-supported-types.rst
rename to docs/cudf/source/user_guide/io.md
index 4a7da60fa85..672375eedaf 100644
--- a/docs/cudf/source/basics/io-supported-types.rst
+++ b/docs/cudf/source/user_guide/io.md
@@ -1,10 +1,17 @@
-I/O Supported dtypes
-====================
+# Input / Output
 
-The following table lists are compatible cudf types for each supported IO format.
+This page contains Input / Output related APIs in cuDF.
 
-.. rst-class:: io-supported-types-table special-table
+## I/O Supported dtypes
+
+The following table lists are compatible cudf types for each supported
+IO format.
+
+<div class="special-table-wrapper" style="overflow:auto">
+
+```{eval-rst}
 .. table::
+    :class: io-supported-types-table special-table
     :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+-------------------+--------+--------+---------+---------+
@@ -64,7 +71,103 @@ The following table lists are compatible cudf types for each supported IO format
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | decimal128            | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+```
+
+</div>
+
 
 **Notes:**
 
-* [¹] - Not GPU-accelerated.
+- \[¹\] - Not GPU-accelerated.
+
+## GPUDirect Storage Integration
+
+Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO
+operations.  GDS enables a direct data path for direct memory access
+(DMA) transfers between GPU memory and storage, which avoids a bounce
+buffer through the CPU.  GDS also has a compatibility mode that allows
+the library to fall back to copying through a CPU bounce buffer.  The
+SDK is available for download
+[here](https://developer.nvidia.com/gpudirect-storage).  GDS is also
+included in CUDA Toolkit 11.4 and higher.
+
+Use of GPUDirect Storage in cuDF is enabled by default, but can be
+disabled through the environment variable `LIBCUDF_CUFILE_POLICY`.
+This variable also controls the GDS compatibility mode.
+
+There are four valid values for the environment variable:
+
+- "GDS": Enable GDS use; GDS compatibility mode is *off*.
+- "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
+- "KVIKIO": Enable GDS through [KvikIO](https://github.com/rapidsai/kvikio).
+- "OFF": Completely disable GDS use.
+
+If no value is set, behavior will be the same as the "GDS" option.
+
+This environment variable also affects how cuDF treats GDS errors.
+
+- When `LIBCUDF_CUFILE_POLICY` is set to "GDS" and a GDS API call
+  fails for any reason, cuDF falls back to the internal implementation
+  with bounce buffers.
+- When `LIBCUDF_CUFILE_POLICY` is set to "ALWAYS" and a GDS API call
+fails for any reason (unlikely, given that the compatibility mode is
+on), cuDF throws an exception to propagate the error to the user.
+- When `LIBCUDF_CUFILE_POLICY` is set to "KVIKIO" and a KvikIO API
+  call fails for any reason (unlikely, given that KvikIO implements
+  its own compatibility mode) cuDF throws an exception to propagate
+  the error to the user.
+
+For more information about error handling, compatibility mode, and
+tuning parameters in KvikIO see: <https://github.com/rapidsai/kvikio>
+
+Operations that support the use of GPUDirect Storage:
+
+- {py:func}`cudf.read_avro`
+- {py:func}`cudf.read_parquet`
+- {py:func}`cudf.read_orc`
+- {py:meth}`cudf.DataFrame.to_csv`
+- {py:meth}`cudf.DataFrame.to_parquet`
+- {py:meth}`cudf.DataFrame.to_orc`
+
+Several parameters that can be used to tune the performance of
+GDS-enabled I/O are exposed through environment variables:
+
+- `LIBCUDF_CUFILE_THREAD_COUNT`: Integral value, maximum number of
+  parallel reads/writes per file (default 16);
+- `LIBCUDF_CUFILE_SLICE_SIZE`: Integral value, maximum size of each
+  GDS read/write, in bytes (default 4MB).  Larger I/O operations are
+  split into multiple calls.
+
+## nvCOMP Integration
+
+Some types of compression/decompression can be performed using either
+the [nvCOMP library](https://github.com/NVIDIA/nvcomp) or the internal
+implementation.
+
+Which implementation is used by default depends on the data format and
+the compression type.  Behavior can be influenced through environment
+variable `LIBCUDF_NVCOMP_POLICY`.
+
+There are three valid values for the environment variable:
+
+- "STABLE": Only enable the nvCOMP in places where it has been deemed
+  stable for production use.
+- "ALWAYS": Enable all available uses of nvCOMP, including new,
+  experimental combinations.
+- "OFF": Disable nvCOMP use whenever possible and use the internal
+  implementations instead.
+
+If no value is set, behavior will be the same as the "STABLE" option.
+
+```{eval-rst}
+.. table:: Current policy for nvCOMP use for different types
+    :widths: 20 15 15 15 15 15 15 15 15 15
+
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
+    |                       |       CSV       |      Parquet    |       JSON       |       ORC       |  AVRO  |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
+    | Compression Type      | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader |
+    +=======================+========+========+========+========+=========+========+========+========+========+
+    | snappy                | ❌     | ❌     | Stable | Stable | ❌      | ❌     | Stable | Stable | ❌     |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
+```
diff --git a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb b/docs/cudf/source/user_guide/missing-data.ipynb
similarity index 87%
rename from docs/cudf/source/user_guide/Working-with-missing-data.ipynb
rename to docs/cudf/source/user_guide/missing-data.ipynb
index 54fe774060e..ad12c675373 100644
--- a/docs/cudf/source/user_guide/Working-with-missing-data.ipynb
+++ b/docs/cudf/source/user_guide/missing-data.ipynb
@@ -2,6 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "f8ffbea7",
    "metadata": {},
    "source": [
     "# Working with missing data"
@@ -9,6 +10,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7e3ab093",
    "metadata": {},
    "source": [
     "In this section, we will discuss missing (also referred to as `NA`) values in cudf. cudf supports having missing values in all dtypes. These missing values are represented by `<NA>`. These values are also referenced as \"null values\"."
@@ -16,25 +18,7 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "1. [How to Detect missing values](#How-to-Detect-missing-values)\n",
-    "2. [Float dtypes and missing data](#Float-dtypes-and-missing-data)\n",
-    "3. [Datetimes](#Datetimes)\n",
-    "4. [Calculations with missing data](#Calculations-with-missing-data)\n",
-    "5. [Sum/product of Null/nans](#Sum/product-of-Null/nans)\n",
-    "6. [NA values in GroupBy](#NA-values-in-GroupBy)\n",
-    "7. [Inserting missing data](#Inserting-missing-data)\n",
-    "8. [Filling missing values: fillna](#Filling-missing-values:-fillna)\n",
-    "9. [Filling with cudf Object](#Filling-with-cudf-Object)\n",
-    "10. [Dropping axis labels with missing data: dropna](#Dropping-axis-labels-with-missing-data:-dropna)\n",
-    "11. [Replacing generic values](#Replacing-generic-values)\n",
-    "12. [String/regular expression replacement](#String/regular-expression-replacement)\n",
-    "13. [Numeric replacement](#Numeric-replacement)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
+   "id": "8d657a82",
    "metadata": {},
    "source": [
     "## How to Detect missing values"
@@ -42,6 +26,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "9ea9f672",
    "metadata": {},
    "source": [
     "To detect missing values, you can use `isna()` and `notna()` functions."
@@ -50,6 +35,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "58050adb",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -60,6 +46,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "416d73da",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,6 +56,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "5dfc6bc3",
    "metadata": {},
    "outputs": [
     {
@@ -141,6 +129,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "4d7f7a6d",
    "metadata": {},
    "outputs": [
     {
@@ -213,6 +202,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "40edca67",
    "metadata": {},
    "outputs": [
     {
@@ -236,6 +226,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "acdf29d7",
    "metadata": {},
    "source": [
     "One has to be mindful that in Python (and NumPy), the nan's don’t compare equal, but None's do. Note that cudf/NumPy uses the fact that `np.nan != np.nan`, and treats `None` like `np.nan`."
@@ -244,6 +235,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+   "id": "c269c1f5",
    "metadata": {},
    "outputs": [
     {
@@ -264,6 +256,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "99fb083a",
    "metadata": {},
    "outputs": [
     {
@@ -283,22 +276,23 @@
   },
   {
    "cell_type": "markdown",
+   "id": "4fdb8bc7",
    "metadata": {},
    "source": [
-    "So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information.\n",
-    "\n"
+    "So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "630ef6bb",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "0    False\n",
-       "1    False\n",
+       "1     <NA>\n",
        "2    False\n",
        "3    False\n",
        "Name: b, dtype: bool"
@@ -316,6 +310,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "8162e383",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -325,6 +320,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "199775b3",
    "metadata": {},
    "outputs": [
     {
@@ -348,14 +344,15 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "cd09d80c",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0    False\n",
-       "1    False\n",
-       "2    False\n",
+       "0    <NA>\n",
+       "1    <NA>\n",
+       "2    <NA>\n",
        "dtype: bool"
       ]
      },
@@ -371,6 +368,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "6b23bb0c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -380,6 +378,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
+   "id": "cafb79ee",
    "metadata": {},
    "outputs": [
     {
@@ -403,6 +402,7 @@
   {
    "cell_type": "code",
    "execution_count": 14,
+   "id": "13363897",
    "metadata": {},
    "outputs": [
     {
@@ -425,6 +425,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "208a3776",
    "metadata": {},
    "source": [
     "## Float dtypes and missing data"
@@ -432,16 +433,18 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2c174b88",
    "metadata": {},
    "source": [
     "Because ``NaN`` is a float, a column of integers with even one missing values is cast to floating-point dtype. However this doesn't happen by default.\n",
     "\n",
-    "By default if a ``NaN`` value is passed to `Series` constructor, it is treated as `<NA>` value. "
+    "By default if a ``NaN`` value is passed to `Series` constructor, it is treated as `<NA>` value."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 15,
+   "id": "c59c3c54",
    "metadata": {},
    "outputs": [
     {
@@ -464,6 +467,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a9eb2d9c",
    "metadata": {},
    "source": [
     "Hence to consider a ``NaN`` as ``NaN`` you will have to pass `nan_as_null=False` parameter into `Series` constructor."
@@ -472,6 +476,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
+   "id": "ecc5ae92",
    "metadata": {},
    "outputs": [
     {
@@ -494,6 +499,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d1db7b08",
    "metadata": {},
    "source": [
     "## Datetimes"
@@ -501,15 +507,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "548d3734",
    "metadata": {},
    "source": [
-    "For `datetime64` types, cudf doesn't support having `NaT` values. Instead these values which are specific to numpy and pandas are considered as null values(`<NA>`) in cudf. The actual underlying value of `NaT` is `min(int64)` and cudf retains the underlying value when converting a cudf object to pandas object.\n",
-    "\n"
+    "For `datetime64` types, cudf doesn't support having `NaT` values. Instead these values which are specific to numpy and pandas are considered as null values(`<NA>`) in cudf. The actual underlying value of `NaT` is `min(int64)` and cudf retains the underlying value when converting a cudf object to pandas object."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,
+   "id": "de70f244",
    "metadata": {},
    "outputs": [
     {
@@ -535,6 +542,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
+   "id": "8411a914",
    "metadata": {},
    "outputs": [
     {
@@ -557,6 +565,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "df664145",
    "metadata": {},
    "source": [
     "any operations on rows having `<NA>` values in `datetime` column will result in `<NA>` value at the same location in resulting column:"
@@ -565,6 +574,7 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "829c32d0",
    "metadata": {},
    "outputs": [
     {
@@ -587,6 +597,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "aa8031ef",
    "metadata": {},
    "source": [
     "## Calculations with missing data"
@@ -594,6 +605,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c587fae2",
    "metadata": {},
    "source": [
     "Null values propagate naturally through arithmetic operations between pandas objects."
@@ -602,6 +614,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "f8f2aec7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -611,6 +624,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
+   "id": "0c8a3011",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -620,6 +634,7 @@
   {
    "cell_type": "code",
    "execution_count": 22,
+   "id": "052f6c2b",
    "metadata": {},
    "outputs": [
     {
@@ -698,6 +713,7 @@
   {
    "cell_type": "code",
    "execution_count": 23,
+   "id": "0fb0a083",
    "metadata": {},
    "outputs": [
     {
@@ -776,6 +792,7 @@
   {
    "cell_type": "code",
    "execution_count": 24,
+   "id": "6f8152c0",
    "metadata": {},
    "outputs": [
     {
@@ -853,6 +870,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "11170d49",
    "metadata": {},
    "source": [
     "While summing the data along a series, `NA` values will be treated as `0`."
@@ -861,6 +879,7 @@
   {
    "cell_type": "code",
    "execution_count": 25,
+   "id": "45081790",
    "metadata": {},
    "outputs": [
     {
@@ -886,6 +905,7 @@
   {
    "cell_type": "code",
    "execution_count": 26,
+   "id": "39922658",
    "metadata": {},
    "outputs": [
     {
@@ -905,6 +925,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6e99afe0",
    "metadata": {},
    "source": [
     "Since `NA` values are treated as `0`, the mean would result to 2 in this case `(1 + 0 + 2 + 3 + 0)/5 = 2`"
@@ -913,6 +934,7 @@
   {
    "cell_type": "code",
    "execution_count": 27,
+   "id": "b2f16ddb",
    "metadata": {},
    "outputs": [
     {
@@ -932,6 +954,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "07f2ec5a",
    "metadata": {},
    "source": [
     "To preserve `NA` values in the above calculations, `sum` & `mean` support `skipna` parameter.\n",
@@ -942,6 +965,7 @@
   {
    "cell_type": "code",
    "execution_count": 28,
+   "id": "d4a463a0",
    "metadata": {},
    "outputs": [
     {
@@ -962,6 +986,7 @@
   {
    "cell_type": "code",
    "execution_count": 29,
+   "id": "a944c42e",
    "metadata": {},
    "outputs": [
     {
@@ -981,6 +1006,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "fb8c8f18",
    "metadata": {},
    "source": [
     "Cumulative methods like `cumsum` and `cumprod` ignore `NA` values by default."
@@ -989,6 +1015,7 @@
   {
    "cell_type": "code",
    "execution_count": 30,
+   "id": "4f2a7306",
    "metadata": {},
    "outputs": [
     {
@@ -1013,6 +1040,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c8f6054b",
    "metadata": {},
    "source": [
     "To preserve `NA` values in cumulative methods, provide `skipna=False`."
@@ -1021,6 +1049,7 @@
   {
    "cell_type": "code",
    "execution_count": 31,
+   "id": "d4c46776",
    "metadata": {},
    "outputs": [
     {
@@ -1045,6 +1074,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "67077d65",
    "metadata": {},
    "source": [
     "## Sum/product of Null/nans"
@@ -1052,6 +1082,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ffbb9ca1",
    "metadata": {},
    "source": [
     "The sum of an empty or all-NA Series of a DataFrame is 0."
@@ -1060,6 +1091,7 @@
   {
    "cell_type": "code",
    "execution_count": 32,
+   "id": "f430c9ce",
    "metadata": {},
    "outputs": [
     {
@@ -1080,6 +1112,7 @@
   {
    "cell_type": "code",
    "execution_count": 33,
+   "id": "7fde514b",
    "metadata": {},
    "outputs": [
     {
@@ -1100,6 +1133,7 @@
   {
    "cell_type": "code",
    "execution_count": 34,
+   "id": "56cedd17",
    "metadata": {},
    "outputs": [
     {
@@ -1119,6 +1153,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "cb188adb",
    "metadata": {},
    "source": [
     "The product of an empty or all-NA Series of a DataFrame is 1."
@@ -1127,6 +1162,7 @@
   {
    "cell_type": "code",
    "execution_count": 35,
+   "id": "d20bbbef",
    "metadata": {},
    "outputs": [
     {
@@ -1147,6 +1183,7 @@
   {
    "cell_type": "code",
    "execution_count": 36,
+   "id": "75abbcfa",
    "metadata": {},
    "outputs": [
     {
@@ -1167,6 +1204,7 @@
   {
    "cell_type": "code",
    "execution_count": 37,
+   "id": "becce0cc",
    "metadata": {},
    "outputs": [
     {
@@ -1186,6 +1224,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0e899e03",
    "metadata": {},
    "source": [
     "## NA values in GroupBy"
@@ -1193,6 +1232,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7fb20874",
    "metadata": {},
    "source": [
     "`NA` groups in GroupBy are automatically excluded. For example:"
@@ -1201,6 +1241,7 @@
   {
    "cell_type": "code",
    "execution_count": 38,
+   "id": "1379037c",
    "metadata": {},
    "outputs": [
     {
@@ -1279,6 +1320,7 @@
   {
    "cell_type": "code",
    "execution_count": 39,
+   "id": "d6b91e6f",
    "metadata": {},
    "outputs": [
     {
@@ -1345,6 +1387,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "cb83fb11",
    "metadata": {},
    "source": [
     "It is also possible to include `NA` in groups by passing `dropna=False`"
@@ -1353,9 +1396,8 @@
   {
    "cell_type": "code",
    "execution_count": 40,
-   "metadata": {
-    "scrolled": true
-   },
+   "id": "768c3e50",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1426,6 +1468,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "133816b4",
    "metadata": {},
    "source": [
     "## Inserting missing data"
@@ -1433,6 +1476,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "306082ad",
    "metadata": {},
    "source": [
     "All dtypes support insertion of missing value by assignment. Any specific location in series can made null by assigning it to `None`."
@@ -1441,6 +1485,7 @@
   {
    "cell_type": "code",
    "execution_count": 41,
+   "id": "7ddde1fe",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1450,6 +1495,7 @@
   {
    "cell_type": "code",
    "execution_count": 42,
+   "id": "16e54597",
    "metadata": {},
    "outputs": [
     {
@@ -1474,6 +1520,7 @@
   {
    "cell_type": "code",
    "execution_count": 43,
+   "id": "f628f94d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1483,9 +1530,8 @@
   {
    "cell_type": "code",
    "execution_count": 44,
-   "metadata": {
-    "scrolled": true
-   },
+   "id": "b30590b7",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -1508,6 +1554,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a1b123d0",
    "metadata": {},
    "source": [
     "## Filling missing values: fillna"
@@ -1515,6 +1562,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "114aa23a",
    "metadata": {},
    "source": [
     "`fillna()` can fill in `NA` & `NaN` values with non-NA data."
@@ -1523,6 +1571,7 @@
   {
    "cell_type": "code",
    "execution_count": 45,
+   "id": "59e22668",
    "metadata": {},
    "outputs": [
     {
@@ -1601,6 +1650,7 @@
   {
    "cell_type": "code",
    "execution_count": 46,
+   "id": "05c221ee",
    "metadata": {},
    "outputs": [
     {
@@ -1625,6 +1675,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "401f91b2",
    "metadata": {},
    "source": [
     "## Filling with cudf Object"
@@ -1632,6 +1683,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e79346d6",
    "metadata": {},
    "source": [
     "You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series must match the columns of the frame you wish to fill. The use case of this is to fill a DataFrame with the mean of that column."
@@ -1640,6 +1692,7 @@
   {
    "cell_type": "code",
    "execution_count": 47,
+   "id": "f52c5d8f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1650,6 +1703,7 @@
   {
    "cell_type": "code",
    "execution_count": 48,
+   "id": "6affebe9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1659,6 +1713,7 @@
   {
    "cell_type": "code",
    "execution_count": 49,
+   "id": "1ce1b96f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1668,6 +1723,7 @@
   {
    "cell_type": "code",
    "execution_count": 50,
+   "id": "90829195",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1677,6 +1733,7 @@
   {
    "cell_type": "code",
    "execution_count": 51,
+   "id": "c0feac14",
    "metadata": {},
    "outputs": [
     {
@@ -1708,63 +1765,63 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.771245</td>\n",
-       "      <td>0.051024</td>\n",
-       "      <td>1.199239</td>\n",
+       "      <td>-0.408268</td>\n",
+       "      <td>-0.676643</td>\n",
+       "      <td>-1.274743</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>-1.168041</td>\n",
-       "      <td>0.702664</td>\n",
-       "      <td>-0.270806</td>\n",
+       "      <td>-0.029322</td>\n",
+       "      <td>-0.873593</td>\n",
+       "      <td>-1.214105</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>-1.467009</td>\n",
-       "      <td>-0.143080</td>\n",
-       "      <td>-0.806151</td>\n",
+       "      <td>-0.866371</td>\n",
+       "      <td>1.081735</td>\n",
+       "      <td>-0.226840</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>NaN</td>\n",
-       "      <td>-0.610798</td>\n",
-       "      <td>-0.272895</td>\n",
+       "      <td>0.812278</td>\n",
+       "      <td>1.074973</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>1.396784</td>\n",
+       "      <td>-0.366725</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>-0.439343</td>\n",
+       "      <td>-1.016239</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>1.093102</td>\n",
-       "      <td>-0.764758</td>\n",
+       "      <td>0.675123</td>\n",
+       "      <td>1.067536</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>0.003098</td>\n",
-       "      <td>-0.722648</td>\n",
+       "      <td>0.221568</td>\n",
+       "      <td>2.025961</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>-0.095899</td>\n",
-       "      <td>-1.285156</td>\n",
-       "      <td>-0.300566</td>\n",
+       "      <td>-0.317241</td>\n",
+       "      <td>1.011275</td>\n",
+       "      <td>0.674891</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.109465</td>\n",
-       "      <td>2.497843</td>\n",
-       "      <td>-1.199856</td>\n",
+       "      <td>-0.877041</td>\n",
+       "      <td>-1.919394</td>\n",
+       "      <td>-1.029201</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1772,16 +1829,16 @@
       ],
       "text/plain": [
        "          A         B         C\n",
-       "0  0.771245  0.051024  1.199239\n",
-       "1 -1.168041  0.702664 -0.270806\n",
-       "2 -1.467009 -0.143080 -0.806151\n",
-       "3       NaN -0.610798 -0.272895\n",
-       "4       NaN       NaN  1.396784\n",
-       "5 -0.439343       NaN       NaN\n",
-       "6  1.093102 -0.764758       NaN\n",
-       "7  0.003098 -0.722648       NaN\n",
-       "8 -0.095899 -1.285156 -0.300566\n",
-       "9  0.109465  2.497843 -1.199856"
+       "0 -0.408268 -0.676643 -1.274743\n",
+       "1 -0.029322 -0.873593 -1.214105\n",
+       "2 -0.866371  1.081735 -0.226840\n",
+       "3       NaN  0.812278  1.074973\n",
+       "4       NaN       NaN -0.366725\n",
+       "5 -1.016239       NaN       NaN\n",
+       "6  0.675123  1.067536       NaN\n",
+       "7  0.221568  2.025961       NaN\n",
+       "8 -0.317241  1.011275  0.674891\n",
+       "9 -0.877041 -1.919394 -1.029201"
       ]
      },
      "execution_count": 51,
@@ -1796,6 +1853,7 @@
   {
    "cell_type": "code",
    "execution_count": 52,
+   "id": "a07c1260",
    "metadata": {},
    "outputs": [
     {
@@ -1827,63 +1885,63 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.771245</td>\n",
-       "      <td>0.051024</td>\n",
-       "      <td>1.199239</td>\n",
+       "      <td>-0.408268</td>\n",
+       "      <td>-0.676643</td>\n",
+       "      <td>-1.274743</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>-1.168041</td>\n",
-       "      <td>0.702664</td>\n",
-       "      <td>-0.270806</td>\n",
+       "      <td>-0.029322</td>\n",
+       "      <td>-0.873593</td>\n",
+       "      <td>-1.214105</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>-1.467009</td>\n",
-       "      <td>-0.143080</td>\n",
-       "      <td>-0.806151</td>\n",
+       "      <td>-0.866371</td>\n",
+       "      <td>1.081735</td>\n",
+       "      <td>-0.226840</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>-0.149173</td>\n",
-       "      <td>-0.610798</td>\n",
-       "      <td>-0.272895</td>\n",
+       "      <td>-0.327224</td>\n",
+       "      <td>0.812278</td>\n",
+       "      <td>1.074973</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>-0.149173</td>\n",
-       "      <td>-0.034364</td>\n",
-       "      <td>1.396784</td>\n",
+       "      <td>-0.327224</td>\n",
+       "      <td>0.316145</td>\n",
+       "      <td>-0.366725</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>-0.439343</td>\n",
-       "      <td>-0.034364</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>-1.016239</td>\n",
+       "      <td>0.316145</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>1.093102</td>\n",
-       "      <td>-0.764758</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>0.675123</td>\n",
+       "      <td>1.067536</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>0.003098</td>\n",
-       "      <td>-0.722648</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>0.221568</td>\n",
+       "      <td>2.025961</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>-0.095899</td>\n",
-       "      <td>-1.285156</td>\n",
-       "      <td>-0.300566</td>\n",
+       "      <td>-0.317241</td>\n",
+       "      <td>1.011275</td>\n",
+       "      <td>0.674891</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.109465</td>\n",
-       "      <td>2.497843</td>\n",
-       "      <td>-1.199856</td>\n",
+       "      <td>-0.877041</td>\n",
+       "      <td>-1.919394</td>\n",
+       "      <td>-1.029201</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1891,16 +1949,16 @@
       ],
       "text/plain": [
        "          A         B         C\n",
-       "0  0.771245  0.051024  1.199239\n",
-       "1 -1.168041  0.702664 -0.270806\n",
-       "2 -1.467009 -0.143080 -0.806151\n",
-       "3 -0.149173 -0.610798 -0.272895\n",
-       "4 -0.149173 -0.034364  1.396784\n",
-       "5 -0.439343 -0.034364 -0.036322\n",
-       "6  1.093102 -0.764758 -0.036322\n",
-       "7  0.003098 -0.722648 -0.036322\n",
-       "8 -0.095899 -1.285156 -0.300566\n",
-       "9  0.109465  2.497843 -1.199856"
+       "0 -0.408268 -0.676643 -1.274743\n",
+       "1 -0.029322 -0.873593 -1.214105\n",
+       "2 -0.866371  1.081735 -0.226840\n",
+       "3 -0.327224  0.812278  1.074973\n",
+       "4 -0.327224  0.316145 -0.366725\n",
+       "5 -1.016239  0.316145 -0.337393\n",
+       "6  0.675123  1.067536 -0.337393\n",
+       "7  0.221568  2.025961 -0.337393\n",
+       "8 -0.317241  1.011275  0.674891\n",
+       "9 -0.877041 -1.919394 -1.029201"
       ]
      },
      "execution_count": 52,
@@ -1915,6 +1973,7 @@
   {
    "cell_type": "code",
    "execution_count": 53,
+   "id": "9e70d61a",
    "metadata": {},
    "outputs": [
     {
@@ -1946,63 +2005,63 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.771245</td>\n",
-       "      <td>0.051024</td>\n",
-       "      <td>1.199239</td>\n",
+       "      <td>-0.408268</td>\n",
+       "      <td>-0.676643</td>\n",
+       "      <td>-1.274743</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>-1.168041</td>\n",
-       "      <td>0.702664</td>\n",
-       "      <td>-0.270806</td>\n",
+       "      <td>-0.029322</td>\n",
+       "      <td>-0.873593</td>\n",
+       "      <td>-1.214105</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>-1.467009</td>\n",
-       "      <td>-0.143080</td>\n",
-       "      <td>-0.806151</td>\n",
+       "      <td>-0.866371</td>\n",
+       "      <td>1.081735</td>\n",
+       "      <td>-0.226840</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>NaN</td>\n",
-       "      <td>-0.610798</td>\n",
-       "      <td>-0.272895</td>\n",
+       "      <td>0.812278</td>\n",
+       "      <td>1.074973</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>NaN</td>\n",
-       "      <td>-0.034364</td>\n",
-       "      <td>1.396784</td>\n",
+       "      <td>0.316145</td>\n",
+       "      <td>-0.366725</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>-0.439343</td>\n",
-       "      <td>-0.034364</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>-1.016239</td>\n",
+       "      <td>0.316145</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>1.093102</td>\n",
-       "      <td>-0.764758</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>0.675123</td>\n",
+       "      <td>1.067536</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>0.003098</td>\n",
-       "      <td>-0.722648</td>\n",
-       "      <td>-0.036322</td>\n",
+       "      <td>0.221568</td>\n",
+       "      <td>2.025961</td>\n",
+       "      <td>-0.337393</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>-0.095899</td>\n",
-       "      <td>-1.285156</td>\n",
-       "      <td>-0.300566</td>\n",
+       "      <td>-0.317241</td>\n",
+       "      <td>1.011275</td>\n",
+       "      <td>0.674891</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.109465</td>\n",
-       "      <td>2.497843</td>\n",
-       "      <td>-1.199856</td>\n",
+       "      <td>-0.877041</td>\n",
+       "      <td>-1.919394</td>\n",
+       "      <td>-1.029201</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2010,16 +2069,16 @@
       ],
       "text/plain": [
        "          A         B         C\n",
-       "0  0.771245  0.051024  1.199239\n",
-       "1 -1.168041  0.702664 -0.270806\n",
-       "2 -1.467009 -0.143080 -0.806151\n",
-       "3       NaN -0.610798 -0.272895\n",
-       "4       NaN -0.034364  1.396784\n",
-       "5 -0.439343 -0.034364 -0.036322\n",
-       "6  1.093102 -0.764758 -0.036322\n",
-       "7  0.003098 -0.722648 -0.036322\n",
-       "8 -0.095899 -1.285156 -0.300566\n",
-       "9  0.109465  2.497843 -1.199856"
+       "0 -0.408268 -0.676643 -1.274743\n",
+       "1 -0.029322 -0.873593 -1.214105\n",
+       "2 -0.866371  1.081735 -0.226840\n",
+       "3       NaN  0.812278  1.074973\n",
+       "4       NaN  0.316145 -0.366725\n",
+       "5 -1.016239  0.316145 -0.337393\n",
+       "6  0.675123  1.067536 -0.337393\n",
+       "7  0.221568  2.025961 -0.337393\n",
+       "8 -0.317241  1.011275  0.674891\n",
+       "9 -0.877041 -1.919394 -1.029201"
       ]
      },
      "execution_count": 53,
@@ -2033,6 +2092,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0ace728d",
    "metadata": {},
    "source": [
     "## Dropping axis labels with missing data: dropna"
@@ -2040,15 +2100,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2ccd7115",
    "metadata": {},
    "source": [
-    "Missing data can be excluded using `dropna()`:\n",
-    "\n"
+    "Missing data can be excluded using `dropna()`:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 54,
+   "id": "98c57be7",
    "metadata": {},
    "outputs": [
     {
@@ -2127,6 +2188,7 @@
   {
    "cell_type": "code",
    "execution_count": 55,
+   "id": "bc3f273a",
    "metadata": {},
    "outputs": [
     {
@@ -2187,6 +2249,7 @@
   {
    "cell_type": "code",
    "execution_count": 56,
+   "id": "a48d4de0",
    "metadata": {},
    "outputs": [
     {
@@ -2249,14 +2312,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0b1954f9",
    "metadata": {},
    "source": [
-    "An equivalent `dropna()` is available for Series. "
+    "An equivalent `dropna()` is available for Series."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 57,
+   "id": "2dd8f660",
    "metadata": {},
    "outputs": [
     {
@@ -2279,6 +2344,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "121eb6d7",
    "metadata": {},
    "source": [
     "## Replacing generic values"
@@ -2286,6 +2352,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "3cc4c5f1",
    "metadata": {},
    "source": [
     "Often times we want to replace arbitrary values with other values.\n",
@@ -2296,6 +2363,7 @@
   {
    "cell_type": "code",
    "execution_count": 58,
+   "id": "e6c14e8a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2305,6 +2373,7 @@
   {
    "cell_type": "code",
    "execution_count": 59,
+   "id": "a852f0cb",
    "metadata": {},
    "outputs": [
     {
@@ -2330,6 +2399,7 @@
   {
    "cell_type": "code",
    "execution_count": 60,
+   "id": "f6ac12eb",
    "metadata": {},
    "outputs": [
     {
@@ -2354,6 +2424,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "a6e1b6d7",
    "metadata": {},
    "source": [
     "We can also replace any value with a `<NA>` value."
@@ -2362,6 +2433,7 @@
   {
    "cell_type": "code",
    "execution_count": 61,
+   "id": "f0156bff",
    "metadata": {},
    "outputs": [
     {
@@ -2386,6 +2458,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "6673eefb",
    "metadata": {},
    "source": [
     "You can replace a list of values by a list of other values:"
@@ -2394,6 +2467,7 @@
   {
    "cell_type": "code",
    "execution_count": 62,
+   "id": "f3110f5b",
    "metadata": {},
    "outputs": [
     {
@@ -2418,6 +2492,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "61521e8b",
    "metadata": {},
    "source": [
     "You can also specify a mapping dict:"
@@ -2426,6 +2501,7 @@
   {
    "cell_type": "code",
    "execution_count": 63,
+   "id": "45862d05",
    "metadata": {},
    "outputs": [
     {
@@ -2450,6 +2526,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "04a34549",
    "metadata": {},
    "source": [
     "For a DataFrame, you can specify individual values by column:"
@@ -2458,6 +2535,7 @@
   {
    "cell_type": "code",
    "execution_count": 64,
+   "id": "348caa64",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2467,6 +2545,7 @@
   {
    "cell_type": "code",
    "execution_count": 65,
+   "id": "cca41ec4",
    "metadata": {},
    "outputs": [
     {
@@ -2545,6 +2624,7 @@
   {
    "cell_type": "code",
    "execution_count": 66,
+   "id": "64334693",
    "metadata": {},
    "outputs": [
     {
@@ -2622,6 +2702,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "2f0ceec7",
    "metadata": {},
    "source": [
     "## String/regular expression replacement"
@@ -2629,6 +2710,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "c6f44740",
    "metadata": {},
    "source": [
     "cudf supports replacing string values using `replace` API:"
@@ -2637,6 +2719,7 @@
   {
    "cell_type": "code",
    "execution_count": 67,
+   "id": "031d3533",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2646,6 +2729,7 @@
   {
    "cell_type": "code",
    "execution_count": 68,
+   "id": "12b41efb",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2655,6 +2739,7 @@
   {
    "cell_type": "code",
    "execution_count": 69,
+   "id": "d450df49",
    "metadata": {},
    "outputs": [
     {
@@ -2732,6 +2817,7 @@
   {
    "cell_type": "code",
    "execution_count": 70,
+   "id": "f823bc46",
    "metadata": {},
    "outputs": [
     {
@@ -2809,6 +2895,7 @@
   {
    "cell_type": "code",
    "execution_count": 71,
+   "id": "bc52f6e9",
    "metadata": {},
    "outputs": [
     {
@@ -2885,14 +2972,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7c1087be",
    "metadata": {},
    "source": [
-    "Replace a few different values (list -> list):\n"
+    "Replace a few different values (list -> list):"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 72,
+   "id": "7e23eba9",
    "metadata": {},
    "outputs": [
     {
@@ -2969,6 +3058,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "42845a9c",
    "metadata": {},
    "source": [
     "Only search in column 'b' (dict -> dict):"
@@ -2977,6 +3067,7 @@
   {
    "cell_type": "code",
    "execution_count": 73,
+   "id": "d2e79805",
    "metadata": {},
    "outputs": [
     {
@@ -3053,6 +3144,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "774b42a6",
    "metadata": {},
    "source": [
     "## Numeric replacement"
@@ -3060,6 +3152,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1c1926ac",
    "metadata": {},
    "source": [
     "`replace()` can also be used similar to `fillna()`."
@@ -3068,6 +3161,7 @@
   {
    "cell_type": "code",
    "execution_count": 74,
+   "id": "355a2f0d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3077,6 +3171,7 @@
   {
    "cell_type": "code",
    "execution_count": 75,
+   "id": "d9eed372",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3086,6 +3181,7 @@
   {
    "cell_type": "code",
    "execution_count": 76,
+   "id": "ae944244",
    "metadata": {},
    "outputs": [
     {
@@ -3116,70 +3212,70 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-0.089358787</td>\n",
+       "      <td>-0.728419386</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-2.141612003</td>\n",
+       "      <td>-0.574415182</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0.123160746</td>\n",
-       "      <td>1.09464783</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>0.774643462</td>\n",
+       "      <td>2.07287721</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>0.93799853</td>\n",
+       "      <td>-1.054129436</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>0.68137677</td>\n",
-       "      <td>-0.357346253</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-0.435293012</td>\n",
+       "      <td>1.163009584</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>1.346623287</td>\n",
+       "      <td>0.31961371</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>1.173285961</td>\n",
-       "      <td>-0.968616065</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.147922362</td>\n",
-       "      <td>-0.154880098</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "             0             1\n",
-       "0         <NA>          <NA>\n",
-       "1         <NA>          <NA>\n",
-       "2  0.123160746    1.09464783\n",
-       "3         <NA>          <NA>\n",
-       "4         <NA>          <NA>\n",
-       "5   0.68137677  -0.357346253\n",
-       "6         <NA>          <NA>\n",
-       "7         <NA>          <NA>\n",
-       "8  1.173285961  -0.968616065\n",
-       "9  0.147922362  -0.154880098"
+       "              0             1\n",
+       "0  -0.089358787  -0.728419386\n",
+       "1  -2.141612003  -0.574415182\n",
+       "2          <NA>          <NA>\n",
+       "3   0.774643462    2.07287721\n",
+       "4    0.93799853  -1.054129436\n",
+       "5          <NA>          <NA>\n",
+       "6  -0.435293012   1.163009584\n",
+       "7   1.346623287    0.31961371\n",
+       "8          <NA>          <NA>\n",
+       "9          <NA>          <NA>"
       ]
      },
      "execution_count": 76,
@@ -3193,15 +3289,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "0f32607c",
    "metadata": {},
    "source": [
-    "Replacing more than one value is possible by passing a list.\n",
-    "\n"
+    "Replacing more than one value is possible by passing a list."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 77,
+   "id": "59b81c60",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3211,6 +3308,7 @@
   {
    "cell_type": "code",
    "execution_count": 78,
+   "id": "01a71d4c",
    "metadata": {},
    "outputs": [
     {
@@ -3241,70 +3339,70 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>10.000000</td>\n",
+       "      <td>-0.728419</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>-2.141612</td>\n",
+       "      <td>-0.574415</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0.123161</td>\n",
-       "      <td>1.094648</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>0.774643</td>\n",
+       "      <td>2.072877</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>0.937999</td>\n",
+       "      <td>-1.054129</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>0.681377</td>\n",
-       "      <td>-0.357346</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>-0.435293</td>\n",
+       "      <td>1.163010</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>5.000000</td>\n",
-       "      <td>5.000000</td>\n",
+       "      <td>1.346623</td>\n",
+       "      <td>0.319614</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>1.173286</td>\n",
-       "      <td>-0.968616</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.147922</td>\n",
-       "      <td>-0.154880</td>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "          0         1\n",
-       "0  5.000000  5.000000\n",
-       "1  5.000000  5.000000\n",
-       "2  0.123161  1.094648\n",
-       "3  5.000000  5.000000\n",
-       "4  5.000000  5.000000\n",
-       "5  0.681377 -0.357346\n",
-       "6  5.000000  5.000000\n",
-       "7  5.000000  5.000000\n",
-       "8  1.173286 -0.968616\n",
-       "9  0.147922 -0.154880"
+       "           0         1\n",
+       "0  10.000000 -0.728419\n",
+       "1  -2.141612 -0.574415\n",
+       "2   5.000000  5.000000\n",
+       "3   0.774643  2.072877\n",
+       "4   0.937999 -1.054129\n",
+       "5   5.000000  5.000000\n",
+       "6  -0.435293  1.163010\n",
+       "7   1.346623  0.319614\n",
+       "8   5.000000  5.000000\n",
+       "9   5.000000  5.000000"
       ]
      },
      "execution_count": 78,
@@ -3318,15 +3416,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1080e97b",
    "metadata": {},
    "source": [
-    "You can also operate on the DataFrame in place:\n",
-    "\n"
+    "You can also operate on the DataFrame in place:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 79,
+   "id": "5f0859d7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3336,6 +3435,7 @@
   {
    "cell_type": "code",
    "execution_count": 80,
+   "id": "5cf28369",
    "metadata": {},
    "outputs": [
     {
@@ -3366,70 +3466,70 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-0.089358787</td>\n",
+       "      <td>-0.728419386</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-2.141612003</td>\n",
+       "      <td>-0.574415182</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0.123160746</td>\n",
-       "      <td>1.09464783</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>0.774643462</td>\n",
+       "      <td>2.07287721</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>0.93799853</td>\n",
+       "      <td>-1.054129436</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>0.68137677</td>\n",
-       "      <td>-0.357346253</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>-0.435293012</td>\n",
+       "      <td>1.163009584</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>1.346623287</td>\n",
+       "      <td>0.31961371</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>1.173285961</td>\n",
-       "      <td>-0.968616065</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>0.147922362</td>\n",
-       "      <td>-0.154880098</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "             0             1\n",
-       "0         <NA>          <NA>\n",
-       "1         <NA>          <NA>\n",
-       "2  0.123160746    1.09464783\n",
-       "3         <NA>          <NA>\n",
-       "4         <NA>          <NA>\n",
-       "5   0.68137677  -0.357346253\n",
-       "6         <NA>          <NA>\n",
-       "7         <NA>          <NA>\n",
-       "8  1.173285961  -0.968616065\n",
-       "9  0.147922362  -0.154880098"
+       "              0             1\n",
+       "0  -0.089358787  -0.728419386\n",
+       "1  -2.141612003  -0.574415182\n",
+       "2          <NA>          <NA>\n",
+       "3   0.774643462    2.07287721\n",
+       "4    0.93799853  -1.054129436\n",
+       "5          <NA>          <NA>\n",
+       "6  -0.435293012   1.163009584\n",
+       "7   1.346623287    0.31961371\n",
+       "8          <NA>          <NA>\n",
+       "9          <NA>          <NA>"
       ]
      },
      "execution_count": 80,
@@ -3444,7 +3544,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -3458,9 +3558,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.9"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 5
 }
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index dc8c0e4a95b..7993804554d 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -26,8 +26,9 @@ ARG CUDA_VERSION=11.5.0
 FROM gpuci/cuda:$CUDA_VERSION-devel-centos7
 
 ### Install basic requirements
+ARG DEVTOOLSET_VERSION=9
 RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-9 epel-release
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION} epel-release
 RUN yum install -y git zlib-devel maven tar wget patch ninja-build
 
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
@@ -37,4 +38,21 @@ ARG CMAKE_VERSION=3.22.3
 RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
    tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
    rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
+
 ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
+
+ARG CCACHE_VERSION=4.6
+RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
+   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
+   rm ccache-${CCACHE_VERSION}.tar.gz && \
+   cd ccache-${CCACHE_VERSION} && \
+   mkdir build && \
+   cd build && \
+   scl enable devtoolset-${DEVTOOLSET_VERSION} \
+      "cmake .. \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DZSTD_FROM_INTERNET=ON \
+         -DREDIS_STORAGE_BACKEND=OFF && \
+      cmake --build . --parallel ${PARALLEL_LEVEL} --target install" && \
+   cd ../.. && \
+   rm -rf ccache-${CCACHE_VERSION}
diff --git a/java/pom.xml b/java/pom.xml
index 50b6ca59440..31a79ec9801 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -173,6 +173,7 @@
         <arrow.version>0.15.1</arrow.version>
         <parallel.level>4</parallel.level>
         <CUDF_CPP_BUILD_DIR/>
+        <cmake.ccache.opts/>
     </properties>
 
     <profiles>
@@ -382,6 +383,7 @@
                                       failonerror="true"
                                       executable="cmake">
                                     <arg value="${basedir}/src/main/native"/>
+                                    <arg line="${cmake.ccache.opts}"/>
                                     <arg value="-DCUDA_STATIC_RUNTIME=${CUDA_STATIC_RUNTIME}" />
                                     <arg value="-DPER_THREAD_DEFAULT_STREAM=${PER_THREAD_DEFAULT_STREAM}" />
                                     <arg value="-DUSE_GDS=${USE_GDS}" />
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index cc1bc35f951..e871da18966 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -233,8 +233,10 @@ public final ColumnView getChildColumnView(int childIndex) {
 
   /**
    * Get a ColumnView that is the offsets for this list.
+   * Please note that it is the responsibility of the caller to close this view, and the parent
+   * column must out live this view.
    */
-  ColumnView getListOffsetsView() {
+  public ColumnView getListOffsetsView() {
     assert(getType().equals(DType.LIST));
     return new ColumnView(getListOffsetCvPointer(viewHandle));
   }
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index 763ecc763a5..8b1a9a63131 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -448,11 +448,8 @@ public HostColumnVector.StructData getStruct(int rowIndex) {
    * @return true if null else false
    */
   public boolean isNull(long rowIndex) {
-    assert (rowIndex >= 0 && rowIndex < rows) : "index is out of range 0 <= " + rowIndex + " < " + rows;
-    if (hasValidityVector()) {
-      return BitVectorHelper.isNull(offHeap.valid, rowIndex);
-    }
-    return false;
+    return rowIndex < 0 || rowIndex >= rows // unknown, hence NULL
+           || hasValidityVector() && BitVectorHelper.isNull(offHeap.valid, rowIndex);
   }
 
   /**
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8c459e855c1..036ef890696 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -111,6 +111,14 @@
 }
 
 
+def _shape_mismatch_error(x, y):
+    raise ValueError(
+        f"shape mismatch: value array of shape {x} "
+        f"could not be broadcast to indexing result of "
+        f"shape {y}"
+    )
+
+
 class _DataFrameIndexer(_FrameIndexer):
     def __getitem__(self, arg):
         if (
@@ -342,28 +350,58 @@ def _setitem_tuple_arg(self, key, value):
                 )
             self._frame._data.insert(key[1], new_col)
         else:
-            if isinstance(value, (cupy.ndarray, np.ndarray)):
-                value_df = DataFrame(value)
-                if value_df.shape[1] != columns_df.shape[1]:
-                    if value_df.shape[1] == 1:
-                        value_cols = (
-                            value_df._data.columns * columns_df.shape[1]
-                        )
-                    else:
-                        raise ValueError(
-                            f"shape mismatch: value array of shape "
-                            f"{value_df.shape} could not be "
-                            f"broadcast to indexing result of shape "
-                            f"{columns_df.shape}"
-                        )
-                else:
-                    value_cols = value_df._data.columns
-                for i, col in enumerate(columns_df._column_names):
-                    self._frame[col].loc[key[0]] = value_cols[i]
-            else:
+            if is_scalar(value):
                 for col in columns_df._column_names:
                     self._frame[col].loc[key[0]] = value
 
+            elif isinstance(value, cudf.DataFrame):
+                if value.shape != self._frame.loc[key[0]].shape:
+                    _shape_mismatch_error(
+                        value.shape,
+                        self._frame.loc[key[0]].shape,
+                    )
+                value_column_names = set(value._column_names)
+                scatter_map = _indices_from_labels(self._frame, key[0])
+                for col in columns_df._column_names:
+                    columns_df[col][scatter_map] = (
+                        value._data[col]
+                        if col in value_column_names
+                        else cudf.NA
+                    )
+
+            else:
+                value = cupy.asarray(value)
+                if cupy.ndim(value) == 2:
+                    # If the inner dimension is 1, it's broadcastable to
+                    # all columns of the dataframe.
+                    indexed_shape = columns_df.loc[key[0]].shape
+                    if value.shape[1] == 1:
+                        if value.shape[0] != indexed_shape[0]:
+                            _shape_mismatch_error(value.shape, indexed_shape)
+                        for i, col in enumerate(columns_df._column_names):
+                            self._frame[col].loc[key[0]] = value[:, 0]
+                    else:
+                        if value.shape != indexed_shape:
+                            _shape_mismatch_error(value.shape, indexed_shape)
+                        for i, col in enumerate(columns_df._column_names):
+                            self._frame[col].loc[key[0]] = value[:, i]
+                else:
+                    # handle cases where value is 1d object:
+                    # If the key on column axis is a scalar, we indexed
+                    # a single column; The 1d value should assign along
+                    # the columns.
+                    if is_scalar(key[1]):
+                        for col in columns_df._column_names:
+                            self._frame[col].loc[key[0]] = value
+                    # Otherwise, there are two situations. The key on row axis
+                    # can be a scalar or 1d. In either of the situation, the
+                    # ith element in value corresponds to the ith row in
+                    # the indexed object.
+                    # If the key is 1d, a broadcast will happen.
+                    else:
+                        for i, col in enumerate(columns_df._column_names):
+                            self._frame[col].loc[key[0]] = value[i]
+
 
 class _DataFrameIlocIndexer(_DataFrameIndexer):
     """
@@ -424,10 +462,49 @@ def _getitem_tuple_arg(self, arg):
 
     @_cudf_nvtx_annotate
     def _setitem_tuple_arg(self, key, value):
-        # TODO: Determine if this usage is prevalent enough to expose this
-        # selection logic at a higher level than ColumnAccessor.
-        for col in self._frame._data.get_labels_by_index(key[1]):
-            self._frame[col].iloc[key[0]] = value
+        columns_df = self._frame._from_data(
+            self._frame._data.select_by_index(key[1]), self._frame._index
+        )
+
+        if is_scalar(value):
+            for col in columns_df._column_names:
+                self._frame[col].iloc[key[0]] = value
+
+        elif isinstance(value, cudf.DataFrame):
+            if value.shape != self._frame.iloc[key[0]].shape:
+                _shape_mismatch_error(
+                    value.shape,
+                    self._frame.loc[key[0]].shape,
+                )
+            value_column_names = set(value._column_names)
+            for col in columns_df._column_names:
+                columns_df[col][key[0]] = (
+                    value._data[col] if col in value_column_names else cudf.NA
+                )
+
+        else:
+            # TODO: consolidate code path with identical counterpart
+            # in `_DataFrameLocIndexer._setitem_tuple_arg`
+            value = cupy.asarray(value)
+            if cupy.ndim(value) == 2:
+                indexed_shape = columns_df.iloc[key[0]].shape
+                if value.shape[1] == 1:
+                    if value.shape[0] != indexed_shape[0]:
+                        _shape_mismatch_error(value.shape, indexed_shape)
+                    for i, col in enumerate(columns_df._column_names):
+                        self._frame[col].iloc[key[0]] = value[:, 0]
+                else:
+                    if value.shape != indexed_shape:
+                        _shape_mismatch_error(value.shape, indexed_shape)
+                    for i, col in enumerate(columns_df._column_names):
+                        self._frame._data[col][key[0]] = value[:, i]
+            else:
+                if is_scalar(key[1]):
+                    for col in columns_df._column_names:
+                        self._frame[col].iloc[key[0]] = value
+                else:
+                    for i, col in enumerate(columns_df._column_names):
+                        self._frame[col].iloc[key[0]] = value[i]
 
     def _getitem_scalar(self, arg):
         col = self._frame.columns[arg[1]]
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d0e9e6d94c1..e75cf47bb7c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -538,7 +538,7 @@ def to_cupy(
         Parameters
         ----------
         dtype : str or numpy.dtype, optional
-            The dtype to pass to :meth:`numpy.asarray`.
+            The dtype to pass to :func:`numpy.asarray`.
         copy : bool, default False
             Whether to ensure that the returned value is not a view on
             another array. Note that ``copy=False`` does not *ensure* that
@@ -573,7 +573,7 @@ def to_numpy(
         Parameters
         ----------
         dtype : str or numpy.dtype, optional
-            The dtype to pass to :meth:`numpy.asarray`.
+            The dtype to pass to :func:`numpy.asarray`.
         copy : bool, default True
             Whether to ensure that the returned value is not a view on
             another array. This parameter must be ``True`` since cuDF must copy
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 1361fc56fa0..f4dcf9f59ca 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -144,7 +144,6 @@ def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
 
 
 def _indices_from_labels(obj, labels):
-
     if not isinstance(labels, cudf.MultiIndex):
         labels = cudf.core.column.as_column(labels)
 
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index e9f836d9702..679edefcc83 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -311,7 +311,11 @@ def gen_rand(dtype, size, **kwargs):
             np.random.randint(low=low, high=high, size=size), unit=time_unit
         )
     elif dtype.kind in ("O", "U"):
-        return pd._testing.rands_array(10, size)
+        low = kwargs.get("low", 10)
+        high = kwargs.get("high", 11)
+        return pd._testing.rands_array(
+            np.random.randint(low=low, high=high, size=1)[0], size
+        )
     raise NotImplementedError(f"dtype.kind={dtype.kind}")
 
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 9f2a3d45778..7f482c0e776 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8697,43 +8697,6 @@ def test_frame_series_where():
     assert_eq(expected, actual)
 
 
-@pytest.mark.parametrize(
-    "array,is_error",
-    [
-        (cupy.arange(20, 40).reshape(-1, 2), False),
-        (cupy.arange(20, 50).reshape(-1, 3), True),
-        (np.arange(20, 40).reshape(-1, 2), False),
-        (np.arange(20, 30).reshape(-1, 1), False),
-        (cupy.arange(20, 30).reshape(-1, 1), False),
-    ],
-)
-def test_dataframe_indexing_setitem_np_cp_array(array, is_error):
-    gdf = cudf.DataFrame({"a": range(10), "b": range(10)})
-    pdf = gdf.to_pandas()
-    if not is_error:
-        gdf.loc[:, ["a", "b"]] = array
-        pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array)
-
-        assert_eq(gdf, pdf)
-    else:
-        assert_exceptions_equal(
-            lfunc=pdf.loc.__setitem__,
-            rfunc=gdf.loc.__setitem__,
-            lfunc_args_and_kwargs=(
-                [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)],
-                {},
-            ),
-            rfunc_args_and_kwargs=(
-                [(slice(None, None, None), ["a", "b"]), array],
-                {},
-            ),
-            compare_error_message=False,
-            expected_error_message="shape mismatch: value array of shape "
-            "(10, 3) could not be broadcast to indexing "
-            "result of shape (10, 2)",
-        )
-
-
 @pytest.mark.parametrize(
     "data",
     [{"a": [1, 2, 3], "b": [1, 1, 0]}],
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 225aa0cd6bc..790fbd0d3f8 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1486,3 +1486,189 @@ def test_iloc_decimal():
         ["4.00", "3.00", "2.00", "1.00"],
     ).astype(cudf.Decimal64Dtype(scale=2, precision=3))
     assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True))
+
+
+@pytest.mark.parametrize(
+    ("key, value"),
+    [
+        (
+            ([0], ["x", "y"]),
+            [10, 20],
+        ),
+        (
+            ([0, 2], ["x", "y"]),
+            [[10, 30], [20, 40]],
+        ),
+        (
+            (0, ["x", "y"]),
+            [10, 20],
+        ),
+        (
+            ([0, 2], "x"),
+            [10, 20],
+        ),
+    ],
+)
+def test_dataframe_loc_inplace_update(key, value):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    pdf = gdf.to_pandas()
+
+    actual = gdf.loc[key] = value
+    expected = pdf.loc[key] = value
+
+    assert_eq(expected, actual)
+
+
+def test_dataframe_loc_inplace_update_string_index():
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=list("abc"))
+    pdf = gdf.to_pandas()
+
+    actual = gdf.loc[["a"], ["x", "y"]] = [10, 20]
+    expected = pdf.loc[["a"], ["x", "y"]] = [10, 20]
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    ("key, value"),
+    [
+        ([0], [10, 20]),
+        ([0, 2], [[10, 30], [20, 40]]),
+        (([0, 2], [0, 1]), [[10, 30], [20, 40]]),
+        (([0, 2], 0), [10, 30]),
+        ((0, [0, 1]), [20, 40]),
+    ],
+)
+def test_dataframe_iloc_inplace_update(key, value):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    pdf = gdf.to_pandas()
+
+    actual = gdf.iloc[key] = value
+    expected = pdf.iloc[key] = value
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "loc_key",
+    [([0, 2], ["x", "y"])],
+)
+@pytest.mark.parametrize(
+    "iloc_key",
+    [[0, 2]],
+)
+@pytest.mark.parametrize(
+    ("data, index"),
+    [
+        (
+            {"x": [10, 20], "y": [30, 40]},
+            [0, 2],
+        )
+    ],
+)
+def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe(
+    loc_key, iloc_key, data, index
+):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    pdf = gdf.to_pandas()
+
+    actual = gdf.loc[loc_key] = cudf.DataFrame(data, index=cudf.Index(index))
+    expected = pdf.loc[loc_key] = pd.DataFrame(data, index=pd.Index(index))
+    assert_eq(expected, actual)
+
+    actual = gdf.iloc[iloc_key] = cudf.DataFrame(data, index=cudf.Index(index))
+    expected = pdf.iloc[iloc_key] = pd.DataFrame(data, index=pd.Index(index))
+    assert_eq(expected, actual)
+
+
+def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns():
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    pdf = gdf.to_pandas()
+
+    actual = gdf.loc[[0, 2], ["x", "y"]] = cudf.DataFrame(
+        {"b": [10, 20], "y": [30, 40]}, index=cudf.Index([0, 2])
+    )
+    expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame(
+        {"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2])
+    )
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    ("key, value"),
+    [
+        (([0, 2], ["x", "y"]), [[10, 30, 50], [20, 40, 60]]),
+        (([0], ["x", "y"]), [[10], [20]]),
+    ],
+)
+def test_dataframe_loc_inplace_update_shape_mismatch(key, value):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    with pytest.raises(ValueError, match="shape mismatch:"):
+        gdf.loc[key] = value
+
+
+@pytest.mark.parametrize(
+    ("key, value"),
+    [
+        ([0, 2], [[10, 30, 50], [20, 40, 60]]),
+        ([0], [[10], [20]]),
+    ],
+)
+def test_dataframe_iloc_inplace_update_shape_mismatch(key, value):
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    with pytest.raises(ValueError, match="shape mismatch:"):
+        gdf.iloc[key] = value
+
+
+def test_dataframe_loc_inplace_update_shape_mismatch_RHS_df():
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    with pytest.raises(ValueError, match="shape mismatch:"):
+        gdf.loc[([0, 2], ["x", "y"])] = cudf.DataFrame(
+            {"x": [10, 20]}, index=cudf.Index([0, 2])
+        )
+
+
+def test_dataframe_iloc_inplace_update_shape_mismatch_RHS_df():
+    gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+    with pytest.raises(ValueError, match="shape mismatch:"):
+        gdf.iloc[[0, 2]] = cudf.DataFrame(
+            {"x": [10, 20]}, index=cudf.Index([0, 2])
+        )
+
+
+@pytest.mark.parametrize(
+    "array,is_error",
+    [
+        (cupy.arange(20, 40).reshape(-1, 2), False),
+        (cupy.arange(20, 50).reshape(-1, 3), True),
+        (np.arange(20, 40).reshape(-1, 2), False),
+        (np.arange(20, 30).reshape(-1, 1), False),
+        (cupy.arange(20, 30).reshape(-1, 1), False),
+    ],
+)
+def test_dataframe_indexing_setitem_np_cp_array(array, is_error):
+    gdf = cudf.DataFrame({"a": range(10), "b": range(10)})
+    pdf = gdf.to_pandas()
+    if not is_error:
+        gdf.loc[:, ["a", "b"]] = array
+        pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array)
+
+        assert_eq(gdf, pdf)
+    else:
+        assert_exceptions_equal(
+            lfunc=pdf.loc.__setitem__,
+            rfunc=gdf.loc.__setitem__,
+            lfunc_args_and_kwargs=(
+                [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)],
+                {},
+            ),
+            rfunc_args_and_kwargs=(
+                [(slice(None, None, None), ["a", "b"]), array],
+                {},
+            ),
+            compare_error_message=False,
+            expected_error_message="shape mismatch: value array of shape "
+            "(10, 3) could not be broadcast to indexing "
+            "result of shape (10, 2)",
+        )
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c28358f5fa0..c547c80e48b 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -301,27 +301,36 @@ def test_orc_read_rows(datadir, skiprows, num_rows):
     assert_eq(pdf, gdf)
 
 
-def test_orc_read_skiprows(tmpdir):
+def test_orc_read_skiprows():
     buff = BytesIO()
-    df = pd.DataFrame(
-        {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]},
-        dtype=pd.BooleanDtype(),
-    )
+    data = [
+        True,
+        False,
+        True,
+        False,
+        None,
+        True,
+        True,
+        True,
+        False,
+        None,
+        False,
+        False,
+        True,
+        True,
+        True,
+        True,
+    ]
     writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
-    tuples = list(
-        map(
-            lambda x: (None,) if x[0] is pd.NA else (bool(x[0]),),
-            list(df.itertuples(index=False, name=None)),
-        )
-    )
-    writer.writerows(tuples)
+    writer.writerows([(d,) for d in data])
     writer.close()
 
+    # testing 10 skiprows due to a boolean specific bug fix that didn't
+    # repro for other sizes of data
     skiprows = 10
 
-    expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True)
+    expected = cudf.read_orc(buff)[skiprows:].reset_index(drop=True)
     got = cudf.read_orc(buff, skiprows=skiprows)
-
     assert_eq(expected, got)
 
 
@@ -724,6 +733,105 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
                 assert stats_num_vals == actual_num_vals
 
 
+@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
+@pytest.mark.parametrize("nrows", [2, 100, 6000000])
+def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
+    supported_stat_types = supported_numpy_dtypes + ["str"]
+    # Can't write random bool columns until issue #6763 is fixed
+    if nrows == 6000000:
+        supported_stat_types.remove("bool")
+
+    gdf_fname = tmpdir.join("chunked_stats.orc")
+    writer = ORCWriter(gdf_fname)
+
+    max_char_length = 1000 if nrows < 10000 else 100
+
+    # Make a dataframe
+    gdf = cudf.DataFrame(
+        {
+            "col_"
+            + str(dtype): gen_rand_series(
+                dtype,
+                int(nrows / 2),
+                has_nulls=True,
+                low=0,
+                high=max_char_length,
+            )
+            for dtype in supported_stat_types
+        }
+    )
+
+    pdf1 = gdf.to_pandas()
+    writer.write_table(gdf)
+    # gdf is specifically being reused here to ensure the data is destroyed
+    # before the next write_table call to ensure the data is persisted inside
+    # write and no pointers are saved into the original table
+    gdf = cudf.DataFrame(
+        {
+            "col_"
+            + str(dtype): gen_rand_series(
+                dtype,
+                int(nrows / 2),
+                has_nulls=True,
+                low=0,
+                high=max_char_length,
+            )
+            for dtype in supported_stat_types
+        }
+    )
+    pdf2 = gdf.to_pandas()
+    writer.write_table(gdf)
+    writer.close()
+
+    # pandas is unable to handle min/max of string col with nulls
+    expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True))
+
+    # Read back written ORC's statistics
+    orc_file = pa.orc.ORCFile(gdf_fname)
+    (
+        file_stats,
+        stripes_stats,
+    ) = cudf.io.orc.read_orc_statistics([gdf_fname])
+
+    # check file stats
+    for col in expect:
+        if "minimum" in file_stats[0][col]:
+            stats_min = file_stats[0][col]["minimum"]
+            actual_min = expect[col].min()
+            assert normalized_equals(actual_min, stats_min)
+        if "maximum" in file_stats[0][col]:
+            stats_max = file_stats[0][col]["maximum"]
+            actual_max = expect[col].max()
+            assert normalized_equals(actual_max, stats_max)
+        if "number_of_values" in file_stats[0][col]:
+            stats_num_vals = file_stats[0][col]["number_of_values"]
+            actual_num_vals = expect[col].count()
+            assert stats_num_vals == actual_num_vals
+
+    # compare stripe statistics with actual min/max
+    for stripe_idx in range(0, orc_file.nstripes):
+        stripe = orc_file.read_stripe(stripe_idx)
+        # pandas is unable to handle min/max of string col with nulls
+        stripe_df = cudf.DataFrame(stripe.to_pandas())
+        for col in stripe_df:
+            if "minimum" in stripes_stats[stripe_idx][col]:
+                actual_min = stripe_df[col].min()
+                stats_min = stripes_stats[stripe_idx][col]["minimum"]
+                assert normalized_equals(actual_min, stats_min)
+
+            if "maximum" in stripes_stats[stripe_idx][col]:
+                actual_max = stripe_df[col].max()
+                stats_max = stripes_stats[stripe_idx][col]["maximum"]
+                assert normalized_equals(actual_max, stats_max)
+
+            if "number_of_values" in stripes_stats[stripe_idx][col]:
+                stats_num_vals = stripes_stats[stripe_idx][col][
+                    "number_of_values"
+                ]
+                actual_num_vals = stripe_df[col].count()
+                assert stats_num_vals == actual_num_vals
+
+
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
     # Make a dataframe