diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 92e31ea9c4b..c87033238c7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -59,6 +59,7 @@ repos:
         hooks:
               - id: mypy
                 args: ["--config-file=python/cudf/setup.cfg", "python/cudf/cudf"]
+                pass_filenames: false
       - repo: https://github.com/pycqa/pydocstyle
         rev: 6.0.0
         hooks:
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index ed2484814fb..a9bc1f4c605 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -3,6 +3,16 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 set -e
 
+ARCH=$(arch)
+if [ "${ARCH}" = "x86_64" ]; then
+    DEFAULT_CUDA_VER="11.0"
+elif [ "${ARCH}" = "aarch64" ]; then
+    DEFAULT_CUDA_VER="11.2"
+else
+    echo "Unsupported arch ${ARCH}"
+    exit 1
+fi
+
 #Always upload cudf Python package
 export UPLOAD_CUDF=1
 
@@ -14,14 +24,14 @@ else
 fi
 
 # upload cudf_kafka for all versions of Python
-if [[ "$CUDA" == "11.0" ]]; then
+if [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
     export UPLOAD_CUDF_KAFKA=1
 else
     export UPLOAD_CUDF_KAFKA=0
 fi
 
 #We only want to upload libcudf_kafka once per python/CUDA combo
-if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "11.0" ]]; then
+if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
     export UPLOAD_LIBCUDF_KAFKA=1
 else
     export UPLOAD_LIBCUDF_KAFKA=0
@@ -31,4 +41,4 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     #If project flash is not activate, always build both
     export BUILD_LIBCUDF=1
     export BUILD_CUDF=1
-fi
\ No newline at end of file
+fi
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index b8ae1120b30..5e839589811 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -56,7 +56,7 @@ dependencies:
   - protobuf
   - nvtx>=0.2.1
   - cachetools
-  - transformers
+  - transformers<=4.10.3
   - pydata-sphinx-theme
   - pip:
       - git+https://github.com/dask/dask.git@main
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 048d5181aa8..28266b6af87 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -56,7 +56,7 @@ dependencies:
   - protobuf
   - nvtx>=0.2.1
   - cachetools
-  - transformers
+  - transformers<=4.10.3
   - pydata-sphinx-theme
   - pip:
       - git+https://github.com/dask/dask.git@main
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 982fee640d9..82bc5bfba93 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -473,8 +473,7 @@ target_include_directories(cudf
                        "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
            PRIVATE     "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
            INTERFACE   "$<INSTALL_INTERFACE:include>"
-                       "$<INSTALL_INTERFACE:include/libcudf/libcudacxx>"
-                       "$<INSTALL_INTERFACE:include/libcudf/Thrust>")
+                       "$<INSTALL_INTERFACE:include/libcudf/libcudacxx>")
 
 target_compile_definitions(cudf
             PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
@@ -511,7 +510,7 @@ target_link_libraries(cudf
                   cudf::Thrust
                   rmm::rmm
            PRIVATE cuco::cuco
-	             ZLIB::ZLIB
+                   ZLIB::ZLIB
                    nvcomp::nvcomp)
 
 # Add Conda library, and include paths if specified
@@ -692,6 +691,40 @@ following IMPORTED GLOBAL  targets:
     ]=])
 
 
+set(common_code_string
+    [=[
+if(NOT TARGET cudf::Thrust)
+  thrust_create_target(cudf::Thrust FROM_OPTIONS)
+endif()
+
+# nvcc automatically adds the CUDA Toolkit system include paths before any
+# system include paths that CMake adds.
+#
+# CMake implicitly treats all includes on import targets as 'SYSTEM' includes.
+#
+# To get the cudacxx shipped with cudf to be picked up by consumers instead of the
+# version shipped with the CUDA Toolkit we need to make sure it is a non-SYSTEM
+# include on the CMake side.
+#
+# To do this currently, we move the includes from the cudf::cudf target to a
+# non-import target to ensure they are `-I` instead of `-isystem`
+
+add_library(cudf_non_system_includes INTERFACE)
+target_link_libraries(cudf::cudf INTERFACE cudf_non_system_includes)
+
+get_target_property(all_includes cudf::cudf INTERFACE_INCLUDE_DIRECTORIES)
+set(system_includes )
+set(normal_includes )
+foreach(include IN LISTS all_includes)
+  if(include MATCHES "/include/libcudf/")
+    list(APPEND normal_includes "${include}")
+  else()
+    list(APPEND system_includes "${include}")
+  endif()
+endforeach()
+set_target_properties(cudf::cudf PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${system_includes}")
+set_target_properties(cudf_non_system_includes PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${normal_includes}")
+]=])
 set(install_code_string
     [=[
 set(ArrowCUDA_DIR "${Arrow_DIR}")
@@ -705,11 +738,8 @@ if(testing IN_LIST cudf_FIND_COMPONENTS)
     include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
   endif()
 endif()
-
-if(NOT TARGET cudf::Thrust)
-  thrust_create_target(cudf::Thrust FROM_OPTIONS)
-endif()
 ]=])
+string(APPEND install_code_string "${common_code_string}")
 
 rapids_export(INSTALL cudf
     EXPORT_SET cudf-exports
@@ -728,11 +758,8 @@ endif()
 if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
   include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
 endif()
-
-if(NOT TARGET cudf::Thrust)
-  thrust_create_target(cudf::Thrust FROM_OPTIONS)
-endif()
 ]=])
+string(APPEND build_code_string "${common_code_string}")
 
 rapids_export(BUILD cudf
     EXPORT_SET cudf-exports
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 3309be62dc3..41751c7dd50 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -157,17 +157,40 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     endif()
 
     if(Arrow_ADDED)
+        set(arrow_code_string
+        [=[
+        if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
+            add_library(arrow_shared ALIAS cudf::arrow_shared)
+        endif()
+        if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static))
+            add_library(arrow_static ALIAS cudf::arrow_static)
+        endif()
+        ]=]
+        )
+        set(arrow_cuda_code_string
+        [=[
+        if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
+            add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
+        endif()
+        if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static))
+            add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
+        endif()
+        ]=]
+        )
+
         rapids_export(BUILD Arrow
           VERSION ${VERSION}
           EXPORT_SET arrow_targets
-          GLOBAL_TARGETS arrow_shared arrow_static
-          NAMESPACE cudf::)
+          GLOBAL_TARGETS arrow_shared cud
+          NAMESPACE cudf::
+          FINAL_CODE_BLOCK arrow_code_string)
 
         rapids_export(BUILD ArrowCUDA
           VERSION ${VERSION}
           EXPORT_SET arrow_cuda_targets
           GLOBAL_TARGETS arrow_cuda_shared arrow_cuda_static
-          NAMESPACE cudf::)
+          NAMESPACE cudf::
+          FINAL_CODE_BLOCK arrow_cuda_code_string)
     endif()
     # We generate the arrow-config and arrowcuda-config files
     # when we built arrow locally, so always do `find_dependency`
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 2512597b28f..47dbc037334 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
         GLOBAL_TARGETS cuco::cuco
         CPM_ARGS
             GITHUB_REPOSITORY NVIDIA/cuCollections
-            GIT_TAG           0d602ae21ea4f38d23ed816aa948453d97b2ee4e
+            GIT_TAG           729857a5698a0e8d8f812e0464f65f37854ae17b
             OPTIONS           "BUILD_TESTS OFF"
                               "BUILD_BENCHMARKS OFF"
                               "BUILD_EXAMPLES OFF"
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 501071cafb7..bb5cfa5c6e0 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
@@ -36,12 +36,15 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
 
 #include <algorithm>
 
-namespace {
+namespace cudf {
+namespace detail {
+
 // Compute the count of elements that pass the mask within each block
 template <typename Filter, int block_size>
 __global__ void compute_block_counts(cudf::size_type* __restrict__ block_counts,
@@ -293,9 +296,9 @@ struct scatter_gather_functor {
                     filter);
 
     auto output_table = cudf::detail::gather(cudf::table_view{{input}},
-                                             indices.begin(),
-                                             indices.end(),
+                                             indices,
                                              cudf::out_of_bounds_policy::DONT_CHECK,
+                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
                                              stream,
                                              mr);
 
@@ -304,10 +307,6 @@ struct scatter_gather_functor {
   }
 };
 
-}  // namespace
-
-namespace cudf {
-namespace detail {
 /**
  * @brief Filters `input` using a Filter function object
  *
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 74a94f34ad8..d794adceec2 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -152,8 +152,8 @@ __launch_bounds__(block_size) __global__
  * @param filter      Function of type `FilterFn` which determines for index `i` where to get the
  *                    corresponding output value from
  * @param out_type    `cudf::data_type` of the returned column
- * @param mr          Device memory resource used to allocate the returned column's device memory
  * @param stream      CUDA stream used for device memory operations and kernel launches.
+ * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            A new column that contains the values from either `lhs` or `rhs` as determined
  *                    by `filter[i]`
  */
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 06de9ff2716..5d649e55389 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -640,8 +640,8 @@ void gather_bitmask(table_view const& source,
  * use `DONT_CHECK` when they are certain that the gather_map contains only valid indices for
  * better performance. In case there are out-of-bound indices in the gather map, the behavior
  * is undefined. Defaults to `DONT_CHECK`.
- * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return cudf::table Result of the gather
  */
 template <typename MapIterator>
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 268c4878444..01d9c64ba30 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/table/table_view.hpp>
-
 #include <cudf/copying.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -55,10 +55,10 @@ enum class negative_index_policy : bool { ALLOWED, NOT_ALLOWED };
  * indices. If `policy` is set to `DONT_CHECK` and there are out-of-bounds indices in `gather_map`,
  * the behavior is undefined.
  * @param[in] negative_index_policy Interpret each negative index `i` in the
- * gathermap as the positive index `i+num_source_rows`.
- * @param[in] mr Device memory resource used to allocate the returned table's device memory
+ * `gather_map` as the positive index `i+num_source_rows`.
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- * @return cudf::table Result of the gather
+ * @param[in] mr Device memory resource used to allocate the returned table's device memory
+ * @return Result of the gather
  */
 std::unique_ptr<table> gather(
   table_view const& source_table,
@@ -67,5 +67,21 @@ std::unique_ptr<table> gather(
   negative_index_policy neg_indices,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::detail::gather(table_view const&,column_view const&,table_view
+ * const&,cudf::out_of_bounds_policy,cudf::detail::negative_index_policy,rmm::cuda_stream_view,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @throws cudf::logic_error if `gather_map` span size is larger than max of `size_type`.
+ */
+std::unique_ptr<table> gather(
+  table_view const& source_table,
+  device_span<size_type const> const gather_map,
+  out_of_bounds_policy bounds_policy,
+  negative_index_policy neg_indices,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index d546162fc7a..d0fa4e02440 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -502,17 +502,32 @@ struct indexalator_factory {
       iter = make_input_iterator(col);
     }
 
+    __device__ thrust::pair<size_type, bool> operator()(size_type i) const
+    {
+      return {iter[i], (has_nulls ? bit_is_set(null_mask, i + offset) : true)};
+    }
+  };
+
+  /**
+   * @brief An index accessor that returns a validity flag along with the index value.
+   *
+   * This is suitable as a `pair_iterator`.
+   */
+  struct scalar_nullable_index_accessor {
+    input_indexalator iter;
+    bool const is_null;
+
     /**
      * @brief Create an accessor from a scalar.
      */
-    nullable_index_accessor(scalar const& input) : has_nulls{!input.is_valid()}
+    scalar_nullable_index_accessor(scalar const& input) : is_null{!input.is_valid()}
     {
       iter = indexalator_factory::make_input_iterator(input);
     }
 
-    __device__ thrust::pair<size_type, bool> operator()(size_type i) const
+    __device__ thrust::pair<size_type, bool> operator()(size_type) const
     {
-      return {iter[i], (has_nulls ? bit_is_set(null_mask, i + offset) : true)};
+      return {*iter, is_null};
     }
   };
 
@@ -530,7 +545,75 @@ struct indexalator_factory {
   static auto make_input_pair_iterator(scalar const& input)
   {
     return thrust::make_transform_iterator(thrust::make_constant_iterator<size_type>(0),
-                                           nullable_index_accessor{input});
+                                           scalar_nullable_index_accessor{input});
+  }
+
+  /**
+   * @brief An index accessor that returns an index value if corresponding validity flag is true.
+   *
+   * This is suitable as an `optional_iterator`.
+   */
+  struct optional_index_accessor {
+    input_indexalator iter;
+    bitmask_type const* null_mask{};
+    size_type const offset{};
+    bool const has_nulls{};
+
+    /**
+     * @brief Create an accessor from a column_view.
+     */
+    optional_index_accessor(column_view const& col, bool has_nulls = false)
+      : null_mask{col.null_mask()}, offset{col.offset()}, has_nulls{has_nulls}
+    {
+      if (has_nulls) { CUDF_EXPECTS(col.nullable(), "Unexpected non-nullable column."); }
+      iter = make_input_iterator(col);
+    }
+
+    __device__ thrust::optional<size_type> operator()(size_type i) const
+    {
+      return has_nulls && !bit_is_set(null_mask, i + offset) ? thrust::nullopt
+                                                             : thrust::make_optional(iter[i]);
+    }
+  };
+
+  /**
+   * @brief An index accessor that returns an index value if corresponding validity flag is true.
+   *
+   * This is suitable as an `optional_iterator`.
+   */
+  struct scalar_optional_index_accessor {
+    input_indexalator iter;
+    bool const is_null;
+
+    /**
+     * @brief Create an accessor from a scalar.
+     */
+    scalar_optional_index_accessor(scalar const& input) : is_null{!input.is_valid()}
+    {
+      iter = indexalator_factory::make_input_iterator(input);
+    }
+
+    __device__ thrust::optional<size_type> operator()(size_type) const
+    {
+      return is_null ? thrust::nullopt : thrust::make_optional(*iter);
+    }
+  };
+
+  /**
+   * @brief Create an index iterator with a nullable index accessor.
+   */
+  static auto make_input_optional_iterator(column_view const& col)
+  {
+    return make_counting_transform_iterator(0, optional_index_accessor{col, col.has_nulls()});
+  }
+
+  /**
+   * @brief Create an index iterator with a nullable index accessor for a scalar.
+   */
+  static auto make_input_optional_iterator(scalar const& input)
+  {
+    return thrust::make_transform_iterator(thrust::make_constant_iterator<size_type>(0),
+                                           scalar_optional_index_accessor{input});
   }
 };
 
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
index 01df55dea05..0565f332b48 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -33,8 +33,8 @@ namespace reduction {
  *
  * @param col input column to compute sum
  * @param output_dtype data type of return type and typecast elements of input column
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Sum as scalar of type `output_dtype`.
  */
 std::unique_ptr<scalar> sum(
@@ -52,8 +52,8 @@ std::unique_ptr<scalar> sum(
  *
  * @param col input column to compute minimum.
  * @param output_dtype data type of return type and typecast elements of input column
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Minimum element as scalar of type `output_dtype`.
  */
 std::unique_ptr<scalar> min(
@@ -71,8 +71,8 @@ std::unique_ptr<scalar> min(
  *
  * @param col input column to compute maximum.
  * @param output_dtype data type of return type and typecast elements of input column
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Maximum element as scalar of type `output_dtype`.
  */
 std::unique_ptr<scalar> max(
@@ -91,8 +91,8 @@ std::unique_ptr<scalar> max(
  *
  * @param col input column to compute any_of.
  * @param output_dtype data type of return type and typecast elements of input column
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return bool scalar if any of elements is true when typecasted to bool
  */
 std::unique_ptr<scalar> any(
@@ -111,8 +111,8 @@ std::unique_ptr<scalar> any(
  *
  * @param col input column to compute all_of.
  * @param output_dtype data type of return type and typecast elements of input column
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return bool scalar if all of elements is true when typecasted to bool
  */
 std::unique_ptr<scalar> all(
@@ -131,8 +131,8 @@ std::unique_ptr<scalar> all(
  *
  * @param col input column to compute product.
  * @param output_dtype data type of return type and typecast elements of input column
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Product as scalar of type `output_dtype`.
  */
 std::unique_ptr<scalar> product(
@@ -151,8 +151,8 @@ std::unique_ptr<scalar> product(
  *
  * @param col input column to compute sum of squares.
  * @param output_dtype data type of return type and typecast elements of input column
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Sum of squares as scalar of type `output_dtype`.
  */
 std::unique_ptr<scalar> sum_of_squares(
@@ -171,8 +171,8 @@ std::unique_ptr<scalar> sum_of_squares(
  *
  * @param col input column to compute mean.
  * @param output_dtype data type of return type and typecast elements of input column.
- * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @return Mean as scalar of type `output_dtype`.
  */
 std::unique_ptr<scalar> mean(
@@ -191,8 +191,8 @@ std::unique_ptr<scalar> mean(
  *
  * @param col input column to compute variance.
  * @param output_dtype data type of return type and typecast elements of input column.
- * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @return Variance as scalar of type `output_dtype`.
  */
 std::unique_ptr<scalar> variance(
@@ -212,8 +212,8 @@ std::unique_ptr<scalar> variance(
  *
  * @param col input column to compute standard deviation.
  * @param output_dtype data type of return type and typecast elements of input column.
- * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @return Standard deviation as scalar of type `output_dtype`.
  */
 std::unique_ptr<scalar> standard_deviation(
@@ -243,8 +243,8 @@ std::unique_ptr<scalar> standard_deviation(
  * @param col input column to get nth element from.
  * @param n index of element to get
  * @param null_handling Indicates if null values will be counted while indexing.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return nth element as scalar
  */
 std::unique_ptr<scalar> nth_element(
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index a3b1f95ca0a..e43f8495d07 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -70,6 +70,20 @@ std::unique_ptr<table> scatter(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::detail::scatter(table_view const&,column_view const&,table_view
+ * const&,bool,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ *
+ * @throws cudf::logic_error if `scatter_map` span size is larger than max of `size_type`.
+ */
+std::unique_ptr<table> scatter(
+  table_view const& source,
+  device_span<size_type const> const scatter_map,
+  table_view const& target,
+  bool check_bounds                   = false,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
  * according to a scatter map.
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index c2fe2dce1fe..00d2c820b61 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -32,8 +32,8 @@ namespace detail {
  * @throw cudf::logic_error if dictionary column keys are not all the same type.
  *
  * @param columns Vector of dictionary columns to concatenate.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 8c037406e45..e1d5f0367e0 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -98,8 +98,8 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
  * Any null rows are left unchanged.
  *
  * @param input Vector of cudf::table_views that include dictionary columns to be matched.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary columns and updated cudf::table_views.
  */
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index d62b54208d5..7c2979c56cd 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -283,6 +283,7 @@ std::unique_ptr<column> gather_list_leaf(
 /**
  * @copydoc cudf::lists::segmented_gather(lists_column_view const& source_column,
  *                                        lists_column_view const& gather_map_list,
+ *                                        out_of_bounds_policy bounds_policy,
  *                                        rmm::mr::device_memory_resource* mr)
  *
  * @param stream CUDA stream on which to execute kernels
@@ -290,6 +291,7 @@ std::unique_ptr<column> gather_list_leaf(
 std::unique_ptr<column> segmented_gather(
   lists_column_view const& source_column,
   lists_column_view const& gather_map_list,
+  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index 66d7fb137b2..23054b91592 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
 namespace cudf {
@@ -32,7 +33,7 @@ namespace lists {
  *
  * `source_column` with any depth and `gather_map_list` with depth 1 are only supported.
  *
- * * @code{.pseudo}
+ * @code{.pseudo}
  * source_column   : [{"a", "b", "c", "d"}, {"1", "2", "3", "4"}, {"x", "y", "z"}]
  * gather_map_list : [{0, 1, 3, 2}, {1, 3, 2}, {}]
  *
@@ -44,11 +45,24 @@ namespace lists {
  * @throws cudf::logic_error if gather_map is not list column of an index type.
  *
  * If indices in `gather_map_list` are outside the range `[-n, n)`, where `n` is the number of
- * elements in corresponding row of the source column, the behavior is undefined.
+ * elements in corresponding row of the source column, the behaviour is as follows:
+ *   1. If `bounds_policy` is set to `DONT_CHECK`, the behaviour is undefined.
+ *   2. If `bounds_policy` is set to `NULLIFY`, the corresponding element in the list row
+ *      is set to null in the output column.
+ *
+ * @code{.pseudo}
+ * source_column       : [{"a", "b", "c", "d"}, {"1", "2", "3", "4"}, {"x", "y", "z"}]
+ * gather_map_list     : [{0, -1, 4, -5}, {1, 3, 5}, {}]
+ *
+ * result_with_nullify : [{"a", "d", null, null}, {"2", "4", null}, {}]
+ * @endcode
  *
  * @param source_column View into the list column to gather from
  * @param gather_map_list View into a non-nullable list column of integral indices that maps the
  * element in list of each row in the source columns to rows of lists in the destination columns.
+ * @param bounds_policy Can be `DONT_CHECK` or `NULLIFY`. Selects whether or not to nullify the
+ * output list row's element, when the gather index falls outside the range `[-n, n)`,
+ * where `n` is the number of elements in list row corresponding to the gather-map row.
  * @param mr Device memory resource to allocate any returned objects
  * @return column with elements in list of rows gathered based on `gather_map_list`
  *
@@ -56,6 +70,7 @@ namespace lists {
 std::unique_ptr<column> segmented_gather(
   lists_column_view const& source_column,
   lists_column_view const& gather_map_list,
+  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index 0740039e896..3512c05a586 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -37,8 +37,8 @@ namespace detail {
  * ```
  *
  * @param columns List of string columns to concatenate.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with concatenated results.
  */
 std::unique_ptr<column> concatenate(
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 72f9ac11ceb..b6d34f8d89a 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -47,8 +47,8 @@ namespace detail {
  * @param rhs_begin Strings of second set of data. Used when filter_fn returns false.
  * @param filter_fn Called to determine which iterator (lhs or rhs) to retrieve an entry for a
  * specific row.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
 template <typename StringPairIterLeft, typename StringPairIterRight, typename Filter>
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index 7e8ab6a0b68..05dbdf18b64 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -93,8 +93,8 @@ namespace detail {
  * @param target_begin The starting index of the target range (inclusive)
  * @param target_end The index of the last element in the target range
  * (exclusive)
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return std::unique_ptr<column> The result target column
  */
 template <typename SourceValueIterator, typename SourceValidityIterator>
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index 1ddf0ad5cdf..040175af9e5 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -37,8 +37,8 @@ namespace detail {
  * @param begin First row index to include the new string.
  * @param end Last row index (exclusive).
  * @param value String to use when filling the range.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
 std::unique_ptr<column> fill(
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index faae9a50be2..16955b3251b 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -213,8 +213,8 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
  * @param map_end End of index iterator.
  * @param offsets The offset values to be associated with the output chars column.
  * @param chars_bytes The total number of bytes for the output chars column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New chars column fit for a strings column.
  */
 template <typename StringIterator, typename MapIterator>
@@ -278,8 +278,8 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
  * @param strings Strings instance for this operation.
  * @param begin Start of index iterator.
  * @param end End of index iterator.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column containing the gathered strings.
  */
 template <bool NullifyOutOfBounds, typename MapIterator>
@@ -361,8 +361,8 @@ std::unique_ptr<cudf::column> gather(
  * @param begin Start of index iterator.
  * @param end End of index iterator.
  * @param nullify_out_of_bounds If true, indices outside the column's range are nullified.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column containing the gathered strings.
  */
 template <typename MapIterator>
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index d632e6c24cb..4657f6c83bd 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -39,8 +39,8 @@ namespace detail {
  * @param lhs First column.
  * @param rhs Second column.
  * @param row_order Indexes for each column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
 template <typename index_type, typename row_order_iterator>
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index eb7493f4895..0a53c930bb3 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -48,8 +48,8 @@ namespace detail {
  * @param scatter_map Iterator of indices into the output column.
  * @param target The set of columns into which values from the source column
  *        are to be scattered.
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column.
  */
 template <typename SourceIterator, typename MapIterator>
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index 6894c34a077..efd03d882e6 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -41,8 +41,8 @@ namespace detail {
  * @tparam Iterator Used as input to scan to set the offset values.
  * @param begin The beginning of the input sequence
  * @param end The end of the input sequence
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return offsets child column for strings column
  */
 template <typename InputIterator>
@@ -74,8 +74,8 @@ std::unique_ptr<column> make_offsets_child_column(
  * @tparam Iter Iterator type that returns string_view instances
  * @param strings_begin Iterator to the beginning of the string_view sequence
  * @param num_strings The number of string_view instances in the sequence
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Child offsets column
  */
 template <typename Iter>
@@ -131,8 +131,8 @@ __device__ inline char* copy_string(char* buffer, const string_view& d_string)
  *        chars memory.
  * @param exec_size Number of rows for executing the `size_and_exec_fn` function.
  * @param strings_count Number of strings.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned columns' device memory.
  * @return offsets child column and chars child column for a strings column
  */
 template <typename SizeAndExecuteFunction>
@@ -190,8 +190,8 @@ auto make_strings_children(
  *        After that, the d_offsets and d_chars are set and this is called again to fill in the
  *        chars memory.
  * @param strings_count Number of strings.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned columns' device memory.
  * @return offsets child column and chars child column for a strings column
  */
 template <typename SizeAndExecuteFunction>
diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp
index 8bc8b7a7e6e..3a27d798487 100644
--- a/cpp/include/cudf/wrappers/durations.hpp
+++ b/cpp/include/cudf/wrappers/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,6 +59,8 @@ using duration_us = cuda::std::chrono::duration<int64_t, cuda::std::chrono::micr
 using duration_ns = cuda::std::chrono::duration<int64_t, cuda::std::chrono::nanoseconds::period>;
 
 static_assert(sizeof(duration_D) == sizeof(typename duration_D::rep), "");
+static_assert(sizeof(duration_h) == sizeof(typename duration_h::rep), "");
+static_assert(sizeof(duration_m) == sizeof(typename duration_m::rep), "");
 static_assert(sizeof(duration_s) == sizeof(typename duration_s::rep), "");
 static_assert(sizeof(duration_ms) == sizeof(typename duration_ms::rep), "");
 static_assert(sizeof(duration_us) == sizeof(typename duration_us::rep), "");
@@ -85,6 +87,8 @@ namespace std {
   }
 
 DURATION_LIMITS(cudf::duration_D);
+DURATION_LIMITS(cudf::duration_h);
+DURATION_LIMITS(cudf::duration_m);
 DURATION_LIMITS(cudf::duration_s);
 DURATION_LIMITS(cudf::duration_ms);
 DURATION_LIMITS(cudf::duration_us);
diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp
index ac13dae6a74..8481068ca05 100644
--- a/cpp/include/cudf/wrappers/timestamps.hpp
+++ b/cpp/include/cudf/wrappers/timestamps.hpp
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include <limits>
-
-#include <cuda/std/chrono>
+#include <cudf/wrappers/durations.hpp>
 
 /**
  * @file timestamps.hpp
@@ -42,33 +40,37 @@ using timestamp = time_point<Duration>;
  */
 
 /**
- * @brief Type alias representing an int32_t duration of days since the unix
- * epoch.
+ * @brief Type alias representing a cudf::duration_D (int32_t) since the unix epoch.
+ */
+using timestamp_D = detail::timestamp<cudf::duration_D>;
+/**
+ * @brief Type alias representing a cudf::duration_h (int32_t) since the unix epoch.
+ */
+using timestamp_h = detail::timestamp<cudf::duration_h>;
+/**
+ * @brief Type alias representing a cudf::duration_m (int32_t) since the unix epoch.
  */
-using timestamp_D =
-  detail::timestamp<cuda::std::chrono::duration<int32_t, cuda::std::ratio<86400>>>;
+using timestamp_m = detail::timestamp<cudf::duration_m>;
 /**
- * @brief Type alias representing an int64_t duration of seconds since the
- * unix epoch.
+ * @brief Type alias representing a cudf::duration_s (int64_t) since the unix epoch.
  */
-using timestamp_s = detail::timestamp<cuda::std::chrono::duration<int64_t, cuda::std::ratio<1>>>;
+using timestamp_s = detail::timestamp<cudf::duration_s>;
 /**
- * @brief Type alias representing an int64_t duration of milliseconds since
- * the unix epoch.
+ * @brief Type alias representing a cudf::duration_ms (int64_t) since the unix epoch.
  */
-using timestamp_ms = detail::timestamp<cuda::std::chrono::duration<int64_t, cuda::std::milli>>;
+using timestamp_ms = detail::timestamp<cudf::duration_ms>;
 /**
- * @brief Type alias representing an int64_t duration of microseconds since
- * the unix epoch.
+ * @brief Type alias representing a cudf::duration_us (int64_t) since the unix epoch.
  */
-using timestamp_us = detail::timestamp<cuda::std::chrono::duration<int64_t, cuda::std::micro>>;
+using timestamp_us = detail::timestamp<cudf::duration_us>;
 /**
- * @brief Type alias representing an int64_t duration of nanoseconds since
- * the unix epoch.
+ * @brief Type alias representing a cudf::duration_ns (int64_t) since the unix epoch.
  */
-using timestamp_ns = detail::timestamp<cuda::std::chrono::duration<int64_t, cuda::std::nano>>;
+using timestamp_ns = detail::timestamp<cudf::duration_ns>;
 
 static_assert(sizeof(timestamp_D) == sizeof(typename timestamp_D::rep), "");
+static_assert(sizeof(timestamp_h) == sizeof(typename timestamp_h::rep), "");
+static_assert(sizeof(timestamp_m) == sizeof(typename timestamp_m::rep), "");
 static_assert(sizeof(timestamp_s) == sizeof(typename timestamp_s::rep), "");
 static_assert(sizeof(timestamp_ms) == sizeof(typename timestamp_ms::rep), "");
 static_assert(sizeof(timestamp_us) == sizeof(typename timestamp_us::rep), "");
@@ -95,6 +97,8 @@ namespace std {
   }
 
 TIMESTAMP_LIMITS(cudf::timestamp_D);
+TIMESTAMP_LIMITS(cudf::timestamp_h);
+TIMESTAMP_LIMITS(cudf::timestamp_m);
 TIMESTAMP_LIMITS(cudf::timestamp_s);
 TIMESTAMP_LIMITS(cudf::timestamp_ms);
 TIMESTAMP_LIMITS(cudf::timestamp_us);
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 8b74c9cde94..faddfef8be2 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -30,8 +30,8 @@ namespace detail {
  * @param strings Strings column tokenize.
  * @param delimiter UTF-8 characters used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings columns of tokens.
  */
 std::unique_ptr<cudf::column> tokenize(
@@ -46,8 +46,8 @@ std::unique_ptr<cudf::column> tokenize(
  *
  * @param strings Strings column to tokenize.
  * @param delimiters Strings used to separate individual strings into tokens.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings columns of tokens.
  */
 std::unique_ptr<cudf::column> tokenize(
@@ -63,8 +63,8 @@ std::unique_ptr<cudf::column> tokenize(
  * @param strings Strings column to use for this operation.
  * @param delimiter Strings used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New INT32 column of token counts.
  */
 std::unique_ptr<cudf::column> count_tokens(
@@ -79,8 +79,8 @@ std::unique_ptr<cudf::column> count_tokens(
  *
  * @param strings Strings column to use for this operation.
  * @param delimiters Strings used to separate each string into tokens.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New INT32 column of token counts.
  */
 std::unique_ptr<cudf::column> count_tokens(
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index a1b00a4cd6b..6b03a97c59b 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -324,8 +324,8 @@ namespace detail {
  * @param rhs Right-hand side `column_view` used in the binary operation
  * @param op `binary_operator` to be used to combine `lhs` and `rhs`
  * @param output_type `data_type` of the output column
- * @param mr Device memory resource to use for device memory allocation
  * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource to use for device memory allocation
  * @return std::unique_ptr<column> Output column used for binary operation
  */
 std::unique_ptr<column> make_fixed_width_column_for_output(scalar const& lhs,
@@ -351,8 +351,8 @@ std::unique_ptr<column> make_fixed_width_column_for_output(scalar const& lhs,
  * @param rhs Right-hand side `scalar` used in the binary operation
  * @param op `binary_operator` to be used to combine `lhs` and `rhs`
  * @param output_type `data_type` of the output column
- * @param mr Device memory resource to use for device memory allocation
  * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource to use for device memory allocation
  * @return std::unique_ptr<column> Output column used for binary operation
  */
 std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lhs,
@@ -378,8 +378,8 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
  * @param rhs Right-hand side `column_view` used in the binary operation
  * @param op `binary_operator` to be used to combine `lhs` and `rhs`
  * @param output_type `data_type` of the output column
- * @param mr Device memory resource to use for device memory allocation
  * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource to use for device memory allocation
  * @return std::unique_ptr<column> Output column used for binary operation
  */
 std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lhs,
@@ -426,8 +426,8 @@ namespace jit {
  * @param lhs Left-hand side `scalar` used in the binary operation
  * @param rhs Right-hand side `column_view` used in the binary operation
  * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param mr Device memory resource to use for device memory allocation
  * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource to use for device memory allocation
  * @return std::unique_ptr<column> Resulting output column from the binary operation
  */
 std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
@@ -496,8 +496,8 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
  * @param lhs Left-hand side `column_view` used in the binary operation
  * @param rhs Right-hand side `scalar` used in the binary operation
  * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param mr Device memory resource to use for device memory allocation
  * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource to use for device memory allocation
  * @return std::unique_ptr<column> Resulting output column from the binary operation
  */
 std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
@@ -566,8 +566,8 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
  * @param lhs Left-hand side `column_view` used in the binary operation
  * @param rhs Right-hand side `column_view` used in the binary operation
  * @param op `binary_operator` to be used to combine `lhs` and `rhs`
- * @param mr Device memory resource to use for device memory allocation
  * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource to use for device memory allocation
  * @return std::unique_ptr<column> Resulting output column from the binary operation
  */
 std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index cf3a6025847..26a0f26b59c 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -68,9 +68,10 @@ std::unique_ptr<column> string_null_min_max(
  *
  * @param lhs         The left operand string scalar
  * @param rhs         The right operand string column
+ * @param op          The binary operator
  * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
  * @param stream      CUDA stream used for device memory operations and kernel launches.
+ * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
 std::unique_ptr<column> binary_operation(
@@ -94,9 +95,10 @@ std::unique_ptr<column> binary_operation(
  *
  * @param lhs         The left operand string column
  * @param rhs         The right operand string scalar
+ * @param op          The binary operator
  * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
  * @param stream      CUDA stream used for device memory operations and kernel launches.
+ * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
 std::unique_ptr<column> binary_operation(
@@ -119,9 +121,10 @@ std::unique_ptr<column> binary_operation(
  *
  * @param lhs         The left operand string column
  * @param rhs         The right operand string column
+ * @param op          The binary operator enum
  * @param output_type The desired data type of the output column
- * @param mr          Device memory resource used to allocate the returned column's device memory
  * @param stream      CUDA stream used for device memory operations and kernel launches.
+ * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
 std::unique_ptr<column> binary_operation(
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index df1ce6c0e67..21a27ff8c3d 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/copy_if_else.cuh>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/scatter.cuh>
+#include <cudf/detail/scatter.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/traits.hpp>
 
@@ -181,23 +180,24 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  auto scatter_map = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
-  auto const scatter_map_end = thrust::copy_if(rmm::exec_policy(stream),
-                                               thrust::make_counting_iterator(size_type{0}),
-                                               thrust::make_counting_iterator(size_type{size}),
-                                               scatter_map.begin(),
-                                               is_left);
+  auto gather_map = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
+  auto const gather_map_end = thrust::copy_if(rmm::exec_policy(stream),
+                                              thrust::make_counting_iterator(size_type{0}),
+                                              thrust::make_counting_iterator(size_type{size}),
+                                              gather_map.begin(),
+                                              is_left);
+
+  gather_map.resize(thrust::distance(gather_map.begin(), gather_map_end), stream);
 
   auto const scatter_src_lhs = cudf::detail::gather(table_view{std::vector<column_view>{lhs}},
-                                                    scatter_map.begin(),
-                                                    scatter_map_end,
+                                                    gather_map,
                                                     out_of_bounds_policy::DONT_CHECK,
+                                                    negative_index_policy::NOT_ALLOWED,
                                                     stream);
 
   auto result = cudf::detail::scatter(
     table_view{std::vector<column_view>{scatter_src_lhs->get_column(0).view()}},
-    scatter_map.begin(),
-    scatter_map_end,
+    gather_map,
     table_view{std::vector<column_view>{rhs}},
     false,
     stream,
@@ -227,8 +227,12 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                    static_cast<cudf::size_type>(scatter_map_size),
                                                    scatter_map.begin()};
 
-  auto result = cudf::scatter(
-    scatter_source, scatter_map_column_view, table_view{std::vector<column_view>{rhs}}, false, mr);
+  auto result = cudf::detail::scatter(scatter_source,
+                                      scatter_map_column_view,
+                                      table_view{std::vector<column_view>{rhs}},
+                                      false,
+                                      stream,
+                                      mr);
 
   return std::move(result->release()[0]);
 }
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index 181752d18e8..5c66f67ff0f 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -54,6 +54,21 @@ std::unique_ptr<table> gather(table_view const& source_table,
   return gather(source_table, map_begin, map_end, bounds_policy, stream, mr);
 }
 
+std::unique_ptr<table> gather(table_view const& source_table,
+                              device_span<size_type const> const gather_map,
+                              out_of_bounds_policy bounds_policy,
+                              negative_index_policy neg_indices,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(gather_map.size() <= std::numeric_limits<size_type>::max(),
+               "invalid gather map size");
+  auto map_col = column_view(data_type{type_to_id<size_type>()},
+                             static_cast<size_type>(gather_map.size()),
+                             gather_map.data());
+  return gather(source_table, map_col, bounds_policy, neg_indices, stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<table> gather(table_view const& source_table,
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 3312316f548..211bc0e1ebe 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -16,8 +16,6 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.cuh>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -305,6 +303,21 @@ std::unique_ptr<table> scatter(table_view const& source,
   return detail::scatter(source, map_begin, map_end, target, check_bounds, stream, mr);
 }
 
+std::unique_ptr<table> scatter(table_view const& source,
+                               device_span<size_type const> const scatter_map,
+                               table_view const& target,
+                               bool check_bounds,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(scatter_map.size() <= std::numeric_limits<size_type>::max(),
+               "invalid scatter map size");
+  auto map_col = column_view(data_type{type_to_id<size_type>()},
+                             static_cast<size_type>(scatter_map.size()),
+                             scatter_map.data());
+  return scatter(source, map_col, target, check_bounds, stream, mr);
+}
+
 std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
                                column_view const& indices,
                                table_view const& target,
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 16b86177b8e..7e2a82a683c 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -49,8 +49,8 @@ namespace {
  *                    and returns true if that key is to be used in the output dictionary.
  * @param dictionary_column The column to use for creating the new dictionary.
  * @param keys_to_keep_fn Called to determine which keys in `dictionary_column` to keep.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  */
 template <typename KeysKeeper>
 std::unique_ptr<column> remove_keys_fn(
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 37118779248..11c81ee434b 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -44,8 +44,8 @@ namespace {
  *
  * @param input lhs for `copy_if_else`
  * @param replacement_iter rhs for `copy_if_else`
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Always returns column of type INT32 (size_type)
  */
 template <typename ReplacementIter>
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index b64199d7f0c..b8150f7fd14 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -26,13 +26,14 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/replace.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -166,7 +167,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   cudf::detail::result_cache* sparse_results;
   cudf::detail::result_cache* dense_results;
   device_span<size_type const> gather_map;
-  size_type const map_size;
   Map const& map;
   bitmask_type const* __restrict__ row_bitmask;
   rmm::cuda_stream_view stream;
@@ -179,7 +179,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
                               cudf::detail::result_cache* sparse_results,
                               cudf::detail::result_cache* dense_results,
                               device_span<size_type const> gather_map,
-                              size_type map_size,
                               Map const& map,
                               bitmask_type const* row_bitmask,
                               rmm::cuda_stream_view stream,
@@ -188,7 +187,6 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
       sparse_results(sparse_results),
       dense_results(dense_results),
       gather_map(gather_map),
-      map_size(map_size),
       map(map),
       row_bitmask(row_bitmask),
       stream(stream),
@@ -202,9 +200,9 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   {
     auto s                  = sparse_results->get_result(col, agg);
     auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}),
-                                                   gather_map.begin(),
-                                                   gather_map.begin() + map_size,
+                                                   gather_map,
                                                    out_of_bounds_policy::DONT_CHECK,
+                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
                                                    stream,
                                                    mr);
     return std::move(dense_result_table->release()[0]);
@@ -374,7 +372,7 @@ void sparse_to_dense_results(table_view const& keys,
                              cudf::detail::result_cache* sparse_results,
                              cudf::detail::result_cache* dense_results,
                              device_span<size_type const> gather_map,
-                             size_type map_size,
+                             // size_type map_size,
                              Map const& map,
                              bool keys_have_nulls,
                              null_policy include_null_keys,
@@ -393,7 +391,7 @@ void sparse_to_dense_results(table_view const& keys,
     // Given an aggregation, this will get the result from sparse_results and
     // convert and return dense, compacted result
     auto finalizer = hash_compound_agg_finalizer<Map>(
-      col, sparse_results, dense_results, gather_map, map_size, map, row_bitmask_ptr, stream, mr);
+      col, sparse_results, dense_results, gather_map, map, row_bitmask_ptr, stream, mr);
     for (auto&& agg : agg_v) {
       agg->finalize(finalizer);
     }
@@ -595,15 +593,18 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
                           &sparse_results,
                           cache,
                           gather_map,
-                          gather_map.size(),
                           *map,
                           keys_have_nulls,
                           include_null_keys,
                           stream,
                           mr);
 
-  return cudf::detail::gather(
-    keys, gather_map.begin(), gather_map.end(), out_of_bounds_policy::DONT_CHECK, stream, mr);
+  return cudf::detail::gather(keys,
+                              gather_map,
+                              out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
+                              stream,
+                              mr);
 }
 
 }  // namespace
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index a30d4639af8..43a1674d97f 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -18,7 +18,6 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy_if.cuh>
-#include <cudf/detail/gather.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index e7dc57f6c93..f2c57abf54e 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -20,12 +20,13 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/uninitialized_fill.h>
@@ -113,10 +114,11 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
                          return (bitmask_iterator[i] && intra_group_index[i] == nth);
                        });
   }
+
   auto output_table = cudf::detail::gather(table_view{{values}},
-                                           nth_index.begin(),
-                                           nth_index.end(),
+                                           nth_index,
                                            out_of_bounds_policy::NULLIFY,
+                                           cudf::detail::negative_index_policy::NOT_ALLOWED,
                                            stream,
                                            mr);
   if (!output_table->get_column(0).has_nulls()) output_table->get_column(0).set_null_mask({}, 0);
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index cb01ee8e053..f5060a6ed4e 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -45,8 +45,8 @@ namespace detail {
  * @param values Grouped values to get sum of
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
@@ -68,8 +68,8 @@ std::unique_ptr<column> group_sum(column_view const& values,
  * @param values Grouped values to get product of
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_product(column_view const& values,
                                       size_type num_groups,
@@ -91,8 +91,8 @@ std::unique_ptr<column> group_product(column_view const& values,
  * @param values Grouped values to get minimum from
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
@@ -114,8 +114,8 @@ std::unique_ptr<column> group_min(column_view const& values,
  * @param values Grouped values to get maximum from
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
@@ -138,8 +138,8 @@ std::unique_ptr<column> group_max(column_view const& values,
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
  * @param key_sort_order Indices indicating sort order of groupby keys
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_argmax(column_view const& values,
                                      size_type num_groups,
@@ -163,8 +163,8 @@ std::unique_ptr<column> group_argmax(column_view const& values,
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
  * @param key_sort_order Indices indicating sort order of groupby keys
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_argmin(column_view const& values,
                                      size_type num_groups,
@@ -188,8 +188,8 @@ std::unique_ptr<column> group_argmin(column_view const& values,
  * @param values Grouped values to get valid count of
  * @param group_labels ID of group that the corresponding value belongs to
  * @param num_groups Number of groups ( unique values in @p group_labels )
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_count_valid(column_view const& values,
                                           cudf::device_span<size_type const> group_labels,
@@ -209,8 +209,8 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
  *
  * @param group_offsets Offsets of groups' starting points within @p values
  * @param num_groups Number of groups ( unique values in @p group_labels )
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
@@ -232,8 +232,8 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
  * @param values Grouped values to compute M2 values
  * @param group_means Pre-computed groupwise MEAN
  * @param group_labels ID of group corresponding value in @p values belongs to
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_m2(column_view const& values,
                                  column_view const& group_means,
@@ -260,8 +260,8 @@ std::unique_ptr<column> group_m2(column_view const& values,
  * @param group_labels ID of group corresponding value in @p values belongs to
  * @param ddof Delta degrees of freedom. The divisor used in calculation of
  *             `var` is `N - ddof`, where `N` is the group size.
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_var(column_view const& values,
                                   column_view const& group_means,
@@ -289,8 +289,8 @@ std::unique_ptr<column> group_var(column_view const& values,
  * @param group_offsets Offsets of groups' starting points within @p values
  * @param quantiles List of quantiles q where q lies in [0,1]
  * @param interp Method to use when desired value lies between data points
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_quantiles(column_view const& values,
                                         column_view const& group_sizes,
@@ -322,8 +322,8 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
  * @param null_handling Exclude nulls while counting if null_policy::EXCLUDE,
  *  Include nulls if null_policy::INCLUDE.
  *  Nulls are treated equal.
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_nunique(column_view const& values,
                                       cudf::device_span<size_type const> group_labels,
@@ -355,8 +355,8 @@ std::unique_ptr<column> group_nunique(column_view const& values,
  * @param n nth element to choose from each group of @p values
  * @param null_handling Exclude nulls while counting if null_policy::EXCLUDE,
  *  Include nulls if null_policy::INCLUDE.
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_nth_element(column_view const& values,
                                           column_view const& group_sizes,
@@ -433,8 +433,8 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
  * @param values Grouped values (tuples of values `(valid_count, mean, M2)`) to merge.
  * @param group_offsets Offsets of groups' starting points within @p values.
  * @param num_groups Number of groups.
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
index 86590fc1734..cb954eb7ce5 100644
--- a/cpp/src/groupby/sort/group_replace_nulls.cu
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -14,8 +14,9 @@
  * limitations under the License.
  */
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/group_replace_nulls.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/replace/nulls.cuh>
 #include <cudf/replace.hpp>
 
@@ -68,9 +69,9 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
   }
 
   auto output = cudf::detail::gather(cudf::table_view({grouped_value}),
-                                     gather_map.begin(),
-                                     gather_map.end(),
+                                     gather_map,
                                      cudf::out_of_bounds_policy::DONT_CHECK,
+                                     cudf::detail::negative_index_policy::NOT_ALLOWED,
                                      stream,
                                      mr);
 
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index 5b4252a9063..0738e4c5730 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -299,6 +299,8 @@ __global__ void generate_cluster_limits_kernel(int delta_,
         nearest_w_index       = last_inserted_index + 1;
         auto [r, i, adjusted] = cumulative_weight(nearest_w_index);
         adjusted_next_limit   = max(next_limit, adjusted);
+        (void)r;
+        (void)i;
       }
       cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit;
       last_inserted_index                         = nearest_w_index;
@@ -469,6 +471,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
      group_cumulative_weight] __device__(size_type value_index) -> size_type {
       auto [group_index, relative_value_index, cumulative_weight] =
         group_cumulative_weight(value_index);
+      (void)relative_value_index;
 
       // compute start of cluster weight limits for this group
       double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index];
@@ -532,10 +535,12 @@ struct get_scalar_minmax {
 
   __device__ thrust::tuple<double, double> operator()(size_type group_index)
   {
-    // note: .element<T>() is taking care of fixed-point conversions for us.
-    return {static_cast<double>(col.element<T>(group_offsets[group_index])),
-            static_cast<double>(
-              col.element<T>(group_offsets[group_index] + (group_valid_counts[group_index] - 1)))};
+    auto const valid_count = group_valid_counts[group_index];
+    return valid_count > 0
+             ? thrust::make_tuple(
+                 static_cast<double>(col.element<T>(group_offsets[group_index])),
+                 static_cast<double>(col.element<T>(group_offsets[group_index] + valid_count - 1)))
+             : thrust::make_tuple(0.0, 0.0);
   }
 };
 
@@ -601,15 +606,9 @@ struct typed_group_tdigest {
 
   template <
     typename T,
+    typename... Args,
     typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const& col,
-                                     cudf::device_span<size_type const> group_offsets,
-                                     cudf::device_span<size_type const> group_labels,
-                                     cudf::device_span<size_type const> group_valid_counts,
-                                     size_type num_groups,
-                                     int delta,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+  std::unique_ptr<column> operator()(Args&&...)
   {
     CUDF_FAIL("Non-numeric type in group_tdigest");
   }
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 4d3736a41f0..3b4549c135f 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -299,7 +299,8 @@ __inline__ __device__ T decode_value(char const* begin,
   return cudf::io::parse_numeric<T, base>(begin, end, opts);
 }
 
-template <typename T>
+template <typename T,
+          std::enable_if_t<!cudf::is_timestamp<T>() and !cudf::is_duration<T>()>* = nullptr>
 __inline__ __device__ T decode_value(char const* begin,
                                      char const* end,
                                      parse_options_view const& opts)
@@ -307,81 +308,22 @@ __inline__ __device__ T decode_value(char const* begin,
   return cudf::io::parse_numeric<T>(begin, end, opts);
 }
 
-template <>
-__inline__ __device__ cudf::timestamp_D decode_value(char const* begin,
-                                                     char const* end,
-                                                     parse_options_view const& opts)
-{
-  return timestamp_D{cudf::duration_D{to_date(begin, end, opts.dayfirst)}};
-}
-
-template <>
-__inline__ __device__ cudf::timestamp_s decode_value(char const* begin,
-                                                     char const* end,
-                                                     parse_options_view const& opts)
-{
-  auto milli = to_date_time(begin, end, opts.dayfirst);
-  if (milli == -1) {
-    return timestamp_s{cudf::duration_s{to_non_negative_integer<int64_t>(begin, end)}};
-  } else {
-    return timestamp_s{cudf::duration_s{milli / 1000}};
-  }
-}
-
-template <>
-__inline__ __device__ cudf::timestamp_ms decode_value(char const* begin,
-                                                      char const* end,
-                                                      parse_options_view const& opts)
-{
-  auto milli = to_date_time(begin, end, opts.dayfirst);
-  if (milli == -1) {
-    return timestamp_ms{cudf::duration_ms{to_non_negative_integer<int64_t>(begin, end)}};
-  } else {
-    return timestamp_ms{cudf::duration_ms{milli}};
-  }
-}
-
-template <>
-__inline__ __device__ cudf::timestamp_us decode_value(char const* begin,
-                                                      char const* end,
-                                                      parse_options_view const& opts)
+template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
 {
-  auto milli = to_date_time(begin, end, opts.dayfirst);
-  if (milli == -1) {
-    return timestamp_us{cudf::duration_us{to_non_negative_integer<int64_t>(begin, end)}};
-  } else {
-    return timestamp_us{cudf::duration_us{milli * 1000}};
-  }
+  return to_timestamp<T>(begin, end, opts.dayfirst);
 }
 
-template <>
-__inline__ __device__ cudf::timestamp_ns decode_value(char const* begin,
-                                                      char const* end,
-                                                      parse_options_view const& opts)
+template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
 {
-  auto milli = to_date_time(begin, end, opts.dayfirst);
-  if (milli == -1) {
-    return timestamp_ns{cudf::duration_ns{to_non_negative_integer<int64_t>(begin, end)}};
-  } else {
-    return timestamp_ns{cudf::duration_ns{milli * 1000000}};
-  }
+  return to_duration<T>(begin, end);
 }
 
-#ifndef DURATION_DECODE_VALUE
-#define DURATION_DECODE_VALUE(Type)                                     \
-  template <>                                                           \
-  __inline__ __device__ Type decode_value(                              \
-    const char* begin, const char* end, parse_options_view const& opts) \
-  {                                                                     \
-    return Type{to_time_delta<Type>(begin, end)};                       \
-  }
-#endif
-DURATION_DECODE_VALUE(duration_D)
-DURATION_DECODE_VALUE(duration_s)
-DURATION_DECODE_VALUE(duration_ms)
-DURATION_DECODE_VALUE(duration_us)
-DURATION_DECODE_VALUE(duration_ns)
-
 // The purpose of this is merely to allow compilation ONLY
 // TODO : make this work for csv
 template <>
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index 3e785259476..18758bd5309 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -18,8 +18,10 @@
 
 #include <thrust/reduce.h>
 
-#include <cudf/wrappers/durations.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+
 #include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/time_utils.cuh>
 
 namespace cudf {
 namespace io {
@@ -49,101 +51,24 @@ __inline__ __device__ T to_non_negative_integer(char const* begin, char const* e
   return value;
 }
 
-// User-defined literals to clarify numbers and units for time calculation
-__inline__ __device__ constexpr uint32_t operator"" _days(unsigned long long int days)
-{
-  return days;
-}
-__inline__ __device__ constexpr uint32_t operator"" _erasInDays(unsigned long long int eras)
-{
-  return eras * 146097_days;  // multiply by days within an era (400 year span)
-}
-__inline__ __device__ constexpr uint32_t operator"" _years(unsigned long long int years)
-{
-  return years;
-}
-__inline__ __device__ constexpr uint32_t operator"" _erasInYears(unsigned long long int eras)
-{
-  return (eras * 1_erasInDays) / 365_days;
-}
-
-/**
- * @brief Computes the number of days since "March 1, 0000", given a date.
- *
- * This function takes year, month, and day and returns the number of days since the baseline which
- * is taken as 0000-03-01. This value is chosen as the origin for ease of calculation (now February
- * becomes the last month).
- *
- * @return days since March 1, 0000
- */
-__inline__ __device__ constexpr int32_t days_since_baseline(int year, int month, int day)
-{
-  // More details of this formula are located in cuDF datetime_ops
-  // In brief, the calculation is split over several components:
-  //     era: a 400 year range, where the date cycle repeats exactly
-  //     yoe: year within the 400 range of an era
-  //     doy: day within the 364 range of a year
-  //     doe: exact day within the whole era
-  // The months are shifted so that March is the starting month and February
-  // (possible leap day in it) is the last month for the linear calculation
-  year -= (month <= 2) ? 1 : 0;
-
-  const int32_t era = (year >= 0 ? year : year - 399_years) / 1_erasInYears;
-  const int32_t yoe = year - era * 1_erasInYears;
-  const int32_t doy = (153_days * (month + (month > 2 ? -3 : 9)) + 2) / 5 + day - 1;
-  const int32_t doe = (yoe * 365_days) + (yoe / 4_years) - (yoe / 100_years) + doy;
-
-  return (era * 1_erasInDays) + doe;
-}
-
-/**
- * @brief Computes the number of days since epoch, given a date.
- *
- * This function takes year, month, and day and returns the number of days since epoch (1970-01-01).
- *
- * @return days since epoch
- */
-__inline__ __device__ constexpr int32_t days_since_epoch(int year, int month, int day)
-{
-  // Shift the start date to epoch to match unix time
-  static_assert(static_cast<uint32_t>(days_since_baseline(1970, 1, 1)) == 719468_days,
-                "Baseline to epoch returns incorrect number of days");
-
-  return days_since_baseline(year, month, day) - days_since_baseline(1970, 1, 1);
-}
-
 /**
- * @brief Computes the number of seconds since epoch, given a date and time.
+ * @brief Extracts the Day, Month, and Year from a string.
  *
- * This function takes year, month, day, hour, minute and second and returns
- * the number of seconds since epoch (1970-01-01),
+ * This function takes a string and produces a `year_month_day` representation.
+ * Acceptable formats are a combination of `YYYY`, `M`, `MM`, `D` and `DD` with
+ * `/` or `-` as separators. Data with only year and month (no day) is also valid.
  *
- * @return seconds since epoch
+ * @param begin Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
+ * @param dayfirst Flag indicating that first field is the day
+ * @return Extracted year, month and day in `cuda::std::chrono::year_month_day` format
  */
-__inline__ __device__ constexpr int64_t seconds_since_epoch(
-  int year, int month, int day, int hour, int minute, int second)
+__inline__ __device__ cuda::std::chrono::year_month_day extract_date(char const* begin,
+                                                                     char const* end,
+                                                                     bool dayfirst)
 {
-  // Leverage the function to find the days since epoch
-  const int64_t days = days_since_epoch(year, month, day);
+  using namespace cuda::std::chrono;
 
-  // Return sum total seconds from each time portion
-  return (days * 24 * 60 * 60) + (hour * 60 * 60) + (minute * 60) + second;
-}
-
-/**
- * @brief Extracts the Day, Month, and Year from a string.
- *
- * @param[in] begin Pointer to the first element of the string
- * @param[in] end Pointer to the first element after the string
- * @param[in] dayfirst Flag indicating that first field is the day
- * @param[out] year
- * @param[out] month
- * @param[out] day
- * @return true if successful, false otherwise
- */
-__inline__ __device__ bool extract_date(
-  char const* begin, char const* end, bool dayfirst, int* year, int* month, int* day)
-{
   char sep = '/';
 
   auto sep_pos = thrust::find(thrust::seq, begin, end, sep);
@@ -153,11 +78,13 @@ __inline__ __device__ bool extract_date(
     sep_pos = thrust::find(thrust::seq, begin, end, sep);
   }
 
-  if (sep_pos == end) return false;
+  year y;
+  month m;
+  day d;
 
   //--- is year the first filed?
   if ((sep_pos - begin) == 4) {
-    *year = to_non_negative_integer<int>(begin, sep_pos);
+    y = year{to_non_negative_integer<int32_t>(begin, sep_pos)};  //  year is signed
 
     // Month
     auto s2 = sep_pos + 1;
@@ -165,72 +92,70 @@ __inline__ __device__ bool extract_date(
 
     if (sep_pos == end) {
       //--- Data is just Year and Month - no day
-      *month = to_non_negative_integer<int>(s2, end);
-      *day   = 1;
+      m = month{to_non_negative_integer<uint32_t>(s2, end)};  // month and day are unsigned
+      d = day{1};
 
     } else {
-      *month = to_non_negative_integer<int>(s2, sep_pos);
-      *day   = to_non_negative_integer<int>((sep_pos + 1), end);
+      m = month{to_non_negative_integer<uint32_t>(s2, sep_pos)};
+      d = day{to_non_negative_integer<uint32_t>((sep_pos + 1), end)};
     }
 
   } else {
     //--- if the dayfirst flag is set, then restricts the format options
     if (dayfirst) {
-      *day = to_non_negative_integer<int>(begin, sep_pos);
+      d = day{to_non_negative_integer<uint32_t>(begin, sep_pos)};
 
       auto s2 = sep_pos + 1;
       sep_pos = thrust::find(thrust::seq, s2, end, sep);
 
-      *month = to_non_negative_integer<int>(s2, sep_pos);
-      *year  = to_non_negative_integer<int>((sep_pos + 1), end);
+      m = month{to_non_negative_integer<uint32_t>(s2, sep_pos)};
+      y = year{to_non_negative_integer<int32_t>((sep_pos + 1), end)};
 
     } else {
-      *month = to_non_negative_integer<int>(begin, sep_pos);
+      m = month{to_non_negative_integer<uint32_t>(begin, sep_pos)};
 
       auto s2 = sep_pos + 1;
       sep_pos = thrust::find(thrust::seq, s2, end, sep);
 
       if (sep_pos == end) {
         //--- Data is just Year and Month - no day
-        *year = to_non_negative_integer<int>(s2, end);
-        *day  = 1;
+        y = year{to_non_negative_integer<int32_t>(s2, end)};
+        d = day{1};
 
       } else {
-        *day  = to_non_negative_integer<int>(s2, sep_pos);
-        *year = to_non_negative_integer<int>((sep_pos + 1), end);
+        d = day{to_non_negative_integer<uint32_t>(s2, sep_pos)};
+        y = year{to_non_negative_integer<int32_t>((sep_pos + 1), end)};
       }
     }
   }
 
-  return true;
+  return year_month_day{y, m, d};
 }
 
 /**
  * @brief Parses a string to extract the hour, minute, second and millisecond time field
- * values.
+ * values of a day.
  *
  * Incoming format is expected to be `HH:MM:SS.MS`, with the latter second and millisecond fields
  * optional. Each time field can be a single, double, or triple (in the case of milliseconds)
  * digits. 12-hr and 24-hr time format is detected via the absence or presence of AM/PM characters
  * at the end.
  *
- * @param[in] begin Pointer to the first element of the string
- * @param[in] end Pointer to the first element after the string
- * @param[out] hour The hour value
- * @param[out] minute The minute value
- * @param[out] second The second value (0 if not present)
- * @param[out] millisecond The millisecond (0 if not present)
+ * @param begin Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
+ * @return Extracted hours, minutes, seconds and milliseconds of `chrono::hh_mm_ss` type with a
+ * precision of milliseconds
  */
-__inline__ __device__ void extract_time(
-  char const* begin, char const* end, int* hour, int* minute, int* second, int* millisecond)
+__inline__ __device__ cuda::std::chrono::hh_mm_ss<duration_ms> extract_time_of_day(
+  char const* begin, char const* end)
 {
   constexpr char sep = ':';
 
   // Adjust for AM/PM and any whitespace before
-  int hour_adjust = 0;
-  auto last       = end - 1;
+  duration_h d_h{0};
+  auto last = end - 1;
   if (*last == 'M' || *last == 'm') {
-    if (*(last - 1) == 'P' || *(last - 1) == 'p') { hour_adjust = 12; }
+    if (*(last - 1) == 'P' || *(last - 1) == 'p') { d_h = duration_h{12}; }
     last = last - 2;
     while (*last == ' ') {
       --last;
@@ -240,94 +165,89 @@ __inline__ __device__ void extract_time(
 
   // Find hour-minute separator
   const auto hm_sep = thrust::find(thrust::seq, begin, end, sep);
-  *hour             = to_non_negative_integer<int>(begin, hm_sep) + hour_adjust;
+  // Extract hours
+  d_h += cudf::duration_h{to_non_negative_integer<int>(begin, hm_sep)};
+
+  duration_m d_m{0};
+  duration_s d_s{0};
+  duration_ms d_ms{0};
 
   // Find minute-second separator (if present)
   const auto ms_sep = thrust::find(thrust::seq, hm_sep + 1, end, sep);
   if (ms_sep == end) {
-    *minute      = to_non_negative_integer<int>(hm_sep + 1, end);
-    *second      = 0;
-    *millisecond = 0;
+    d_m = duration_m{to_non_negative_integer<int32_t>(hm_sep + 1, end)};
   } else {
-    *minute = to_non_negative_integer<int>(hm_sep + 1, ms_sep);
+    d_m = duration_m{to_non_negative_integer<int32_t>(hm_sep + 1, ms_sep)};
 
     // Find second-millisecond separator (if present)
     const auto sms_sep = thrust::find(thrust::seq, ms_sep + 1, end, '.');
     if (sms_sep == end) {
-      *second      = to_non_negative_integer<int>(ms_sep + 1, end);
-      *millisecond = 0;
+      d_s = duration_s{to_non_negative_integer<int64_t>(ms_sep + 1, end)};
     } else {
-      *second      = to_non_negative_integer<int>(ms_sep + 1, sms_sep);
-      *millisecond = to_non_negative_integer<int>(sms_sep + 1, end);
+      d_s  = duration_s{to_non_negative_integer<int64_t>(ms_sep + 1, sms_sep)};
+      d_ms = duration_ms{to_non_negative_integer<int64_t>(sms_sep + 1, end)};
     }
   }
+  return cuda::std::chrono::hh_mm_ss<duration_ms>{d_h + d_m + d_s + d_ms};
 }
 
 /**
- * @brief Parses a date string into a `date32`, days since epoch.
- *
- * This function takes a string and produces a `date32` representation.
- * Acceptable formats are a combination of `MM/YYYY` and `MM/DD/YYYY`.
- *
- * @param[in] begin Pointer to the first element of the string
- * @param[in] end Pointer to the first element after the string
- * @param[in] dayfirst Flag to indicate that day is the first field - `DD/MM/YYYY`
- * @return Number of days since epoch
+ * @brief Checks whether `c` is decimal digit
  */
-__inline__ __device__ int32_t to_date(char const* begin, char const* end, bool dayfirst)
-{
-  int day, month, year;
-
-  return extract_date(begin, end, dayfirst, &year, &month, &day)
-           ? days_since_epoch(year, month, day)
-           : -1;
-}
+constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; }
 
 /**
- * @brief Parses a datetime string and computes the number of milliseconds since epoch.
+ * @brief Parses a datetime string and computes the corresponding timestamp.
  *
- * This function takes a string and produces a `date32` representation.
- * Acceptable formats are a combination of `MM/YYYY` and `MM/DD/YYYY`.
+ * Acceptable date formats are a combination of `YYYY`, `M`, `MM`, `D` and `DD` with `/` or `-` as
+ * separators. Input with only year and month (no day) is also valid. Character `T` or blank space
+ * is expected to be the separator between date and time of day. Optional time of day information
+ * like hours, minutes, seconds and milliseconds are expected to be `HH:MM:SS.MS`. Each time field
+ * can be a single, double, or triple (in the case of milliseconds) digits. 12-hr and 24-hr time
+ * format is detected via the absence or presence of AM/PM characters at the end.
  *
+ * @tparam timestamp_type Type of output timestamp
  * @param begin Pointer to the first element of the string
  * @param end Pointer to the first element after the string
  * @param dayfirst Flag to indicate day/month or month/day order
- * @return Milliseconds since epoch
+ * @return Timestamp converted to `timestamp_type`
  */
-__inline__ __device__ int64_t to_date_time(char const* begin, char const* end, bool dayfirst)
+template <typename timestamp_type>
+__inline__ __device__ timestamp_type to_timestamp(char const* begin, char const* end, bool dayfirst)
 {
-  int day, month, year;
-  int hour, minute, second, millisecond = 0;
-  int64_t answer = -1;
+  using duration_type = typename timestamp_type::duration;
+
+  auto sep_pos = end;
 
   // Find end of the date portion
-  // TODO: Refactor all the date/time parsing to remove multiple passes over each character because
-  // of find() then convert(); that can also avoid the ugliness below.
-  auto sep_pos = thrust::find(thrust::seq, begin, end, 'T');
-  if (sep_pos == end) {
-    // Attempt to locate the position between date and time, ignore premature space separators
-    // around the day/month/year portions
-    int count = 0;
-    for (auto i = begin; i < end; ++i) {
-      if (count == 3 && *i == ' ') {
-        sep_pos = i;
-        break;
-      } else if ((*i == '/' || *i == '-') || (count == 2 && *i != ' ')) {
-        count++;
-      }
+  int count        = 0;
+  bool digits_only = true;
+  for (auto i = begin; i < end; ++i) {
+    digits_only = digits_only and is_digit(*i);
+    if (*i == 'T') {
+      sep_pos = i;
+      break;
+    } else if (count == 3 && *i == ' ') {
+      sep_pos = i;
+      break;
+    } else if ((*i == '/' || *i == '-') || (count == 2 && *i != ' ')) {
+      count++;
     }
   }
 
-  // There is only date if there's no separator, otherwise it's malformed
+  // Exit if the input string is digit-only
+  if (digits_only) {
+    return timestamp_type{
+      duration_type{to_non_negative_integer<typename timestamp_type::rep>(begin, end)}};
+  }
+
+  auto ymd = extract_date(begin, sep_pos, dayfirst);
+  timestamp_type answer{cuda::std::chrono::sys_days{ymd}};
+
+  // Extract time only if separator is present
   if (sep_pos != end) {
-    if (extract_date(begin, sep_pos, dayfirst, &year, &month, &day)) {
-      extract_time(sep_pos + 1, end, &hour, &minute, &second, &millisecond);
-      answer = seconds_since_epoch(year, month, day, hour, minute, second) * 1000 + millisecond;
-    }
-  } else {
-    if (extract_date(begin, end, dayfirst, &year, &month, &day)) {
-      answer = seconds_since_epoch(year, month, day, 0, 0, 0) * 1000;
-    }
+    auto t = extract_time_of_day(sep_pos + 1, end);
+    answer += cuda::std::chrono::duration_cast<duration_type>(t.to_duration());
   }
 
   return answer;
@@ -382,58 +302,65 @@ __inline__ __device__ T parse_optional_integer(char const** begin, char const* e
 }
 
 /**
- * @brief Parses the input string into a duration of the given type.
+ * @brief Parses the input string into a duration of `duration_type`.
  *
+ * The expected format can be one of the following: `DD days`, `DD days +HH:MM:SS.NS`, `DD days
+ * HH:MM::SS.NS`, `HH:MM::SS.NS` and digits-only string. Note `DD` and optional `NS` field can
+ * contain arbitrary number of digits while `HH`, `MM` and `SS` can be single or double digits.
+ *
+ * @tparam duration_type Type of the parsed duration
  * @param begin Pointer to the first element of the string
  * @param end Pointer to the first element after the string
- * @return The parsed duration
+ * @return The parsed duration in `duration_type`
  */
-template <typename T>
-__inline__ __device__ int64_t to_time_delta(char const* begin, char const* end)
+template <typename duration_type>
+__inline__ __device__ duration_type to_duration(char const* begin, char const* end)
 {
+  using cuda::std::chrono::duration_cast;
+
   // %d days [+]%H:%M:%S.n => %d days, %d days [+]%H:%M:%S,  %H:%M:%S.n, %H:%M:%S, %value.
   constexpr char sep = ':';
 
-  int32_t days{0};
-  int8_t hour{0};
   // single pass to parse days, hour, minute, seconds, nanosecond
   auto cur         = begin;
   auto const value = parse_integer<int32_t>(&cur, end);
   cur              = skip_spaces(cur, end);
-  if (std::is_same_v<T, cudf::duration_D> || cur >= end) {  // %value
-    return value;
+  if (std::is_same_v<duration_type, cudf::duration_D> || cur >= end) {
+    return duration_type{static_cast<typename duration_type::rep>(value)};
   }
+
   // " days [+]"
   auto const after_days_sep     = skip_if_starts_with(cur, end, "days");
   auto const has_days_seperator = (after_days_sep != cur);
   cur                           = skip_spaces(after_days_sep, end);
   cur += (*cur == '+');
+
+  duration_D d_d{0};
+  duration_h d_h{0};
   if (has_days_seperator) {
-    days = value;
-    hour = parse_integer<int8_t>(&cur, end);
+    d_d = duration_D{value};
+    d_h = duration_h{parse_integer<int32_t>(&cur, end)};
   } else {
-    hour = value;
+    d_h = duration_h{value};
   }
 
-  auto const minute = parse_optional_integer<int8_t>(&cur, end, sep);
-  auto const second = parse_optional_integer<int8_t>(&cur, end, sep);
-
-  int nanosecond = 0;
-  if (std::is_same_v<T, cudf::duration_s>) {
-    return ((days * 24L + hour) * 60L + minute) * 60L + second;
-  } else if (*cur == '.') {  //.n
-    auto const start_subsecond        = ++cur;
-    nanosecond                        = parse_integer<int>(&cur, end);
-    int8_t const num_digits           = min(9L, cur - start_subsecond);
-    constexpr int64_t powers_of_ten[] = {
-      1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L, 1000000000L};
-    nanosecond *= powers_of_ten[9 - num_digits];
-  }
+  duration_m d_m{parse_optional_integer<int32_t>(&cur, end, sep)};
+  duration_s d_s{parse_optional_integer<int64_t>(&cur, end, sep)};
+
+  // Convert all durations to the given type
+  auto output_d = duration_cast<duration_type>(d_d + d_h + d_m + d_s);
+
+  if constexpr (std::is_same_v<duration_type, cudf::duration_s>) { return output_d; }
+
+  auto const d_ns = (*cur != '.') ? duration_ns{0} : [&]() {
+    auto const start_subsecond     = ++cur;
+    auto const unscaled_subseconds = parse_integer<int64_t>(&cur, end);
+    auto const scale               = min(9L, cur - start_subsecond) - 9;
+    auto const rescaled = numeric::decimal64{unscaled_subseconds, numeric::scale_type{scale}};
+    return duration_ns{rescaled.value()};
+  }();
 
-  return cuda::std::chrono::duration_cast<T>(
-           cudf::duration_s{((days * 24L + hour) * 60L + minute) * 60L + second})
-           .count() +
-         cuda::std::chrono::duration_cast<T>(cudf::duration_ns{nanosecond}).count();
+  return output_d + duration_cast<duration_type>(d_ns);
 }
 
 }  // namespace io
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index de75ea6a51d..9a0c701ea49 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -135,9 +135,9 @@ struct column_to_strings_fn {
   // instead of column-wise; might be faster
   //
   // Note: Cannot pass `stream` to detail::<fname> version of <fname> calls below, because they are
-  // not exposed in header (see, for example, detail::concatenate(tbl_view, separator, na_rep, mr,
-  // stream) is declared and defined in combine.cu); Possible solution: declare `extern`, or just
-  // declare a prototype inside `namespace cudf::strings::detail`;
+  // not exposed in header (see, for example, detail::concatenate(tbl_view, separator, na_rep,
+  // stream, mr) is declared and defined in combine.cu); Possible solution: declare `extern`, or
+  // just declare a prototype inside `namespace cudf::strings::detail`;
 
   // bools:
   //
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index ba6bc30e0d4..673d9054631 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -22,7 +22,6 @@
 
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/bit.hpp>
@@ -125,7 +124,8 @@ __inline__ __device__ T decode_value(const char* begin,
  *
  * @return The parsed numeric value
  */
-template <typename T>
+template <typename T,
+          std::enable_if_t<!cudf::is_timestamp<T>() and !cudf::is_duration<T>()>* = nullptr>
 __inline__ __device__ T decode_value(const char* begin,
                                      const char* end,
                                      parse_options_view const& opts)
@@ -133,110 +133,22 @@ __inline__ __device__ T decode_value(const char* begin,
   return cudf::io::parse_numeric<T>(begin, end, opts);
 }
 
-/**
- * @brief Decodes a timestamp_D
- *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
- *
- * @return The parsed timestamp_D
- */
-template <>
-__inline__ __device__ cudf::timestamp_D decode_value(const char* begin,
-                                                     const char* end,
-                                                     parse_options_view const& opts)
-{
-  return cudf::timestamp_D{cudf::duration_D{to_date(begin, end, opts.dayfirst)}};
-}
-
-/**
- * @brief Decodes a timestamp_s
- *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
- *
- * @return The parsed timestamp_s
- */
-template <>
-__inline__ __device__ cudf::timestamp_s decode_value(const char* begin,
-                                                     const char* end,
-                                                     parse_options_view const& opts)
-{
-  auto milli = to_date_time(begin, end, opts.dayfirst);
-  return cudf::timestamp_s{cudf::duration_s{milli / 1000}};
-}
-
-/**
- * @brief Decodes a timestamp_ms
- *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
- *
- * @return The parsed timestamp_ms
- */
-template <>
-__inline__ __device__ cudf::timestamp_ms decode_value(const char* begin,
-                                                      const char* end,
-                                                      parse_options_view const& opts)
-{
-  auto milli = to_date_time(begin, end, opts.dayfirst);
-  return cudf::timestamp_ms{cudf::duration_ms{milli}};
-}
-
-/**
- * @brief Decodes a timestamp_us
- *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
- *
- * @return The parsed timestamp_us
- */
-template <>
-__inline__ __device__ cudf::timestamp_us decode_value(const char* begin,
-                                                      const char* end,
-                                                      parse_options_view const& opts)
+template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
 {
-  auto milli = to_date_time(begin, end, opts.dayfirst);
-  return cudf::timestamp_us{cudf::duration_us{milli * 1000}};
+  return to_timestamp<T>(begin, end, opts.dayfirst);
 }
 
-/**
- * @brief Decodes a timestamp_ns
- *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
- *
- * @return The parsed timestamp_ns
- */
-template <>
-__inline__ __device__ cudf::timestamp_ns decode_value(const char* begin,
-                                                      const char* end,
-                                                      parse_options_view const& opts)
+template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
+__inline__ __device__ T decode_value(char const* begin,
+                                     char const* end,
+                                     parse_options_view const& opts)
 {
-  auto milli = to_date_time(begin, end, opts.dayfirst);
-  return cudf::timestamp_ns{cudf::duration_ns{milli * 1000000}};
+  return to_duration<T>(begin, end);
 }
 
-#ifndef DURATION_DECODE_VALUE
-#define DURATION_DECODE_VALUE(Type)                                \
-  template <>                                                      \
-  __inline__ __device__ Type decode_value(                         \
-    const char* begin, const char* end, parse_options_view const&) \
-  {                                                                \
-    return Type{to_time_delta<Type>(begin, end)};                  \
-  }
-#endif
-DURATION_DECODE_VALUE(duration_D)
-DURATION_DECODE_VALUE(duration_s)
-DURATION_DECODE_VALUE(duration_ms)
-DURATION_DECODE_VALUE(duration_us)
-DURATION_DECODE_VALUE(duration_ns)
-
 // The purpose of these is merely to allow compilation ONLY
 template <>
 __inline__ __device__ cudf::string_view decode_value(const char*,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index b15c5a0941d..d05bec92166 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -24,6 +24,7 @@
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
+#include <io/utilities/time_utils.cuh>
 #include "orc.h"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -91,20 +92,6 @@ constexpr type_id to_type_id(const orc::SchemaType& schema,
   return type_id::EMPTY;
 }
 
-/**
- * @brief Function that translates cuDF time unit to ORC clock frequency
- */
-constexpr int32_t to_clockrate(type_id timestamp_type_id)
-{
-  switch (timestamp_type_id) {
-    case type_id::TIMESTAMP_SECONDS: return 1;
-    case type_id::TIMESTAMP_MILLISECONDS: return 1000;
-    case type_id::TIMESTAMP_MICROSECONDS: return 1000000;
-    case type_id::TIMESTAMP_NANOSECONDS: return 1000000000;
-    default: return 0;
-  }
-}
-
 constexpr std::pair<gpu::StreamIndexType, uint32_t> get_index_type_and_pos(
   const orc::StreamKind kind, uint32_t skip_count, bool non_child)
 {
@@ -822,8 +809,8 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
     }
   }
 
-  thrust::counting_iterator<int, thrust::host_space_tag> col_idx_it(0);
-  thrust::counting_iterator<int, thrust::host_space_tag> stripe_idx_it(0);
+  thrust::counting_iterator<int> col_idx_it(0);
+  thrust::counting_iterator<int> stripe_idx_it(0);
 
   if (is_mask_updated) {
     // Update chunks with pointers to column data which might have been changed.
@@ -892,8 +879,8 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::Col
 {
   const auto num_stripes = chunks.size().first;
   const auto num_columns = chunks.size().second;
-  thrust::counting_iterator<int, thrust::host_space_tag> col_idx_it(0);
-  thrust::counting_iterator<int, thrust::host_space_tag> stripe_idx_it(0);
+  thrust::counting_iterator<int> col_idx_it(0);
+  thrust::counting_iterator<int> stripe_idx_it(0);
 
   // Update chunks with pointers to column data
   std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index f566a4b53b5..c932cda80d5 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -17,6 +17,7 @@
 #include <cub/cub.cuh>
 #include <io/utilities/block_utils.cuh>
 #include <rmm/cuda_stream_view.hpp>
+
 #include "orc_common.h"
 #include "orc_gpu.h"
 
@@ -1744,9 +1745,10 @@ __global__ void __launch_bounds__(block_size)
               break;
             case DATE:
               if (s->chunk.dtype_len == 8) {
-                // Convert from days to milliseconds by multiplying by 24*3600*1000
+                cudf::duration_D days{s->vals.i32[t + vals_skipped]};
+                // Convert from days to milliseconds
                 static_cast<int64_t*>(data_out)[row] =
-                  86400000ll * (int64_t)s->vals.i32[t + vals_skipped];
+                  cuda::std::chrono::duration_cast<cudf::duration_ms>(days).count();
               } else {
                 static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped];
               }
@@ -1787,13 +1789,17 @@ __global__ void __launch_bounds__(block_size)
                 seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
               }
               if (seconds < 0 && nanos != 0) { seconds -= 1; }
-              if (s->chunk.ts_clock_rate)
+              if (s->chunk.ts_clock_rate) {
+                duration_ns d_ns{nanos};
+                d_ns += duration_s{seconds};
                 static_cast<int64_t*>(data_out)[row] =
-                  seconds * s->chunk.ts_clock_rate +
-                  (nanos + (499999999 / s->chunk.ts_clock_rate)) /
-                    (1000000000 / s->chunk.ts_clock_rate);  // Output to desired clock rate
-              else
-                static_cast<int64_t*>(data_out)[row] = seconds * 1000000000 + nanos;
+                  d_ns.count() * s->chunk.ts_clock_rate /
+                  duration_ns::period::den;  // Output to desired clock rate
+              } else {
+                cudf::duration_s d{seconds};
+                static_cast<int64_t*>(data_out)[row] =
+                  cuda::std::chrono::duration_cast<cudf::duration_ns>(d).count() + nanos;
+              }
               break;
             }
           }
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index cc7e22f2042..c8ed0e36966 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -21,6 +21,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <io/utilities/block_utils.cuh>
+#include <io/utilities/time_utils.cuh>
 
 #include <cub/cub.cuh>
 #include <rmm/cuda_stream_view.hpp>
@@ -614,12 +615,6 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals,
   }
 }
 
-/**
- * @brief Timestamp scale table (powers of 10)
- */
-static const __device__ __constant__ int32_t kTimeScale[10] = {
-  1000000000, 100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
-
 template <int block_size, typename Storage>
 static __device__ void encode_null_mask(orcenc_state_s* s,
                                         bitmask_type const* pushdown_mask,
@@ -808,7 +803,7 @@ __global__ void __launch_bounds__(block_size)
           case BYTE: s->vals.u8[nz_idx] = column.element<uint8_t>(row); break;
           case TIMESTAMP: {
             int64_t ts       = column.element<int64_t>(row);
-            int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
+            int32_t ts_scale = powers_of_ten[9 - min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
             int64_t nanos    = (ts - seconds * ts_scale);
             // There is a bug in the ORC spec such that for negative timestamps, it is understood
@@ -822,7 +817,7 @@ __global__ void __launch_bounds__(block_size)
             if (nanos != 0) {
               // Trailing zeroes are encoded in the lower 3-bits
               uint32_t zeroes = 0;
-              nanos *= kTimeScale[9 - min(s->chunk.scale, 9)];
+              nanos *= powers_of_ten[min(s->chunk.scale, 9)];
               if (!(nanos % 100)) {
                 nanos /= 100;
                 zeroes = 1;
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index f5bda3401c0..77fde0d1e75 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -315,14 +315,6 @@ dst_transition_s posix_parser<Container>::parse_transition()
   return {0, 0, 0, 0, time};
 }
 
-/**
- * @brief Checks if a given year is a leap year.
- */
-static bool is_leap_year(uint32_t year)
-{
-  return ((year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)));
-}
-
 /**
  * @brief Returns the number of days in a month.
  */
@@ -346,10 +338,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
 {
   auto day = trans.day;
 
+  auto const is_leap = cuda::std::chrono::year{year}.is_leap();
+
   if (trans.type == 'M') {
-    auto const is_leap = is_leap_year(year);
-    auto const month   = std::min(std::max(trans.month, 1), 12);
-    auto week          = std::min(std::max(trans.week, 1), 52);
+    auto const month = std::min(std::max(trans.month, 1), 12);
+    auto week        = std::min(std::max(trans.week, 1), 52);
 
     // Year-to-year day adjustment
     auto const adjusted_month = (month + 9) % 12 + 1;
@@ -372,10 +365,10 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
     }
   } else if (trans.type == 'J') {
     // Account for 29th of February on leap years
-    day += (day > 31 + 29 && is_leap_year(year));
+    day += (day > 31 + 29 && is_leap);
   }
 
-  return trans.time + day * day_seconds;
+  return trans.time + cuda::std::chrono::duration_cast<duration_s>(duration_D{day}).count();
 }
 
 timezone_table build_timezone_transition_table(std::string const& timezone_name,
@@ -445,7 +438,7 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
 
   // Add entries to fill the transition cycle
   int64_t year_timestamp = 0;
-  for (uint32_t year = 1970; year < 1970 + cycle_years; ++year) {
+  for (int32_t year = 1970; year < 1970 + cycle_years; ++year) {
     auto const dst_start_time = get_transition_time(dst_start, year);
     auto const dst_end_time   = get_transition_time(dst_end, year);
 
@@ -461,7 +454,9 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
       std::swap(offsets.rbegin()[0], offsets.rbegin()[1]);
     }
 
-    year_timestamp += (365 + is_leap_year(year)) * day_seconds;
+    year_timestamp += cuda::std::chrono::duration_cast<duration_s>(
+                        duration_D{365 + cuda::std::chrono::year{year}.is_leap()})
+                        .count();
   }
 
   rmm::device_uvector<int64_t> d_ttimes{ttimes.size(), stream};
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index f4a2ef4fc49..e091efef072 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -18,6 +18,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
+#include <io/utilities/time_utils.cuh>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -36,11 +37,11 @@ struct timezone_table_view {
   cudf::device_span<int32_t const> offsets;
 };
 
-static constexpr int64_t day_seconds = 24 * 60 * 60;
 // Cycle in which the time offsets repeat
-static constexpr uint32_t cycle_years = 400;
+static constexpr int32_t cycle_years = 400;
 // Number of seconds in 400 years
-static constexpr int64_t cycle_seconds = (365 * 400 + (100 - 3)) * day_seconds;
+static constexpr int64_t cycle_seconds =
+  cuda::std::chrono::duration_cast<duration_s>(duration_D{365 * cycle_years + (100 - 3)}).count();
 // Two entries per year, over the length of the cycle
 static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index f8158eaa6e9..ebc655578f7 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -627,6 +627,8 @@ inline __device__ void gpuStoreOutput(uint2* dst,
  */
 inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst)
 {
+  using cuda::std::chrono::duration_cast;
+
   const uint8_t* src8;
   uint32_t dict_pos, dict_size = s->dict_size, ofs;
   int64_t ts;
@@ -646,7 +648,7 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src
   ofs <<= 3;    // bytes -> bits
   if (dict_pos + 4 < dict_size) {
     uint3 v;
-    int64_t nanos, secs, days;
+    int64_t nanos, days;
     v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
     v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
     v.z = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
@@ -661,13 +663,15 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src
     nanos |= v.x;
     // Convert from Julian day at noon to UTC seconds
     days = static_cast<int32_t>(v.z);
-    secs = (days - 2440588) *
-           (24 * 60 * 60);  // TBD: Should be noon instead of midnight, but this matches pyarrow
-    if (s->col.ts_clock_rate)
-      ts = (secs * s->col.ts_clock_rate) +
-           nanos / (1000000000 / s->col.ts_clock_rate);  // Output to desired clock rate
-    else
-      ts = (secs * 1000000000) + nanos;
+    cudf::duration_D d{
+      days - 2440588};  // TBD: Should be noon instead of midnight, but this matches pyarrow
+    if (s->col.ts_clock_rate) {
+      int64_t secs = duration_cast<cudf::duration_s>(d).count() +
+                     duration_cast<cudf::duration_s>(cudf::duration_ns{nanos}).count();
+      ts = secs * s->col.ts_clock_rate;  // Output to desired clock rate
+    } else {
+      ts = duration_cast<cudf::duration_ns>(d).count() + nanos;
+    }
   } else {
     ts = 0;
   }
@@ -999,11 +1003,14 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
         case INT64:
           if (s->col.ts_clock_rate) {
             int32_t units = 0;
-            if (s->col.converted_type == TIME_MICROS || s->col.converted_type == TIMESTAMP_MICROS)
-              units = 1000000;
+            if (s->col.converted_type == TIME_MICROS || s->col.converted_type == TIMESTAMP_MICROS) {
+              units = cudf::timestamp_us::period::den;
+            }
+
             else if (s->col.converted_type == TIME_MILLIS ||
-                     s->col.converted_type == TIMESTAMP_MILLIS)
-              units = 1000;
+                     s->col.converted_type == TIMESTAMP_MILLIS) {
+              units = cudf::timestamp_ms::period::den;
+            }
             if (units && units != s->col.ts_clock_rate)
               s->ts_scale = (s->col.ts_clock_rate < units) ? -(units / s->col.ts_clock_rate)
                                                            : (s->col.ts_clock_rate / units);
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 5b1d8c846bf..e79a19fc2e9 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -712,6 +712,13 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s,
   }
 }
 
+/**
+ * @brief Determines the difference between the Proleptic Gregorian Calendar epoch (1970-01-01
+ * 00:00:00 UTC) and the Julian date epoch (-4713-11-24 12:00:00 UTC).
+ *
+ * @return The difference between two epochs in `cuda::std::chrono::duration` format with a period
+ * of hours.
+ */
 constexpr auto julian_calendar_epoch_diff()
 {
   using namespace cuda::std::chrono;
@@ -720,22 +727,21 @@ constexpr auto julian_calendar_epoch_diff()
 }
 
 /**
- * @brief Converts a sys_time<nanoseconds> into a pair with nanoseconds since midnight and number of
- * Julian days. Does not deal with time zones. Used by INT96 code.
+ * @brief Converts a timestamp_ns into a pair with nanoseconds since midnight and number of Julian
+ * days. Does not deal with time zones. Used by INT96 code.
  *
  * @param ns number of nanoseconds since epoch
  * @return std::pair<nanoseconds,days> where nanoseconds is the number of nanoseconds
  * elapsed in the day and days is the number of days from Julian epoch.
  */
-static __device__ std::pair<cuda::std::chrono::nanoseconds, cuda::std::chrono::days>
-convert_nanoseconds(cuda::std::chrono::sys_time<cuda::std::chrono::nanoseconds> const ns)
+static __device__ std::pair<duration_ns, duration_D> convert_nanoseconds(timestamp_ns const ns)
 {
   using namespace cuda::std::chrono;
   auto const nanosecond_ticks = ns.time_since_epoch();
   auto const gregorian_days   = floor<days>(nanosecond_ticks);
   auto const julian_days      = gregorian_days + ceil<days>(julian_calendar_epoch_diff());
 
-  auto const last_day_ticks = nanosecond_ticks - duration_cast<nanoseconds>(gregorian_days);
+  auto const last_day_ticks = nanosecond_ticks - gregorian_days;
   return {last_day_ticks, julian_days};
 }
 
@@ -1038,19 +1044,17 @@ __global__ void __launch_bounds__(128, 8)
             }
 
             auto const ret = convert_nanoseconds([&]() {
-              using namespace cuda::std::chrono;
-
               switch (s->col.leaf_column->type().id()) {
                 case type_id::TIMESTAMP_SECONDS:
                 case type_id::TIMESTAMP_MILLISECONDS: {
-                  return sys_time<nanoseconds>{milliseconds{v}};
+                  return timestamp_ns{duration_ms{v}};
                 } break;
                 case type_id::TIMESTAMP_MICROSECONDS:
                 case type_id::TIMESTAMP_NANOSECONDS: {
-                  return sys_time<nanoseconds>{microseconds{v}};
+                  return timestamp_ns{duration_us{v}};
                 } break;
               }
-              return sys_time<nanoseconds>{microseconds{0}};
+              return timestamp_ns{duration_ns{0}};
             }());
 
             // the 12 bytes of fixed length data.
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index a8dfabd9514..06a696d6751 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -22,6 +22,7 @@
 #include "reader_impl.hpp"
 
 #include <io/comp/gpuinflate.h>
+#include <io/utilities/time_utils.cuh>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -180,24 +181,6 @@ type_id to_type_id(SchemaElement const& schema,
   return type_id::EMPTY;
 }
 
-/**
- * @brief Function that translates cuDF time unit to Parquet clock frequency
- */
-constexpr int32_t to_clockrate(type_id timestamp_type_id)
-{
-  switch (timestamp_type_id) {
-    case type_id::DURATION_SECONDS: return 1;
-    case type_id::DURATION_MILLISECONDS: return 1000;
-    case type_id::DURATION_MICROSECONDS: return 1000000;
-    case type_id::DURATION_NANOSECONDS: return 1000000000;
-    case type_id::TIMESTAMP_SECONDS: return 1;
-    case type_id::TIMESTAMP_MILLISECONDS: return 1000;
-    case type_id::TIMESTAMP_MICROSECONDS: return 1000000;
-    case type_id::TIMESTAMP_NANOSECONDS: return 1000000000;
-    default: return 0;
-  }
-}
-
 /**
  * @brief Function that returns the required the number of bits to store a value
  */
@@ -207,6 +190,11 @@ T required_bits(uint32_t max_level)
   return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
 }
 
+/**
+ * @brief Converts cuDF units to Parquet units.
+ *
+ * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ */
 std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
                                                      type_id timestamp_type_id,
                                                      parquet::Type physical,
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 99b6410c44d..e71cd063e70 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -85,8 +85,8 @@ class writer::impl {
    * @param filepath Filepath if storing dataset to a file
    * @param options Settings for controlling behavior
    * @param mode Option to write at once or in chunks
-   * @param mr Device memory resource to use for device memory allocation
    * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<data_sink> sink,
                 chunked_parquet_writer_options const& options,
diff --git a/cpp/src/io/utilities/time_utils.cuh b/cpp/src/io/utilities/time_utils.cuh
new file mode 100644
index 00000000000..687766c1bcc
--- /dev/null
+++ b/cpp/src/io/utilities/time_utils.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+namespace cudf {
+namespace io {
+
+/**
+ * @brief Lookup table to compute power of ten
+ */
+static const __device__ __constant__ int32_t powers_of_ten[10] = {
+  1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
+
+struct get_period {
+  template <typename T>
+  constexpr int32_t operator()()
+  {
+    if constexpr (is_chrono<T>()) { return T::period::den; }
+    CUDF_FAIL("Invalid, non chrono type");
+  }
+};
+
+/**
+ * @brief Function that translates cuDF time unit to clock frequency
+ */
+constexpr int32_t to_clockrate(type_id timestamp_type_id)
+{
+  return timestamp_type_id == type_id::EMPTY
+           ? 0
+           : type_dispatcher(data_type{timestamp_type_id}, get_period{});
+}
+
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 636729a735e..07ad2e052f1 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -17,11 +17,12 @@
 #include <join/hash_join.cuh>
 #include <structs/utilities.hpp>
 
+#include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
-#include <cudf/detail/gather.cuh>
-#include <cudf/detail/gather.hpp>
+#include <cudf/detail/null_mask.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index dd21a22803b..dfb1af3cef1 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -16,8 +16,6 @@
 #pragma once
 
 #include <cudf/detail/concatenate.cuh>
-#include <cudf/detail/gather.cuh>
-#include <cudf/detail/gather.hpp>
 #include <join/join_common_utils.cuh>
 #include <join/join_common_utils.hpp>
 #include <join/join_kernels.cuh>
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 740431b8563..db79075d864 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -16,6 +16,7 @@
 #include <join/hash_join.cuh>
 #include <join/join_common_utils.hpp>
 
+#include <cudf/detail/gather.cuh>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 4b84d80f6a0..4bef312b396 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -17,7 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
@@ -202,9 +202,9 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
     });
 
   auto result = cudf::detail::gather(table_view{{entry_col}},
-                                     gather_map.begin(),
-                                     gather_map.end(),
+                                     gather_map,
                                      out_of_bounds_policy::DONT_CHECK,
+                                     cudf::detail::negative_index_policy::NOT_ALLOWED,
                                      stream,
                                      mr);
   return std::move(result->release()[0]);
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index ff4649f4945..d4a3d5555a6 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -13,8 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/copy_range.cuh>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index da20cabdd8f..77d41c5ddc9 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -21,6 +21,7 @@
 #include <cudf/lists/detail/gather.cuh>
 
 #include <thrust/binary_search.h>
+
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -29,6 +30,7 @@ namespace detail {
 
 std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                                          lists_column_view const& gather_map,
+                                         out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
@@ -38,27 +40,38 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   CUDF_EXPECTS(value_column.size() == gather_map.size(),
                "Gather map and list column should be same size");
 
-  auto gather_map_sliced_child = gather_map.get_sliced_child(stream);
-  auto const gather_map_size   = gather_map_sliced_child.size();
-  auto gather_index_begin      = gather_map.offsets().begin<size_type>() + 1 + gather_map.offset();
-  auto gather_index_end        = gather_index_begin + gather_map.size();
-  auto value_offsets           = value_column.offsets().begin<size_type>() + value_column.offset();
-  auto map_begin = cudf::detail::indexalator_factory::make_input_iterator(gather_map_sliced_child);
+  auto const gather_map_sliced_child = gather_map.get_sliced_child(stream);
+  auto const gather_map_size         = gather_map_sliced_child.size();
+  auto const gather_index_begin      = gather_map.offsets_begin() + 1;
+  auto const gather_index_end        = gather_map.offsets_end();
+  auto const value_offsets           = value_column.offsets_begin();
+  auto const map_begin =
+    cudf::detail::indexalator_factory::make_input_iterator(gather_map_sliced_child);
+  auto const out_of_bounds = [] __device__(auto const index, auto const list_size) {
+    return index >= list_size || (index < 0 && -index > list_size);
+  };
 
   // Calculate Flattened gather indices  (value_offset[row]+sub_index
-  auto transformer = [value_offsets, map_begin, gather_index_begin, gather_index_end] __device__(
-                       size_type index) -> size_type {
+  auto transformer = [value_offsets,
+                      map_begin,
+                      gather_index_begin,
+                      gather_index_end,
+                      bounds_policy,
+                      out_of_bounds] __device__(size_type index) -> size_type {
     // Get each row's offset. (Each row is a list).
     auto offset_idx =
       thrust::upper_bound(
         thrust::seq, gather_index_begin, gather_index_end, gather_index_begin[-1] + index) -
       gather_index_begin;
     // Get each sub_index in list in each row of gather_map.
-    auto sub_index         = map_begin[index];
-    auto list_size         = value_offsets[offset_idx + 1] - value_offsets[offset_idx];
-    auto wrapped_sub_index = (sub_index % list_size + list_size) % list_size;
+    auto sub_index          = map_begin[index];
+    auto list_size          = value_offsets[offset_idx + 1] - value_offsets[offset_idx];
+    auto wrapped_sub_index  = sub_index < 0 ? sub_index + list_size : sub_index;
+    auto constexpr null_idx = cuda::std::numeric_limits<cudf::size_type>::max();
     // Add sub_index to value_column offsets, to get gather indices of child of value_column
-    return value_offsets[offset_idx] + wrapped_sub_index - value_offsets[0];
+    return (bounds_policy == out_of_bounds_policy::NULLIFY && out_of_bounds(sub_index, list_size))
+             ? null_idx
+             : value_offsets[offset_idx] + wrapped_sub_index - value_offsets[0];
   };
   auto child_gather_index_begin = cudf::detail::make_counting_transform_iterator(0, transformer);
 
@@ -66,7 +79,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   auto child_table = cudf::detail::gather(table_view({value_column.get_sliced_child(stream)}),
                                           child_gather_index_begin,
                                           child_gather_index_begin + gather_map_size,
-                                          out_of_bounds_policy::DONT_CHECK,
+                                          bounds_policy,
                                           stream,
                                           mr);
   auto child       = std::move(child_table->release().front());
@@ -94,9 +107,11 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
 
 std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
+                                         out_of_bounds_policy bounds_policy,
                                          rmm::mr::device_memory_resource* mr)
 {
-  return detail::segmented_gather(source_column, gather_map_list, rmm::cuda_stream_default, mr);
+  return detail::segmented_gather(
+    source_column, gather_map_list, bounds_policy, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index e53ae4ff0c1..c547ca14f2d 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -18,13 +18,14 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/detail/sorting.hpp>
 #include <cudf/lists/drop_list_duplicates.hpp>
 #include <cudf/structs/struct_view.hpp>
+#include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -33,6 +34,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
+#include <thrust/copy.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -376,10 +378,10 @@ struct column_row_comparator_dispatch {
   }
 
   template <class Type, std::enable_if_t<!cudf::is_equality_comparable<Type, Type>()>* = nullptr>
-  bool operator()(size_type i, size_type j) const
+  bool operator()(size_type, size_type) const
   {
     CUDF_FAIL(
-      "`column_row_comparator_dispatch` cannot operate on types that are not equally comparable.");
+      "column_row_comparator_dispatch cannot operate on types that are not equally comparable.");
   }
 };
 
@@ -543,13 +545,17 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
                                           all_lists_entries.has_nulls(),
                                           stream);
 
+  auto gather_map = column_view(data_type{type_to_id<offset_type>()},
+                                static_cast<size_type>(thrust::distance(output_begin, output_end)),
+                                unique_indices.data());
+
   // Collect unique entries and entry list offsets.
   // The new null_count and bitmask of the unique entries will also be generated
   // by the gather function.
   return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}},
-                              output_begin,
-                              output_end,
+                              gather_map,
                               cudf::out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
                               stream,
                               mr)
     ->release();
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 94b86b670b1..c8ef4912392 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -16,10 +16,11 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/lists/extract.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/transform.h>
 
@@ -97,9 +98,9 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
 
   // call gather on the child column
   auto result = cudf::detail::gather(table_view({child_column}),
-                                     d_gather_map,
-                                     d_gather_map + gather_map->size(),
+                                     gather_map->view(),
                                      out_of_bounds_policy::NULLIFY,  // nullify-out-of-bounds
+                                     cudf::detail::negative_index_policy::NOT_ALLOWED,
                                      stream,
                                      mr)
                   ->release();
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 67d71b7a39a..e8c56cdafd8 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -17,7 +17,6 @@
 #include <cub/cub.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 80beb6e715c..98ab713bfa2 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -68,8 +68,8 @@ namespace {
  * @Param[in] input The input table to be round-robin partitioned
  * @Param[in] num_partitions Number of partitions for the table
  * @Param[in] start_partition Index of the 1st partition
- * @Param[in] mr Device memory resource used to allocate the returned table's device memory
  * @Param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @Param[in] mr Device memory resource used to allocate the returned table's device memory
  *
  * @Returns A std::pair consisting of a unique_ptr to the partitioned table and the partition
  * offsets for each partition within the table
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 25bf4a436ad..073b318b879 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -16,12 +16,13 @@
 
 #include <quantiles/quantiles_util.hpp>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -30,6 +31,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/transform.h>
 
 #include <memory>
 #include <vector>
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index bf515342afb..59a614664c9 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -57,15 +57,14 @@ struct minmax_pair {
  *
  * @tparam Op Binary operator functor
  * @tparam InputIterator Input iterator Type
+ * @tparam OutputType Output scalar type
  * @param d_in input iterator
  * @param num_items number of items to reduce
  * @param binary_op binary operator used to reduce
- * @param mr Device resource used for result allocation
  * @param stream CUDA stream to run kernels on.
  * @return rmm::device_scalar<OutputType>
  */
-template <typename T,
-          typename Op,
+template <typename Op,
           typename InputIterator,
           typename OutputType = typename thrust::iterator_value<InputIterator>::type>
 rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
@@ -155,16 +154,19 @@ struct minmax_functor {
     if (col.has_nulls()) {
       auto pair_to_minmax = thrust::make_transform_iterator(
         make_pair_iterator<T, true>(*device_col), create_minmax_with_nulls<T>{});
-      return reduce_device<T>(pair_to_minmax, col.size(), minmax_binary_op<T>{}, stream);
+      return reduce_device(pair_to_minmax, col.size(), minmax_binary_op<T>{}, stream);
     } else {
       auto col_to_minmax =
         thrust::make_transform_iterator(device_col->begin<T>(), create_minmax<T>{});
-      return reduce_device<T>(col_to_minmax, col.size(), minmax_binary_op<T>{}, stream);
+      return reduce_device(col_to_minmax, col.size(), minmax_binary_op<T>{}, stream);
     }
   }
 
   /**
    * @brief Functor to copy a minmax_pair result to individual scalar instances.
+   *
+   * @tparam T type of the data
+   * @tparam ResultType result type to assign min, max to minmax_pair<T>
    */
   template <typename T, typename ResultType = minmax_pair<T>>
   struct assign_min_max {
@@ -246,6 +248,11 @@ struct minmax_functor {
 
 }  // namespace
 
+/**
+ * @copydoc cudf::minmax
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
@@ -260,9 +267,6 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 }
 }  // namespace detail
 
-/**
- * @copydoc cudf::minmax
- */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   const column_view& col, rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 699494c49c5..0d3ac2d366f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -67,17 +67,17 @@ struct reduce_dispatch_functor {
         return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr);
       } break;
       case aggregation::MEDIAN: {
-        auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr);
+        auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream);
         auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0];
         auto col_ptr =
-          quantile(col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, mr);
+          quantile(col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream);
         return get_element(*col_ptr, 0, stream, mr);
       } break;
       case aggregation::QUANTILE: {
         auto quantile_agg = dynamic_cast<quantile_aggregation const*>(agg.get());
         CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
-        auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr);
+        auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream);
         auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0];
 
         auto col_ptr = quantile(col,
@@ -85,8 +85,7 @@ struct reduce_dispatch_functor {
                                 quantile_agg->_interpolation,
                                 valid_sorted_indices,
                                 true,
-                                stream,
-                                mr);
+                                stream);
         return get_element(*col_ptr, 0, stream, mr);
       } break;
       case aggregation::NUNIQUE: {
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index e0dc219b767..c8345a30f79 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -41,8 +41,8 @@ namespace simple {
  * @tparam Op           the operator of cudf::reduction::op::
 
  * @param col Input column of data to reduce
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream Used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Output scalar in device memory
  */
 template <typename ElementType, typename ResultType, typename Op>
@@ -76,10 +76,10 @@ std::unique_ptr<scalar> simple_reduction(column_view const& col,
  *
  * @tparam DecimalXX  The `decimal32` or `decimal64` type
  * @tparam Op         The operator of cudf::reduction::op::
+ *
  * @param col         Input column of data to reduce
-
- * @param mr          Device memory resource used to allocate the returned scalar's device memory
  * @param stream      Used for device memory operations and kernel launches.
+ * @param mr          Device memory resource used to allocate the returned scalar's device memory
  * @return            Output scalar in device memory
  */
 template <typename DecimalXX, typename Op>
@@ -124,10 +124,10 @@ std::unique_ptr<scalar> fixed_point_reduction(column_view const& col,
  * @tparam ElementType  The key type of the input dictionary column.
  * @tparam ResultType   The output data-type for the resulting scalar
  * @tparam Op           The operator of cudf::reduction::op::
-
+ *
  * @param col Input dictionary column of data to reduce
- * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @param stream Used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Output scalar in device memory
  */
 template <typename ElementType, typename ResultType, typename Op>
@@ -376,8 +376,8 @@ struct element_type_dispatcher {
    * @tparam ElementType The input column type or key type.
    * @param col Input column (must be numeric)
    * @param output_type Requested type of the scalar result
-   * @param mr Device memory resource used to allocate the returned scalar's device memory
    * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned scalar's device memory
    */
   template <typename ElementType,
             typename std::enable_if_t<cudf::is_numeric<ElementType>()>* = nullptr>
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 4fa42021bd2..2145dcc6b91 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -20,7 +20,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -387,9 +387,9 @@ std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const&
   }
 
   auto output = cudf::detail::gather(cudf::table_view({input}),
-                                     gather_map.begin(),
-                                     gather_map.end(),
+                                     gather_map,
                                      cudf::out_of_bounds_policy::DONT_CHECK,
+                                     cudf::detail::negative_index_policy::NOT_ALLOWED,
                                      stream,
                                      mr);
 
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 30ff7b0549e..4cc8a84c868 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -15,11 +15,13 @@
  */
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/reshape.hpp>
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/interleave_columns.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 
diff --git a/cpp/src/rolling/lead_lag_nested_detail.cuh b/cpp/src/rolling/lead_lag_nested_detail.cuh
index 4cff3053aa2..bde7101b9a9 100644
--- a/cpp/src/rolling/lead_lag_nested_detail.cuh
+++ b/cpp/src/rolling/lead_lag_nested_detail.cuh
@@ -18,11 +18,18 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.cuh>
-#include <cudf/detail/scatter.cuh>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/scatter.hpp>
 #include <cudf/utilities/traits.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+
 #include <vector>
 
 namespace cudf::detail {
@@ -151,13 +158,12 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                       });
   }
 
-  auto output_with_nulls =
-    cudf::detail::gather(table_view{std::vector<column_view>{input}},
-                         gather_map_column->view().template begin<size_type>(),
-                         gather_map_column->view().end<size_type>(),
-                         out_of_bounds_policy::NULLIFY,
-                         stream,
-                         mr);
+  auto output_with_nulls = cudf::detail::gather(table_view{std::vector<column_view>{input}},
+                                                gather_map_column->view(),
+                                                out_of_bounds_policy::NULLIFY,
+                                                cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                                stream,
+                                                mr);
 
   if (default_outputs.is_empty()) { return std::move(output_with_nulls->release()[0]); }
 
@@ -172,22 +178,22 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                     scatter_map.begin(),
                     is_null_index_predicate(input.size(), gather_map.begin<size_type>()));
 
+  scatter_map.resize(thrust::distance(scatter_map.begin(), scatter_map_end), stream);
   // Bail early, if all LEAD/LAG computations succeeded. No defaults need be substituted.
   if (scatter_map.is_empty()) { return std::move(output_with_nulls->release()[0]); }
 
   // Gather only those default values that are to be substituted.
   auto gathered_defaults =
     cudf::detail::gather(table_view{std::vector<column_view>{default_outputs}},
-                         scatter_map.begin(),
-                         scatter_map_end,
+                         scatter_map,
                          out_of_bounds_policy::DONT_CHECK,
+                         cudf::detail::negative_index_policy::NOT_ALLOWED,
                          stream);
 
   // Scatter defaults into locations where LEAD/LAG computed nulls.
   auto scattered_results = cudf::detail::scatter(
     table_view{std::vector<column_view>{gathered_defaults->release()[0]->view()}},
-    scatter_map.begin(),
-    scatter_map_end,
+    scatter_map,
     table_view{std::vector<column_view>{output_with_nulls->release()[0]->view()}},
     false,
     stream,
diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu
index 57d5d6afc75..8e40b7ce7e8 100644
--- a/cpp/src/strings/find.cu
+++ b/cpp/src/strings/find.cu
@@ -47,8 +47,8 @@ namespace {
  * @param start First character position to start the search.
  * @param stop Last character position (exclusive) to end the search.
  * @param pfn Functor used for locating `target` in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New integer column with character position values.
  */
 template <typename FindFunction>
@@ -172,8 +172,8 @@ namespace {
  * @param strings Column of strings to check for target.
  * @param target UTF-8 encoded string to check in strings column.
  * @param pfn Returns bool value if target is found in the given string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New BOOL column.
  */
 template <typename BoolFunction>
@@ -233,8 +233,8 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
  * @param strings Column of strings to check for `targets[i]`.
  * @param targets Column of strings to be checked in `strings[i]``.
  * @param pfn Returns bool value if target is found in the given string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New BOOL column.
  */
 template <typename BoolFunction>
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 72359ada8c9..82759a6c73f 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -192,8 +192,8 @@ struct substring_from_fn {
  * @param null_count Number of nulls for the output column.
  * @param starts Start positions index iterator.
  * @param stops Stop positions index iterator.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  */
 std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_column,
                                                    size_type null_count,
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index d01d0a8cbbc..c316b2c6f4e 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -163,8 +163,8 @@ struct device_cast {
  * @tparam T     Type of the `fixed_point` column_view (`decimal32` or `decimal64`)
  * @param input  Input `column_view`
  * @param scale  `scale` of the returned `column`
- * @param mr     Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr     Device memory resource used to allocate the returned column's device memory
  *
  * @return std::unique_ptr<column> Returned column with new @p scale
  */
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 088b0b747fb..d3475cbbed2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -266,6 +266,7 @@ ConfigureTest(ITERATOR_TEST
     iterator/scalar_iterator_test.cu
     iterator/optional_iterator_test_chrono.cu
     iterator/optional_iterator_test_numeric.cu
+    iterator/indexalator_test.cu
     )
 
 ###################################################################################################
diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp
index b02d0ad387d..528986e2a8d 100644
--- a/cpp/tests/copying/segmented_gather_list_tests.cpp
+++ b/cpp/tests/copying/segmented_gather_list_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 template <typename T>
@@ -31,7 +32,7 @@ using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBo
                                                   cudf::test::FloatingPointTypes,
                                                   cudf::test::DurationTypes,
                                                   cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(SegmentedGatherTest, FixedWidthTypesNotBool);
+TYPED_TEST_SUITE(SegmentedGatherTest, FixedWidthTypesNotBool);
 
 class SegmentedGatherTestList : public cudf::test::BaseFixture {
 };
@@ -42,6 +43,11 @@ class SegmentedGatherTestList : public cudf::test::BaseFixture {
 template <typename T>
 using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
 using cudf::lists_column_view;
+using cudf::lists::detail::segmented_gather;
+using cudf::test::iterators::no_nulls;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
+auto constexpr NULLIFY = cudf::out_of_bounds_policy::NULLIFY;
 
 TYPED_TEST(SegmentedGatherTest, Gather)
 {
@@ -49,13 +55,23 @@ TYPED_TEST(SegmentedGatherTest, Gather)
 
   // List<T>
   LCW<T> list{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}};
-  LCW<int> gather_map{{3, 2, 1, 0}, {0}, {0, 1}, {0, 2, 1}};
-  LCW<T> expected{{4, 3, 2, 1}, {5}, {6, 7}, {8, 10, 9}};
 
-  auto results =
-    cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+  {
+    // Straight-line case.
+    auto const gather_map = LCW<int>{{3, 2, 1, 0}, {0}, {0, 1}, {0, 2, 1}};
+    auto const expected   = LCW<T>{{4, 3, 2, 1}, {5}, {6, 7}, {8, 10, 9}};
+    auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  }
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  {
+    // Nullify out-of-bounds values.
+    auto const gather_map = LCW<int>{{3, 2, 4, 0}, {0}, {0, -3}, {0, 2, 1}};
+    auto const expected = LCW<T>{{{4, 3, 2, 1}, null_at(2)}, {5}, {{6, 7}, null_at(1)}, {8, 10, 9}};
+    auto const results =
+      segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  }
 }
 
 TYPED_TEST(SegmentedGatherTest, GatherNothing)
@@ -65,41 +81,31 @@ TYPED_TEST(SegmentedGatherTest, GatherNothing)
 
   // List<T>
   {
-    LCW<T> list{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}};
-    LCW<int> gather_map{LCW<int>{}, LCW<int>{}, LCW<int>{}, LCW<int>{}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-
-    LCW<T> expected{LCW<T>{}, LCW<T>{}, LCW<T>{}, LCW<T>{}};
+    auto const list       = LCW<T>{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}};
+    auto const gather_map = LCW<int>{LCW<int>{}, LCW<int>{}, LCW<int>{}, LCW<int>{}};
+    auto const results  = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const expected = LCW<T>{LCW<T>{}, LCW<T>{}, LCW<T>{}, LCW<T>{}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   // List<List<T>>
   {
-    LCW<T> list{{{1, 2, 3, 4}, {5}}, {{6, 7}}, {{}, {8, 9, 10}}};
-    LCW<int> gather_map{LCW<int>{}, LCW<int>{}, LCW<int>{}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const list       = LCW<T>{{{1, 2, 3, 4}, {5}}, {{6, 7}}, {{}, {8, 9, 10}}};
+    auto const gather_map = LCW<int>{LCW<int>{}, LCW<int>{}, LCW<int>{}};
+    auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
 
     // hack to get column of empty list of list
-    LCW<T> expected_dummy{{{1, 2, 3, 4}, {5}}, LCW<T>{}, LCW<T>{}, LCW<T>{}};
-    auto expected = cudf::split(expected_dummy, {1})[1];
+    auto const expected_dummy = LCW<T>{{{1, 2, 3, 4}, {5}}, LCW<T>{}, LCW<T>{}, LCW<T>{}};
+    auto const expected       = cudf::split(expected_dummy, {1})[1];
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
-
   // List<List<List<T>>>
   {
-    LCW<T> list{{{{1, 2, 3, 4}, {5}}}, {{{6, 7}, {8, 9, 10}}}};
-    LCW<int> gather_map{LCW<int>{}, LCW<int>{}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-
-    LCW<T> expected_dummy{{{{1, 2, 3, 4}}},  // hack to get column of empty list of list of list
-                          LCW<T>{},
-                          LCW<T>{}};
-    auto expected = cudf::split(expected_dummy, {1})[1];
+    auto const list       = LCW<T>{{{{1, 2, 3, 4}, {5}}}, {{{6, 7}, {8, 9, 10}}}};
+    auto const gather_map = LCW<int>{LCW<int>{}, LCW<int>{}};
+    auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    // hack to get column of empty list of list of list
+    auto const expected_dummy = LCW<T>{{{{1, 2, 3, 4}}}, LCW<T>{}, LCW<T>{}};
+    auto const expected       = cudf::split(expected_dummy, {1})[1];
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
     // the result should preserve the full List<List<List<int>>> hierarchy
@@ -120,18 +126,29 @@ TYPED_TEST(SegmentedGatherTest, GatherNulls)
 {
   using T = TypeParam;
 
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
   // List<T>
-  LCW<T> list{{{1, 2, 3, 4}, valids}, {5}, {{6, 7}, valids}, {{8, 9, 10}, valids}};
-  LCW<int> gather_map{{0, 1}, LCW<int>{}, {1}, {2, 1, 0}};
-
-  auto results =
-    cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+  auto const list = LCW<T>{{{1, 2, 3, 4}, valids}, {5}, {{6, 7}, valids}, {{8, 9, 10}, valids}};
 
-  LCW<T> expected{{{1, 2}, valids}, LCW<T>{}, {{7}, valids + 1}, {{10, 9, 8}, valids}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  {
+    // Test gathering on lists that contain nulls.
+    auto const gather_map = LCW<int>{{0, 1}, LCW<int>{}, {1}, {2, 1, 0}};
+    auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const expected =
+      LCW<T>{{{1, 2}, valids}, LCW<T>{}, {{7}, valids + 1}, {{10, 9, 8}, valids}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  }
+  {
+    // Test gathering on lists that contain nulls, with out-of-bounds indices.
+    auto const gather_map = LCW<int>{{10, -10}, LCW<int>{}, {1}, {2, -10, 0}};
+    auto const results =
+      segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
+    auto const expected =
+      LCW<T>{{{0, 0}, nulls_at({0, 1})}, LCW<T>{}, {{7}, valids + 1}, {{10, 0, 8}, null_at(1)}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  }
 }
 
 TYPED_TEST(SegmentedGatherTest, GatherNested)
@@ -140,39 +157,76 @@ TYPED_TEST(SegmentedGatherTest, GatherNested)
 
   // List<List<T>>
   {
-    LCW<T> list{{{2, 3}, {4, 5}},
-                {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
-                {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {-17, -18}}};
-    LCW<int> gather_map{{0, 2, -2}, {1}, {1, 0, -1, 5}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    // clang-format off
+    auto const list       = LCW<T>{{{2, 3}, {4, 5}},
+                                   {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
+                                   {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {-17, -18}}};
+    auto const gather_map = LCW<int>{{0, -2, -2}, {1}, {1, 0, -1, -5}};
+    auto const results    = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const expected   = LCW<T>{{{2, 3}, {2, 3}, {2, 3}}, 
+                                   {{9, 10, 11}}, 
+                                   {{17, 18}, {15, 16}, {-17, -18}, {15, 16}}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on 
+  }
 
-    LCW<T> expected{
-      {{2, 3}, {2, 3}, {2, 3}}, {{9, 10, 11}}, {{17, 18}, {15, 16}, {-17, -18}, {15, 16}}};
+  // List<List<T>>, with out-of-bounds gather indices.
+  {
+    // clang-format off
+    auto const list       = LCW<T>{{{2, 3}, {4, 5}},
+                                   {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
+                                   {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {-17, -18}}};
+    auto const gather_map = LCW<int>{{0, 2, -2}, {1}, {1, 0, -1, -6}};
+    auto const results =
+      segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
+    auto const expected = LCW<T>{{{{2, 3}, LCW<T>{}, {2, 3}}, null_at(1)},
+                                 {{9, 10, 11}},
+                                 {{{17, 18}, {15, 16}, {-17, -18}, LCW<T>{}}, null_at(3)}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
   }
 
   // List<List<List<T>>>
   {
-    LCW<T> list{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}},
-                {{{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}},
-                {{LCW<T>{0}}},
-                {{{10}, {20, 30, 40, 50}, {60, 70, 80}},
-                 {{0, 1, 3}, {5}},
-                 {{11, 12, 13, 14, 15}, {16, 17}, {0}}},
-                {{{10, 20}}, {LCW<T>{30}}, {{40, 50}, {60, 70, 80}}}};
-    LCW<int> gather_map{{1}, LCW<int>{}, {0}, {1}, {0, -1, 1}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    // clang-format off
+    auto const list = LCW<T>{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}},
+                             {{{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}},
+                             {{LCW<T>{0}}},
+                             {{{10}, {20, 30, 40, 50}, {60, 70, 80}},
+                              {{0, 1, 3}, {5}},
+                              {{11, 12, 13, 14, 15}, {16, 17}, {0}}},
+                             {{{10, 20}}, {LCW<T>{30}}, {{40, 50}, {60, 70, 80}}}};
+    auto const gather_map = LCW<int>{{1}, LCW<int>{}, {0}, {1}, {0, -1, 1}};
+    auto const results = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const expected = LCW<T>{{{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}},
+                                 LCW<T>{},
+                                 {{LCW<T>{0}}},
+                                 {{{0, 1, 3}, {5}}},
+                                 {{{10, 20}}, {{40, 50}, {60, 70, 80}}, {LCW<T>{30}}}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
+  }
 
-    LCW<T> expected{{{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}},
-                    LCW<T>{},
-                    {{LCW<T>{0}}},
-                    {{{0, 1, 3}, {5}}},
-                    {{{10, 20}}, {{40, 50}, {60, 70, 80}}, {LCW<T>{30}}}};
+  // List<List<List<T>>>, with out-of-bounds gather indices.
+  {
+    auto const list       = LCW<T>{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}},
+                             {{{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}},
+                             {{LCW<T>{0}}},
+                             {{{10}, {20, 30, 40, 50}, {60, 70, 80}},
+                              {{0, 1, 3}, {5}},
+                              {{11, 12, 13, 14, 15}, {16, 17}, {0}}},
+                             {{{10, 20}}, {LCW<T>{30}}, {{40, 50}, {60, 70, 80}}}};
+    auto const gather_map = LCW<int>{{1}, LCW<int>{}, {0}, {1}, {0, -1, 3, -4}};
+    auto const results =
+      segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
+    auto const expected =
+      LCW<T>{{{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}},
+             LCW<T>{},
+             {{LCW<T>{0}}},
+             {{{0, 1, 3}, {5}}},
+             {{{{10, 20}}, {{40, 50}, {60, 70, 80}}, LCW<T>{}, LCW<T>{}}, nulls_at({2, 3})}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
   }
 }
 
@@ -182,19 +236,32 @@ TYPED_TEST(SegmentedGatherTest, GatherOutOfOrder)
 
   // List<List<T>>
   {
-    LCW<T> list{{{2, 3}, {4, 5}},
-                {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
-                {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}};
-    LCW<int> gather_map{{1, 0}, {1, 2, 0}, {5, 4, 3, 2, 1, 0}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-
-    LCW<T> expected{{{4, 5}, {2, 3}},
-                    {{9, 10, 11}, {12, 13, 14}, {6, 7, 8}},
-                    {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}};
+    // clang-format off
+    auto const list       = LCW<T>{{{2, 3}, {4, 5}},
+                                   {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
+                                   {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}};
+    auto const gather_map = LCW<int>{{1, 0}, {1, 2, 0}, {4, 3, 2, 1, 0}};
+    auto const results    = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const expected   = LCW<T>{{{4, 5}, {2, 3}},
+                                   {{9, 10, 11}, {12, 13, 14}, {6, 7, 8}},
+                                   {{17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
+  }
 
+  // List<List<T>>, with out-of-bounds gather indices.
+  {
+    // clang-format off
+    auto const list       = LCW<T>{{{2, 3}, {4, 5}},
+                                   {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
+                                   {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}};
+    auto const gather_map = LCW<int>{{1, 0}, {3, -1, -4}, {5, 4, 3, 2, 1, 0}};
+    auto const results    = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
+    auto const expected   = LCW<T>{{{4, 5}, {2, 3}},
+                                   {{LCW<T>{}, {12, 13, 14}, LCW<T>{}}, nulls_at({0, 2})},
+                                   {{LCW<T>{}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}, null_at(0)}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
   }
 }
 
@@ -204,19 +271,32 @@ TYPED_TEST(SegmentedGatherTest, GatherNegatives)
 
   // List<List<T>>
   {
-    LCW<T> list{{{2, 3}, {4, 5}},
-                {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
-                {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}};
-    LCW<int> gather_map{{-1, 0}, {-2, -1, 0}, {-5, -4, -3, -2, -1, 0}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-
-    LCW<T> expected{{{4, 5}, {2, 3}},
-                    {{9, 10, 11}, {12, 13, 14}, {6, 7, 8}},
-                    {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}};
-
+    // clang-format off
+    auto const list       = LCW<T>{{{2, 3}, {4, 5}},
+                                   {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
+                                   {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}};
+    auto const gather_map = LCW<int>{{-1, 0}, {-2, -1, 0}, {-5, -4, -3, -2, -1, 0}};
+    auto const results    = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const expected   = LCW<T>{{{4, 5}, {2, 3}},
+                                   {{9, 10, 11}, {12, 13, 14}, {6, 7, 8}},
+                                   {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
+  }
+  // List<List<T>>, with out-of-bounds gather indices.
+  {
+    // clang-format off
+    auto const list       = LCW<T>{{{2, 3}, {4, 5}},
+                                   {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
+                                   {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}};
+    auto const gather_map = LCW<int>{{-1, 0}, {-2, -1, -4}, {-6, -4, -3, -2, -1, 0}};
+    auto const results    = 
+      segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
+    auto const expected   = LCW<T>{{{4, 5}, {2, 3}},
+                                   {{{9, 10, 11}, {12, 13, 14}, LCW<T>{}}, null_at(2)},
+                                   {{LCW<T>{}, {17, 18}, {17, 18}, {17, 18}, {17, 18}, {15, 16}}, null_at(0)}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
   }
 }
 
@@ -224,51 +304,43 @@ TYPED_TEST(SegmentedGatherTest, GatherNestedNulls)
 {
   using T = TypeParam;
 
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
 
   // List<List<T>>
   {
-    LCW<T> list{{{{2, 3}, valids}, {4, 5}},
-                {{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, valids},
-                {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}},
-                {{{{25, 26}, valids}, {27, 28}, {{29, 30}, valids}, {31, 32}, {33, 34}}, valids}};
-
-    LCW<int> gather_map{{0, 1}, {0, 2}, LCW<int>{}, {0, 1, 4}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-
-    auto trues = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-
-    LCW<T> expected{{{{2, 3}, valids}, {4, 5}},
-                    {{{6, 7, 8}, {12, 13, 14}}, trues},
-                    LCW<T>{},
-                    {{{{25, 26}, valids}, {27, 28}, {33, 34}}, valids}};
-
+    // clang-format off
+    auto const list = LCW<T>{{{{2, 3}, valids}, {4, 5}},
+                             {{{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, valids},
+                             {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}},
+                             {{{{25, 26}, valids}, {27, 28}, {{29, 30}, valids}, {31, 32}, {33, 34}}, valids}};
+    auto const gather_map = LCW<int>{{0, 1}, {0, 2}, LCW<int>{}, {0, 1, 4}};
+    auto const results  = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const expected = LCW<T>{{{{2, 3}, valids}, {4, 5}},
+                                 {{{6, 7, 8}, {12, 13, 14}}, no_nulls()},
+                                 LCW<T>{},
+                                 {{{{25, 26}, valids}, {27, 28}, {33, 34}}, valids}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
   }
 
   // List<List<List<List<T>>>>
   {
-    LCW<T> list{{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}},
-                 {{{15, 16}, {{27, 28}, valids}, {{37, 38}, valids}, {47, 48}, {57, 58}}},
-                 {{LCW<T>{0}}},
-                 {{{10}, {20, 30, 40, 50}, {60, 70, 80}},
-                  {{0, 1, 3}, {5}},
-                  {{11, 12, 13, 14, 15}, {16, 17}, {0}}},
-                 {{{{{10, 20}, valids}}, {LCW<T>{30}}, {{40, 50}, {60, 70, 80}}}, valids}}};
-
-    LCW<int> gather_map{{1, 2, 4}};
-
-    auto results =
-      cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-
-    LCW<T> expected{{{{{15, 16}, {{27, 28}, valids}, {{37, 38}, valids}, {47, 48}, {57, 58}}},
-                     {{LCW<T>{0}}},
-                     {{{{{10, 20}, valids}}, {LCW<T>{30}}, {{40, 50}, {60, 70, 80}}}, valids}}};
-
+    // clang-format off
+    auto const list = LCW<T>{{{{{2, 3}, {4, 5}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}},
+                              {{{15, 16}, {{27, 28}, valids}, {{37, 38}, valids}, {47, 48}, {57, 58}}},
+                              {{LCW<T>{0}}},
+                              {{{10}, {20, 30, 40, 50}, {60, 70, 80}},
+                                {{0, 1, 3}, {5}},
+                                {{11, 12, 13, 14, 15}, {16, 17}, {0}}},
+                              {{{{{10, 20}, valids}}, {LCW<T>{30}}, {{40, 50}, {60, 70, 80}}}, valids}}};
+    auto const gather_map = LCW<int>{{1, 2, 4}};
+    auto const results    = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const expected   = LCW<T>{{{{{15, 16}, {{27, 28}, valids}, {{37, 38}, valids}, {47, 48}, {57, 58}}},
+                                    {{LCW<T>{0}}},
+                                    {{{{{10, 20}, valids}}, {LCW<T>{30}}, {{40, 50}, {60, 70, 80}}}, valids}}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    // clang-format on
   }
 }
 
@@ -276,15 +348,11 @@ TYPED_TEST(SegmentedGatherTest, GatherNestedWithEmpties)
 {
   using T = TypeParam;
 
-  LCW<T> list{{{2, 3}, LCW<T>{}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, {LCW<T>{}}};
-  LCW<int> gather_map{LCW<int>{0}, LCW<int>{0}, LCW<int>{0}};
-
-  auto results =
-    cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-
-  // skip one null, gather one null.
-  LCW<T> expected{{{2, 3}}, {{6, 7, 8}}, {LCW<T>{}}};
-
+  auto const list = LCW<T>{{{2, 3}, LCW<T>{}}, {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}}, {LCW<T>{}}};
+  auto const gather_map = LCW<int>{LCW<int>{0}, LCW<int>{0}, LCW<int>{0}};
+  auto results          = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+  auto const expected =
+    LCW<T>{{{2, 3}}, {{6, 7, 8}}, {LCW<T>{}}};  // skip one null, gather one null.
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
 }
 
@@ -292,7 +360,7 @@ TYPED_TEST(SegmentedGatherTest, GatherSliced)
 {
   using T = TypeParam;
   {
-    LCW<T> a{
+    auto const a = LCW<T>{
       {{1, 1, 1}, {2, 2}, {3, 3}},
       {{4, 4, 4}, {5, 5}, {6, 6}},
       {{7, 7, 7}, {8, 8}, {9, 9}},
@@ -302,23 +370,27 @@ TYPED_TEST(SegmentedGatherTest, GatherSliced)
       {{50, 50, 50, 50}, {6, 13}},
       {{70, 70, 70, 70}, {80}},
     };
-    auto split_a = cudf::split(a, {3});
-
-    auto result0 = cudf::lists::detail::segmented_gather(
-      lists_column_view{split_a[0]}, lists_column_view{LCW<int>{{1, 2}, {0, 2}, {0, 1}}});
-    LCW<T> expected0{
-      {{2, 2}, {3, 3}},
-      {{4, 4, 4}, {6, 6}},
-      {{7, 7, 7}, {8, 8}},
-    };
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected0, result0->view());
-
-    auto result1 = cudf::lists::detail::segmented_gather(
-      lists_column_view{split_a[1]},
-      lists_column_view{LCW<int>{{0, 1}, LCW<int>{}, LCW<int>{}, {0, 1}, LCW<int>{}}});
-    LCW<T> expected1{
-      {{10, 10, 10}, {11, 11}}, LCW<T>{}, LCW<T>{}, {{50, 50, 50, 50}, {6, 13}}, LCW<T>{}};
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+    auto const split_a = cudf::split(a, {3});
+
+    {
+      auto const gather_map = lists_column_view{LCW<int>{{1, 2}, {0, 2}, {0, 1}}};
+      auto const result     = segmented_gather(lists_column_view{split_a[0]}, gather_map);
+      auto const expected   = LCW<T>{
+        {{2, 2}, {3, 3}},
+        {{4, 4, 4}, {6, 6}},
+        {{7, 7, 7}, {8, 8}},
+      };
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+    }
+
+    {
+      auto const gather_map =
+        lists_column_view{LCW<int>{{0, 1}, LCW<int>{}, LCW<int>{}, {0, 1}, LCW<int>{}}};
+      auto const result = segmented_gather(lists_column_view{split_a[1]}, gather_map);
+      auto const expected =
+        LCW<T>{{{10, 10, 10}, {11, 11}}, LCW<T>{}, LCW<T>{}, {{50, 50, 50, 50}, {6, 13}}, LCW<T>{}};
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+    }
   }
 
   auto valids =
@@ -404,13 +476,24 @@ TEST_F(SegmentedGatherTestString, StringGather)
 {
   using T = cudf::string_view;
   // List<T>
-  LCW<T> list{{"a", "b", "c", "d"}, {"1", "22", "333", "4"}, {"x", "y", "z"}};
-  LCW<int8_t> gather_map{{0, 1, 3, 2}, {1, 0, 3, 2}, LCW<int8_t>{}};
-  LCW<T> expected{{"a", "b", "d", "c"}, {"22", "1", "4", "333"}, LCW<T>{}};
+  {
+    auto const list       = LCW<T>{{"a", "b", "c", "d"}, {"1", "22", "333", "4"}, {"x", "y", "z"}};
+    auto const gather_map = LCW<int8_t>{{0, 1, 3, 2}, {1, 0, 3, 2}, LCW<int8_t>{}};
+    auto const expected   = LCW<T>{{"a", "b", "d", "c"}, {"22", "1", "4", "333"}, LCW<T>{}};
+    auto const result = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
 
-  auto result =
-    cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  // List<T>, with out-of-order gather indices.
+  {
+    auto const list       = LCW<T>{{"a", "b", "c", "d"}, {"1", "22", "333", "4"}, {"x", "y", "z"}};
+    auto const gather_map = LCW<int8_t>{{0, 1, 3, 4}, {1, -5, 3, 2}, LCW<int8_t>{}};
+    auto const expected   = LCW<T>{{{"a", "b", "d", "c"}, cudf::test::iterators::null_at(3)},
+                                 {{"22", "1", "4", "333"}, cudf::test::iterators::null_at(1)},
+                                 LCW<T>{}};
+    auto result = segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
 }
 
 using SegmentedGatherTestFloat = SegmentedGatherTest<float>;
@@ -419,28 +502,51 @@ TEST_F(SegmentedGatherTestFloat, GatherMapSliced)
   using T = float;
 
   // List<T>
-  LCW<T> list{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}, {11, 12}, {13, 14, 15, 16}};
-  LCW<int> gather_map{{3, 2, 1, 0}, {0}, {0, 1}, {0, 2, 1}, {0}, {1}};
-  // gather_map.offset: 0, 4, 5, 7, 10, 11, 12
-  LCW<T> expected{{4, 3, 2, 1}, {5}, {6, 7}, {8, 10, 9}, {11}, {14}};
+  {
+    auto const list = LCW<T>{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}, {11, 12}, {13, 14, 15, 16}};
+    auto const gather_map = LCW<int>{{3, 2, 1, 0}, {0}, {0, 1}, {0, 2, 1}, {0}, {1}};
+    // gather_map.offset: 0, 4, 5, 7, 10, 11, 12
+    auto const expected = LCW<T>{{4, 3, 2, 1}, {5}, {6, 7}, {8, 10, 9}, {11}, {14}};
+    auto const results  = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
 
-  auto results =
-    cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
+    auto const sliced  = cudf::split(list, {1, 4});
+    auto const split_m = cudf::split(gather_map, {1, 4});
+    auto const split_e = cudf::split(expected, {1, 4});
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
-  auto sliced  = cudf::split(list, {1, 4});
-  auto split_m = cudf::split(gather_map, {1, 4});
-  auto split_e = cudf::split(expected, {1, 4});
-
-  auto result0 = cudf::lists::detail::segmented_gather(lists_column_view{sliced[0]},
-                                                       lists_column_view{split_m[0]});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[0], result0->view());
-  auto result1 = cudf::lists::detail::segmented_gather(lists_column_view{sliced[1]},
-                                                       lists_column_view{split_m[1]});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[1], result1->view());
-  auto result2 = cudf::lists::detail::segmented_gather(lists_column_view{sliced[2]},
-                                                       lists_column_view{split_m[2]});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[2], result2->view());
+    auto result0 = segmented_gather(lists_column_view{sliced[0]}, lists_column_view{split_m[0]});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[0], result0->view());
+    auto result1 = segmented_gather(lists_column_view{sliced[1]}, lists_column_view{split_m[1]});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[1], result1->view());
+    auto result2 = segmented_gather(lists_column_view{sliced[2]}, lists_column_view{split_m[2]});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[2], result2->view());
+  }
+
+  // List<T>, with out-of-bounds gather indices.
+  {
+    auto const list = LCW<T>{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 10}, {11, 12}, {13, 14, 15, 16}};
+    auto const gather_map = LCW<int>{{3, -5, 1, 0}, {0}, {0, 1}, {0, 2, 3}, {0}, {1}};
+    // gather_map.offset: 0, 4, 5, 7, 10, 11, 12
+    auto const expected =
+      LCW<T>{{{4, 0, 2, 1}, null_at(1)}, {5}, {6, 7}, {{8, 10, 9}, null_at(2)}, {11}, {14}};
+    auto results =
+      segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+    auto const sliced  = cudf::split(list, {1, 4});
+    auto const split_m = cudf::split(gather_map, {1, 4});
+    auto const split_e = cudf::split(expected, {1, 4});
+
+    auto const result0 =
+      segmented_gather(lists_column_view{sliced[0]}, lists_column_view{split_m[0]}, NULLIFY);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[0], result0->view());
+    auto const result1 =
+      segmented_gather(lists_column_view{sliced[1]}, lists_column_view{split_m[1]}, NULLIFY);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[1], result1->view());
+    auto const result2 =
+      segmented_gather(lists_column_view{sliced[2]}, lists_column_view{split_m[2]}, NULLIFY);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(split_e[2], result2->view());
+  }
 }
 
 TEST_F(SegmentedGatherTestFloat, Fails)
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index e83592a028a..121bd1e2c99 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -365,6 +365,7 @@ TEST_F(JsonReaderTest, Durations)
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << "[-2]\n[-1]\n[0]\n";
     outfile << "[1 days]\n[0 days 23:01:00]\n[0 days 00:00:00.000000123]\n";
+    outfile << "[0:0:0.000123]\n[0:0:0.000123000]\n[00:00:00.100000001]\n";
     outfile << "[-2147483648]\n[2147483647]\n";
   }
 
@@ -388,6 +389,9 @@ TEST_F(JsonReaderTest, Durations)
                                                         1L * 60 * 60 * 24 * 1000000000L,
                                                         (23 * 60 + 1) * 60 * 1000000000L,
                                                         123L,
+                                                        123000L,
+                                                        123000L,
+                                                        100000001L,
                                                         -2147483648L,
                                                         2147483647L},
                                                        validity});
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
new file mode 100644
index 00000000000..d5379b6dd30
--- /dev/null
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <tests/iterator/iterator_tests.cuh>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/indexalator.cuh>
+
+using TestingTypes = cudf::test::IntegralTypesNotBool;
+
+template <typename T>
+struct IndexalatorTest : public IteratorTest<T> {
+};
+
+TYPED_TEST_CASE(IndexalatorTest, TestingTypes);
+
+TYPED_TEST(IndexalatorTest, input_iterator)
+{
+  using T = TypeParam;
+
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+
+  auto d_col = cudf::test::fixed_width_column_wrapper<T>(host_values.begin(), host_values.end());
+
+  auto expected_values = thrust::host_vector<cudf::size_type>(host_values.size());
+  std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) {
+    return static_cast<cudf::size_type>(v);
+  });
+
+  auto it_dev = cudf::detail::indexalator_factory::make_input_iterator(d_col);
+  this->iterator_test_thrust(expected_values, it_dev, host_values.size());
+}
+
+TYPED_TEST(IndexalatorTest, pair_iterator)
+{
+  using T = TypeParam;
+
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -120, 115});
+  auto validity    = std::vector<bool>({0, 1, 1, 1, 1, 1, 0, 1, 1});
+
+  auto d_col = cudf::test::fixed_width_column_wrapper<T>(
+    host_values.begin(), host_values.end(), validity.begin());
+
+  auto expected_values =
+    thrust::host_vector<thrust::pair<cudf::size_type, bool>>(host_values.size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 validity.begin(),
+                 expected_values.begin(),
+                 [](T v, bool b) { return thrust::make_pair(static_cast<cudf::size_type>(v), b); });
+
+  auto it_dev = cudf::detail::indexalator_factory::make_input_pair_iterator(d_col);
+  this->iterator_test_thrust(expected_values, it_dev, host_values.size());
+}
+
+TYPED_TEST(IndexalatorTest, optional_iterator)
+{
+  using T = TypeParam;
+
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -104, 103, 64, -13, -20, 45});
+  auto validity    = std::vector<bool>({0, 1, 1, 1, 1, 1, 0, 1, 1});
+
+  auto d_col = cudf::test::fixed_width_column_wrapper<T>(
+    host_values.begin(), host_values.end(), validity.begin());
+
+  auto expected_values = thrust::host_vector<thrust::optional<cudf::size_type>>(host_values.size());
+
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 validity.begin(),
+                 expected_values.begin(),
+                 [](T v, bool b) {
+                   return (b) ? thrust::make_optional(static_cast<cudf::size_type>(v))
+                              : thrust::nullopt;
+                 });
+
+  auto it_dev = cudf::detail::indexalator_factory::make_input_optional_iterator(d_col);
+  this->iterator_test_thrust(expected_values, it_dev, host_values.size());
+}
diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst
index 960608d8f3c..0bf1d11bff4 100644
--- a/docs/cudf/source/api_docs/index.rst
+++ b/docs/cudf/source/api_docs/index.rst
@@ -17,4 +17,5 @@ This page provides a list of all publicly accessible modules, methods and classe
     general_utilities
     window
     io
+    subword_tokenize
 
diff --git a/docs/cudf/source/api_docs/subword_tokenize.rst b/docs/cudf/source/api_docs/subword_tokenize.rst
new file mode 100644
index 00000000000..e8737a9ee0a
--- /dev/null
+++ b/docs/cudf/source/api_docs/subword_tokenize.rst
@@ -0,0 +1,12 @@
+================
+SubwordTokenizer
+================
+.. currentmodule:: cudf.core.subword_tokenizer
+
+Constructor
+~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_with_autosummary.rst
+   
+   SubwordTokenizer
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index dde97b65215..e4fb71033af 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -557,9 +557,15 @@ protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
       boolean neededCleanup = false;
       if (data != null || valid != null || offsets != null) {
         try {
-          ColumnVector.closeBuffers(data);
-          ColumnVector.closeBuffers(offsets);
-          ColumnVector.closeBuffers(valid);
+          if (data != null) {
+            data.close();
+          }
+          if (offsets != null) {
+            offsets.close();
+          }
+          if (valid != null) {
+            valid.close();
+          }
         } finally {
           // Always mark the resource as freed even if an exception is thrown.
           // We cannot know how far it progressed before the exception, and
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
index 5a9228608e5..5e7380c1d4e 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
@@ -17,7 +17,8 @@ cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
 
     cdef unique_ptr[column] from_timestamps(
         column_view input_col,
-        string format) except +
+        string format,
+        column_view input_strings_names) except +
 
     cdef unique_ptr[column] is_timestamp(
         column_view input_col,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 70bdb6e2e60..2ae3c53cb1b 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -160,6 +160,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
     # Access the Parquet user_data json to find the index
     index_col = None
+    is_range_index = False
     cdef map[string, string] user_data = c_out_table.metadata.user_data
     json_str = user_data[b'pandas'].decode('utf-8')
     meta = None
@@ -171,7 +172,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                     index_col[0]['kind'] == 'range':
                 is_range_index = True
             else:
-                is_range_index = False
                 index_col_names = OrderedDict()
                 for idx_col in index_col:
                     for c in meta['columns']:
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index c6307d6cdb9..ab53a242db2 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -100,11 +100,8 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     cdef unique_ptr[column] c_result
     cdef Aggregation cython_agg = make_aggregation(scan_op, kwargs)
 
-    cdef scan_type c_inclusive
-    if inclusive is True:
-        c_inclusive = scan_type.INCLUSIVE
-    elif inclusive is False:
-        c_inclusive = scan_type.EXCLUSIVE
+    cdef scan_type c_inclusive = \
+        scan_type.INCLUSIVE if inclusive else scan_type.EXCLUSIVE
 
     with nogil:
         c_result = move(cpp_scan(
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 74490d6bb19..f9e98efbbd9 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -509,7 +509,8 @@ def from_booleans(Column input_col):
 
 def int2timestamp(
         Column input_col,
-        format):
+        str format,
+        Column names):
     """
     Converting/Casting input date-time column to string
     column with specified format
@@ -517,6 +518,9 @@ def int2timestamp(
     Parameters
     ----------
     input_col : input column of type timestamp in integer format
+    format : The string specifying output format
+    names : The string names to use for weekdays ("%a", "%A") and
+    months ("%b", "%B")
 
     Returns
     -------
@@ -525,12 +529,15 @@ def int2timestamp(
     """
     cdef column_view input_column_view = input_col.view()
     cdef string c_timestamp_format = format.encode("UTF-8")
+    cdef column_view input_strings_names = names.view()
+
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = move(
             cpp_from_timestamps(
                 input_column_view,
-                c_timestamp_format))
+                c_timestamp_format,
+                input_strings_names))
 
     return Column.from_unique_ptr(move(c_result))
 
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 810cdd51df5..dc08f767c26 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -234,7 +234,7 @@ cdef data_from_unique_ptr(
     cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
     cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
 
-    cdef int i
+    cdef size_t i
 
     columns = [Column.from_unique_ptr(move(dereference(it+i)))
                for i in range(c_columns.size())]
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index b2f3274faab..72f902889f0 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -59,19 +59,6 @@ def get_loc(self, key, method=None, tolerance=None):
     def __getitem__(self, key):
         raise NotImplementedError()
 
-    def serialize(self):
-        header = {}
-        header["index_column"] = {}
-        # store metadata values of index separately
-        # Indexes: Numerical/DateTime/String are often GPU backed
-        header["index_column"], frames = self._values.serialize()
-
-        header["name"] = pickle.dumps(self.name)
-        header["dtype"] = pickle.dumps(self.dtype)
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["frame_count"] = len(frames)
-        return header, frames
-
     def __contains__(self, item):
         return item in self._values
 
@@ -122,13 +109,10 @@ def get_level_values(self, level):
 
     @classmethod
     def deserialize(cls, header, frames):
-        h = header["index_column"]
-        idx_typ = pickle.loads(header["type-serialized"])
-        name = pickle.loads(header["name"])
-
-        col_typ = pickle.loads(h["type-serialized"])
-        index = col_typ.deserialize(h, frames[: header["frame_count"]])
-        return idx_typ(index, name=name)
+        # Dispatch deserialization to the appropriate index type in case
+        # deserialization is ever attempted with the base class directly.
+        idx_type = pickle.loads(header["type-serialized"])
+        return idx_type.deserialize(header, frames)
 
     @property
     def names(self):
@@ -814,22 +798,8 @@ def astype(self, dtype, copy=False):
             self.copy(deep=copy)._values.astype(dtype), name=self.name
         )
 
+    # TODO: This method is deprecated and can be removed.
     def to_array(self, fillna=None):
-        """Get a dense numpy array for the data.
-
-        Parameters
-        ----------
-        fillna : str or None
-            Defaults to None, which will skip null values.
-            If it equals "pandas", null values are filled with NaNs.
-            Non integral dtype is promoted to np.float64.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-        """
         return self._values.to_array(fillna=fillna)
 
     def to_series(self, index=None, name=None):
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c057b729fd1..d2da594fa3b 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -426,7 +426,7 @@ def remove_categories(
         # ensure all the removals are in the current categories
         # list. If not, raise an error to match Pandas behavior
         if not removals_mask.all():
-            vals = removals[~removals_mask].to_array()
+            vals = removals[~removals_mask].to_numpy()
             raise ValueError(f"removals must all be in old categories: {vals}")
 
         new_categories = cats[~cats.isin(removals)]._column
@@ -1012,11 +1012,11 @@ def _encode(self, value) -> ScalarLike:
         return self.categories.find_first_value(value)
 
     def _decode(self, value: int) -> ScalarLike:
-        if value == self.default_na_value():
+        if value == self._default_na_value():
             return None
         return self.categories.element_indexing(value)
 
-    def default_na_value(self) -> ScalarLike:
+    def _default_na_value(self) -> ScalarLike:
         return -1
 
     def find_and_replace(
@@ -1175,7 +1175,7 @@ def fillna(
             fill_is_scalar = np.isscalar(fill_value)
 
             if fill_is_scalar:
-                if fill_value == self.default_na_value():
+                if fill_value == self._default_na_value():
                     fill_value = self.codes.dtype.type(fill_value)
                 else:
                     try:
@@ -1578,7 +1578,7 @@ def _create_empty_categorical_column(
         categories=column.as_column(dtype.categories),
         codes=column.as_column(
             cudf.utils.utils.scalar_broadcast_to(
-                categorical_column.default_na_value(),
+                categorical_column._default_na_value(),
                 categorical_column.size,
                 categorical_column.codes.dtype,
             )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index de278db919d..b6b4ef65cab 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -130,6 +130,12 @@ def values_host(self) -> "np.ndarray":
         """
         Return a numpy representation of the Column.
         """
+        if len(self) == 0:
+            return np.array([], dtype=self.dtype)
+
+        if self.has_nulls:
+            raise ValueError("Column must have no nulls.")
+
         return self.data_array_view.copy_to_host()
 
     @property
@@ -138,7 +144,7 @@ def values(self) -> "cupy.ndarray":
         Return a CuPy representation of the Column.
         """
         if len(self) == 0:
-            return cupy.asarray([], dtype=self.dtype)
+            return cupy.array([], dtype=self.dtype)
 
         if self.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -319,9 +325,11 @@ def _get_mask_as_column(self) -> ColumnBase:
     def _memory_usage(self, **kwargs) -> int:
         return self.__sizeof__()
 
-    def default_na_value(self) -> Any:
+    def _default_na_value(self) -> Any:
         raise NotImplementedError()
 
+    # TODO: This method is decpreated and can be removed when the associated
+    # Frame methods are removed.
     def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         """Get a dense numba device array for the data.
 
@@ -337,10 +345,12 @@ def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         output size could be smaller.
         """
         if fillna:
-            return self.fillna(self.default_na_value()).data_array_view
+            return self.fillna(self._default_na_value()).data_array_view
         else:
             return self.dropna(drop_nan=False).data_array_view
 
+    # TODO: This method is decpreated and can be removed when the associated
+    # Frame methods are removed.
     def to_array(self, fillna=None) -> np.ndarray:
         """Get a dense numpy array for the data.
 
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 0d4edbf0113..eba6764e83d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -4,7 +4,9 @@
 
 import builtins
 import datetime as dt
+import locale
 import re
+from locale import nl_langinfo
 from numbers import Number
 from types import SimpleNamespace
 from typing import Any, Mapping, Sequence, Union, cast
@@ -50,6 +52,56 @@
     "datetime64[s]": "%Y-%m-%d %H:%M:%S",
 }
 
+_DATETIME_SPECIAL_FORMATS = {
+    "%b",
+    "%B",
+    "%A",
+    "%a",
+}
+
+_DATETIME_NAMES = [
+    nl_langinfo(locale.AM_STR),  # type: ignore
+    nl_langinfo(locale.PM_STR),  # type: ignore
+    nl_langinfo(locale.DAY_1),
+    nl_langinfo(locale.DAY_2),
+    nl_langinfo(locale.DAY_3),
+    nl_langinfo(locale.DAY_4),
+    nl_langinfo(locale.DAY_5),
+    nl_langinfo(locale.DAY_6),
+    nl_langinfo(locale.DAY_7),
+    nl_langinfo(locale.ABDAY_1),
+    nl_langinfo(locale.ABDAY_2),
+    nl_langinfo(locale.ABDAY_3),
+    nl_langinfo(locale.ABDAY_4),
+    nl_langinfo(locale.ABDAY_5),
+    nl_langinfo(locale.ABDAY_6),
+    nl_langinfo(locale.ABDAY_7),
+    nl_langinfo(locale.MON_1),
+    nl_langinfo(locale.MON_2),
+    nl_langinfo(locale.MON_3),
+    nl_langinfo(locale.MON_4),
+    nl_langinfo(locale.MON_5),
+    nl_langinfo(locale.MON_6),
+    nl_langinfo(locale.MON_7),
+    nl_langinfo(locale.MON_8),
+    nl_langinfo(locale.MON_9),
+    nl_langinfo(locale.MON_10),
+    nl_langinfo(locale.MON_11),
+    nl_langinfo(locale.MON_12),
+    nl_langinfo(locale.ABMON_1),
+    nl_langinfo(locale.ABMON_2),
+    nl_langinfo(locale.ABMON_3),
+    nl_langinfo(locale.ABMON_4),
+    nl_langinfo(locale.ABMON_5),
+    nl_langinfo(locale.ABMON_6),
+    nl_langinfo(locale.ABMON_7),
+    nl_langinfo(locale.ABMON_8),
+    nl_langinfo(locale.ABMON_9),
+    nl_langinfo(locale.ABMON_10),
+    nl_langinfo(locale.ABMON_11),
+    nl_langinfo(locale.ABMON_12),
+]
+
 
 class DatetimeColumn(column.ColumnBase):
     """
@@ -278,17 +330,23 @@ def as_string_column(
             format = _dtype_to_format_conversion.get(
                 self.dtype.name, "%Y-%m-%d %H:%M:%S"
             )
+        if format in _DATETIME_SPECIAL_FORMATS:
+            names = as_column(_DATETIME_NAMES)
+        else:
+            names = cudf.core.column.column_empty(
+                0, dtype="object", masked=False
+            )
         if len(self) > 0:
             return string._datetime_to_str_typecast_functions[
                 cudf.dtype(self.dtype)
-            ](self, format)
+            ](self, format, names)
         else:
             return cast(
                 "cudf.core.column.StringColumn",
                 column.column_empty(0, dtype="object", masked=False),
             )
 
-    def default_na_value(self) -> DatetimeLikeScalar:
+    def _default_na_value(self) -> DatetimeLikeScalar:
         """Returns the default NA value for this column
         """
         return np.datetime64("nat", self.time_unit)
@@ -433,14 +491,11 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
     def _make_copy_with_na_as_null(self):
         """Return a copy with NaN values replaced with nulls."""
         null = column_empty_like(self, masked=True, newsize=1)
+        na_value = np.datetime64("nat", self.time_unit)
         out_col = cudf._lib.replace.replace(
             self,
             as_column(
-                Buffer(
-                    np.array([self.default_na_value()], dtype=self.dtype).view(
-                        "|u1"
-                    )
-                ),
+                Buffer(np.array([na_value], dtype=self.dtype).view("|u1")),
                 dtype=self.dtype,
             ),
             null,
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 736aa5b5a7b..27ff5da5505 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -288,7 +288,7 @@ def _process_values_for_isin(
 
         return lhs, rhs
 
-    def default_na_value(self) -> ScalarLike:
+    def _default_na_value(self) -> ScalarLike:
         """Returns the default NA value for this column
         """
         dkind = self.dtype.kind
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index c59081e4b59..07dc8743c00 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5210,10 +5210,10 @@ def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the StringColumn.
         """
-        raise NotImplementedError(
-            "String Arrays is not yet implemented in cudf"
-        )
+        raise TypeError("String Arrays is not yet implemented in cudf")
 
+    # TODO: This method is deprecated and should be removed when the associated
+    # Frame methods are removed.
     def to_array(self, fillna: bool = None) -> np.ndarray:
         """Get a dense numpy array for the data.
 
@@ -5409,7 +5409,7 @@ def normalize_binop_value(self, other) -> "column.ColumnBase":
         else:
             raise TypeError(f"cannot broadcast {type(other)}")
 
-    def default_na_value(self) -> ScalarLike:
+    def _default_na_value(self) -> ScalarLike:
         return None
 
     def binary_operator(
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index a3888d30f30..c7b13903751 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -304,7 +304,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn":
             ),
         )
 
-    def default_na_value(self) -> ScalarLike:
+    def _default_na_value(self) -> ScalarLike:
         """Returns the default NA value for this column
         """
         return np.timedelta64("nat", self.time_unit)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bdbd94ef754..0287e83a99a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -356,7 +356,7 @@ def _init_from_series_list(self, data, columns, index):
             # Setting `final_columns` to self._index so
             # that the resulting `transpose` will be have
             # columns set to `final_columns`
-            self._index = final_columns
+            self._index = as_index(final_columns)
 
             transpose = self.T
         else:
@@ -529,38 +529,28 @@ def _constructor_expanddim(self):
         )
 
     def serialize(self):
-        header = {}
-        frames = []
-        header["type-serialized"] = pickle.dumps(type(self))
+        header, frames = super().serialize()
+
         header["index"], index_frames = self._index.serialize()
         header["index_frame_count"] = len(index_frames)
-        frames.extend(index_frames)
-
-        # Use the column directly to avoid duplicating the index
-        # need to pickle column names to handle numpy integer columns
-        header["columns"], column_frames = column.serialize_columns(
-            self._columns
-        )
-        frames.extend(column_frames)
+        # For backwards compatibility with older versions of cuDF, index
+        # columns are placed before data columns.
+        frames = index_frames + frames
 
-        header["column_names"] = pickle.dumps(tuple(self._data.names))
         return header, frames
 
     @classmethod
     def deserialize(cls, header, frames):
-        # Reconstruct the index
-        index_frames = frames[: header["index_frame_count"]]
+        index_nframes = header["index_frame_count"]
+        obj = super().deserialize(
+            header, frames[header["index_frame_count"] :]
+        )
 
         idx_typ = pickle.loads(header["index"]["type-serialized"])
-        index = idx_typ.deserialize(header["index"], index_frames)
-
-        # Reconstruct the columns
-        column_frames = frames[header["index_frame_count"] :]
-
-        column_names = pickle.loads(header["column_names"])
-        columns = column.deserialize_columns(header["columns"], column_frames)
+        index = idx_typ.deserialize(header["index"], frames[:index_nframes])
+        obj._index = index
 
-        return cls._from_data(dict(zip(column_names, columns)), index=index,)
+        return obj
 
     @property
     def dtypes(self):
@@ -978,36 +968,6 @@ def __array_function__(self, func, types, args, kwargs):
         else:
             return NotImplemented
 
-    @property
-    def values(self):
-        """
-        Return a CuPy representation of the DataFrame.
-
-        Only the values in the DataFrame will be returned, the axes labels will
-        be removed.
-
-        Returns
-        -------
-        out: cupy.ndarray
-            The values of the DataFrame.
-        """
-        return cupy.asarray(self.as_gpu_matrix())
-
-    def __array__(self, dtype=None):
-        raise TypeError(
-            "Implicit conversion to a host NumPy array via __array__ is not "
-            "allowed, To explicitly construct a GPU matrix, consider using "
-            ".as_gpu_matrix()\nTo explicitly construct a host "
-            "matrix, consider using .as_matrix()"
-        )
-
-    def __arrow_array__(self, type=None):
-        raise TypeError(
-            "Implicit conversion to a host PyArrow Table via __arrow_array__ "
-            "is not allowed, To explicitly construct a PyArrow Table, "
-            "consider using .to_arrow()"
-        )
-
     def _get_numeric_data(self):
         """ Return a dataframe with only numeric data types """
         columns = [
@@ -2750,7 +2710,7 @@ def columns(self, columns):
         if isinstance(
             columns, (Series, cudf.Index, cudf.core.column.ColumnBase)
         ):
-            columns = pd.Index(columns.to_array(), tupleize_cols=is_multiindex)
+            columns = pd.Index(columns.to_numpy(), tupleize_cols=is_multiindex)
         elif not isinstance(columns, pd.Index):
             columns = pd.Index(columns, tupleize_cols=is_multiindex)
 
@@ -2816,6 +2776,87 @@ def index(self, value):
         idx = as_index(value)
         self._index = idx
 
+    def _reindex(
+        self, columns, dtypes=None, deep=False, index=None, inplace=False
+    ):
+        """
+        Helper for `.reindex`
+
+        Parameters
+        ----------
+        columns : array-like
+            The list of columns to select from the Frame,
+            if ``columns`` is a superset of ``Frame.columns`` new
+            columns are created.
+        dtypes : dict
+            Mapping of dtypes for the empty columns being created.
+        deep : boolean, optional, default False
+            Whether to make deep copy or shallow copy of the columns.
+        index : Index or array-like, default None
+            The ``index`` to be used to reindex the Frame with.
+        inplace : bool, default False
+            Whether to perform the operation in place on the data.
+
+        Returns
+        -------
+        DataFrame
+        """
+        if dtypes is None:
+            dtypes = {}
+
+        df = self
+        if index is not None:
+            index = cudf.core.index.as_index(index)
+
+            if isinstance(index, cudf.MultiIndex):
+                idx_dtype_match = all(
+                    left_dtype == right_dtype
+                    for left_dtype, right_dtype in zip(
+                        (col.dtype for col in df.index._data.columns),
+                        (col.dtype for col in index._data.columns),
+                    )
+                )
+            else:
+                idx_dtype_match = df.index.dtype == index.dtype
+
+            if not idx_dtype_match:
+                columns = (
+                    columns if columns is not None else list(df._column_names)
+                )
+                df = cudf.DataFrame()
+            else:
+                df = cudf.DataFrame(None, index).join(
+                    df, how="left", sort=True
+                )
+                # double-argsort to map back from sorted to unsorted positions
+                df = df.take(index.argsort(ascending=True).argsort())
+
+        index = index if index is not None else df.index
+        names = columns if columns is not None else list(df._data.names)
+        cols = {
+            name: (
+                df._data[name].copy(deep=deep)
+                if name in df._data
+                else column_empty(
+                    dtype=dtypes.get(name, np.float64),
+                    masked=True,
+                    row_count=len(index),
+                )
+            )
+            for name in names
+        }
+
+        result = self.__class__._from_data(
+            data=cudf.core.column_accessor.ColumnAccessor(
+                cols,
+                multiindex=self._data.multiindex,
+                level_names=self._data.level_names,
+            ),
+            index=index,
+        )
+
+        return self._mimic_inplace(result, inplace=inplace)
+
     def reindex(
         self, labels=None, axis=0, index=None, columns=None, copy=True
     ):
@@ -2866,12 +2907,16 @@ def reindex(
         if labels is None and index is None and columns is None:
             return self.copy(deep=copy)
 
-        df = self
-        cols = columns
-        dtypes = dict(df.dtypes)
+        dtypes = dict(self.dtypes)
         idx = labels if index is None and axis in (0, "index") else index
-        cols = labels if cols is None and axis in (1, "columns") else cols
-        df = df if cols is None else df[list(set(df.columns) & set(cols))]
+        cols = (
+            labels if columns is None and axis in (1, "columns") else columns
+        )
+        df = (
+            self
+            if cols is None
+            else self[list(set(self._column_names) & set(cols))]
+        )
 
         result = df._reindex(
             columns=cols, dtypes=dtypes, deep=copy, index=idx, inplace=False
@@ -2879,36 +2924,6 @@ def reindex(
 
         return result
 
-    def _set_index(
-        self, index, to_drop=None, inplace=False, verify_integrity=False,
-    ):
-        """Helper for `.set_index`
-
-        Parameters
-        ----------
-        index : Index
-            The new index to set.
-        to_drop : list optional, default None
-            A list of labels indicating columns to drop.
-        inplace : boolean, default False
-            Modify the DataFrame in place (do not create a new object).
-        verify_integrity : boolean, default False
-            Check for duplicates in the new index.
-        """
-        if not isinstance(index, BaseIndex):
-            raise ValueError("Parameter index should be type `Index`.")
-
-        df = self if inplace else self.copy(deep=True)
-
-        if verify_integrity and not index.is_unique:
-            raise ValueError(f"Values in Index are not unique: {index}")
-
-        if to_drop:
-            df.drop(columns=to_drop, inplace=True)
-
-        df.index = index
-        return df if not inplace else None
-
     def set_index(
         self,
         keys,
@@ -3015,7 +3030,7 @@ def set_index(
         columns_to_add = []
         names = []
         to_drop = []
-        for i, col in enumerate(keys):
+        for col in keys:
             # Is column label
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self.columns:
@@ -3073,17 +3088,24 @@ def set_index(
         elif len(columns_to_add) == 1:
             idx = cudf.Index(columns_to_add[0], name=names[0])
         else:
-            idf = cudf.DataFrame()
-            for i, col in enumerate(columns_to_add):
-                idf[i] = col
-            idx = cudf.MultiIndex.from_frame(idf, names=names)
-
-        return self._set_index(
-            index=idx,
-            to_drop=to_drop,
-            inplace=inplace,
-            verify_integrity=verify_integrity,
-        )
+            idx = cudf.MultiIndex._from_data(
+                {i: col for i, col in enumerate(columns_to_add)}
+            )
+            idx.names = names
+
+        if not isinstance(idx, BaseIndex):
+            raise ValueError("Parameter index should be type `Index`.")
+
+        df = self if inplace else self.copy(deep=True)
+
+        if verify_integrity and not idx.is_unique:
+            raise ValueError(f"Values in Index are not unique: {idx}")
+
+        if to_drop:
+            df.drop(columns=to_drop, inplace=True)
+
+        df.index = idx
+        return df if not inplace else None
 
     def reset_index(
         self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
@@ -3146,10 +3168,7 @@ class max_speed
                 "col_fill parameter is not supported yet."
             )
 
-        if inplace:
-            result = self
-        else:
-            result = self.copy()
+        result = self if inplace else self.copy()
 
         if not drop:
             if isinstance(self.index, cudf.MultiIndex):
@@ -3172,9 +3191,7 @@ class max_speed
             ):
                 result.insert(0, name, index_column)
         result.index = RangeIndex(len(self))
-        if inplace:
-            return
-        else:
+        if not inplace:
             return result
 
     def take(self, positions, keep_index=True):
@@ -3227,12 +3244,12 @@ def insert(self, loc, name, value):
             name or label of column to be inserted
         value : Series or array-like
         """
-        num_cols = len(self._data)
         if name in self._data:
             raise NameError(f"duplicated column name {name}")
 
+        num_cols = len(self._data)
         if loc < 0:
-            loc = num_cols + loc + 1
+            loc += num_cols + 1
 
         if not (0 <= loc <= num_cols):
             raise ValueError(
@@ -3660,7 +3677,7 @@ def rename(
                 out = DataFrame(
                     index=self.index.replace(
                         to_replace=list(index.keys()),
-                        replacement=list(index.values()),
+                        value=list(index.values()),
                     )
                 )
         else:
@@ -3677,21 +3694,11 @@ def rename(
             return out.copy(deep=copy)
 
     def as_gpu_matrix(self, columns=None, order="F"):
-        """Convert to a matrix in device memory.
-
-        Parameters
-        ----------
-        columns : sequence of str
-            List of a column names to be extracted.  The order is preserved.
-            If None is specified, all columns are used.
-        order : 'F' or 'C'
-            Optional argument to determine whether to return a column major
-            (Fortran) matrix or a row major (C) matrix.
-
-        Returns
-        -------
-        A (nrow x ncol) numba device ndarray
-        """
+        warnings.warn(
+            "The as_gpu_matrix method will be removed in a future cuDF "
+            "release. Consider using `to_cupy` instead.",
+            DeprecationWarning,
+        )
         if columns is None:
             columns = self._data.names
 
@@ -3735,18 +3742,11 @@ def as_gpu_matrix(self, columns=None, order="F"):
         return cuda.as_cuda_array(matrix).view(dtype)
 
     def as_matrix(self, columns=None):
-        """Convert to a matrix in host memory.
-
-        Parameters
-        ----------
-        columns : sequence of str
-            List of a column names to be extracted.  The order is preserved.
-            If None is specified, all columns are used.
-
-        Returns
-        -------
-        A (nrow x ncol) numpy ndarray in "F" order.
-        """
+        warnings.warn(
+            "The as_matrix method will be removed in a future cuDF "
+            "release. Consider using `to_numpy` instead.",
+            DeprecationWarning,
+        )
         return self.as_gpu_matrix(columns=columns).copy_to_host()
 
     def one_hot_encoding(
@@ -3914,71 +3914,6 @@ def argsort(self, ascending=True, na_position="last"):
         )
         return cudf.Series(inds_col)
 
-    @annotate("SORT_INDEX", color="red", domain="cudf_python")
-    def sort_index(
-        self,
-        axis=0,
-        level=None,
-        ascending=True,
-        inplace=False,
-        kind=None,
-        na_position="last",
-        sort_remaining=True,
-        ignore_index=False,
-    ):
-        """Sort object by labels (along an axis).
-
-        Parameters
-        ----------
-        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
-            The axis along which to sort. The value 0 identifies the rows,
-            and 1 identifies the columns.
-        level : int or level name or list of ints or list of level names
-            If not None, sort on values in specified index level(s).
-            This is only useful in the case of MultiIndex.
-        ascending : bool, default True
-            Sort ascending vs. descending.
-        inplace : bool, default False
-            If True, perform operation in-place.
-        kind : sorting method such as `quick sort` and others.
-            Not yet supported.
-        na_position : {‘first’, ‘last’}, default ‘last’
-            Puts NaNs at the beginning if first; last puts NaNs at the end.
-        sort_remaining : bool, default True
-            Not yet supported
-        ignore_index : bool, default False
-            if True, index will be replaced with RangeIndex.
-
-        Returns
-        -------
-        DataFrame or None
-
-        Examples
-        --------
-        >>> df = cudf.DataFrame(
-        ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2])
-        >>> df.sort_index(axis=0)
-           b  a
-        1  3  2
-        2  1  3
-        3  2  1
-        >>> df.sort_index(axis=1)
-           a  b
-        1  2  3
-        3  1  2
-        2  3  1
-        """
-        return super()._sort_index(
-            axis=axis,
-            level=level,
-            ascending=ascending,
-            inplace=inplace,
-            kind=kind,
-            na_position=na_position,
-            sort_remaining=sort_remaining,
-            ignore_index=ignore_index,
-        )
-
     def sort_values(
         self,
         by,
@@ -3990,7 +3925,6 @@ def sort_values(
         ignore_index=False,
     ):
         """
-
         Sort by the values row-wise.
 
         Parameters
@@ -4418,7 +4352,7 @@ def merge(
         sort=False,
         lsuffix=None,
         rsuffix=None,
-        method="hash",
+        method=None,
         indicator=False,
         suffixes=("_x", "_y"),
     ):
@@ -4463,8 +4397,9 @@ def merge(
         suffixes: Tuple[str, str], defaults to ('_x', '_y')
             Suffixes applied to overlapping column names on the left and right
             sides
-        method : {‘hash’, ‘sort’}, default ‘hash’
-            The implementation method to be used for the operation.
+        method :
+            This parameter is unused. It is deprecated and will be removed in a
+            future version.
 
         Returns
         -------
@@ -4526,6 +4461,13 @@ def merge(
         else:
             lsuffix, rsuffix = suffixes
 
+        if method is not None:
+            warnings.warn(
+                "The 'method' argument is deprecated and will be removed "
+                "in a future version of cudf.",
+                FutureWarning,
+            )
+
         # Compute merge
         gdf_result = super()._merge(
             right,
@@ -4536,7 +4478,6 @@ def merge(
             right_index=right_index,
             how=how,
             sort=sort,
-            method=method,
             indicator=indicator,
             suffixes=suffixes,
         )
@@ -4551,7 +4492,7 @@ def join(
         lsuffix="",
         rsuffix="",
         sort=False,
-        method="hash",
+        method=None,
     ):
         """Join columns with other DataFrame on index or on a key column.
 
@@ -4565,6 +4506,9 @@ def join(
             column names when avoiding conflicts.
         sort : bool
             Set to True to ensure sorted ordering.
+        method :
+            This parameter is unused. It is deprecated and will be removed in a
+            future version.
 
         Returns
         -------
@@ -4578,6 +4522,13 @@ def join(
         - *on* is not supported yet due to lack of multi-index support.
         """
 
+        if method is not None:
+            warnings.warn(
+                "The 'method' argument is deprecated and will be removed "
+                "in a future version of cudf.",
+                FutureWarning,
+            )
+
         lhs = self
         rhs = other
 
@@ -4587,7 +4538,6 @@ def join(
             right_index=True,
             how=how,
             suffixes=(lsuffix, rsuffix),
-            method=method,
             sort=sort,
         )
         df.index.name = (
@@ -4885,12 +4835,6 @@ def apply(
         runtime compilation features
         """
 
-        # libcudacxx tuples are not compatible with nvrtc 11.0
-        runtime = cuda.cudadrv.runtime.Runtime()
-        mjr, mnr = runtime.get_version()
-        if mjr < 11 or (mjr == 11 and mnr < 1):
-            raise RuntimeError("DataFrame.apply requires CUDA 11.1+")
-
         for dtype in self.dtypes:
             if (
                 isinstance(dtype, cudf.core.dtypes._BaseDtype)
@@ -5103,140 +5047,6 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
         # Slice into partition
         return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])]
 
-    def replace(
-        self,
-        to_replace=None,
-        value=None,
-        inplace=False,
-        limit=None,
-        regex=False,
-        method=None,
-    ):
-        """
-        Replace values given in *to_replace* with *replacement*.
-
-        Parameters
-        ----------
-        to_replace : numeric, str, list-like or dict
-            Value(s) that will be replaced.
-
-            * numeric or str:
-                - values equal to *to_replace* will be replaced
-                  with *replacement*
-            * list of numeric or str:
-                - If *replacement* is also list-like,
-                  *to_replace* and *replacement* must be of same length.
-            * dict:
-                - Dicts can be used to replace different values in different
-                  columns. For example, `{'a': 1, 'z': 2}` specifies that the
-                  value 1 in column `a` and the value 2 in column `z` should be
-                  replaced with replacement*.
-                - Dicts can be used to specify different replacement values for
-                  different existing values. For example, {'a': 'b', 'y': 'z'}
-                  replaces the value ‘a’ with ‘b’ and ‘y’ with ‘z’.
-                  To use a dict in this way the value parameter should be None.
-
-        value : numeric, str, list-like, or dict
-            Value(s) to replace `to_replace` with. If a dict is provided, then
-            its keys must match the keys in *to_replace*, and corresponding
-            values must be compatible (e.g., if they are lists, then they must
-            match in length).
-        inplace : bool, default False
-            If True, in place.
-
-        Raises
-        ------
-        TypeError
-            - If ``to_replace`` is not a scalar, array-like, dict, or None
-            - If ``to_replace`` is a dict and value is not a list, dict,
-              or Series
-        ValueError
-            - If a list is passed to ``to_replace`` and ``value`` but they
-              are not the same length.
-
-        Returns
-        -------
-        result : DataFrame
-            DataFrame after replacement.
-
-        Examples
-        --------
-
-        Scalar ``to_replace`` and ``value``
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({'A': [0, 1, 2, 3, 4],
-        ...                    'B': [5, 6, 7, 8, 9],
-        ...                    'C': ['a', 'b', 'c', 'd', 'e']})
-        >>> df
-           A  B  C
-        0  0  5  a
-        1  1  6  b
-        2  2  7  c
-        3  3  8  d
-        4  4  9  e
-        >>> df.replace(0, 5)
-           A  B  C
-        0  5  5  a
-        1  1  6  b
-        2  2  7  c
-        3  3  8  d
-        4  4  9  e
-
-        List-like ``to_replace``
-
-        >>> df.replace([0, 1, 2, 3], 4)
-           A  B  C
-        0  4  5  a
-        1  4  6  b
-        2  4  7  c
-        3  4  8  d
-        4  4  9  e
-        >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
-           A  B  C
-        0  4  5  a
-        1  3  6  b
-        2  2  7  c
-        3  1  8  d
-        4  4  9  e
-
-        dict-like ``to_replace``
-
-        >>> df.replace({0: 10, 1: 100})
-             A  B  C
-        0   10  5  a
-        1  100  6  b
-        2    2  7  c
-        3    3  8  d
-        4    4  9  e
-        >>> df.replace({'A': 0, 'B': 5}, 100)
-             A    B  C
-        0  100  100  a
-        1    1    6  b
-        2    2    7  c
-        3    3    8  d
-        4    4    9  e
-
-        Notes
-        -----
-        Parameters that are currently not supported are: `limit`, `regex`,
-        `method`
-        """
-        if limit is not None:
-            raise NotImplementedError("limit parameter is not implemented yet")
-
-        if regex:
-            raise NotImplementedError("regex parameter is not implemented yet")
-
-        if method not in ("pad", None):
-            raise NotImplementedError(
-                "method parameter is not implemented yet"
-            )
-
-        outdf = super().replace(to_replace=to_replace, replacement=value)
-
-        return self._mimic_inplace(outdf, inplace=inplace)
-
     def info(
         self,
         verbose=None,
@@ -5897,9 +5707,9 @@ def to_records(self, index=True):
         dtype = np.dtype(members)
         ret = np.recarray(len(self), dtype=dtype)
         if index:
-            ret["index"] = self.index.to_array()
+            ret["index"] = self.index.to_numpy()
         for col in self._data.names:
-            ret[col] = self[col].to_array()
+            ret[col] = self[col].to_numpy()
         return ret
 
     @classmethod
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 28080cbc4c1..e63ba4a4499 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3,10 +3,12 @@
 from __future__ import annotations
 
 import copy
+import pickle
 import warnings
 from collections import abc
 from typing import (
     Any,
+    Callable,
     Dict,
     MutableMapping,
     Optional,
@@ -24,13 +26,14 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import ColumnLike, DataFrameOrSeries
+from cudf._typing import ColumnLike, DataFrameOrSeries, Dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
     is_decimal_dtype,
     is_dict_like,
     is_integer_dtype,
+    is_list_like,
     is_scalar,
     issubdtype,
 )
@@ -39,13 +42,16 @@
     as_column,
     build_categorical_column,
     column_empty,
+    deserialize_columns,
+    serialize_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import merge
+from cudf.core.udf.pipeline import compile_or_get
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.dtypes import is_column_like
+from cudf.utils.dtypes import find_common_type, is_column_like
 
 T = TypeVar("T", bound="Frame")
 
@@ -64,6 +70,21 @@ class Frame(libcudf.table.Table):
 
     _data: "ColumnAccessor"
 
+    def serialize(self):
+        header = {
+            "type-serialized": pickle.dumps(type(self)),
+            "column_names": pickle.dumps(tuple(self._data.names)),
+        }
+        header["columns"], frames = serialize_columns(self._columns)
+        return header, frames
+
+    @classmethod
+    def deserialize(cls, header, frames):
+        cls_deserialize = pickle.loads(header["type-serialized"])
+        column_names = pickle.loads(header["column_names"])
+        columns = deserialize_columns(header["columns"], frames)
+        return cls_deserialize._from_data(dict(zip(column_names, columns)))
+
     @classmethod
     def _from_data(
         cls,
@@ -591,6 +612,151 @@ def _empty_like(self, keep_index=True):
         result._copy_type_metadata(self, include_index=keep_index)
         return result
 
+    @property
+    def values(self):
+        """
+        Return a CuPy representation of the DataFrame.
+
+        Only the values in the DataFrame will be returned, the axes labels will
+        be removed.
+
+        Returns
+        -------
+        cupy.ndarray
+            The values of the DataFrame.
+        """
+        return self.to_cupy()
+
+    @property
+    def values_host(self):
+        """
+        Return a NumPy representation of the data.
+
+        Only the values in the DataFrame will be returned, the axes labels will
+        be removed.
+
+        Returns
+        -------
+        numpy.ndarray
+            A host representation of the underlying data.
+        """
+        return self.to_numpy()
+
+    def __array__(self, dtype=None):
+        raise TypeError(
+            "Implicit conversion to a host NumPy array via __array__ is not "
+            "allowed, To explicitly construct a GPU matrix, consider using "
+            ".to_cupy()\nTo explicitly construct a host matrix, consider "
+            "using .to_numpy()."
+        )
+
+    def __arrow_array__(self, type=None):
+        raise TypeError(
+            "Implicit conversion to a host PyArrow object via __arrow_array__ "
+            "is not allowed. Consider using .to_arrow()"
+        )
+
+    def _to_array(
+        self,
+        get_column_values: Callable,
+        make_empty_matrix: Callable,
+        dtype: Union[Dtype, None] = None,
+        na_value=None,
+    ) -> Union[cupy.ndarray, np.ndarray]:
+        # Internal function to implement to_cupy and to_numpy, which are nearly
+        # identical except for the attribute they access to generate values.
+
+        def get_column_values_na(col):
+            if na_value is not None:
+                col = col.fillna(na_value)
+            return get_column_values(col)
+
+        # Early exit for an empty Frame.
+        ncol = self._num_columns
+        if ncol == 0:
+            return make_empty_matrix(shape=(0, 0), dtype=np.dtype("float64"))
+
+        if dtype is None:
+            dtype = find_common_type(
+                [col.dtype for col in self._data.values()]
+            )
+
+        matrix = make_empty_matrix(shape=(len(self), ncol), dtype=dtype)
+        for i, col in enumerate(self._data.values()):
+            # TODO: col.values may fail if there is nullable data or an
+            # unsupported dtype. We may want to catch and provide a more
+            # suitable error.
+            matrix[:, i] = get_column_values_na(col)
+        return matrix
+
+    def to_cupy(
+        self,
+        dtype: Union[Dtype, None] = None,
+        copy: bool = False,
+        na_value=None,
+    ) -> cupy.ndarray:
+        """Convert the Frame to a CuPy array.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`.
+        copy : bool, default False
+            Whether to ensure that the returned value is not a view on
+            another array. Note that ``copy=False`` does not *ensure* that
+            ``to_cupy()`` is no-copy. Rather, ``copy=True`` ensure that
+            a copy is made, even if not strictly necessary.
+        na_value : Any, default None
+            The value to use for missing values. The default value depends on
+            dtype and the dtypes of the DataFrame columns.
+
+        Returns
+        -------
+        cupy.ndarray
+        """
+        return self._to_array(
+            (lambda col: col.values.copy())
+            if copy
+            else (lambda col: col.values),
+            cupy.empty,
+            dtype,
+            na_value,
+        )
+
+    def to_numpy(
+        self,
+        dtype: Union[Dtype, None] = None,
+        copy: bool = True,
+        na_value=None,
+    ) -> np.ndarray:
+        """Convert the Frame to a NumPy array.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`.
+        copy : bool, default True
+            Whether to ensure that the returned value is not a view on
+            another array. This parameter must be ``True`` since cuDF must copy
+            device memory to host to provide a numpy array.
+        na_value : Any, default None
+            The value to use for missing values. The default value depends on
+            dtype and the dtypes of the DataFrame columns.
+
+        Returns
+        -------
+        numpy.ndarray
+        """
+        if not copy:
+            raise ValueError(
+                "copy=False is not supported because conversion to a numpy "
+                "array always copies the data."
+            )
+
+        return self._to_array(
+            (lambda col: col.values_host), np.empty, dtype, na_value
+        )
+
     def clip(self, lower=None, upper=None, inplace=False, axis=1):
         """
         Trim values at input threshold(s).
@@ -1436,10 +1602,29 @@ def _apply(self, func):
         """
         Apply `func` across the rows of the frame.
         """
-        output_dtype, ptx = cudf.core.udf.pipeline.compile_masked_udf(
-            func, self.dtypes
+        kernel, retty = compile_or_get(self, func)
+
+        # Mask and data column preallocated
+        ans_col = cupy.empty(len(self), dtype=retty)
+        ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
+        launch_args = [(ans_col, ans_mask)]
+        offsets = []
+        for col in self._data.values():
+            data = col.data
+            mask = col.mask
+            if mask is None:
+                launch_args.append(data)
+            else:
+                launch_args.append((data, mask))
+            offsets.append(col.offset)
+        launch_args += offsets
+        launch_args.append(len(self))  # size
+        kernel.forall(len(self))(*launch_args)
+
+        result = cudf.Series(ans_col).set_mask(
+            libcudf.transform.bools_to_mask(ans_mask)
         )
-        result = cudf._lib.transform.masked_udf(self, ptx, output_dtype)
+
         return result
 
     def rank(
@@ -1599,6 +1784,147 @@ def repeat(self, repeats, axis=None):
 
         return self._repeat(repeats)
 
+    @annotate("SORT_INDEX", color="red", domain="cudf_python")
+    def sort_index(
+        self,
+        axis=0,
+        level=None,
+        ascending=True,
+        inplace=False,
+        kind=None,
+        na_position="last",
+        sort_remaining=True,
+        ignore_index=False,
+        key=None,
+    ):
+        """Sort object by labels (along an axis).
+
+        Parameters
+        ----------
+        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
+            The axis along which to sort. The value 0 identifies the rows,
+            and 1 identifies the columns.
+        level : int or level name or list of ints or list of level names
+            If not None, sort on values in specified index level(s).
+            This is only useful in the case of MultiIndex.
+        ascending : bool, default True
+            Sort ascending vs. descending.
+        inplace : bool, default False
+            If True, perform operation in-place.
+        kind : sorting method such as `quick sort` and others.
+            Not yet supported.
+        na_position : {‘first’, ‘last’}, default ‘last’
+            Puts NaNs at the beginning if first; last puts NaNs at the end.
+        sort_remaining : bool, default True
+            Not yet supported
+        ignore_index : bool, default False
+            if True, index will be replaced with RangeIndex.
+        key : callable, optional
+            If not None, apply the key function to the index values before
+            sorting. This is similar to the key argument in the builtin
+            sorted() function, with the notable difference that this key
+            function should be vectorized. It should expect an Index and return
+            an Index of the same shape. For MultiIndex inputs, the key is
+            applied per level.
+
+        Returns
+        -------
+        Frame or None
+
+        Notes
+        -----
+        Difference from pandas:
+          * Not supporting: kind, sort_remaining=False
+
+        Examples
+        --------
+        **Series**
+        >>> import cudf
+        >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
+        >>> series
+        3    a
+        2    b
+        1    c
+        4    d
+        dtype: object
+        >>> series.sort_index()
+        1    c
+        2    b
+        3    a
+        4    d
+        dtype: object
+
+        Sort Descending
+
+        >>> series.sort_index(ascending=False)
+        4    d
+        3    a
+        2    b
+        1    c
+        dtype: object
+
+        **DataFrame**
+        >>> df = cudf.DataFrame(
+        ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2])
+        >>> df.sort_index(axis=0)
+           b  a
+        1  3  2
+        2  1  3
+        3  2  1
+        >>> df.sort_index(axis=1)
+           a  b
+        1  2  3
+        3  1  2
+        2  3  1
+        """
+        if kind is not None:
+            raise NotImplementedError("kind is not yet supported")
+
+        if not sort_remaining:
+            raise NotImplementedError(
+                "sort_remaining == False is not yet supported"
+            )
+
+        if key is not None:
+            raise NotImplementedError("key is not yet supported.")
+
+        if axis in (0, "index"):
+            idx = self.index
+            if isinstance(idx, cudf.MultiIndex):
+                if level is None:
+                    midx_data = idx.to_frame(index=False)
+                else:
+                    # Pandas doesn't handle na_position in case of MultiIndex.
+                    na_position = "first" if ascending is True else "last"
+                    labels = [
+                        idx._get_level_label(lvl)
+                        for lvl in (level if is_list_like(level) else (level,))
+                    ]
+                    midx_data = cudf.DataFrame._from_data(
+                        idx._data.select_by_label(labels)
+                    )
+
+                inds = midx_data.argsort(
+                    ascending=ascending, na_position=na_position
+                )
+                out = self.take(inds)
+            elif (ascending and idx.is_monotonic_increasing) or (
+                not ascending and idx.is_monotonic_decreasing
+            ):
+                out = self.copy()
+            else:
+                inds = idx.argsort(
+                    ascending=ascending, na_position=na_position
+                )
+                out = self.take(inds)
+        else:
+            labels = sorted(self._data.names, reverse=not ascending)
+            out = self[labels]
+
+        if ignore_index is True:
+            out = out.reset_index(drop=True)
+        return self._mimic_inplace(out, inplace=inplace)
+
     def _repeat(self, count):
         if not is_scalar(count):
             count = as_column(count)
@@ -1639,35 +1965,21 @@ def _shift(self, offset, fill_value=None):
             zip(self._column_names, data_columns), self._index
         )
 
-    def __array__(self, dtype=None):
-        raise TypeError(
-            "Implicit conversion to a host NumPy array via __array__ is not "
-            "allowed, To explicitly construct a GPU array, consider using "
-            "cupy.asarray(...)\nTo explicitly construct a "
-            "host array, consider using .to_array()"
-        )
-
-    def __arrow_array__(self, type=None):
-        raise TypeError(
-            "Implicit conversion to a host PyArrow Array via __arrow_array__ "
-            "is not allowed, To explicitly construct a PyArrow Array, "
-            "consider using .to_arrow()"
-        )
-
     def round(self, decimals=0, how="half_even"):
         """
-        Round a DataFrame to a variable number of decimal places.
+        Round to a variable number of decimal places.
 
         Parameters
         ----------
         decimals : int, dict, Series
-            Number of decimal places to round each column to. If an int is
-            given, round each column to the same number of places.
-            Otherwise dict and Series round to variable numbers of places.
-            Column names should be in the keys if `decimals` is a
-            dict-like, or in the index if `decimals` is a Series. Any
-            columns not included in `decimals` will be left as is. Elements
-            of `decimals` which are not columns of the input will be
+            Number of decimal places to round each column to. This parameter
+            must be an int for a Series.  For a DataFrame, a dict or a Series
+            are also valid inputs. If an int is given, round each column to the
+            same number of places.  Otherwise dict and Series round to variable
+            numbers of places.  Column names should be in the keys if
+            `decimals` is a dict-like, or in the index if `decimals` is a
+            Series. Any columns not included in `decimals` will be left as is.
+            Elements of `decimals` which are not columns of the input will be
             ignored.
         how : str, optional
             Type of rounding. Can be either "half_even" (default)
@@ -1675,12 +1987,23 @@ def round(self, decimals=0, how="half_even"):
 
         Returns
         -------
-        DataFrame
-            A DataFrame with the affected columns rounded to the specified
-            number of decimal places.
+        Series or DataFrame
+            A Series or DataFrame with the affected columns rounded to the
+            specified number of decimal places.
 
         Examples
         --------
+        **Series**
+
+        >>> s = cudf.Series([0.1, 1.4, 2.9])
+        >>> s.round()
+        0    0.0
+        1    1.0
+        2    3.0
+        dtype: float64
+
+        **DataFrame**
+
         >>> df = cudf.DataFrame(
                 [(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
         ...     columns=['dogs', 'cats']
@@ -1729,34 +2052,24 @@ def round(self, decimals=0, how="half_even"):
         if isinstance(decimals, cudf.Series):
             decimals = decimals.to_pandas()
 
-        if isinstance(decimals, (dict, pd.Series)):
-            if (
-                isinstance(decimals, pd.Series)
-                and not decimals.index.is_unique
-            ):
+        if isinstance(decimals, pd.Series):
+            if not decimals.index.is_unique:
                 raise ValueError("Index of decimals must be unique")
-
-            cols = {
-                name: col.round(decimals[name], how=how)
-                if (
-                    name in decimals.keys()
-                    and _is_non_decimal_numeric_dtype(col.dtype)
-                )
-                else col.copy(deep=True)
-                for name, col in self._data.items()
-            }
+            decimals = decimals.to_dict()
         elif isinstance(decimals, int):
-            cols = {
-                name: col.round(decimals, how=how)
-                if _is_non_decimal_numeric_dtype(col.dtype)
-                else col.copy(deep=True)
-                for name, col in self._data.items()
-            }
-        else:
+            decimals = {name: decimals for name in self._column_names}
+        elif not isinstance(decimals, abc.Mapping):
             raise TypeError(
                 "decimals must be an integer, a dict-like or a Series"
             )
 
+        cols = {
+            name: col.round(decimals[name], how=how)
+            if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype))
+            else col.copy(deep=True)
+            for name, col in self._data.items()
+        }
+
         return self.__class__._from_data(
             data=cudf.core.column_accessor.ColumnAccessor(
                 cols,
@@ -2170,8 +2483,219 @@ def drop_duplicates(
         result._copy_type_metadata(self)
         return result
 
-    def replace(self, to_replace: Any, replacement: Any) -> Frame:
-        if not (to_replace is None and replacement is None):
+    def replace(
+        self,
+        to_replace=None,
+        value=None,
+        inplace=False,
+        limit=None,
+        regex=False,
+        method=None,
+    ):
+        """Replace values given in ``to_replace`` with ``value``.
+
+        Parameters
+        ----------
+        to_replace : numeric, str or list-like
+            Value(s) to replace.
+
+            * numeric or str:
+                - values equal to ``to_replace`` will be replaced
+                  with ``value``
+            * list of numeric or str:
+                - If ``value`` is also list-like, ``to_replace`` and
+                  ``value`` must be of same length.
+            * dict:
+                - Dicts can be used to specify different replacement values
+                  for different existing values. For example, {'a': 'b',
+                  'y': 'z'} replaces the value ‘a’ with ‘b’ and
+                  ‘y’ with ‘z’.
+                  To use a dict in this way the ``value`` parameter should
+                  be ``None``.
+        value : scalar, dict, list-like, str, default None
+            Value to replace any values matching ``to_replace`` with.
+        inplace : bool, default False
+            If True, in place.
+
+        See also
+        --------
+        Series.fillna
+
+        Raises
+        ------
+        TypeError
+            - If ``to_replace`` is not a scalar, array-like, dict, or None
+            - If ``to_replace`` is a dict and value is not a list, dict,
+              or Series
+        ValueError
+            - If a list is passed to ``to_replace`` and ``value`` but they
+              are not the same length.
+
+        Returns
+        -------
+        result : Series
+            Series after replacement. The mask and index are preserved.
+
+        Notes
+        -----
+        Parameters that are currently not supported are: `limit`, `regex`,
+        `method`
+
+        Examples
+        --------
+        **Series**
+
+        Scalar ``to_replace`` and ``value``
+
+        >>> import cudf
+        >>> s = cudf.Series([0, 1, 2, 3, 4])
+        >>> s
+        0    0
+        1    1
+        2    2
+        3    3
+        4    4
+        dtype: int64
+        >>> s.replace(0, 5)
+        0    5
+        1    1
+        2    2
+        3    3
+        4    4
+        dtype: int64
+
+        List-like ``to_replace``
+
+        >>> s.replace([1, 2], 10)
+        0     0
+        1    10
+        2    10
+        3     3
+        4     4
+        dtype: int64
+
+        dict-like ``to_replace``
+
+        >>> s.replace({1:5, 3:50})
+        0     0
+        1     5
+        2     2
+        3    50
+        4     4
+        dtype: int64
+        >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a'])
+        >>> s
+        0     b
+        1     a
+        2     a
+        3     b
+        4     a
+        dtype: object
+        >>> s.replace({'a': None})
+        0       b
+        1    <NA>
+        2    <NA>
+        3       b
+        4    <NA>
+        dtype: object
+
+        If there is a mimatch in types of the values in
+        ``to_replace`` & ``value`` with the actual series, then
+        cudf exhibits different behaviour with respect to pandas
+        and the pairs are ignored silently:
+
+        >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a'])
+        >>> s
+        0    b
+        1    a
+        2    a
+        3    b
+        4    a
+        dtype: object
+        >>> s.replace('a', 1)
+        0    b
+        1    a
+        2    a
+        3    b
+        4    a
+        dtype: object
+        >>> s.replace(['a', 'c'], [1, 2])
+        0    b
+        1    a
+        2    a
+        3    b
+        4    a
+        dtype: object
+
+        **DataFrame**
+
+        Scalar ``to_replace`` and ``value``
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'A': [0, 1, 2, 3, 4],
+        ...                    'B': [5, 6, 7, 8, 9],
+        ...                    'C': ['a', 'b', 'c', 'd', 'e']})
+        >>> df
+           A  B  C
+        0  0  5  a
+        1  1  6  b
+        2  2  7  c
+        3  3  8  d
+        4  4  9  e
+        >>> df.replace(0, 5)
+           A  B  C
+        0  5  5  a
+        1  1  6  b
+        2  2  7  c
+        3  3  8  d
+        4  4  9  e
+
+        List-like ``to_replace``
+
+        >>> df.replace([0, 1, 2, 3], 4)
+           A  B  C
+        0  4  5  a
+        1  4  6  b
+        2  4  7  c
+        3  4  8  d
+        4  4  9  e
+        >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
+           A  B  C
+        0  4  5  a
+        1  3  6  b
+        2  2  7  c
+        3  1  8  d
+        4  4  9  e
+
+        dict-like ``to_replace``
+
+        >>> df.replace({0: 10, 1: 100})
+             A  B  C
+        0   10  5  a
+        1  100  6  b
+        2    2  7  c
+        3    3  8  d
+        4    4  9  e
+        >>> df.replace({'A': 0, 'B': 5}, 100)
+             A    B  C
+        0  100  100  a
+        1    1    6  b
+        2    2    7  c
+        3    3    8  d
+        4    4    9  e
+        """
+        if limit is not None:
+            raise NotImplementedError("limit parameter is not implemented yet")
+
+        if regex:
+            raise NotImplementedError("regex parameter is not implemented yet")
+
+        if method not in ("pad", None):
+            raise NotImplementedError(
+                "method parameter is not implemented yet"
+            )
+
+        if not (to_replace is None and value is None):
             copy_data = self._data.copy(deep=False)
             (
                 all_na_per_column,
@@ -2179,7 +2703,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame:
                 replacements_per_column,
             ) = _get_replacement_values_for_columns(
                 to_replace=to_replace,
-                value=replacement,
+                value=value,
                 columns_dtype_map={
                     col: copy_data._data[col].dtype for col in copy_data._data
                 },
@@ -2193,7 +2717,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame:
                         all_na_per_column[name],
                     )
                 except (KeyError, OverflowError):
-                    # We need to create a deep copy if :
+                    # We need to create a deep copy if:
                     # i. `find_and_replace` was not successful or any of
                     #    `to_replace_per_column`, `replacements_per_column`,
                     #    `all_na_per_column` don't contain the `name`
@@ -2206,7 +2730,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame:
 
         result = self._from_data(copy_data, self._index)
 
-        return result
+        return self._mimic_inplace(result, inplace=inplace)
 
     def _copy_type_metadata(
         self, other: Frame, include_index: bool = True
@@ -3159,7 +3683,6 @@ def _merge(
         right_index=False,
         how="inner",
         sort=False,
-        method="hash",
         indicator=False,
         suffixes=("_x", "_y"),
     ):
@@ -3182,7 +3705,6 @@ def _merge(
             right_index=right_index,
             how=how,
             sort=sort,
-            method=method,
             indicator=indicator,
             suffixes=suffixes,
         )
@@ -3230,85 +3752,6 @@ def _encode(self):
         keys = self.__class__._from_data(data, index)
         return keys, indices
 
-    def _reindex(
-        self, columns, dtypes=None, deep=False, index=None, inplace=False
-    ):
-        """
-        Helper for `.reindex`
-
-        Parameters
-        ----------
-        columns : array-like
-            The list of columns to select from the Frame,
-            if ``columns`` is a superset of ``Frame.columns`` new
-            columns are created.
-        dtypes : dict
-            Mapping of dtypes for the empty columns being created.
-        deep : boolean, optional, default False
-            Whether to make deep copy or shallow copy of the columns.
-        index : Index or array-like, default None
-            The ``index`` to be used to reindex the Frame with.
-        inplace : bool, default False
-            Whether to perform the operation in place on the data.
-
-        Returns
-        -------
-        DataFrame
-        """
-        if dtypes is None:
-            dtypes = {}
-
-        df = self
-        if index is not None:
-            index = cudf.core.index.as_index(index)
-
-            if isinstance(index, cudf.MultiIndex):
-                idx_dtype_match = all(
-                    left_dtype == right_dtype
-                    for left_dtype, right_dtype in zip(
-                        (col.dtype for col in df.index._data.columns),
-                        (col.dtype for col in index._data.columns),
-                    )
-                )
-            else:
-                idx_dtype_match = df.index.dtype == index.dtype
-
-            if not idx_dtype_match:
-                columns = columns if columns is not None else list(df.columns)
-                df = cudf.DataFrame()
-            else:
-                df = cudf.DataFrame(None, index).join(
-                    df, how="left", sort=True
-                )
-                # double-argsort to map back from sorted to unsorted positions
-                df = df.take(index.argsort(ascending=True).argsort())
-
-        index = index if index is not None else df.index
-        names = columns if columns is not None else list(df.columns)
-        cols = {
-            name: (
-                df._data[name].copy(deep=deep)
-                if name in df._data
-                else column_empty(
-                    dtype=dtypes.get(name, np.float64),
-                    masked=True,
-                    row_count=len(index),
-                )
-            )
-            for name in names
-        }
-
-        result = self.__class__._from_data(
-            data=cudf.core.column_accessor.ColumnAccessor(
-                cols,
-                multiindex=self._data.multiindex,
-                level_names=self._data.level_names,
-            ),
-            index=index,
-        )
-
-        return self._mimic_inplace(result, inplace=inplace)
-
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         return self.__class__._from_data(
@@ -4846,55 +5289,27 @@ def _column(self, value):
 
     @property
     def values(self):
-        """
-        Return a CuPy representation of the data.
-
-        Returns
-        -------
-        out : cupy.ndarray
-            A device representation of the underlying data.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, -10, 100, 20])
-        >>> ser.values
-        array([  1, -10, 100,  20])
-        >>> type(ser.values)
-        <class 'cupy.core.core.ndarray'>
-        >>> index = cudf.Index([1, -10, 100, 20])
-        >>> index.values
-        array([  1, -10, 100,  20])
-        >>> type(index.values)
-        <class 'cupy.core.core.ndarray'>
-        """
         return self._column.values
 
     @property
     def values_host(self):
-        """
-        Return a NumPy representation of the data.
+        return self._column.values_host
 
-        Returns
-        -------
-        out : numpy.ndarray
-            A host representation of the underlying data.
+    def to_cupy(
+        self,
+        dtype: Union[Dtype, None] = None,
+        copy: bool = True,
+        na_value=None,
+    ) -> cupy.ndarray:
+        return super().to_cupy(dtype, copy, na_value).flatten()
 
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, -10, 100, 20])
-        >>> ser.values_host
-        array([  1, -10, 100,  20])
-        >>> type(ser.values_host)
-        <class 'numpy.ndarray'>
-        >>> index = cudf.Index([1, -10, 100, 20])
-        >>> index.values_host
-        array([  1, -10, 100,  20])
-        >>> type(index.values_host)
-        <class 'numpy.ndarray'>
-        """
-        return self._column.values_host
+    def to_numpy(
+        self,
+        dtype: Union[Dtype, None] = None,
+        copy: bool = True,
+        na_value=None,
+    ) -> np.ndarray:
+        return super().to_numpy(dtype, copy, na_value).flatten()
 
     def tolist(self):
 
@@ -4906,38 +5321,14 @@ def tolist(self):
 
     to_list = tolist
 
+    # TODO: When this method is removed we can also remove
+    # ColumnBase.to_gpu_array.
     def to_gpu_array(self, fillna=None):
-        """Get a dense numba device array for the data.
-
-        Parameters
-        ----------
-        fillna : str or None
-            See *fillna* in ``.to_array``.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-
-        Returns
-        -------
-        numba.DeviceNDArray
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([10, 20, 30, 40, 50])
-        >>> s
-        0    10
-        1    20
-        2    30
-        3    40
-        4    50
-        dtype: int64
-        >>> s.to_gpu_array()
-        <numba.cuda.cudadrv.devicearray.DeviceNDArray object at 0x7f1840858890>
-        """
+        warnings.warn(
+            "The to_gpu_array method will be removed in a future cuDF "
+            "release. Consider using `to_cupy` instead.",
+            DeprecationWarning,
+        )
         return self._column.to_gpu_array(fillna=fillna)
 
     @classmethod
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6414d4a7e84..49a5e5e2143 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -4,6 +4,7 @@
 
 import math
 import pickle
+import warnings
 from numbers import Number
 from typing import (
     Any,
@@ -601,6 +602,23 @@ def __init__(self, data, **kwargs):
         name = kwargs.get("name")
         super().__init__({name: data})
 
+    @classmethod
+    def deserialize(cls, header, frames):
+        if "index_column" in header:
+            warnings.warn(
+                "Index objects serialized in cudf version "
+                "21.10 or older will no longer be deserializable "
+                "after version 21.12. Please load and resave any "
+                "pickles before upgrading to version 22.02.",
+                DeprecationWarning,
+            )
+            header["columns"] = [header.pop("index_column")]
+            header["column_names"] = pickle.dumps(
+                [pickle.loads(header["name"])]
+            )
+
+        return super().deserialize(header, frames)
+
     def drop_duplicates(self, keep="first"):
         """
         Return Index with duplicate values removed
@@ -852,8 +870,8 @@ def get_loc(self, key, method=None, tolerance=None):
 
         # Not sorted and not unique. Return a boolean mask
         mask = cupy.full(self._data.nrows, False)
-        true_inds = sort_inds.slice(lower_bound, upper_bound).to_gpu_array()
-        mask[cupy.array(true_inds)] = True
+        true_inds = sort_inds.slice(lower_bound, upper_bound).values
+        mask[true_inds] = True
         return mask
 
     def __sizeof__(self):
@@ -1597,6 +1615,27 @@ def quarter(self):
         res = extract_quarter(self._values)
         return Int8Index(res, dtype="int8")
 
+    def isocalendar(self):
+        """
+        Returns a DataFrame with the year, week, and day
+        calculated according to the ISO 8601 standard.
+
+        Returns
+        -------
+        DataFrame
+        with columns year, week and day
+
+        Examples
+        --------
+        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00",
+        ...    "1999-12-31 18:40:00"])
+        >>> gIndex.isocalendar()
+                             year  week  day
+        2020-05-31 08:00:00  2020    22    7
+        1999-12-31 18:40:00  1999    52    5
+        """
+        return cudf.core.tools.datetimes._to_iso_calendar(self)
+
     def to_pandas(self):
         nanos = self._values.astype("datetime64[ns]")
         return pd.DatetimeIndex(nanos.to_pandas(), name=self.name)
@@ -2108,7 +2147,9 @@ def __init__(self, values, copy=False, **kwargs):
         super().__init__(values, **kwargs)
 
     def to_pandas(self):
-        return pd.Index(self.to_array(), name=self.name, dtype="object")
+        return pd.Index(
+            self.to_numpy(na_value=None), name=self.name, dtype="object"
+        )
 
     def take(self, indices):
         return self._values[indices]
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 1d1f661779f..cc9c0fb66da 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -125,7 +125,7 @@ def _match_join_keys(
         else:
             warnings.warn(
                 f"Can't safely cast column from {rtype} to {ltype}, "
-                "upcasting to {common_type}."
+                f"upcasting to {common_type}."
             )
 
     return lcol.astype(common_type), rcol.astype(common_type)
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 276038146e1..55540d362ac 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -29,7 +29,6 @@ def merge(
     right_index,
     how,
     sort,
-    method,
     indicator,
     suffixes,
 ):
@@ -47,7 +46,6 @@ def merge(
         right_index=right_index,
         how=how,
         sort=sort,
-        method=method,
         indicator=indicator,
         suffixes=suffixes,
     )
@@ -87,7 +85,6 @@ def __init__(
         right_index,
         how,
         sort,
-        method,
         indicator,
         suffixes,
     ):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index bc97c72db88..941783ecee7 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -854,17 +854,23 @@ def take(self, indices):
         return result
 
     def serialize(self):
-        header = {}
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["names"] = pickle.dumps(self.names)
-
-        header["columns"], frames = column.serialize_columns(self._columns)
-
+        header, frames = super().serialize()
+        # Overwrite the names in _data with the true names.
+        header["column_names"] = pickle.dumps(self.names)
         return header, frames
 
     @classmethod
     def deserialize(cls, header, frames):
-        names = pickle.loads(header["names"])
+        if "names" in header:
+            warnings.warn(
+                "MultiIndex objects serialized in cudf version "
+                "21.10 or older will no longer be deserializable "
+                "after version 21.12. Please load and resave any "
+                "pickles before upgrading to version 22.02.",
+                DeprecationWarning,
+            )
+            header["column_names"] = header["names"]
+        column_names = pickle.loads(header["column_names"])
         if "source_data" in header:
             warnings.warn(
                 "MultiIndex objects serialized in cudf version "
@@ -874,11 +880,12 @@ def deserialize(cls, header, frames):
                 DeprecationWarning,
             )
             df = cudf.DataFrame.deserialize(header["source_data"], frames)
-            obj = cls.from_frame(df)
-            return obj._set_names(names)
-        columns = column.deserialize_columns(header["columns"], frames)
-        obj = cls._from_data(dict(zip(range(0, len(names)), columns)))
-        return obj._set_names(names)
+            return cls.from_frame(df)._set_names(column_names)
+
+        # Spoof the column names to construct the frame, then set manually.
+        header["column_names"] = pickle.dumps(range(0, len(column_names)))
+        obj = super().deserialize(header, frames)
+        return obj._set_names(column_names)
 
     def __getitem__(self, index):
         if isinstance(index, int):
@@ -1633,9 +1640,7 @@ def get_loc(self, key, method=None, tolerance=None):
             # the range is returned.
             return slice(lower_bound, upper_bound)
 
-        true_inds = cupy.array(
-            sort_inds.slice(lower_bound, upper_bound).to_gpu_array()
-        )
+        true_inds = sort_inds.slice(lower_bound, upper_bound).values
         true_inds = _maybe_indices_to_slice(true_inds)
         if isinstance(true_inds, slice):
             return true_inds
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 594f9fc42d0..590ac077e97 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -331,21 +331,6 @@ def from_pandas(cls, s, nan_as_null=None):
         """
         return cls(s, nan_as_null=nan_as_null)
 
-    def serialize(self):
-        header = {}
-        frames = []
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["index"], index_frames = self._index.serialize()
-        header["index_frame_count"] = len(index_frames)
-        frames.extend(index_frames)
-
-        header["column"], column_frames = self._column.serialize()
-        header["column_frame_count"] = len(column_frames)
-        frames.extend(column_frames)
-
-        header["name"] = pickle.dumps(self.name)
-        return header, frames
-
     @property
     def dt(self):
         """
@@ -374,20 +359,42 @@ def dt(self):
                 "Can only use .dt accessor with datetimelike values"
             )
 
+    def serialize(self):
+        header, frames = super().serialize()
+
+        header["index"], index_frames = self._index.serialize()
+        header["index_frame_count"] = len(index_frames)
+        # For backwards compatibility with older versions of cuDF, index
+        # columns are placed before data columns.
+        frames = index_frames + frames
+
+        return header, frames
+
     @classmethod
     def deserialize(cls, header, frames):
+        if "column" in header:
+            warnings.warn(
+                "Series objects serialized in cudf version "
+                "21.10 or older will no longer be deserializable "
+                "after version 21.12. Please load and resave any "
+                "pickles before upgrading to version 22.02.",
+                DeprecationWarning,
+            )
+            header["columns"] = [header.pop("column")]
+            header["column_names"] = pickle.dumps(
+                [pickle.loads(header["name"])]
+            )
+
         index_nframes = header["index_frame_count"]
+        obj = super().deserialize(
+            header, frames[header["index_frame_count"] :]
+        )
+
         idx_typ = pickle.loads(header["index"]["type-serialized"])
         index = idx_typ.deserialize(header["index"], frames[:index_nframes])
-        name = pickle.loads(header["name"])
+        obj._index = index
 
-        frames = frames[index_nframes:]
-
-        column_nframes = header["column_frame_count"]
-        col_typ = pickle.loads(header["column"]["type-serialized"])
-        column = col_typ.deserialize(header["column"], frames[:column_nframes])
-
-        return cls._from_data({name: column}, index=index)
+        return obj
 
     def _get_columns_by_label(self, labels, downcast=False):
         """Return the column specified by `labels`
@@ -738,25 +745,6 @@ def set_index(self, index):
         index = index if isinstance(index, BaseIndex) else as_index(index)
         return self._from_data(self._data, index, self.name)
 
-    def as_index(self):
-        """Returns a new Series with a RangeIndex.
-
-        Examples
-        ----------
-        >>> s = cudf.Series([1,2,3], index=['a','b','c'])
-        >>> s
-        a    1
-        b    2
-        c    3
-        dtype: int64
-        >>> s.as_index()
-        0    1
-        1    2
-        2    3
-        dtype: int64
-        """
-        return self.set_index(RangeIndex(len(self)))
-
     def to_frame(self, name=None):
         """Convert Series into a DataFrame
 
@@ -800,57 +788,8 @@ def to_frame(self, name=None):
         return cudf.DataFrame({col: self._column}, index=self.index)
 
     def set_mask(self, mask, null_count=None):
-        """Create new Series by setting a mask array.
-
-        This will override the existing mask.  The returned Series will
-        reference the same data buffer as this Series.
-
-        Parameters
-        ----------
-        mask : 1D array-like
-            The null-mask.  Valid values are marked as ``1``; otherwise ``0``.
-            The mask bit given the data index ``idx`` is computed as::
-
-                (mask[idx // 8] >> (idx % 8)) & 1
-        null_count : int, optional
-            The number of null values.
-            If None, it is calculated automatically.
-
-        Returns
-        -------
-        Series
-            A new series with the applied mask.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([1, 2, 3, 4, 5])
-        >>> ref_array = cudf.Series([10, None, 11, None, 16])
-        >>> series
-        0    1
-        1    2
-        2    3
-        3    4
-        4    5
-        dtype: int64
-        >>> ref_array
-        0      10
-        1    <NA>
-        2      11
-        3    <NA>
-        4      16
-        dtype: int64
-        >>> series.set_mask(ref_array._column.mask)
-        0       1
-        1    <NA>
-        2       3
-        3    <NA>
-        4       5
-        dtype: int64
-        """
         warnings.warn(
-            "Series.set_mask is deprecated and will be removed "
-            "in the future.",
+            "Series.set_mask is deprecated and will be removed in the future.",
             DeprecationWarning,
         )
         return self._from_data(
@@ -2540,43 +2479,13 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
+    # TODO: When this method is removed we can also remove ColumnBase.to_array.
     def to_array(self, fillna=None):
-        """Get a dense numpy array for the data.
-
-        Parameters
-        ----------
-        fillna : str or None
-            Defaults to None, which will skip null values.
-            If it equals "pandas", null values are filled with NaNs.
-            Non integral dtype is promoted to np.float64.
-
-        Returns
-        -------
-        numpy.ndarray
-            A numpy array representation of the elements in the Series.
-
-        Notes
-        -----
-        If ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 11, 12, 13, 14])
-        >>> series
-        0    10
-        1    11
-        2    12
-        3    13
-        4    14
-        dtype: int64
-        >>> array = series.to_array()
-        >>> array
-        array([10, 11, 12, 13, 14])
-        >>> type(array)
-        <class 'numpy.ndarray'>
-        """
+        warnings.warn(
+            "The to_array method will be removed in a future cuDF "
+            "release. Consider using `to_numpy` instead.",
+            DeprecationWarning,
+        )
         return self._column.to_array(fillna=fillna)
 
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
@@ -2912,81 +2821,10 @@ def argsort(self, ascending=True, na_position="last"):
         """
         return self._sort(ascending=ascending, na_position=na_position)[1]
 
-    def sort_index(
-        self,
-        axis=0,
-        level=None,
-        ascending=True,
-        inplace=False,
-        kind=None,
-        na_position="last",
-        sort_remaining=True,
-        ignore_index=False,
-    ):
-        """
-        Sort by the index.
-
-        Parameters
-        ----------
-        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
-            Axis to direct sorting. This can only be 0 for Series.
-        level : int or level name or list of ints or list of level names
-            If not None, sort on values in specified index level(s).
-            This is only useful in the case of MultiIndex.
-        ascending : bool, default True
-            Sort ascending vs. descending.
-        inplace : bool, default False
-            If True, perform operation in-place.
-        kind : sorting method such as `quick sort` and others.
-            Not yet supported.
-        na_position : {‘first’, ‘last’}, default ‘last’
-            Puts NaNs at the beginning if first; last puts NaNs at the end.
-        sort_remaining : bool, default True
-            Not yet supported
-        ignore_index : bool, default False
-            if True, index will be replaced with RangeIndex.
-
-        Returns
-        -------
-        Series
-            The original Series sorted by the labels.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
-        >>> series
-        3    a
-        2    b
-        1    c
-        4    d
-        dtype: object
-        >>> series.sort_index()
-        1    c
-        2    b
-        3    a
-        4    d
-        dtype: object
-
-        Sort Descending
-
-        >>> series.sort_index(ascending=False)
-        4    d
-        3    a
-        2    b
-        1    c
-        dtype: object
-        """
-        return super()._sort_index(
-            axis=axis,
-            level=level,
-            ascending=ascending,
-            inplace=inplace,
-            kind=kind,
-            na_position=na_position,
-            sort_remaining=sort_remaining,
-            ignore_index=ignore_index,
-        )
+    def sort_index(self, axis=0, *args, **kwargs):
+        if axis not in (0, "index"):
+            raise ValueError("Only axis=0 is valid for Series.")
+        return super().sort_index(axis=axis, *args, **kwargs)
 
     def sort_values(
         self,
@@ -3217,170 +3055,14 @@ def _sort(self, ascending=True, na_position="last"):
         sr_inds = self._from_data({self.name: col_inds}, self._index)
         return sr_keys, sr_inds
 
-    def replace(
-        self,
-        to_replace=None,
-        value=None,
-        inplace=False,
-        limit=None,
-        regex=False,
-        method=None,
-    ):
-        """
-        Replace values given in ``to_replace`` with ``value``.
-
-        Parameters
-        ----------
-        to_replace : numeric, str or list-like
-            Value(s) to replace.
-
-            * numeric or str:
-                - values equal to ``to_replace`` will be replaced
-                  with ``value``
-            * list of numeric or str:
-                - If ``value`` is also list-like, ``to_replace`` and
-                  ``value`` must be of same length.
-            * dict:
-                - Dicts can be used to specify different replacement values
-                  for different existing values. For example, {'a': 'b',
-                  'y': 'z'} replaces the value ‘a’ with ‘b’ and
-                  ‘y’ with ‘z’.
-                  To use a dict in this way the ``value`` parameter should
-                  be ``None``.
-        value : scalar, dict, list-like, str, default None
-            Value to replace any values matching ``to_replace`` with.
-        inplace : bool, default False
-            If True, in place.
-
-        See also
-        --------
-        Series.fillna
-
-        Raises
-        ------
-        TypeError
-            - If ``to_replace`` is not a scalar, array-like, dict, or None
-            - If ``to_replace`` is a dict and value is not a list, dict,
-              or Series
-        ValueError
-            - If a list is passed to ``to_replace`` and ``value`` but they
-              are not the same length.
-
-        Returns
-        -------
-        result : Series
-            Series after replacement. The mask and index are preserved.
-
-        Notes
-        -----
-        Parameters that are currently not supported are: `limit`, `regex`,
-        `method`
-
-        Examples
-        --------
-
-        Scalar ``to_replace`` and ``value``
-
-        >>> import cudf
-        >>> s = cudf.Series([0, 1, 2, 3, 4])
-        >>> s
-        0    0
-        1    1
-        2    2
-        3    3
-        4    4
-        dtype: int64
-        >>> s.replace(0, 5)
-        0    5
-        1    1
-        2    2
-        3    3
-        4    4
-        dtype: int64
-
-        List-like ``to_replace``
-
-        >>> s.replace([1, 2], 10)
-        0     0
-        1    10
-        2    10
-        3     3
-        4     4
-        dtype: int64
-
-        dict-like ``to_replace``
-
-        >>> s.replace({1:5, 3:50})
-        0     0
-        1     5
-        2     2
-        3    50
-        4     4
-        dtype: int64
-        >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a'])
-        >>> s
-        0     b
-        1     a
-        2     a
-        3     b
-        4     a
-        dtype: object
-        >>> s.replace({'a': None})
-        0       b
-        1    <NA>
-        2    <NA>
-        3       b
-        4    <NA>
-        dtype: object
-
-        If there is a mimatch in types of the values in
-        ``to_replace`` & ``value`` with the actual series, then
-        cudf exhibits different behaviour with respect to pandas
-        and the pairs are ignored silently:
-
-        >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a'])
-        >>> s
-        0    b
-        1    a
-        2    a
-        3    b
-        4    a
-        dtype: object
-        >>> s.replace('a', 1)
-        0    b
-        1    a
-        2    a
-        3    b
-        4    a
-        dtype: object
-        >>> s.replace(['a', 'c'], [1, 2])
-        0    b
-        1    a
-        2    a
-        3    b
-        4    a
-        dtype: object
-        """
-        if limit is not None:
-            raise NotImplementedError("limit parameter is not implemented yet")
-
-        if regex:
-            raise NotImplementedError("regex parameter is not implemented yet")
-
-        if method not in ("pad", None):
-            raise NotImplementedError(
-                "method parameter is not implemented yet"
-            )
-
+    def replace(self, to_replace=None, value=None, *args, **kwargs):
         if is_dict_like(to_replace) and value is not None:
             raise ValueError(
                 "Series.replace cannot use dict-like to_replace and non-None "
                 "value"
             )
 
-        result = super().replace(to_replace=to_replace, replacement=value)
-
-        return self._mimic_inplace(result, inplace=inplace)
+        return super().replace(to_replace, value, *args, **kwargs)
 
     def update(self, other):
         """
@@ -3487,35 +3169,10 @@ def update(self, other):
         self.mask(mask, other, inplace=True)
 
     def reverse(self):
-        """
-        Reverse the Series
-
-        Returns
-        -------
-        Series
-            A reversed Series.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([1, 2, 3, 4, 5, 6])
-        >>> series
-        0    1
-        1    2
-        2    3
-        3    4
-        4    5
-        5    6
-        dtype: int64
-        >>> series.reverse()
-        5    6
-        4    5
-        3    4
-        2    3
-        1    2
-        0    1
-        dtype: int64
-        """
+        warnings.warn(
+            "Series.reverse is deprecated and will be removed in the future.",
+            DeprecationWarning,
+        )
         rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32)
         return self._from_data(
             {self.name: self._column[rinds]}, self.index._values[rinds]
@@ -3684,6 +3341,94 @@ def _return_sentinel_series():
         return codes
 
     # UDF related
+    def apply(self, func, convert_dtype=True, args=(), **kwargs):
+        """
+        Apply a scalar function to the values of a Series.
+
+        Similar to `pandas.Series.apply. Applies a user
+        defined function elementwise over a series.
+
+        Parameters
+        ----------
+        func : function
+            Scalar Python function to apply.
+        convert_dtype : bool, default True
+            In cuDF, this parameter is always True. Because
+            cuDF does not support arbitrary object dtypes,
+            the result will always be the common type as determined
+            by numba based on the function logic and argument types.
+            See examples for details.
+        args : tuple
+            Not supported
+        **kwargs
+            Not supported
+
+        Notes
+        -----
+        UDFs are cached in memory to avoid recompilation. The first
+        call to the UDF will incur compilation overhead.
+
+        Examples
+        --------
+
+        Apply a basic function to a series
+        >>> sr = cudf.Series([1,2,3])
+        >>> def f(x):
+        ...     return x + 1
+        >>> sr.apply(f)
+        0    2
+        1    3
+        2    4
+        dtype: int64
+
+        Apply a basic function to a series with nulls
+        >>> sr = cudf.Series([1,cudf.NA,3])
+        >>> def f(x):
+        ...     return x + 1
+        >>> sr.apply(f)
+        0       2
+        1    <NA>
+        2       4
+        dtype: int64
+
+        Use a function that does something conditionally,
+        based on if the value is or is not null
+        >>> sr = cudf.Series([1,cudf.NA,3])
+        >>> def f(x):
+        ...     if x is cudf.NA:
+        ...         return 42
+        ...     else:
+        ...         return x - 1
+        >>> sr.apply(f)
+        0     0
+        1    42
+        2     2
+        dtype: int64
+
+        Results will be upcast to the common dtype required
+        as derived from the UDFs logic. Note that this means
+        the common type will be returned even if such data
+        is passed that would not result in any values of that
+        dtype.
+
+        >>> sr = cudf.Series([1,cudf.NA,3])
+        >>> def f(x):
+        ...     return x + 1.5
+        >>> sr.apply(f)
+        0     2.5
+        1    <NA>
+        2     4.5
+        dtype: float64
+
+
+
+        """
+        if args or kwargs:
+            raise ValueError(
+                "UDFs using *args or **kwargs are not yet supported."
+            )
+
+        return super()._apply(func)
 
     def applymap(self, udf, out_dtype=None):
         """Apply an elementwise function to transform the values in the Column.
@@ -3895,39 +3640,9 @@ def mode(self, dropna=True):
         return Series(val_counts.index.sort_values(), name=self.name)
 
     def round(self, decimals=0, how="half_even"):
-        """
-        Round each value in a Series to the given number of decimals.
-
-        Parameters
-        ----------
-        decimals : int, default 0
-            Number of decimal places to round to. If decimals is negative,
-            it specifies the number of positions to the left of the decimal
-            point.
-        how : str, optional
-            Type of rounding. Can be either "half_even" (default)
-            of "half_up" rounding.
-
-        Returns
-        -------
-        Series
-            Rounded values of the Series.
-
-        Examples
-        --------
-        >>> s = cudf.Series([0.1, 1.4, 2.9])
-        >>> s.round()
-        0    0.0
-        1    1.0
-        2    3.0
-        dtype: float64
-        """
-        return Series(
-            self._column.round(decimals=decimals, how=how),
-            name=self.name,
-            index=self.index,
-            dtype=self.dtype,
-        )
+        if not isinstance(decimals, int):
+            raise ValueError("decimals must be an int")
+        return super().round(decimals, how)
 
     def cov(self, other, min_periods=None):
         """
@@ -4554,7 +4269,7 @@ def _describe_numeric(self):
             )
             data = (
                 [self.count(), self.mean(), self.std(), self.min()]
-                + self.quantile(percentiles).to_array(fillna="pandas").tolist()
+                + self.quantile(percentiles).to_numpy(na_value=np.nan).tolist()
                 + [self.max()]
             )
             data = _format_stats_values(data)
@@ -4580,7 +4295,7 @@ def _describe_timedelta(self):
                 ]
                 + self.quantile(percentiles)
                 .astype("str")
-                .to_array(fillna="pandas")
+                .to_numpy(na_value=None)
                 .tolist()
                 + [str(pd.Timedelta(self.max()))]
             )
@@ -4632,7 +4347,7 @@ def _describe_timestamp(self):
                 ]
                 + self.quantile(percentiles)
                 .astype("str")
-                .to_array(fillna="pandas")
+                .to_numpy(na_value=None)
                 .tolist()
                 + [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))]
             )
@@ -4946,7 +4661,6 @@ def merge(
             right_index=right_index,
             how=how,
             sort=sort,
-            method=method,
             indicator=False,
             suffixes=suffixes,
         )
@@ -5509,6 +5223,49 @@ def quarter(self):
             {None: res}, index=self.series._index, name=self.series.name,
         )
 
+    def isocalendar(self):
+        """
+        Returns a DataFrame with the year, week, and day
+        calculated according to the ISO 8601 standard.
+
+        Returns
+        -------
+        DataFrame
+        with columns year, week and day
+
+        Examples
+        --------
+        >>> ser = cudf.Series(pd.date_range(start="2021-07-25",
+        ... end="2021-07-30"))
+        >>> ser.dt.isocalendar()
+           year  week  day
+        0  2021    29    7
+        1  2021    30    1
+        2  2021    30    2
+        3  2021    30    3
+        4  2021    30    4
+        5  2021    30    5
+        >>> ser.dt.isocalendar().week
+        0    29
+        1    30
+        2    30
+        3    30
+        4    30
+        5    30
+        Name: week, dtype: object
+
+        >>> serIndex = cudf.to_datetime(pd.Series(["2010-01-01", pd.NaT]))
+        >>> serIndex.dt.isocalendar()
+            year  week  day
+        0  2009    53     5
+        1  <NA>  <NA>  <NA>
+        >>> serIndex.dt.isocalendar().year
+        0    2009
+        1    <NA>
+        Name: year, dtype: object
+        """
+        return cudf.core.tools.datetimes._to_iso_calendar(self)
+
     @property
     def is_month_start(self):
         """
@@ -5812,9 +5569,8 @@ def strftime(self, date_format, *args, **kwargs):
         Notes
         -----
 
-        The following date format identifiers are not yet supported: ``%a``,
-        ``%A``, ``%w``, ``%b``, ``%B``, ``%U``, ``%W``, ``%c``, ``%x``,
-        ``%X``, ``%G``, ``%u``, ``%V``
+        The following date format identifiers are not yet
+        supported: ``%c``, ``%x``,``%X``
 
         Examples
         --------
@@ -5853,19 +5609,9 @@ def strftime(self, date_format, *args, **kwargs):
         # once https://github.com/rapidsai/cudf/issues/5991
         # is implemented
         not_implemented_formats = {
-            "%a",
-            "%A",
-            "%w",
-            "%b",
-            "%B",
-            "%U",
-            "%W",
             "%c",
             "%x",
             "%X",
-            "%G",
-            "%u",
-            "%V",
         }
         for d_format in not_implemented_formats:
             if d_format in date_format:
@@ -5875,7 +5621,6 @@ def strftime(self, date_format, *args, **kwargs):
                     f"https://github.com/rapidsai/cudf/issues/5991 "
                     f"for tracking purposes."
                 )
-
         str_col = self.series._column.as_string_column(
             dtype="str", format=date_format
         )
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index b0fb2fb4274..e17c58d1db7 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -4,6 +4,7 @@
 from typing import Sequence, Union
 
 import numpy as np
+import pandas as pd
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
@@ -221,8 +222,8 @@ def to_datetime(
                 format=format,
             )
             return as_index(col, name=arg.name)
-        elif isinstance(arg, cudf.Series):
-            col = arg._column
+        elif isinstance(arg, (cudf.Series, pd.Series)):
+            col = column.as_column(arg)
             col = _process_col(
                 col=col,
                 unit=unit,
@@ -652,3 +653,23 @@ def _isin_datetimelike(
 
     res = lhs._obtain_isin_result(rhs)
     return res
+
+
+def _to_iso_calendar(arg):
+    formats = ["%G", "%V", "%u"]
+    if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)):
+        raise AttributeError(
+            "Can only use .isocalendar accessor with series or index"
+        )
+    if isinstance(arg, cudf.Index):
+        iso_params = [
+            arg._column.as_string_column(arg._values.dtype, fmt)
+            for fmt in formats
+        ]
+        index = arg._column
+    elif isinstance(arg.series, cudf.Series):
+        iso_params = [arg.strftime(fmt) for fmt in formats]
+        index = arg.series.index
+
+    data = dict(zip(["year", "week", "day"], iso_params))
+    return cudf.DataFrame(data, index=index, dtype=np.int32)
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 776b9e0a47c..8c69b94cc84 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -165,7 +165,9 @@ def to_numeric(arg, errors="raise", downcast=None):
     if isinstance(arg, (cudf.Series, pd.Series)):
         return cudf.Series(col)
     else:
-        col = col.fillna(col.default_na_value())
+        if col.has_nulls:
+            # To match pandas, always return a floating type filled with nan.
+            col = col.astype(float).fillna(np.nan)
         return col.values
 
 
diff --git a/python/cudf/cudf/core/udf/classes.py b/python/cudf/cudf/core/udf/api.py
similarity index 69%
rename from python/cudf/cudf/core/udf/classes.py
rename to python/cudf/cudf/core/udf/api.py
index fe2fbd9daad..23b4d02c57d 100644
--- a/python/cudf/cudf/core/udf/classes.py
+++ b/python/cudf/cudf/core/udf/api.py
@@ -14,3 +14,11 @@ class Masked:
     def __init__(self, value, valid):
         self.value = value
         self.valid = valid
+
+
+def pack_return(masked_or_scalar):
+    # Blank function to give us something for the typing and
+    # lowering to grab onto. Just a dummy function for us to
+    # call within kernels that will get replaced later by the
+    # lowered implementation
+    pass
diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 1467a61f215..3986abc2bf0 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -9,11 +9,10 @@
 )
 from numba.extending import lower_builtin, types
 
+from cudf.core.udf import api
+from cudf.core.udf._ops import arith_ops, comparison_ops
 from cudf.core.udf.typing import MaskedType, NAType
 
-from . import classes
-from ._ops import arith_ops, comparison_ops
-
 
 @cuda_lowering_registry.lower_constant(NAType)
 def constant_na(context, builder, ty, pyval):
@@ -154,9 +153,8 @@ def register_const_op(op):
     to_lower_op = make_const_op(op)
     cuda_lower(op, MaskedType, types.Number)(to_lower_op)
     cuda_lower(op, types.Number, MaskedType)(to_lower_op)
-
-    # to_lower_op_reflected = make_reflected_const_op(op)
-    # cuda_lower(op, types.Number, MaskedType)(to_lower_op_reflected)
+    cuda_lower(op, MaskedType, types.Boolean)(to_lower_op)
+    cuda_lower(op, types.Boolean, MaskedType)(to_lower_op)
 
 
 # register all lowering at init
@@ -194,6 +192,24 @@ def masked_scalar_is_null_impl(context, builder, sig, args):
     return builder.load(result)
 
 
+# Main kernel always calls `pack_return` on whatever the user defined
+# function returned. This returns the same data if its already a `Masked`
+# else packs it up into a new one that is valid from the get go
+@cuda_lower(api.pack_return, MaskedType)
+def pack_return_masked_impl(context, builder, sig, args):
+    return args[0]
+
+
+@cuda_lower(api.pack_return, types.Boolean)
+@cuda_lower(api.pack_return, types.Number)
+def pack_return_scalar_impl(context, builder, sig, args):
+    outdata = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+    outdata.value = args[0]
+    outdata.valid = context.get_constant(types.boolean, 1)
+
+    return outdata._getvalue()
+
+
 @cuda_lower(operator.truth, MaskedType)
 def masked_scalar_truth_impl(context, builder, sig, args):
     indata = cgutils.create_struct_proxy(MaskedType(types.boolean))(
@@ -253,7 +269,7 @@ def cast_masked_to_masked(context, builder, fromty, toty, val):
 
 
 # Masked constructor for use in a kernel for testing
-@lower_builtin(classes.Masked, types.Number, types.boolean)
+@lower_builtin(api.Masked, types.Number, types.boolean)
 def masked_constructor(context, builder, sig, args):
     ty = sig.return_type
     value, valid = args
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
index c7b8be92c00..7f3aa7baa93 100644
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -1,28 +1,40 @@
+import cachetools
+import numpy as np
+from numba import cuda
 from numba.np import numpy_support
+from numba.types import Tuple, boolean, int64, void
 from nvtx import annotate
 
+from cudf.core.udf.api import Masked, pack_return
 from cudf.core.udf.typing import MaskedType
 from cudf.utils import cudautils
 
+libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32"))
+MASK_BITSIZE = np.dtype("int32").itemsize * 8
+precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
+
 
 @annotate("NUMBA JIT", color="green", domain="cudf_python")
-def compile_masked_udf(func, dtypes):
+def get_udf_return_type(func, dtypes):
     """
-    Generate an inlineable PTX function that will be injected into
-    a variadic kernel inside libcudf
-
-    assume all input types are `MaskedType(input_col.dtype)` and then
-    compile the requestied PTX function as a function over those types
+    Get the return type of a masked UDF for a given set of argument dtypes. It
+    is assumed that a `MaskedType(dtype)` is passed to the function for each
+    input dtype.
     """
     to_compiler_sig = tuple(
         MaskedType(arg)
         for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)
     )
-    # Get the inlineable PTX function
-    ptx, numba_output_type = cudautils.compile_udf(func, to_compiler_sig)
-    numpy_output_type = numpy_support.as_dtype(numba_output_type.value_type)
+    # Get the return type. The PTX is also returned by compile_udf, but is not
+    # needed here.
+    ptx, output_type = cudautils.compile_udf(func, to_compiler_sig)
+
+    if not isinstance(output_type, MaskedType):
+        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
+    else:
+        numba_output_type = output_type
 
-    return numpy_output_type, ptx
+    return numba_output_type
 
 
 def nulludf(func):
@@ -50,3 +62,159 @@ def wrapper(*args):
         return to_udf_table._apply(func)
 
     return wrapper
+
+
+def masked_array_type_from_col(col):
+    """
+    Return a type representing a tuple of arrays,
+    the first element an array of the numba type
+    corresponding to `dtype`, and the second an
+    array of bools representing a mask.
+    """
+    nb_scalar_ty = numpy_support.from_dtype(col.dtype)
+    if col.mask is None:
+        return nb_scalar_ty[::1]
+    else:
+        return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1]))
+
+
+def construct_signature(df, return_type):
+    """
+    Build the signature of numba types that will be used to
+    actually JIT the kernel itself later, accounting for types
+    and offsets
+    """
+
+    # Tuple of arrays, first the output data array, then the mask
+    return_type = Tuple((return_type[::1], boolean[::1]))
+    offsets = []
+    sig = [return_type]
+    for col in df._data.values():
+        sig.append(masked_array_type_from_col(col))
+        offsets.append(int64)
+
+    # return_type + data,masks + offsets + size
+    sig = void(*(sig + offsets + [int64]))
+
+    return sig
+
+
+@cuda.jit(device=True)
+def mask_get(mask, pos):
+    return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
+
+
+kernel_template = """\
+def _kernel(retval, {input_columns}, {input_offsets}, size):
+    i = cuda.grid(1)
+    ret_data_arr, ret_mask_arr = retval
+    if i < size:
+{masked_input_initializers}
+        ret = {user_udf_call}
+        ret_masked = pack_return(ret)
+        ret_data_arr[i] = ret_masked.value
+        ret_mask_arr[i] = ret_masked.valid
+"""
+
+unmasked_input_initializer_template = """\
+        d_{idx} = input_col_{idx}
+        masked_{idx} = Masked(d_{idx}[i], True)
+"""
+
+masked_input_initializer_template = """\
+        d_{idx}, m_{idx} = input_col_{idx}
+        masked_{idx} = Masked(d_{idx}[i], mask_get(m_{idx}, i + offset_{idx}))
+"""
+
+
+def _define_function(df, scalar_return=False):
+    # Create argument list for kernel
+    input_columns = ", ".join([f"input_col_{i}" for i in range(len(df._data))])
+    input_offsets = ", ".join([f"offset_{i}" for i in range(len(df._data))])
+
+    # Create argument list to pass to device function
+    args = ", ".join([f"masked_{i}" for i in range(len(df._data))])
+    user_udf_call = f"f_({args})"
+
+    # Generate the initializers for each device function argument
+    initializers = []
+    for i, col in enumerate(df._data.values()):
+        idx = str(i)
+        if col.mask is not None:
+            template = masked_input_initializer_template
+        else:
+            template = unmasked_input_initializer_template
+
+        initializer = template.format(idx=idx)
+
+        initializers.append(initializer)
+
+    masked_input_initializers = "\n".join(initializers)
+
+    # Incorporate all of the above into the kernel code template
+    d = {
+        "input_columns": input_columns,
+        "input_offsets": input_offsets,
+        "masked_input_initializers": masked_input_initializers,
+        "user_udf_call": user_udf_call,
+    }
+
+    return kernel_template.format(**d)
+
+
+@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
+def compile_or_get(df, f):
+    """
+    Return a compiled kernel in terms of MaskedTypes that launches a
+    kernel equivalent of `f` for the dtypes of `df`. The kernel uses
+    a thread for each row and calls `f` using that rows data / mask
+    to produce an output value and output valdity for each row.
+
+    If the UDF has already been compiled for this requested dtypes,
+    a cached version will be returned instead of running compilation.
+
+    """
+
+    # check to see if we already compiled this function
+    frame_dtypes = tuple(col.dtype for col in df._data.values())
+    cache_key = (
+        *cudautils.make_cache_key(f, frame_dtypes),
+        *(col.mask is None for col in df._data.values()),
+    )
+    if precompiled.get(cache_key) is not None:
+        kernel, scalar_return_type = precompiled[cache_key]
+        return kernel, scalar_return_type
+
+    numba_return_type = get_udf_return_type(f, frame_dtypes)
+
+    _is_scalar_return = not isinstance(numba_return_type, MaskedType)
+    scalar_return_type = (
+        numba_return_type
+        if _is_scalar_return
+        else numba_return_type.value_type
+    )
+
+    sig = construct_signature(df, scalar_return_type)
+    f_ = cuda.jit(device=True)(f)
+
+    # Dict of 'local' variables into which `_kernel` is defined
+    local_exec_context = {}
+    global_exec_context = {
+        "f_": f_,
+        "cuda": cuda,
+        "Masked": Masked,
+        "mask_get": mask_get,
+        "pack_return": pack_return,
+    }
+    exec(
+        _define_function(df, scalar_return=_is_scalar_return),
+        global_exec_context,
+        local_exec_context,
+    )
+    # The python function definition representing the kernel
+    _kernel = local_exec_context["_kernel"]
+    kernel = cuda.jit(sig)(_kernel)
+    scalar_return_type = numpy_support.as_dtype(scalar_return_type)
+    precompiled[cache_key] = (kernel, scalar_return_type)
+
+    return kernel, scalar_return_type
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 6e026412f24..042d97db838 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -17,8 +17,8 @@
 from numba.cuda.cudadecl import registry as cuda_decl_registry
 from pandas._libs.missing import NAType as _NAType
 
-from . import classes
-from ._ops import arith_ops, comparison_ops
+from cudf.core.udf import api
+from cudf.core.udf._ops import arith_ops, comparison_ops
 
 
 class MaskedType(types.Type):
@@ -101,7 +101,7 @@ def __eq__(self, other):
 
 # For typing a Masked constant value defined outside a kernel (e.g. captured in
 # a closure).
-@typeof_impl.register(classes.Masked)
+@typeof_impl.register(api.Masked)
 def typeof_masked(val, c):
     return MaskedType(typeof(val.value))
 
@@ -110,7 +110,7 @@ def typeof_masked(val, c):
 # type in a kernel.
 @cuda_decl_registry.register
 class MaskedConstructor(ConcreteTemplate):
-    key = classes.Masked
+    key = api.Masked
 
     cases = [
         nb_signature(MaskedType(t), t, types.boolean)
@@ -123,20 +123,20 @@ class MaskedConstructor(ConcreteTemplate):
 make_attribute_wrapper(MaskedType, "valid", "valid")
 
 
-# Typing for `classes.Masked`
+# Typing for `api.Masked`
 @cuda_decl_registry.register_attr
 class ClassesTemplate(AttributeTemplate):
-    key = types.Module(classes)
+    key = types.Module(api)
 
     def resolve_Masked(self, mod):
         return types.Function(MaskedConstructor)
 
 
-# Registration of the global is also needed for Numba to type classes.Masked
-cuda_decl_registry.register_global(classes, types.Module(classes))
-# For typing bare Masked (as in `from .classes import Masked`
+# Registration of the global is also needed for Numba to type api.Masked
+cuda_decl_registry.register_global(api, types.Module(api))
+# For typing bare Masked (as in `from .api import Masked`
 cuda_decl_registry.register_global(
-    classes.Masked, types.Function(MaskedConstructor)
+    api.Masked, types.Function(MaskedConstructor)
 )
 
 
@@ -247,10 +247,10 @@ def generic(self, args, kws):
         # In the case of op(Masked, scalar), we resolve the type between
         # the Masked value_type and the scalar's type directly
         if isinstance(args[0], MaskedType) and isinstance(
-            args[1], types.Number
+            args[1], (types.Number, types.Boolean)
         ):
             to_resolve_types = (args[0].value_type, args[1])
-        elif isinstance(args[0], types.Number) and isinstance(
+        elif isinstance(args[0], (types.Number, types.Boolean)) and isinstance(
             args[1], MaskedType
         ):
             to_resolve_types = (args[1].value_type, args[0])
@@ -287,6 +287,23 @@ def generic(self, args, kws):
             return nb_signature(types.boolean, MaskedType(types.boolean))
 
 
+@cuda_decl_registry.register_global(api.pack_return)
+class UnpackReturnToMasked(AbstractTemplate):
+    """
+    Turn a returned MaskedType into its value and validity
+    or turn a scalar into the tuple (scalar, True).
+    """
+
+    def generic(self, args, kws):
+        if isinstance(args[0], MaskedType):
+            # MaskedType(dtype, valid) -> MaskedType(dtype, valid)
+            return nb_signature(args[0], args[0])
+        elif isinstance(args[0], (types.Number, types.Boolean)):
+            # scalar_type -> MaskedType(scalar_type, True)
+            return_type = MaskedType(args[0])
+            return nb_signature(return_type, args[0])
+
+
 for op in arith_ops + comparison_ops:
     # Every op shares the same typing class
     cuda_decl_registry.register_global(op)(MaskedScalarArithOp)
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index d5e9142934b..9562fca7399 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -222,8 +222,8 @@ def assert_column_equal(
             left = left.astype(left.categories.dtype)
             right = right.astype(right.categories.dtype)
     if not columns_equal:
-        msg1 = f"{left.to_array()}"
-        msg2 = f"{right.to_array()}"
+        msg1 = f"{left.values_host}"
+        msg2 = f"{right.values_host}"
         try:
             diff = left.apply_boolean_mask(left != right).size
             diff = diff * 100.0 / left.size
diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index fa3c88a3551..925c9ef720c 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -43,7 +43,7 @@ def test_applymap_round(nelem, masked):
 
     # Check
     expect = np.round(data)
-    got = out.to_array()
+    got = out.to_numpy()
     np.testing.assert_array_almost_equal(expect, got)
 
 
@@ -58,5 +58,5 @@ def test_applymap_change_out_dtype():
 
     # Check
     expect = np.array(data, dtype=float)
-    got = out.to_array()
+    got = out.to_numpy()
     np.testing.assert_array_equal(expect, got)
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index ecd13b57ca4..29654fb9556 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -34,7 +34,7 @@ def test_array_func_cudf_series(np_ar, func):
     if np.isscalar(expect):
         assert_eq(expect, got)
     else:
-        assert_eq(expect, got.to_array())
+        assert_eq(expect, got.to_numpy())
 
 
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index 8cfcf4d2b6d..3fe0321ec54 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -31,7 +31,7 @@ def test_ufunc_cudf_non_nullseries(np_ar_tup, func):
     s_1, s_2 = cudf.Series(x), cudf.Series(y)
     expect = func(x, y)
     got = func(s_1, s_2)
-    assert_eq(expect, got.to_array())
+    assert_eq(expect, got.to_numpy())
 
 
 @pytest.mark.parametrize(
@@ -45,7 +45,7 @@ def test_ufunc_cudf_series_bitwise(func):
     s_1, s_2 = cudf.Series(x), cudf.Series(y)
     expect = func(x, y)
     got = func(s_1, s_2)
-    assert_eq(expect, got.to_array())
+    assert_eq(expect, got.to_numpy())
 
 
 @pytest.mark.parametrize(
@@ -67,16 +67,16 @@ def test_ufunc_cudf_null_series(np_ar_tup, func):
     s_1, s_2 = cudf.Series(x), cudf.Series(y)
     expect = func(x, y)
     got = func(s_1, s_2)
-    assert_eq(expect, got.fillna(np.nan).to_array())
+    assert_eq(expect, got.fillna(np.nan).to_numpy())
 
     scalar = 0.5
     expect = func(x, scalar)
     got = func(s_1, scalar)
-    assert_eq(expect, got.fillna(np.nan).to_array())
+    assert_eq(expect, got.fillna(np.nan).to_numpy())
 
     expect = func(scalar, x)
     got = func(scalar, s_1)
-    assert_eq(expect, got.fillna(np.nan).to_array())
+    assert_eq(expect, got.fillna(np.nan).to_numpy())
 
 
 @pytest.mark.xfail(
@@ -93,16 +93,16 @@ def test_ufunc_cudf_null_series_comparison_ops(np_ar_tup, func):
     s_1, s_2 = cudf.Series(x), cudf.Series(y)
     expect = func(x, y)
     got = func(s_1, s_2)
-    assert_eq(expect, got.fillna(np.nan).to_array())
+    assert_eq(expect, got.fillna(np.nan).to_numpy())
 
     scalar = 0.5
     expect = func(x, scalar)
     got = func(s_1, scalar)
-    assert_eq(expect, got.fillna(np.nan).to_array())
+    assert_eq(expect, got.fillna(np.nan).to_numpy())
 
     expect = func(scalar, x)
     got = func(scalar, s_1)
-    assert_eq(expect, got.fillna(np.nan).to_array())
+    assert_eq(expect, got.fillna(np.nan).to_numpy())
 
 
 @pytest.mark.parametrize(
@@ -115,7 +115,7 @@ def test_ufunc_cudf_series_cupy_array(np_ar_tup, func):
     cudf_s = cudf.Series(x)
     cupy_ar = cp.array(y)
     got = func(cudf_s, cupy_ar)
-    assert_eq(expect, got.to_array())
+    assert_eq(expect, got.to_numpy())
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index f8063408e28..50fd27f2752 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -80,7 +80,7 @@ def func(index):
         result = binop(sr.astype("int32"), sr)
         expect = binop(arr.astype("int32"), arr)
 
-        np.testing.assert_almost_equal(result.to_array(), expect, decimal=5)
+        np.testing.assert_almost_equal(result.to_numpy(), expect, decimal=5)
 
     from concurrent.futures import ThreadPoolExecutor
 
@@ -108,7 +108,7 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar):
     if obj_class == "Index":
         result = Series(result)
 
-    np.testing.assert_almost_equal(result.to_array(), binop(arr, rhs))
+    np.testing.assert_almost_equal(result.to_numpy(), binop(arr, rhs))
 
 
 _bitwise_binops = [operator.and_, operator.or_, operator.xor]
@@ -146,7 +146,7 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype):
     if obj_class == "Index":
         result = Series(result)
 
-    np.testing.assert_almost_equal(result.to_array(), binop(arr1, arr2))
+    np.testing.assert_almost_equal(result.to_numpy(), binop(arr1, arr2))
 
 
 _logical_binops = [
@@ -211,9 +211,9 @@ def test_series_compare(cmpop, obj_class, dtype):
         result2 = Series(result2)
         result3 = Series(result3)
 
-    np.testing.assert_equal(result1.to_array(), cmpop(arr1, arr1))
-    np.testing.assert_equal(result2.to_array(), cmpop(arr2, arr2))
-    np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))
+    np.testing.assert_equal(result1.to_numpy(), cmpop(arr1, arr1))
+    np.testing.assert_equal(result2.to_numpy(), cmpop(arr2, arr2))
+    np.testing.assert_equal(result3.to_numpy(), cmpop(arr1, arr2))
 
 
 def _series_compare_nulls_typegen():
@@ -298,8 +298,8 @@ def test_series_compare_scalar(
         result1 = Series(result1)
         result2 = Series(result2)
 
-    np.testing.assert_equal(result1.to_array(), cmpop(arr1, rhs))
-    np.testing.assert_equal(result2.to_array(), cmpop(rhs, arr1))
+    np.testing.assert_equal(result1.to_numpy(), cmpop(arr1, rhs))
+    np.testing.assert_equal(result2.to_numpy(), cmpop(rhs, arr1))
 
 
 _nulls = ["none", "some"]
@@ -347,7 +347,7 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls):
         )[:nelem]
     # Fill NA values
     na_value = -10000
-    got = res.fillna(na_value).to_array()
+    got = res.fillna(na_value).to_numpy()
     expect = lhs_data + rhs_data
     if lhs_nulls == "some" or rhs_nulls == "some":
         expect[~res_mask] = na_value
@@ -383,7 +383,7 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class):
     if obj_class == "Index":
         result = Series(result)
 
-    np.testing.assert_almost_equal(result.to_array(), binop(lhs, rhs))
+    np.testing.assert_almost_equal(result.to_numpy(), binop(lhs, rhs))
 
 
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
@@ -408,7 +408,7 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class):
     if obj_class == "Index":
         result = Series(result)
 
-    np.testing.assert_array_equal(result.to_array(), cmpop(lhs, rhs))
+    np.testing.assert_array_equal(result.to_numpy(), cmpop(lhs, rhs))
 
 
 _reflected_ops = [
@@ -468,7 +468,7 @@ def test_reflected_ops_scalar(func, dtype, obj_class):
     ps_result = func(random_series)
 
     # verify
-    np.testing.assert_allclose(ps_result, gs_result.to_array())
+    np.testing.assert_allclose(ps_result, gs_result.to_numpy())
 
 
 _cudf_scalar_reflected_ops = [
@@ -536,7 +536,7 @@ def test_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
     ps_result = cpu_func(random_series)
 
     # verify
-    np.testing.assert_allclose(ps_result, gs_result.to_array())
+    np.testing.assert_allclose(ps_result, gs_result.to_numpy())
 
 
 @pytest.mark.parametrize("binop", _binops)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 8d6c551761d..bc3ae721554 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -36,7 +36,7 @@ def test_categorical_basic():
     assert pdsr.cat.ordered == sr.cat.ordered
 
     np.testing.assert_array_equal(
-        pdsr.cat.codes.values, sr.cat.codes.to_array()
+        pdsr.cat.codes.values, sr.cat.codes.to_numpy()
     )
 
     string = str(sr)
@@ -48,7 +48,7 @@ def test_categorical_basic():
 t a
 """
     assert all(x == y for x, y in zip(string.split(), expect_str.split()))
-    assert_eq(cat.codes, cudf_cat.codes.to_array())
+    assert_eq(cat.codes, cudf_cat.codes.to_numpy())
 
 
 def test_categorical_integer():
@@ -58,13 +58,13 @@ def test_categorical_integer():
     pdsr = pd.Series(cat)
     sr = cudf.Series(cat)
     np.testing.assert_array_equal(
-        cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_array()
+        cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_numpy()
     )
     assert sr.null_count == 2
 
     np.testing.assert_array_equal(
         pdsr.cat.codes.values,
-        sr.cat.codes.astype(pdsr.cat.codes.dtype).fillna(-1).to_array(),
+        sr.cat.codes.astype(pdsr.cat.codes.dtype).fillna(-1).to_numpy(),
     )
 
     string = str(sr)
@@ -90,12 +90,12 @@ def test_categorical_compare_unordered():
     out = sr == sr
     assert out.dtype == np.bool_
     assert type(out[0]) == np.bool_
-    assert np.all(out.to_array())
+    assert np.all(out.to_numpy())
     assert np.all(pdsr == pdsr)
 
     # test inequality
     out = sr != sr
-    assert not np.any(out.to_array())
+    assert not np.any(out.to_numpy())
     assert not np.any(pdsr != pdsr)
 
     assert not pdsr.cat.ordered
@@ -126,20 +126,20 @@ def test_categorical_compare_ordered():
     out = sr1 == sr1
     assert out.dtype == np.bool_
     assert type(out[0]) == np.bool_
-    assert np.all(out.to_array())
+    assert np.all(out.to_numpy())
     assert np.all(pdsr1 == pdsr1)
 
     # test inequality
     out = sr1 != sr1
-    assert not np.any(out.to_array())
+    assert not np.any(out.to_numpy())
     assert not np.any(pdsr1 != pdsr1)
 
     assert pdsr1.cat.ordered
     assert sr1.cat.ordered
 
     # test using ordered operators
-    np.testing.assert_array_equal(pdsr1 < pdsr2, (sr1 < sr2).to_array())
-    np.testing.assert_array_equal(pdsr1 > pdsr2, (sr1 > sr2).to_array())
+    np.testing.assert_array_equal(pdsr1 < pdsr2, (sr1 < sr2).to_numpy())
+    np.testing.assert_array_equal(pdsr1 > pdsr2, (sr1 > sr2).to_numpy())
 
 
 def test_categorical_binary_add():
@@ -198,7 +198,7 @@ def test_categorical_masking():
     got_matches = sr == "a"
 
     np.testing.assert_array_equal(
-        expect_matches.values, got_matches.to_array()
+        expect_matches.values, got_matches.to_numpy()
     )
 
     # mask series
@@ -320,14 +320,14 @@ def test_categorical_empty():
     cat = pd.Categorical([])
     pdsr = pd.Series(cat)
     sr = cudf.Series(cat)
-    np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_array())
+    np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_numpy())
 
     # Test attributes
     assert_eq(pdsr.cat.categories, sr.cat.categories)
     assert pdsr.cat.ordered == sr.cat.ordered
 
     np.testing.assert_array_equal(
-        pdsr.cat.codes.values, sr.cat.codes.to_array()
+        pdsr.cat.codes.values, sr.cat.codes.to_numpy()
     )
 
 
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index b82f736fe89..d2c7c073aa1 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -102,7 +102,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False):
             sliced_series.reset_index(drop=True),
         )
     else:
-        assert_eq(np.asarray(pd_series[sl]), sliced_series.to_array())
+        assert_eq(np.asarray(pd_series[sl]), sliced_series.to_numpy())
 
 
 @pytest.mark.parametrize("offset", [0, 1, 15])
@@ -161,11 +161,17 @@ def test_as_column_scalar_with_nan(nan_as_null):
     size = 10
     scalar = np.nan
 
-    expected = cudf.Series([np.nan] * size, nan_as_null=nan_as_null).to_array()
+    expected = (
+        cudf.Series([np.nan] * size, nan_as_null=nan_as_null)
+        .dropna()
+        .to_numpy()
+    )
 
-    got = cudf.Series(
-        as_column(scalar, length=size, nan_as_null=nan_as_null)
-    ).to_array()
+    got = (
+        cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null))
+        .dropna()
+        .to_numpy()
+    )
 
     np.testing.assert_equal(expected, got)
 
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 2eb59616253..0b8b6dd565f 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -393,9 +393,9 @@ def test_csv_reader_negative_vals(tmpdir):
 
     df = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1)
 
-    np.testing.assert_allclose(zero, df["0"].to_array())
-    np.testing.assert_allclose(one, df["1"].to_array())
-    np.testing.assert_allclose(two, df["2"].to_array())
+    np.testing.assert_allclose(zero, df["0"].to_numpy())
+    np.testing.assert_allclose(one, df["1"].to_numpy())
+    np.testing.assert_allclose(two, df["2"].to_numpy())
 
 
 def test_csv_reader_strings(tmpdir):
@@ -483,7 +483,7 @@ def test_csv_reader_mangle_dupe_cols(tmpdir):
     cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=False)
     # check that the dupe columns were removed
     assert len(cu_df.columns) == 3
-    np.testing.assert_array_equal(cu_df["abc"].to_array(), [1])
+    np.testing.assert_array_equal(cu_df["abc"].to_numpy(), [1])
 
 
 def test_csv_reader_float_decimal(tmpdir):
@@ -516,11 +516,11 @@ def test_csv_reader_float_decimal(tmpdir):
         decimal=",",
     )
 
-    np.testing.assert_allclose(basic_32_ref, df["basic_32"].to_array())
-    np.testing.assert_allclose(basic_64_ref, df["basic_64"].to_array())
-    np.testing.assert_allclose(round_ref, df["round"].to_array())
-    np.testing.assert_allclose(decimal_only_ref, df["decimal_only"].to_array())
-    np.testing.assert_allclose(precision_ref, df["precision"].to_array())
+    np.testing.assert_allclose(basic_32_ref, df["basic_32"].to_numpy())
+    np.testing.assert_allclose(basic_64_ref, df["basic_64"].to_numpy())
+    np.testing.assert_allclose(round_ref, df["round"].to_numpy())
+    np.testing.assert_allclose(decimal_only_ref, df["decimal_only"].to_numpy())
+    np.testing.assert_allclose(precision_ref, df["precision"].to_numpy())
 
 
 def test_csv_reader_NaN_values():
@@ -620,12 +620,12 @@ def test_csv_reader_thousands(tmpdir):
         str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'"
     )
 
-    np.testing.assert_allclose(f32_ref, df["float32"].to_array())
-    np.testing.assert_allclose(f64_ref, df["float64"].to_array())
-    np.testing.assert_allclose(int32_ref, df["int32"].to_array())
-    np.testing.assert_allclose(int64_ref, df["int64"].to_array())
-    np.testing.assert_allclose(uint32_ref, df["uint32"].to_array())
-    np.testing.assert_allclose(uint64_ref, df["uint64"].to_array())
+    np.testing.assert_allclose(f32_ref, df["float32"].to_numpy())
+    np.testing.assert_allclose(f64_ref, df["float64"].to_numpy())
+    np.testing.assert_allclose(int32_ref, df["int32"].to_numpy())
+    np.testing.assert_allclose(int64_ref, df["int64"].to_numpy())
+    np.testing.assert_allclose(uint32_ref, df["uint32"].to_numpy())
+    np.testing.assert_allclose(uint64_ref, df["uint64"].to_numpy())
 
 
 def test_csv_reader_buffer_strings():
@@ -768,10 +768,10 @@ def test_csv_quotednumbers(tmpdir):
     df2 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1)
 
     assert len(df2.columns) == 2
-    np.testing.assert_allclose(integer_ref, df1["integer"].to_array())
-    np.testing.assert_allclose(decimal_ref, df1["decimal"].to_array())
-    np.testing.assert_allclose(integer_ref, df2["integer"].to_array())
-    np.testing.assert_allclose(decimal_ref, df2["decimal"].to_array())
+    np.testing.assert_allclose(integer_ref, df1["integer"].to_numpy())
+    np.testing.assert_allclose(decimal_ref, df1["decimal"].to_numpy())
+    np.testing.assert_allclose(integer_ref, df2["integer"].to_numpy())
+    np.testing.assert_allclose(decimal_ref, df2["decimal"].to_numpy())
 
 
 def test_csv_reader_nrows(tmpdir):
@@ -1033,8 +1033,8 @@ def test_csv_reader_tabs():
         "1970-12-12T00:00:00.000000000",
         "2018-06-15T00:00:00.000000000",
     ]
-    np.testing.assert_allclose(floats, df["float_point"].to_array())
-    np.testing.assert_allclose(ints, df["integer"].to_array())
+    np.testing.assert_allclose(floats, df["float_point"].to_numpy())
+    np.testing.assert_allclose(ints, df["integer"].to_numpy())
     for row in range(4):
         assert str(df["date"][row]) == dates[row]
 
@@ -1280,7 +1280,7 @@ def test_csv_reader_bools_false_positives(tmpdir):
 
     df = read_csv(StringIO(buffer), header=None, dtype=["int32"])
 
-    np.testing.assert_array_equal(items, df["0"].to_array())
+    np.testing.assert_array_equal(items, df["0"].to_numpy())
 
 
 def test_csv_reader_aligned_byte_range(tmpdir):
@@ -1312,7 +1312,7 @@ def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype):
         pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"])
         gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"])
         np.testing.assert_array_equal(
-            pdf["hex_int"], gdf["hex_int"].to_array()
+            pdf["hex_int"], gdf["hex_int"].to_numpy()
         )
     else:
         # otherwise, dtype inference returns as object (string)
diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py
index 2604030097b..a00dbbba5f0 100644
--- a/python/cudf/cudf/tests/test_cuda_apply.py
+++ b/python/cudf/cudf/tests/test_cuda_apply.py
@@ -37,8 +37,8 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2):
         kwargs=dict(extra1=extra1, extra2=extra2),
     )
 
-    got_out1 = outdf["out1"].to_array()
-    got_out2 = outdf["out2"].to_array()
+    got_out1 = outdf["out1"].to_numpy()
+    got_out2 = outdf["out2"].to_numpy()
 
     np.testing.assert_array_almost_equal(got_out1, expect_out1)
     np.testing.assert_array_almost_equal(got_out2, expect_out2)
@@ -74,8 +74,8 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2):
     got_out1 = outdf["out1"]
     got_out2 = outdf["out2"]
 
-    np.testing.assert_array_almost_equal(got_out1.to_array(), expect_out1)
-    np.testing.assert_array_almost_equal(got_out2.to_array(), expect_out2)
+    np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1)
+    np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2)
 
 
 @pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129])
@@ -112,8 +112,8 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2):
     got_out1 = outdf["out1"]
     got_out2 = outdf["out2"]
 
-    np.testing.assert_array_almost_equal(got_out1.to_array(), expect_out1)
-    np.testing.assert_array_almost_equal(got_out2.to_array(), expect_out2)
+    np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1)
+    np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2)
 
 
 @pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129])
@@ -158,8 +158,8 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2):
     got_out1 = outdf["out1"]
     got_out2 = outdf["out2"]
 
-    np.testing.assert_array_almost_equal(got_out1.to_array(), expect_out1)
-    np.testing.assert_array_almost_equal(got_out2.to_array(), expect_out2)
+    np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1)
+    np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2)
 
 
 @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 1000, 5000])
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 376fc3e6b88..5a839507182 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -201,7 +201,7 @@ def test_series_basic():
     a1 = np.arange(10, dtype=np.float64)
     series = cudf.Series(a1)
     assert len(series) == 10
-    np.testing.assert_equal(series.to_array(), np.hstack([a1]))
+    np.testing.assert_equal(series.to_numpy(), np.hstack([a1]))
 
 
 def test_series_from_cupy_scalars():
@@ -265,13 +265,13 @@ def test_dataframe_basic():
 
     # Populate with cuda memory
     df["keys"] = np.arange(10, dtype=np.float64)
-    np.testing.assert_equal(df["keys"].to_array(), np.arange(10))
+    np.testing.assert_equal(df["keys"].to_numpy(), np.arange(10))
     assert len(df) == 10
 
     # Populate with numpy array
     rnd_vals = np.random.random(10)
     df["vals"] = rnd_vals
-    np.testing.assert_equal(df["vals"].to_array(), rnd_vals)
+    np.testing.assert_equal(df["vals"].to_numpy(), rnd_vals)
     assert len(df) == 10
     assert tuple(df.columns) == ("keys", "vals")
 
@@ -287,11 +287,11 @@ def test_dataframe_basic():
     hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123])
     hvals = np.asarray(rnd_vals.tolist() + [321])
 
-    np.testing.assert_equal(df["keys"].to_array(), hkeys)
-    np.testing.assert_equal(df["vals"].to_array(), hvals)
+    np.testing.assert_equal(df["keys"].to_numpy(), hkeys)
+    np.testing.assert_equal(df["vals"].to_numpy(), hvals)
 
     # As matrix
-    mat = df.as_matrix()
+    mat = df.values_host
 
     expect = np.vstack([hkeys, hvals]).T
 
@@ -301,7 +301,7 @@ def test_dataframe_basic():
     df_tup = cudf.DataFrame()
     data = np.arange(10)
     df_tup[(1, "foobar")] = data
-    np.testing.assert_equal(data, df_tup[(1, "foobar")].to_array())
+    np.testing.assert_equal(data, df_tup[(1, "foobar")].to_numpy())
 
     df = cudf.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]}))
     pdf = pd.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]}))
@@ -715,7 +715,7 @@ def test_dataframe_astype(nelem):
     assert df["a"].dtype is np.dtype(np.int32)
     df["b"] = df["a"].astype(np.float32)
     assert df["b"].dtype is np.dtype(np.float32)
-    np.testing.assert_equal(df["a"].to_array(), df["b"].to_array())
+    np.testing.assert_equal(df["a"].to_numpy(), df["b"].to_numpy())
 
 
 def test_astype_dict():
@@ -738,12 +738,12 @@ def test_index_astype(nelem):
     df.index = df.index.astype(np.float32)
     assert df.index.dtype is np.dtype(np.float32)
     df["a"] = df["a"].astype(np.float32)
-    np.testing.assert_equal(df.index.to_array(), df["a"].to_array())
+    np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy())
     df["b"] = df["a"]
     df = df.set_index("b")
     df["a"] = df["a"].astype(np.int16)
     df.index = df.index.astype(np.int16)
-    np.testing.assert_equal(df.index.to_array(), df["a"].to_array())
+    np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy())
 
 
 def test_dataframe_to_string():
@@ -786,9 +786,9 @@ def test_dataframe_to_string():
     # check data
     values = masked.copy()
     validids = [0, 2, 3, 5]
-    densearray = masked.to_array()
+    densearray = masked.dropna().to_numpy()
     np.testing.assert_equal(data[validids], densearray)
-    # valid position is corret
+    # valid position is correct
 
     for i in validids:
         assert data[i] == values[i]
@@ -941,12 +941,13 @@ def test_dataframe_dir_and_getattr():
         df.not_a_column
 
 
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_empty_dataframe_as_gpu_matrix(order):
+def test_empty_dataframe_to_array():
     df = cudf.DataFrame()
 
     # Check fully empty dataframe.
-    mat = df.as_gpu_matrix(order=order).copy_to_host()
+    mat = df.to_cupy()
+    assert mat.shape == (0, 0)
+    mat = df.to_numpy()
     assert mat.shape == (0, 0)
 
     df = cudf.DataFrame()
@@ -955,12 +956,11 @@ def test_empty_dataframe_as_gpu_matrix(order):
         df[k] = np.random.random(nelem)
 
     # Check all columns in empty dataframe.
-    mat = df.head(0).as_gpu_matrix(order=order).copy_to_host()
+    mat = df.head(0).to_cupy()
     assert mat.shape == (0, 3)
 
 
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_dataframe_as_gpu_matrix(order):
+def test_dataframe_to_cupy():
     df = cudf.DataFrame()
 
     nelem = 123
@@ -968,20 +968,20 @@ def test_dataframe_as_gpu_matrix(order):
         df[k] = np.random.random(nelem)
 
     # Check all columns
-    mat = df.as_gpu_matrix(order=order).copy_to_host()
+    mat = df.to_numpy()
     assert mat.shape == (nelem, 4)
     for i, k in enumerate(df.columns):
-        np.testing.assert_array_equal(df[k].to_array(), mat[:, i])
+        np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i])
 
     # Check column subset
-    mat = df.as_gpu_matrix(order=order, columns=["a", "c"]).copy_to_host()
+    mat = df[["a", "c"]].to_cupy().get()
     assert mat.shape == (nelem, 2)
 
     for i, k in enumerate("ac"):
-        np.testing.assert_array_equal(df[k].to_array(), mat[:, i])
+        np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i])
 
 
-def test_dataframe_as_gpu_matrix_null_values():
+def test_dataframe_to_cupy_null_values():
     df = cudf.DataFrame()
 
     nelem = 123
@@ -999,14 +999,15 @@ def test_dataframe_as_gpu_matrix_null_values():
         refvalues[k] = data
 
     # Check null value causes error
-    with pytest.raises(ValueError) as raises:
-        df.as_gpu_matrix()
-    raises.match("column 'a' has null values")
+    with pytest.raises(ValueError):
+        df.to_cupy()
+    with pytest.raises(ValueError):
+        df.to_numpy()
 
     for k in df.columns:
         df[k] = df[k].fillna(na)
 
-    mat = df.as_gpu_matrix().copy_to_host()
+    mat = df.to_numpy()
     for i, k in enumerate(df.columns):
         np.testing.assert_array_equal(refvalues[k], mat[:, i])
 
@@ -1082,7 +1083,7 @@ def test_dataframe_setitem_index_len1():
     gdf["a"] = [1]
     gdf["b"] = gdf.index._values
 
-    np.testing.assert_equal(gdf.b.to_array(), [0])
+    np.testing.assert_equal(gdf.b.to_numpy(), [0])
 
 
 def test_empty_dataframe_setitem_df():
@@ -1098,7 +1099,7 @@ def test_assign():
     assert list(gdf.columns) == ["x"]
     assert list(gdf2.columns) == ["x", "y"]
 
-    np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
+    np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4])
 
 
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
@@ -1150,7 +1151,7 @@ def test_dataframe_hash_partition(nrows, nparts, nkeys):
     for p in got:
         if len(p):
             # Take rows of the keycolumns and build a set of the key-values
-            unique_keys = set(map(tuple, p.as_matrix(columns=keycols)))
+            unique_keys = set(map(tuple, p[keycols].values_host))
             # Ensure that none of the key-values have occurred in other groups
             assert not (unique_keys & part_unique_keys)
             part_unique_keys |= unique_keys
@@ -1593,7 +1594,7 @@ def test_from_arrow(nelem, data_type):
 
     # For some reason PyArrow to_pandas() converts to numpy array and has
     # better type compatibility
-    np.testing.assert_array_equal(s.to_pandas(), gs.to_array())
+    np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy())
 
 
 @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000])
@@ -1733,7 +1734,7 @@ def test_from_python_array(data_type):
 
     gs = cudf.Series(data)
 
-    np.testing.assert_equal(gs.to_array(), np_arr)
+    np.testing.assert_equal(gs.to_numpy(), np_arr)
 
 
 def test_series_shape():
@@ -2200,11 +2201,11 @@ def test_series_hash_encode(nrows):
 
     encoded_series = s.hash_encode(num_features)
     assert isinstance(encoded_series, cudf.Series)
-    enc_arr = encoded_series.to_array()
+    enc_arr = encoded_series.to_numpy()
     assert np.all(enc_arr >= 0)
     assert np.max(enc_arr) < num_features
 
-    enc_with_name_arr = s.hash_encode(num_features, use_name=True).to_array()
+    enc_with_name_arr = s.hash_encode(num_features, use_name=True).to_numpy()
     assert enc_with_name_arr[0] != enc_arr[0]
 
 
@@ -2391,7 +2392,7 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins):
     else:
         indices = s.digitize(bins, right)
     np.testing.assert_array_equal(
-        np.digitize(data, bins, right), indices.to_array()
+        np.digitize(data, bins, right), indices.to_numpy()
     )
 
 
@@ -3228,15 +3229,6 @@ def test_array_ufunc():
     assert_eq(np.sqrt(gdf.x), np.sqrt(pdf.x))
 
 
-@pytest.mark.parametrize("nan_value", [-5, -5.0, 0, 5, 5.0, None, "pandas"])
-def test_series_to_gpu_array(nan_value):
-
-    s = cudf.Series([0, 1, None, 3])
-    np.testing.assert_array_equal(
-        s.to_array(nan_value), s.to_gpu_array(nan_value).copy_to_host()
-    )
-
-
 def test_dataframe_describe_exclude():
     np.random.seed(12)
     data_length = 10000
@@ -3474,8 +3466,6 @@ def test_dataframe_round(decimals):
     expected = pdf.round(pdecimals)
 
     assert_eq(result, expected)
-    for c in gdf.columns:
-        np.array_equal(gdf[c].nullmask.to_array(), result[c].to_array())
 
 
 @pytest.mark.parametrize(
@@ -4061,8 +4051,7 @@ def test_series_values_host_property(data):
             marks=pytest.mark.xfail(raises=NotImplementedError),
         ),
         pytest.param(
-            ["m", "a", "d", "v"],
-            marks=pytest.mark.xfail(raises=NotImplementedError),
+            ["m", "a", "d", "v"], marks=pytest.mark.xfail(raises=TypeError),
         ),
     ],
 )
@@ -5719,7 +5708,7 @@ def test_df_sr_mask_where(data, condition, other, error, inplace):
                 expect_where.cat.codes,
                 got_where.cat.codes.astype(expect_where.cat.codes.dtype)
                 .fillna(-1)
-                .to_array(),
+                .to_numpy(),
             )
             assert_eq(expect_where.cat.categories, got_where.cat.categories)
 
@@ -5727,7 +5716,7 @@ def test_df_sr_mask_where(data, condition, other, error, inplace):
                 expect_mask.cat.codes,
                 got_mask.cat.codes.astype(expect_mask.cat.codes.dtype)
                 .fillna(-1)
-                .to_array(),
+                .to_numpy(),
             )
             assert_eq(expect_mask.cat.categories, got_mask.cat.categories)
         else:
@@ -7275,9 +7264,8 @@ def test_cudf_arrow_array_error():
 
     with pytest.raises(
         TypeError,
-        match="Implicit conversion to a host PyArrow Table via __arrow_array__"
-        " is not allowed, To explicitly construct a PyArrow Table, consider "
-        "using .to_arrow()",
+        match="Implicit conversion to a host PyArrow object via "
+        "__arrow_array__ is not allowed. Consider using .to_arrow()",
     ):
         df.__arrow_array__()
 
@@ -7285,18 +7273,16 @@ def test_cudf_arrow_array_error():
 
     with pytest.raises(
         TypeError,
-        match="Implicit conversion to a host PyArrow Array via __arrow_array__"
-        " is not allowed, To explicitly construct a PyArrow Array, consider "
-        "using .to_arrow()",
+        match="Implicit conversion to a host PyArrow object via "
+        "__arrow_array__ is not allowed. Consider using .to_arrow()",
     ):
         sr.__arrow_array__()
 
     sr = cudf.Series(["a", "b", "c"])
     with pytest.raises(
         TypeError,
-        match="Implicit conversion to a host PyArrow Array via __arrow_array__"
-        " is not allowed, To explicitly construct a PyArrow Array, consider "
-        "using .to_arrow()",
+        match="Implicit conversion to a host PyArrow object via "
+        "__arrow_array__ is not allowed. Consider using .to_arrow()",
     ):
         sr.__arrow_array__()
 
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index 5b258c760b3..1a9098c70db 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -41,7 +41,7 @@ def test_dataframe_deep_copy(copy_parameters):
     copy_gdf["b"] = [0, 0, 0]
     pdf_is_equal = np.array_equal(pdf["b"].values, copy_pdf["b"].values)
     gdf_is_equal = np.array_equal(
-        gdf["b"].to_array(), copy_gdf["b"].to_array()
+        gdf["b"].to_numpy(), copy_gdf["b"].to_numpy()
     )
     assert pdf_is_equal == copy_parameters["expected_equality"]
     assert gdf_is_equal == copy_parameters["expected_equality"]
@@ -67,7 +67,7 @@ def test_dataframe_deep_copy_and_insert(copy_parameters):
     copy_gdf["b"] = [0, 0, 0]
     pdf_is_equal = np.array_equal(pdf["b"].values, copy_pdf["b"].values)
     gdf_is_equal = np.array_equal(
-        gdf["b"].to_array(), copy_gdf["b"].to_array()
+        gdf["b"].to_numpy(), copy_gdf["b"].to_numpy()
     )
     assert pdf_is_equal == copy_parameters["expected_equality"]
     assert gdf_is_equal == copy_parameters["expected_equality"]
@@ -160,8 +160,6 @@ def test_kernel_deep_copy():
     cdf = gdf.copy(deep=True)
     sr = gdf["b"]
 
-    # column.to_gpu_array calls to_dense_buffer which returns a copy
-    # need to access buffer directly and then call gpu_array
     add_one[1, len(sr)](sr._column.data_array_view)
     assert not gdf.to_string().split() == cdf.to_string().split()
 
@@ -173,7 +171,7 @@ def test_kernel_shallow_copy():
     gdf = DataFrame.from_pandas(pdf)
     cdf = gdf.copy(deep=False)
     sr = gdf["a"]
-    add_one[1, len(sr)](sr.to_gpu_array())
+    add_one[1, len(sr)](sr.to_cupy())
     assert_eq(gdf, cdf)
 
 
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 6e5b3c39dc4..3bbac217283 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -140,24 +140,24 @@ def test_datetime_series_binops_numpy(lhs_dtype, rhs_dtype):
     gdf_data_2 = Series(pd_data_2).astype(rhs_dtype)
     np_data_1 = np.array(pd_data_1).astype(lhs_dtype)
     np_data_2 = np.array(pd_data_2).astype(rhs_dtype)
-    np.testing.assert_equal(np_data_1, gdf_data_1.to_array())
-    np.testing.assert_equal(np_data_2, gdf_data_2.to_array())
+    np.testing.assert_equal(np_data_1, gdf_data_1.to_numpy())
+    np.testing.assert_equal(np_data_2, gdf_data_2.to_numpy())
     np.testing.assert_equal(
-        np.less(np_data_1, np_data_2), (gdf_data_1 < gdf_data_2).to_array()
+        np.less(np_data_1, np_data_2), (gdf_data_1 < gdf_data_2).to_numpy()
     )
     np.testing.assert_equal(
-        np.greater(np_data_1, np_data_2), (gdf_data_1 > gdf_data_2).to_array()
+        np.greater(np_data_1, np_data_2), (gdf_data_1 > gdf_data_2).to_numpy()
     )
     np.testing.assert_equal(
-        np.equal(np_data_1, np_data_2), (gdf_data_1 == gdf_data_2).to_array()
+        np.equal(np_data_1, np_data_2), (gdf_data_1 == gdf_data_2).to_numpy()
     )
     np.testing.assert_equal(
         np.less_equal(np_data_1, np_data_2),
-        (gdf_data_1 <= gdf_data_2).to_array(),
+        (gdf_data_1 <= gdf_data_2).to_numpy(),
     )
     np.testing.assert_equal(
         np.greater_equal(np_data_1, np_data_2),
-        (gdf_data_1 >= gdf_data_2).to_array(),
+        (gdf_data_1 >= gdf_data_2).to_numpy(),
     )
 
 
@@ -268,7 +268,7 @@ def test_typecast_from_datetime(data, dtype):
     np_casted = np_data.astype(dtype)
     gdf_casted = gdf_data.astype(dtype)
 
-    np.testing.assert_equal(np_casted, gdf_casted.to_array())
+    np.testing.assert_equal(np_casted, gdf_casted.to_numpy())
 
 
 @pytest.mark.parametrize("data", [data1(), data2()])
@@ -284,7 +284,7 @@ def test_typecast_from_datetime_to_int64_to_datetime(data, dtype):
     np_casted = np_data.astype(np.int64).astype(dtype)
     gdf_casted = gdf_data.astype(np.int64).astype(dtype)
 
-    np.testing.assert_equal(np_casted, gdf_casted.to_array())
+    np.testing.assert_equal(np_casted, gdf_casted.to_numpy())
 
 
 @pytest.mark.parametrize("data", [timeseries_us_data()])
@@ -296,7 +296,7 @@ def test_typecast_to_different_datetime_resolutions(data, dtype):
     pd_data = pd.Series(data.copy())
     np_data = np.array(pd_data).astype(dtype)
     gdf_series = Series(pd_data).astype(dtype)
-    np.testing.assert_equal(np_data, gdf_series.to_array())
+    np.testing.assert_equal(np_data, gdf_series.to_numpy())
 
 
 @pytest.mark.parametrize(
@@ -331,7 +331,7 @@ def test_typecast_to_datetime(data, from_dtype, to_dtype):
     np_casted = np_data.astype(to_dtype)
     gdf_casted = gdf_data.astype(to_dtype)
 
-    np.testing.assert_equal(np_casted, gdf_casted.to_array())
+    np.testing.assert_equal(np_casted, gdf_casted.to_numpy())
 
 
 @pytest.mark.parametrize("data", [numerical_data()])
@@ -347,7 +347,7 @@ def test_typecast_to_from_datetime(data, from_dtype, to_dtype):
     np_casted = np_data.astype(to_dtype).astype(from_dtype)
     gdf_casted = gdf_data.astype(to_dtype).astype(from_dtype)
 
-    np.testing.assert_equal(np_casted, gdf_casted.to_array())
+    np.testing.assert_equal(np_casted, gdf_casted.to_numpy())
 
 
 @pytest.mark.parametrize("data", [numerical_data()])
@@ -361,12 +361,12 @@ def test_typecast_to_from_datetime(data, from_dtype, to_dtype):
 )
 def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype):
     np_data = data.astype(from_dtype)
-    gdf_col = Series(np_data)._column
+    ser = Series(np_data)
 
     np_casted = np_data.astype(to_dtype)
-    gdf_casted = gdf_col.astype(to_dtype)
+    ser_casted = ser.astype(to_dtype)
 
-    np.testing.assert_equal(np_casted, gdf_casted.to_array())
+    np.testing.assert_equal(np_casted, ser_casted.to_numpy())
 
 
 @pytest.mark.parametrize("data", [numerical_data()])
@@ -1131,7 +1131,26 @@ def test_datetime_fillna(data, dtype, fill_value):
 )
 @pytest.mark.parametrize("dtype", DATETIME_TYPES)
 @pytest.mark.parametrize(
-    "date_format", ["%d - %m", "%y/%H", "%Y", "%I - %M / %S", "%f", "%j", "%p"]
+    "date_format",
+    [
+        "%d - %m",
+        "%y/%H",
+        "%Y",
+        "%I - %M / %S",
+        "%f",
+        "%j",
+        "%p",
+        "%w",
+        "%U",
+        "%W",
+        "%G",
+        "%u",
+        "%V",
+        "%b",
+        "%B",
+        "%a",
+        "%A",
+    ],
 )
 def test_datetime_strftime(data, dtype, date_format):
     gsr = cudf.Series(data, dtype=dtype)
@@ -1143,24 +1162,7 @@ def test_datetime_strftime(data, dtype, date_format):
     assert_eq(expected, actual)
 
 
-@pytest.mark.parametrize(
-    "date_format",
-    [
-        "%a",
-        "%A",
-        "%w",
-        "%b",
-        "%B",
-        "%U",
-        "%W",
-        "%c",
-        "%x",
-        "%X",
-        "%G",
-        "%u",
-        "%V",
-    ],
-)
+@pytest.mark.parametrize("date_format", ["%c", "%x", "%X"])
 def test_datetime_strftime_not_implemented_formats(date_format):
     gsr = cudf.Series([1, 2, 3], dtype="datetime64[ms]")
 
@@ -1334,6 +1336,55 @@ def test_quarter():
     assert_eq(expect2.values, got2.values, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        pd.Series([], dtype="datetime64[ns]"),
+        pd.Series(pd.date_range("2010-01-01", "2010-02-01")),
+        pd.Series([None, None], dtype="datetime64[ns]"),
+        pd.Series("2020-05-31 08:00:00", dtype="datetime64[s]"),
+        pd.Series(
+            pd.date_range(start="2021-07-25", end="2021-07-30"),
+            index=["a", "b", "c", "d", "e", "f"],
+        ),
+    ],
+)
+def test_isocalendar_series(data):
+    ps = data.copy()
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.dt.isocalendar()
+    got = gs.dt.isocalendar()
+
+    assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        pd.DatetimeIndex([], dtype="datetime64[ns]"),
+        pd.DatetimeIndex([None, None], dtype="datetime64[ns]"),
+        pd.DatetimeIndex(
+            [
+                "2020-05-31 08:00:00",
+                "1999-12-31 18:40:00",
+                "2000-12-31 04:00:00",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        pd.DatetimeIndex(["2100-03-14 07:30:00"], dtype="datetime64[ns]"),
+    ],
+)
+def test_isocalendar_index(data):
+    ps = data.copy()
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.isocalendar()
+    got = gs.isocalendar()
+
+    assert_eq(expect, got, check_dtype=False)
+
+
 @pytest.mark.parametrize("dtype", DATETIME_TYPES)
 def test_days_in_months(dtype):
     nrows = 1000
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 4b2fca0d12d..4b26e2c13bc 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -126,7 +126,7 @@ def test_to_dlpack_cupy_1d(data_1d):
     expectation = data_size_expectation_builder(data_1d, False)
     with expectation:
         gs = cudf.Series(data_1d, nan_as_null=False)
-        cudf_host_array = gs.to_array(fillna="pandas")
+        cudf_host_array = gs.to_numpy(na_value=np.nan)
         dlt = gs._column.to_dlpack()
 
         cupy_array = cupy.fromDlpack(dlt)
@@ -155,7 +155,7 @@ def test_from_dlpack_cupy_1d(data_1d):
     dlt = cupy_array.toDlpack()
 
     gs = cudf.from_dlpack(dlt)
-    cudf_host_array = gs.to_array(fillna="pandas")
+    cudf_host_array = gs.to_numpy(na_value=np.nan)
 
     assert_eq(cudf_host_array, cupy_host_array)
 
@@ -190,7 +190,7 @@ def test_to_dlpack_cupy_1d_null(data_1d):
 
     with expectation:
         gs = cudf.Series(data_1d)
-        cudf_host_array = gs.to_array(fillna="pandas")
+        cudf_host_array = gs.to_numpy(na_value=np.nan)
         dlt = gs._column.to_dlpack()
 
         cupy_array = cupy.fromDlpack(dlt)
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index f464ac1a6c2..bc43c82729b 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -24,11 +24,11 @@ def assert_df(g, p):
 
 def assert_df2(g, p):
     assert g.index.dtype == p.index.dtype
-    np.testing.assert_equal(g.index.to_array(), p.index)
+    np.testing.assert_equal(g.index.to_numpy(), p.index)
     assert tuple(g.columns) == tuple(p.columns)
     for k in g.columns:
         assert g[k].dtype == p[k].dtype
-        np.testing.assert_equal(g[k].to_array(), p[k])
+        np.testing.assert_equal(g[k].to_numpy(), p[k])
 
 
 # most tests are similar to pandas drop_duplicates
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index e527fd0af17..39fa7b11ce2 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -5,7 +5,7 @@
 from numba.cuda import compile_ptx
 
 from cudf import NA
-from cudf.core.udf.classes import Masked
+from cudf.core.udf.api import Masked
 from cudf.core.udf.typing import MaskedType
 
 arith_ops = (
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 46cbc9d2b52..1f16686a6a6 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -19,7 +19,7 @@ def test_factorize_series_obj(ncats, nelem):
     df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
 
     uvals, labels = df["cats"].factorize()
-    np.testing.assert_array_equal(labels.to_array(), sorted(set(arr)))
+    np.testing.assert_array_equal(labels.to_numpy(), sorted(set(arr)))
     assert isinstance(uvals, cp.ndarray)
     assert isinstance(labels, Index)
 
diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
index a088ae9f923..3b3aa72901f 100644
--- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py
+++ b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
@@ -53,8 +53,8 @@ def test_gpu_parse_arrow_data_cpu_schema():
     np.testing.assert_array_less(-105, lon)
 
     dct = reader.to_dict()
-    np.testing.assert_array_equal(lat, dct["dest_lat"].to_array())
-    np.testing.assert_array_equal(lon, dct["dest_lon"].to_array())
+    np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy())
+    np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy())
 
 
 def test_gpu_parse_arrow_data_gpu_schema():
@@ -86,8 +86,8 @@ def test_gpu_parse_arrow_data_gpu_schema():
     np.testing.assert_array_less(-105, lon)
 
     dct = reader.to_dict()
-    np.testing.assert_array_equal(lat, dct["dest_lat"].to_array())
-    np.testing.assert_array_equal(lon, dct["dest_lon"].to_array())
+    np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy())
+    np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy())
 
 
 def test_gpu_parse_arrow_data_bad_cpu_schema_good_gpu_schema():
@@ -119,8 +119,8 @@ def test_gpu_parse_arrow_data_bad_cpu_schema_good_gpu_schema():
     np.testing.assert_array_less(-105, lon)
 
     dct = reader.to_dict()
-    np.testing.assert_array_equal(lat, dct["dest_lat"].to_array())
-    np.testing.assert_array_equal(lon, dct["dest_lon"].to_array())
+    np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy())
+    np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy())
 
 
 expected_values = """
@@ -288,9 +288,9 @@ def test_gpu_parse_arrow_timestamps(dtype):
     reader = GpuArrowReader(cpu_schema, gpu_data)
     assert reader[0].name == "timestamp"
     timestamp_arr = reader[0].data.copy_to_host()
-    np.testing.assert_array_equal(timestamp_arr, gdf["timestamp"].to_array())
+    np.testing.assert_array_equal(timestamp_arr, gdf["timestamp"].to_numpy())
     dct = reader.to_dict()
-    np.testing.assert_array_equal(timestamp_arr, dct["timestamp"].to_array())
+    np.testing.assert_array_equal(timestamp_arr, dct["timestamp"].to_numpy())
 
 
 if __name__ == "__main__":
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index b58078818dd..338e10ebe30 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -196,7 +196,7 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index):
     else:
         # column names don't match - check just the values
         for gcol, pcol in zip(gdf, pdf):
-            assert_array_equal(gdf[gcol].to_array(), pdf[pcol].values)
+            assert_array_equal(gdf[gcol].to_numpy(), pdf[pcol].values)
 
 
 def test_groupby_default(pdf, gdf):
@@ -244,7 +244,7 @@ def test_groupby_cats():
     df["vals"] = np.random.random(len(df))
 
     cats = df["cats"].values_host
-    vals = df["vals"].to_array()
+    vals = df["vals"].to_numpy()
 
     grouped = df.groupby(["cats"], as_index=False).mean()
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index a3de92ba9e5..4ae86dc1cfc 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -176,7 +176,7 @@ def test_categorical_index():
     assert_eq(pdf.index, gdf1.index)
     assert_eq(
         pdf.index.codes,
-        gdf1.index.codes.astype(pdf.index.codes.dtype).to_array(),
+        gdf1.index.codes.astype(pdf.index.codes.dtype).to_numpy(),
     )
 
     assert isinstance(gdf2.index, CategoricalIndex)
@@ -184,7 +184,7 @@ def test_categorical_index():
     assert_eq(pdf.index, gdf2.index)
     assert_eq(
         pdf.index.codes,
-        gdf2.index.codes.astype(pdf.index.codes.dtype).to_array(),
+        gdf2.index.codes.astype(pdf.index.codes.dtype).to_numpy(),
     )
 
 
@@ -223,7 +223,7 @@ def test_pandas_as_index():
         pdf_category_index.codes,
         gdf_category_index.codes.astype(
             pdf_category_index.codes.dtype
-        ).to_array(),
+        ).to_numpy(),
     )
 
 
@@ -291,7 +291,7 @@ def test_set_index_as_property():
     # Check set_index(Series)
     cdf.index = cdf["b"]
 
-    assert_eq(cdf.index._values.to_array(), col2)
+    assert_eq(cdf.index.to_numpy(), col2)
 
     with pytest.raises(ValueError):
         cdf.index = [list(range(10))]
@@ -459,14 +459,14 @@ def test_index_copy_deep(idx, deep):
 def test_index_isna(idx):
     pidx = pd.Index(idx, name="idx")
     gidx = cudf.core.index.Int64Index(idx, name="idx")
-    assert_eq(gidx.isna().to_array(), pidx.isna())
+    assert_eq(gidx.isna().to_numpy(), pidx.isna())
 
 
 @pytest.mark.parametrize("idx", [[1, None, 3, None, 5]])
 def test_index_notna(idx):
     pidx = pd.Index(idx, name="idx")
     gidx = cudf.core.index.Int64Index(idx, name="idx")
-    assert_eq(gidx.notna().to_array(), pidx.notna())
+    assert_eq(gidx.notna().to_numpy(), pidx.notna())
 
 
 def test_rangeindex_slice_attr_name():
@@ -674,17 +674,13 @@ def test_index_where(data, condition, other, error):
             got = gs.where(gs_condition, other=gs_other)
             np.testing.assert_array_equal(
                 expect.codes,
-                got.codes.astype(expect.codes.dtype).fillna(-1).to_array(),
+                got.codes.astype(expect.codes.dtype).fillna(-1).to_numpy(),
             )
             assert_eq(expect.categories, got.categories)
         else:
             assert_eq(
-                ps.where(ps_condition, other=ps_other).fillna(
-                    gs._values.default_na_value()
-                ),
-                gs.where(gs_condition, other=gs_other)
-                .to_pandas()
-                .fillna(gs._values.default_na_value()),
+                ps.where(ps_condition, other=ps_other),
+                gs.where(gs_condition, other=gs_other).to_pandas(),
             )
     else:
         assert_exceptions_equal(
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 58d39ff35a6..e550c7c374e 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -110,16 +110,16 @@ def test_series_indexing(i1, i2, i3):
     # Indexing
     sr1 = series.iloc[i1]
     assert sr1.null_count == 0
-    np.testing.assert_equal(sr1.to_array(), a1[:12])
+    np.testing.assert_equal(sr1.to_numpy(), a1[:12])
 
     sr2 = sr1.iloc[i2]
     assert sr2.null_count == 0
-    np.testing.assert_equal(sr2.to_array(), a1[3:12])
+    np.testing.assert_equal(sr2.to_numpy(), a1[3:12])
 
     # Index with stride
     sr3 = sr2.iloc[i3]
     assert sr3.null_count == 0
-    np.testing.assert_equal(sr3.to_array(), a1[3:12:2])
+    np.testing.assert_equal(sr3.to_numpy(), a1[3:12:2])
 
     # Integer indexing
     if isinstance(i1, range):
@@ -197,10 +197,10 @@ def test_dataframe_column_name_indexing():
     df["a"] = data
     df[1] = data
     np.testing.assert_equal(
-        df["a"].to_array(), np.asarray(range(10), dtype=np.int32)
+        df["a"].to_numpy(), np.asarray(range(10), dtype=np.int32)
     )
     np.testing.assert_equal(
-        df[1].to_array(), np.asarray(range(10), dtype=np.int32)
+        df[1].to_numpy(), np.asarray(range(10), dtype=np.int32)
     )
 
     pdf = pd.DataFrame()
@@ -245,20 +245,20 @@ def test_dataframe_slicing():
     first_10 = df[:10]
     assert len(first_10) == 10
     assert tuple(first_10.columns) == ("a", "b", "c", "d")
-    np.testing.assert_equal(first_10["a"].to_array(), ha[:10])
-    np.testing.assert_equal(first_10["b"].to_array(), hb[:10])
-    np.testing.assert_equal(first_10["c"].to_array(), hc[:10])
-    np.testing.assert_equal(first_10["d"].to_array(), hd[:10])
+    np.testing.assert_equal(first_10["a"].to_numpy(), ha[:10])
+    np.testing.assert_equal(first_10["b"].to_numpy(), hb[:10])
+    np.testing.assert_equal(first_10["c"].to_numpy(), hc[:10])
+    np.testing.assert_equal(first_10["d"].to_numpy(), hd[:10])
     del first_10
 
     # Row slice last 10
     last_10 = df[-10:]
     assert len(last_10) == 10
     assert tuple(last_10.columns) == ("a", "b", "c", "d")
-    np.testing.assert_equal(last_10["a"].to_array(), ha[-10:])
-    np.testing.assert_equal(last_10["b"].to_array(), hb[-10:])
-    np.testing.assert_equal(last_10["c"].to_array(), hc[-10:])
-    np.testing.assert_equal(last_10["d"].to_array(), hd[-10:])
+    np.testing.assert_equal(last_10["a"].to_numpy(), ha[-10:])
+    np.testing.assert_equal(last_10["b"].to_numpy(), hb[-10:])
+    np.testing.assert_equal(last_10["c"].to_numpy(), hc[-10:])
+    np.testing.assert_equal(last_10["d"].to_numpy(), hd[-10:])
     del last_10
 
     # Row slice [begin:end]
@@ -267,10 +267,10 @@ def test_dataframe_slicing():
     subrange = df[begin:end]
     assert len(subrange) == end - begin
     assert tuple(subrange.columns) == ("a", "b", "c", "d")
-    np.testing.assert_equal(subrange["a"].to_array(), ha[begin:end])
-    np.testing.assert_equal(subrange["b"].to_array(), hb[begin:end])
-    np.testing.assert_equal(subrange["c"].to_array(), hc[begin:end])
-    np.testing.assert_equal(subrange["d"].to_array(), hd[begin:end])
+    np.testing.assert_equal(subrange["a"].to_numpy(), ha[begin:end])
+    np.testing.assert_equal(subrange["b"].to_numpy(), hb[begin:end])
+    np.testing.assert_equal(subrange["c"].to_numpy(), hc[begin:end])
+    np.testing.assert_equal(subrange["d"].to_numpy(), hd[begin:end])
     del subrange
 
 
@@ -527,7 +527,7 @@ def test_series_loc_categorical():
     # order of categories changes, so we can only
     # compare values:
     assert_eq(
-        ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_array()
+        ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_numpy()
     )
 
     assert_eq(
@@ -594,9 +594,9 @@ def test_series_iloc(nelem):
     np.testing.assert_allclose(gs.iloc[nelem - 1], ps.iloc[nelem - 1])
 
     # positive tests for slice
-    np.testing.assert_allclose(gs.iloc[-1:1].to_array(), ps.iloc[-1:1])
+    np.testing.assert_allclose(gs.iloc[-1:1].to_numpy(), ps.iloc[-1:1])
     np.testing.assert_allclose(
-        gs.iloc[nelem - 1 : -1].to_array(), ps.iloc[nelem - 1 : -1]
+        gs.iloc[nelem - 1 : -1].to_numpy(), ps.iloc[nelem - 1 : -1]
     )
     np.testing.assert_allclose(
         gs.iloc[0 : nelem - 1].to_pandas(), ps.iloc[0 : nelem - 1]
@@ -697,8 +697,8 @@ def test_dataframe_iloc_index_error():
     pdf["b"] = hb
 
     def assert_col(g, p):
-        np.testing.assert_equal(g["a"].to_array(), p["a"])
-        np.testing.assert_equal(g["b"].to_array(), p["b"])
+        np.testing.assert_equal(g["a"].to_numpy(), p["a"])
+        np.testing.assert_equal(g["b"].to_numpy(), p["b"])
 
     assert_col(gdf.iloc[nelem * 2], pdf.iloc[nelem * 2])
 
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index b18cce60bfd..775b866f5ce 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -21,47 +21,30 @@ def make_params():
     np.random.seed(0)
 
     hows = _JOIN_TYPES
-    methods = "hash,sort".split(",")
 
     # Test specific cases (1)
     aa = [0, 0, 4, 5, 5]
     bb = [0, 0, 2, 3, 5]
     for how in hows:
-        if how in ["left", "inner", "right", "leftanti", "leftsemi"]:
-            for method in methods:
-                yield (aa, bb, how, method)
-        else:
-            yield (aa, bb, how, "sort")
+        yield (aa, bb, how)
 
     # Test specific cases (2)
     aa = [0, 0, 1, 2, 3]
     bb = [0, 1, 2, 2, 3]
     for how in hows:
-        if how in ["left", "inner", "right", "leftanti", "leftsemi"]:
-            for method in methods:
-                yield (aa, bb, how, method)
-        else:
-            yield (aa, bb, how, "sort")
+        yield (aa, bb, how)
 
     # Test large random integer inputs
     aa = np.random.randint(0, 50, 100)
     bb = np.random.randint(0, 50, 100)
     for how in hows:
-        if how in ["left", "inner", "right", "leftanti", "leftsemi"]:
-            for method in methods:
-                yield (aa, bb, how, method)
-        else:
-            yield (aa, bb, how, "sort")
+        yield (aa, bb, how)
 
     # Test floating point inputs
     aa = np.random.random(50)
     bb = np.random.random(50)
     for how in hows:
-        if how in ["left", "inner", "right", "leftanti", "leftsemi"]:
-            for method in methods:
-                yield (aa, bb, how, method)
-        else:
-            yield (aa, bb, how, "sort")
+        yield (aa, bb, how)
 
 
 def pd_odd_joins(left, right, join_type):
@@ -102,8 +85,8 @@ def assert_join_results_equal(expect, got, how, **kwargs):
         raise ValueError(f"Not a join result: {type(expect).__name__}")
 
 
-@pytest.mark.parametrize("aa,bb,how,method", make_params())
-def test_dataframe_join_how(aa, bb, how, method):
+@pytest.mark.parametrize("aa,bb,how", make_params())
+def test_dataframe_join_how(aa, bb, how):
     df = cudf.DataFrame()
     df["a"] = aa
     df["b"] = bb
@@ -122,7 +105,7 @@ def work_pandas(df, how):
     def work_gdf(df):
         df1 = df.set_index("a")
         df2 = df.set_index("b")
-        joined = df1.join(df2, how=how, sort=True, method=method)
+        joined = df1.join(df2, how=how, sort=True)
         return joined
 
     expect = work_pandas(df.to_pandas(), how)
@@ -136,8 +119,7 @@ def work_gdf(df):
     assert got.index.name is None
 
     assert list(expect.columns) == list(got.columns)
-    # test disabled until libgdf sort join gets updated with new api
-    if method == "hash":
+    if how in {"left", "inner", "right", "leftanti", "leftsemi"}:
         assert_eq(sorted(expect.index.values), sorted(got.index.values))
         if how != "outer":
             # Newly introduced ambiguous ValueError thrown when
@@ -161,9 +143,9 @@ def work_gdf(df):
 def _check_series(expect, got):
     magic = 0xDEADBEAF
 
-    direct_equal = np.all(expect.values == got.to_array())
+    direct_equal = np.all(expect.values == got.to_numpy())
     nanfilled_equal = np.all(
-        expect.fillna(magic).values == got.fillna(magic).to_array()
+        expect.fillna(magic).values == got.fillna(magic).to_numpy()
     )
     msg = "direct_equal={}, nanfilled_equal={}".format(
         direct_equal, nanfilled_equal
@@ -221,8 +203,8 @@ def test_dataframe_join_cats():
     assert list(got.columns) == ["b", "c"]
     assert len(got) > 0
     assert set(got.index.to_pandas()) & set("abc")
-    assert set(got["b"].to_array()) & set(bb)
-    assert set(got["c"].to_array()) & set(cc)
+    assert set(got["b"].to_numpy()) & set(bb)
+    assert set(got["c"].to_numpy()) & set(cc)
 
 
 def test_dataframe_join_combine_cats():
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 8c06dbea03f..3391c9c22ee 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -190,7 +190,7 @@ def test_json_lines_basic(json_input, engine):
     assert all(cu_df.dtypes == ["int64", "int64", "int64"])
     for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
         assert str(cu_col) == str(pd_col)
-        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
+        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy())
 
 
 @pytest.mark.filterwarnings("ignore:Using CPU")
@@ -209,7 +209,7 @@ def test_json_lines_multiple(tmpdir, json_input, engine):
     assert all(cu_df.dtypes == ["int64", "int64", "int64"])
     for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
         assert str(cu_col) == str(pd_col)
-        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
+        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy())
 
 
 @pytest.mark.parametrize("engine", ["auto", "cudf"])
@@ -240,7 +240,7 @@ def test_json_read_directory(tmpdir, json_input, engine):
     assert all(cu_df.dtypes == ["int64", "int64", "int64"])
     for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
         assert str(cu_col) == str(pd_col)
-        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
+        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy())
 
 
 def test_json_lines_byte_range(json_input):
@@ -340,9 +340,9 @@ def test_json_bool_values():
 
     # types should be ['bool', 'int64']
     np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
-    np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_array())
+    np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_numpy())
     # boolean values should be converted to 0/1
-    np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_array())
+    np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy())
 
     cu_df = cudf.read_json(buffer, lines=True, dtype=["bool", "long"])
     np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
@@ -364,15 +364,9 @@ def test_json_null_literal(buffer):
     # second column contains only empty fields, type should be set to int8
     np.testing.assert_array_equal(df.dtypes, ["float64", "int8"])
     np.testing.assert_array_equal(
-        df["0"].to_array(fillna=np.nan), [1.0, np.nan]
-    )
-    np.testing.assert_array_equal(
-        df["1"].to_array(fillna=np.nan),
-        [
-            df["1"]._column.default_na_value(),
-            df["1"]._column.default_na_value(),
-        ],
+        df["0"].to_numpy(na_value=np.nan), [1.0, np.nan]
     )
+    np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0])
 
 
 def test_json_bad_protocol_string():
diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py
index bac324d9c1c..106179e2b47 100644
--- a/python/cudf/cudf/tests/test_label_encode.py
+++ b/python/cudf/cudf/tests/test_label_encode.py
@@ -44,7 +44,7 @@ def test_label_encode(nelem, dtype):
 
     # label encode series
     ncol = df["cats"].label_encoding(cats=vals)
-    arr = ncol.to_array()
+    arr = ncol.to_numpy()
 
     # verify labels of new column
     for i in range(arr.size):
@@ -75,7 +75,7 @@ def test_label_encode_drop_one():
 
     # label encode series
     ncol = df["cats"].label_encoding(cats=vals, dtype="float32")
-    arr = ncol.to_array()
+    arr = ncol.to_numpy()
 
     # verify labels of new column
 
@@ -110,7 +110,7 @@ def test_label_encode_float_output():
         na_sentinel=np.nan,
     )
 
-    got = df2["cats_labels"].to_array(fillna="pandas")
+    got = df2["cats_labels"].to_numpy(na_value=np.nan)
 
     handcoded = np.array([encoder.get(v, np.nan) for v in arr])
     np.testing.assert_equal(got, handcoded)
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index 0a3ead6cf31..0292d47f31a 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -18,7 +18,7 @@ def test_onehot_simple():
     df["vals"] = np.arange(10, dtype=np.int32)
     # One Hot (Series)
     for i, col in enumerate(df["vals"].one_hot_encoding(list(range(10)))):
-        arr = col.to_array()
+        arr = col.to_numpy()
         # Verify 1 in the right position
         np.testing.assert_equal(arr[i], 1)
         # Every other slots are 0s
@@ -31,7 +31,7 @@ def test_onehot_simple():
     assert df2.columns[0] == "vals"
     for i in range(1, len(df2.columns)):
         assert df2.columns[i] == "vals_%s" % (i - 1)
-    got = df2.as_matrix(columns=df2.columns[1:])
+    got = df2[df2.columns[1:]].values_host
     expect = np.identity(got.shape[0])
     np.testing.assert_equal(got, expect)
 
@@ -45,7 +45,7 @@ def test_onehot_random():
     df2 = df.one_hot_encoding(
         column="src", prefix="out_", cats=tuple(range(10, 17))
     )
-    mat = df2.as_matrix(columns=df2.columns[1:])
+    mat = df2[df2.columns[1:]].values_host
 
     for val in range(low, high):
         colidx = val - low
@@ -73,11 +73,11 @@ def test_onehot_masked():
     )
 
     assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4")
-    np.testing.assert_array_equal((out["a_0"] == 1).to_array(), arr == 0)
-    np.testing.assert_array_equal((out["a_1"] == 1).to_array(), arr == 1)
-    np.testing.assert_array_equal((out["a_2"] == 1).to_array(), arr == 2)
-    np.testing.assert_array_equal((out["a_3"] == 1).to_array(), arr == 3)
-    np.testing.assert_array_equal((out["a_4"] == 1).to_array(), arr == 4)
+    np.testing.assert_array_equal((out["a_0"] == 1).to_numpy(), arr == 0)
+    np.testing.assert_array_equal((out["a_1"] == 1).to_numpy(), arr == 1)
+    np.testing.assert_array_equal((out["a_2"] == 1).to_numpy(), arr == 2)
+    np.testing.assert_array_equal((out["a_3"] == 1).to_numpy(), arr == 3)
+    np.testing.assert_array_equal((out["a_4"] == 1).to_numpy(), arr == 4)
 
 
 def test_onehot_generic_index():
@@ -91,10 +91,10 @@ def test_onehot_generic_index():
         "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
     )
     assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
-    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
-    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
-    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
-    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
+    np.testing.assert_array_equal(values == 0, out.fo_0.to_numpy())
+    np.testing.assert_array_equal(values == 1, out.fo_1.to_numpy())
+    np.testing.assert_array_equal(values == 2, out.fo_2.to_numpy())
+    np.testing.assert_array_equal(values == 3, out.fo_3.to_numpy())
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py
index c90d6f23c2d..78cf5b998e8 100644
--- a/python/cudf/cudf/tests/test_pandas_interop.py
+++ b/python/cudf/cudf/tests/test_pandas_interop.py
@@ -52,11 +52,11 @@ def test_from_pandas_ex1():
     df = DataFrame.from_pandas(pdf)
 
     assert tuple(df.columns) == tuple(pdf.columns)
-    assert np.all(df["a"].to_array() == pdf["a"])
-    matches = df["b"].to_array(fillna="pandas") == pdf["b"]
+    assert np.all(df["a"].to_numpy() == pdf["a"])
+    matches = df["b"].to_numpy(na_value=np.nan) == pdf["b"]
     # the 3d element is False due to (nan == nan) == False
     assert np.all(matches == [True, True, False, True])
-    assert np.isnan(df["b"].to_array(fillna="pandas")[2])
+    assert np.isnan(df["b"].to_numpy(na_value=np.nan)[2])
     assert np.isnan(pdf["b"][2])
 
 
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index 2e8de9b5d50..9a02d5145bb 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -84,8 +84,8 @@ def test_query_ref_env(data, fn):
     df2 = df.query(query_expr)
     # check
     assert len(df2) == np.count_nonzero(expect_mask)
-    np.testing.assert_array_almost_equal(df2["a"].to_array(), aa[expect_mask])
-    np.testing.assert_array_almost_equal(df2["b"].to_array(), bb[expect_mask])
+    np.testing.assert_array_almost_equal(df2["a"].to_numpy(), aa[expect_mask])
+    np.testing.assert_array_almost_equal(df2["b"].to_numpy(), bb[expect_mask])
 
 
 def test_query_env_changing():
@@ -95,11 +95,11 @@ def test_query_env_changing():
     # first attempt
     c = 10
     got = df.query(expr)
-    np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
+    np.testing.assert_array_equal(aa[aa < c], got["a"].to_numpy())
     # change env
     c = 50
     got = df.query(expr)
-    np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
+    np.testing.assert_array_equal(aa[aa < c], got["a"].to_numpy())
 
 
 def test_query_local_dict():
@@ -108,7 +108,7 @@ def test_query_local_dict():
     expr = "a < @val"
 
     got = df.query(expr, local_dict={"val": 10})
-    np.testing.assert_array_equal(aa[aa < 10], got["a"].to_array())
+    np.testing.assert_array_equal(aa[aa < 10], got["a"].to_numpy())
 
     # test for datetime
     df = DataFrame()
@@ -118,7 +118,7 @@ def test_query_local_dict():
     expr = "datetimes==@search_date"
 
     got = df.query(expr, local_dict={"search_date": search_date})
-    np.testing.assert_array_equal(data[1], got["datetimes"].to_array())
+    np.testing.assert_array_equal(data[1], got["datetimes"].to_numpy())
 
 
 def test_query_splitted_combine():
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 43d477190ae..f47e87374dc 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -68,7 +68,7 @@ def test_series_replace():
     a2 = np.array([5, 1, 2, 3, 4])
     sr1 = cudf.Series(a1)
     sr2 = sr1.replace(0, 5)
-    assert_eq(a2, sr2.to_array())
+    assert_eq(a2, sr2.to_numpy())
 
     # Categorical
     psr3 = pd.Series(["one", "two", "three"], dtype="category")
@@ -85,35 +85,35 @@ def test_series_replace():
     # List input
     a6 = np.array([5, 6, 2, 3, 4])
     sr6 = sr1.replace([0, 1], [5, 6])
-    assert_eq(a6, sr6.to_array())
+    assert_eq(a6, sr6.to_numpy())
 
     with pytest.raises(TypeError):
         sr1.replace([0, 1], [5.5, 6.5])
 
     # Series input
     a8 = np.array([5, 5, 5, 3, 4])
-    sr8 = sr1.replace(sr1[:3].to_array(), 5)
-    assert_eq(a8, sr8.to_array())
+    sr8 = sr1.replace(sr1[:3].to_numpy(), 5)
+    assert_eq(a8, sr8.to_numpy())
 
     # large input containing null
     sr9 = cudf.Series(list(range(400)) + [None])
     sr10 = sr9.replace([22, 323, 27, 0], None)
     assert sr10.null_count == 5
-    assert len(sr10.to_array()) == (401 - 5)
+    assert len(sr10.dropna().to_numpy()) == (401 - 5)
 
     sr11 = sr9.replace([22, 323, 27, 0], -1)
     assert sr11.null_count == 1
-    assert len(sr11.to_array()) == (401 - 1)
+    assert len(sr11.dropna().to_numpy()) == (401 - 1)
 
     # large input not containing nulls
     sr9 = sr9.fillna(-11)
     sr12 = sr9.replace([22, 323, 27, 0], None)
     assert sr12.null_count == 4
-    assert len(sr12.to_array()) == (401 - 4)
+    assert len(sr12.dropna().to_numpy()) == (401 - 4)
 
     sr13 = sr9.replace([22, 323, 27, 0], -1)
     assert sr13.null_count == 0
-    assert len(sr13.to_array()) == 401
+    assert len(sr13.to_numpy()) == 401
 
 
 def test_series_replace_with_nulls():
@@ -123,12 +123,12 @@ def test_series_replace_with_nulls():
     a2 = np.array([-10, 1, 2, 3, 4])
     sr1 = cudf.Series(a1)
     sr2 = sr1.replace(0, None).fillna(-10)
-    assert_eq(a2, sr2.to_array())
+    assert_eq(a2, sr2.to_numpy())
 
     # List input
     a6 = np.array([-10, 6, 2, 3, 4])
     sr6 = sr1.replace([0, 1], [None, 6]).fillna(-10)
-    assert_eq(a6, sr6.to_array())
+    assert_eq(a6, sr6.to_numpy())
 
     sr1 = cudf.Series([0, 1, 2, 3, 4, None])
     with pytest.raises(TypeError):
@@ -137,11 +137,11 @@ def test_series_replace_with_nulls():
     # Series input
     a8 = np.array([-10, -10, -10, 3, 4, -10])
     sr8 = sr1.replace(cudf.Series([-10] * 3, index=sr1[:3]), None).fillna(-10)
-    assert_eq(a8, sr8.to_array())
+    assert_eq(a8, sr8.to_numpy())
 
     a9 = np.array([-10, 6, 2, 3, 4, -10])
     sr9 = sr1.replace([0, 1], [None, 6]).fillna(-10)
-    assert_eq(a9, sr9.to_array())
+    assert_eq(a9, sr9.to_numpy())
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 838ea91c7d3..736bcf131cc 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -40,14 +40,6 @@ def test_null_series(nrows, dtype):
     psrepr = psrepr.replace("NaN", "<NA>")
     psrepr = psrepr.replace("NaT", "<NA>")
     psrepr = psrepr.replace("None", "<NA>")
-    if (
-        dtype.startswith("int")
-        or dtype.startswith("uint")
-        or dtype.startswith("long")
-    ):
-        psrepr = psrepr.replace(
-            str(sr._column.default_na_value()) + "\n", "<NA>\n"
-        )
     if "UInt" in psrepr:
         psrepr = psrepr.replace("UInt", "uint")
     elif "Int" in psrepr:
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index 0ef7b89a606..741a9f45d09 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -38,7 +38,7 @@ def test_cumsum(dtype, nelem):
     gs = cudf.Series(data)
     ps = pd.Series(data)
     np.testing.assert_array_almost_equal(
-        gs.cumsum().to_array(), ps.cumsum(), decimal=decimal
+        gs.cumsum().to_numpy(), ps.cumsum(), decimal=decimal
     )
 
     # dataframe series (named series)
@@ -47,7 +47,7 @@ def test_cumsum(dtype, nelem):
     pdf = pd.DataFrame()
     pdf["a"] = pd.Series(data)
     np.testing.assert_array_almost_equal(
-        gdf.a.cumsum().to_array(), pdf.a.cumsum(), decimal=decimal
+        gdf.a.cumsum().to_numpy(), pdf.a.cumsum(), decimal=decimal
     )
 
 
@@ -96,7 +96,7 @@ def test_cummin(dtype, nelem):
     gs = cudf.Series(data)
     ps = pd.Series(data)
     np.testing.assert_array_almost_equal(
-        gs.cummin().to_array(), ps.cummin(), decimal=decimal
+        gs.cummin().to_numpy(), ps.cummin(), decimal=decimal
     )
 
     # dataframe series (named series)
@@ -105,7 +105,7 @@ def test_cummin(dtype, nelem):
     pdf = pd.DataFrame()
     pdf["a"] = pd.Series(data)
     np.testing.assert_array_almost_equal(
-        gdf.a.cummin().to_array(), pdf.a.cummin(), decimal=decimal
+        gdf.a.cummin().to_numpy(), pdf.a.cummin(), decimal=decimal
     )
 
 
@@ -153,7 +153,7 @@ def test_cummax(dtype, nelem):
     gs = cudf.Series(data)
     ps = pd.Series(data)
     np.testing.assert_array_almost_equal(
-        gs.cummax().to_array(), ps.cummax(), decimal=decimal
+        gs.cummax().to_numpy(), ps.cummax(), decimal=decimal
     )
 
     # dataframe series (named series)
@@ -162,7 +162,7 @@ def test_cummax(dtype, nelem):
     pdf = pd.DataFrame()
     pdf["a"] = pd.Series(data)
     np.testing.assert_array_almost_equal(
-        gdf.a.cummax().to_array(), pdf.a.cummax(), decimal=decimal
+        gdf.a.cummax().to_numpy(), pdf.a.cummax(), decimal=decimal
     )
 
 
@@ -210,7 +210,7 @@ def test_cumprod(dtype, nelem):
     gs = cudf.Series(data)
     ps = pd.Series(data)
     np.testing.assert_array_almost_equal(
-        gs.cumprod().to_array(), ps.cumprod(), decimal=decimal
+        gs.cumprod().to_numpy(), ps.cumprod(), decimal=decimal
     )
 
     # dataframe series (named series)
@@ -219,7 +219,7 @@ def test_cumprod(dtype, nelem):
     pdf = pd.DataFrame()
     pdf["a"] = pd.Series(data)
     np.testing.assert_array_almost_equal(
-        gdf.a.cumprod().to_array(), pdf.a.cumprod(), decimal=decimal
+        gdf.a.cumprod().to_numpy(), pdf.a.cumprod(), decimal=decimal
     )
 
 
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index c16c6486cd4..cd029d02d79 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -26,13 +26,13 @@ def test_searchsorted(side, obj_class, vals_class):
 
     # Reference object can be Series, Index, or Column
     if obj_class == "index":
-        sr = cudf.Series.as_index(sr)
+        sr.reset_index(drop=True)
     elif obj_class == "column":
         sr = sr._column
 
     # Values can be Series or Index
     if vals_class == "index":
-        vals = cudf.Series.as_index(vals)
+        vals.reset_index(drop=True)
 
     psr = sr.to_pandas()
     pvals = vals.to_pandas()
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 11deb6c0842..ca179703864 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -285,25 +285,25 @@ def test_series_append_existing_buffers():
     a2 = cudf.Series(np.arange(5))
     gs = gs.append(a2)
     assert len(gs) == 15
-    np.testing.assert_equal(gs.to_array(), np.hstack([a1, a2.to_array()]))
+    np.testing.assert_equal(gs.to_numpy(), np.hstack([a1, a2.to_numpy()]))
 
     # Ensure appending to previous buffer
     a3 = cudf.Series(np.arange(3))
     gs = gs.append(a3)
     assert len(gs) == 18
-    a4 = np.hstack([a1, a2.to_array(), a3.to_array()])
-    np.testing.assert_equal(gs.to_array(), a4)
+    a4 = np.hstack([a1, a2.to_numpy(), a3.to_numpy()])
+    np.testing.assert_equal(gs.to_numpy(), a4)
 
     # Appending different dtype
     a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32))
     a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64))
     gs = a5.append(a6)
     np.testing.assert_equal(
-        gs.to_array(), np.hstack([a5.to_array(), a6.to_array()])
+        gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()])
     )
     gs = cudf.Series(a6).append(a5)
     np.testing.assert_equal(
-        gs.to_array(), np.hstack([a6.to_array(), a5.to_array()])
+        gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()])
     )
 
 
@@ -694,7 +694,6 @@ def test_series_round(arr, decimals):
     expected = pser.round(decimals)
 
     assert_eq(result, expected)
-    np.array_equal(ser.nullmask.to_array(), result.to_array())
 
 
 def test_series_round_half_up():
@@ -958,14 +957,8 @@ def test_series_update(data, other):
 
     ps = gs.to_pandas()
 
-    gs_column_before = gs._column
-    gs.update(g_other)
-    gs_column_after = gs._column
-
-    assert_eq(gs_column_before.to_array(), gs_column_after.to_array())
-
     ps.update(p_other)
-
+    gs.update(g_other)
     assert_eq(gs, ps)
 
 
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index fed391ac6be..3a42411c839 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -94,10 +94,10 @@ def test_series_argsort(nelem, dtype, asc):
     res = sr.argsort(ascending=asc)
 
     if asc:
-        expected = np.argsort(sr.to_array(), kind="mergesort")
+        expected = np.argsort(sr.to_numpy(), kind="mergesort")
     else:
-        expected = np.argsort(sr.to_array() * -1, kind="mergesort")
-    np.testing.assert_array_equal(expected, res.to_array())
+        expected = np.argsort(sr.to_numpy() * -1, kind="mergesort")
+    np.testing.assert_array_equal(expected, res.to_numpy())
 
 
 @pytest.mark.parametrize(
@@ -165,8 +165,8 @@ def test_dataframe_nlargest(nelem, n):
 
     # Check
     inds = np.argsort(aa)
-    assert_eq(res["a"].to_array(), aa[inds][-n:][::-1])
-    assert_eq(res["b"].to_array(), bb[inds][-n:][::-1])
+    assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1])
+    assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1])
     assert_eq(res.index.values, inds[-n:][::-1])
 
 
@@ -180,8 +180,8 @@ def test_dataframe_nsmallest(nelem, n):
 
     # Check
     inds = np.argsort(-aa)
-    assert_eq(res["a"].to_array(), aa[inds][-n:][::-1])
-    assert_eq(res["b"].to_array(), bb[inds][-n:][::-1])
+    assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1])
+    assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1])
     assert_eq(res.index.values, inds[-n:][::-1])
 
 
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index e10ad8e5306..f7cf597afd2 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -46,7 +46,7 @@ def test_fillna():
     masked_col = gar[8]
     sr = Series(data=masked_col.data)
     dense = sr.nans_to_nulls().fillna(123)
-    np.testing.assert_equal(123, dense.to_array())
+    np.testing.assert_equal(123, dense.to_numpy())
     assert len(dense) == len(sr)
     assert dense.null_count == 0
 
@@ -58,8 +58,8 @@ def test_to_dense_array():
     sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
     assert sr.has_nulls
     assert sr.null_count != len(sr)
-    filled = sr.to_array(fillna="pandas")
-    dense = sr.to_array()
+    filled = sr.to_numpy(na_value=np.nan)
+    dense = sr.dropna().to_numpy()
     assert dense.size < filled.size
     assert filled.size == len(sr)
 
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 759a6a95798..ebe78d56c3f 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -84,7 +84,7 @@ def test_series_unique():
         arr = np.random.randint(low=-1, high=10, size=size)
         mask = arr != -1
         sr = cudf.Series.from_masked_array(arr, cudf.Series(mask).as_mask())
-        assert set(arr[mask]) == set(sr.unique().to_array())
+        assert set(arr[mask]) == set(sr.unique().dropna().to_numpy())
         assert len(set(arr[mask])) == sr.nunique()
 
 
@@ -239,12 +239,12 @@ def test_kurtosis(data, null_flag):
         pdata.iloc[[0, 2]] = None
 
     got = data.kurtosis()
-    got = got if np.isscalar(got) else got.to_array()
+    got = got if np.isscalar(got) else got.to_numpy()
     expected = pdata.kurtosis()
     np.testing.assert_array_almost_equal(got, expected)
 
     got = data.kurt()
-    got = got if np.isscalar(got) else got.to_array()
+    got = got if np.isscalar(got) else got.to_numpy()
     expected = pdata.kurt()
     np.testing.assert_array_almost_equal(got, expected)
 
@@ -281,7 +281,7 @@ def test_skew(data, null_flag):
 
     got = data.skew()
     expected = pdata.skew()
-    got = got if np.isscalar(got) else got.to_array()
+    got = got if np.isscalar(got) else got.to_numpy()
     np.testing.assert_array_almost_equal(got, expected)
 
     with pytest.raises(NotImplementedError):
@@ -338,7 +338,7 @@ def test_series_pct_change(data, periods, fill_method):
         got = cs.pct_change(periods=periods, fill_method=fill_method)
         expected = ps.pct_change(periods=periods, fill_method=fill_method)
         np.testing.assert_array_almost_equal(
-            got.to_array(fillna="pandas"), expected
+            got.to_numpy(na_value=np.nan), expected
         )
 
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index a4ceed258db..b254a6ba02c 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -82,7 +82,7 @@ def test_string_export(ps_gs):
     assert_eq(expect, got)
 
     expect = np.array(ps)
-    got = gs.to_array()
+    got = gs.to_numpy()
     assert_eq(expect, got)
 
     expect = pa.Array.from_pandas(ps)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 773bec56634..36a49aa4b33 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -167,17 +167,15 @@ def test_timedelta_from_pandas(data, dtype):
     ],
 )
 @pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES)
-@pytest.mark.parametrize("fillna", [None, "pandas"])
-def test_timedelta_series_to_array(data, dtype, fillna):
+def test_timedelta_series_to_numpy(data, dtype):
     gsr = cudf.Series(data, dtype=dtype)
 
     expected = np.array(
         cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype
     )
-    if fillna is None:
-        expected = expected[~np.isnan(expected)]
+    expected = expected[~np.isnan(expected)]
 
-    actual = gsr.to_array(fillna=fillna)
+    actual = gsr.dropna().to_numpy()
 
     np.testing.assert_array_equal(expected, actual)
 
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index 0c246554082..021c4052759 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -32,4 +32,4 @@ def test_applymap_python_lambda(dtype, udf, testfunc):
 
     out_ser = lhs_ser.applymap(udf)
     result = testfunc(lhs_arr)
-    np.testing.assert_almost_equal(result, out_ser.to_array())
+    np.testing.assert_almost_equal(result, out_ser.to_numpy())
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index f73f1526c7f..c0018dae47d 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -2,7 +2,6 @@
 
 import pandas as pd
 import pytest
-from numba import cuda
 
 import cudf
 from cudf.core.udf.pipeline import nulludf
@@ -15,12 +14,7 @@
     operator.truediv,
     operator.floordiv,
     operator.mod,
-    pytest.param(
-        operator.pow,
-        marks=pytest.mark.xfail(
-            reason="https://github.com/rapidsai/cudf/issues/8470"
-        ),
-    ),
+    operator.pow,
 ]
 
 comparison_ops = [
@@ -34,13 +28,6 @@
 
 
 def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
-
-    # Skip testing CUDA 11.0
-    runtime = cuda.cudadrv.runtime.Runtime()
-    mjr, mnr = runtime.get_version()
-    if mjr < 11 or (mjr == 11 and mnr < 1):
-        pytest.skip("Skip testing for CUDA 11.0")
-
     gdf = data
     pdf = data.to_pandas(nullable=True)
 
@@ -53,6 +40,15 @@ def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
     assert_eq(expect, obtain, **kwargs)
 
 
+def run_masked_udf_series(func_psr, func_gsr, data, **kwargs):
+    gsr = data
+    psr = data.to_pandas(nullable=True)
+
+    expect = psr.apply(func_psr)
+    obtain = gsr.apply(func_gsr)
+    assert_eq(expect, obtain, **kwargs)
+
+
 @pytest.mark.parametrize("op", arith_ops)
 def test_arith_masked_vs_masked(op):
     # This test should test all the typing
@@ -91,8 +87,9 @@ def func_gdf(x, y):
 
 
 @pytest.mark.parametrize("op", arith_ops)
-@pytest.mark.parametrize("constant", [1, 1.5])
-def test_arith_masked_vs_constant(op, constant):
+@pytest.mark.parametrize("constant", [1, 1.5, True, False])
+@pytest.mark.parametrize("data", [[1, 2, cudf.NA]])
+def test_arith_masked_vs_constant(op, constant, data):
     def func_pdf(x):
         return op(x, constant)
 
@@ -100,15 +97,28 @@ def func_pdf(x):
     def func_gdf(x):
         return op(x, constant)
 
-    # Just a single column -> result will be all NA
-    gdf = cudf.DataFrame({"data": [1, 2, None]})
+    gdf = cudf.DataFrame({"data": data})
 
+    if constant is False and op in {
+        operator.mod,
+        operator.pow,
+        operator.truediv,
+        operator.floordiv,
+    }:
+        # The following tests cases yield undefined behavior:
+        # - truediv(x, False) because its dividing by zero
+        # - floordiv(x, False) because its dividing by zero
+        # - mod(x, False) because its mod by zero,
+        # - pow(x, False) because we have an NA in the series and pandas
+        #   insists that (NA**0 == 1) where we do not
+        pytest.skip()
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", arith_ops)
-@pytest.mark.parametrize("constant", [1, 1.5])
-def test_arith_masked_vs_constant_reflected(op, constant):
+@pytest.mark.parametrize("constant", [1, 1.5, True, False])
+@pytest.mark.parametrize("data", [[2, 3, cudf.NA], [1, cudf.NA, 1]])
+def test_arith_masked_vs_constant_reflected(op, constant, data):
     def func_pdf(x):
         return op(constant, x)
 
@@ -117,13 +127,20 @@ def func_gdf(x):
         return op(constant, x)
 
     # Just a single column -> result will be all NA
-    gdf = cudf.DataFrame({"data": [1, 2, None]})
-
+    gdf = cudf.DataFrame({"data": data})
+
+    if constant == 1 and op is operator.pow:
+        # The following tests cases yield differing results from pandas:
+        # - 1**NA
+        # - True**NA
+        # both due to pandas insisting that this is equal to 1.
+        pytest.skip()
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", arith_ops)
-def test_arith_masked_vs_null(op):
+@pytest.mark.parametrize("data", [[1, cudf.NA, 3], [2, 3, cudf.NA]])
+def test_arith_masked_vs_null(op, data):
     def func_pdf(x):
         return op(x, pd.NA)
 
@@ -131,7 +148,11 @@ def func_pdf(x):
     def func_gdf(x):
         return op(x, cudf.NA)
 
-    gdf = cudf.DataFrame({"data": [1, None, 3]})
+    gdf = cudf.DataFrame({"data": data})
+
+    if 1 in gdf["data"] and op is operator.pow:
+        # In pandas, 1**NA == 1.
+        pytest.skip()
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
@@ -255,6 +276,18 @@ def func_gdf(x):
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
+def test_apply_return_literal_only():
+    def func_pdf(x):
+        return 5
+
+    @nulludf
+    def func_gdf(x):
+        return 5
+
+    gdf = cudf.DataFrame({"a": [1, None, 3]})
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
+
 def test_apply_everything():
     def func_pdf(w, x, y, z):
         if x is pd.NA:
@@ -290,3 +323,110 @@ def func_gdf(w, x, y, z):
         }
     )
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
+
+###
+
+
+@pytest.mark.parametrize(
+    "data", [cudf.Series([1, 2, 3]), cudf.Series([1, cudf.NA, 3])]
+)
+def test_series_apply_basic(data):
+    def func(x):
+        return x + 1
+
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+def test_series_apply_null_conditional():
+    def func_pdf(x):
+        if x is pd.NA:
+            return 42
+        else:
+            return x - 1
+
+    def func_gdf(x):
+        if x is cudf.NA:
+            return 42
+        else:
+            return x - 1
+
+    data = cudf.Series([1, cudf.NA, 3])
+
+    run_masked_udf_series(func_pdf, func_gdf, data)
+
+
+###
+
+
+@pytest.mark.parametrize("op", arith_ops)
+def test_series_arith_masked_vs_masked(op):
+    def func(x):
+        return op(x, x)
+
+    data = cudf.Series([1, cudf.NA, 3])
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", comparison_ops)
+def test_series_compare_masked_vs_masked(op):
+    """
+    In the series case, only one other MaskedType to compare with
+    - itself
+    """
+
+    def func(x):
+        return op(x, x)
+
+    data = cudf.Series([1, cudf.NA, 3])
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", arith_ops)
+@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA])
+def test_series_arith_masked_vs_constant(op, constant):
+    def func(x):
+        return op(x, constant)
+
+    # Just a single column -> result will be all NA
+    data = cudf.Series([1, 2, cudf.NA])
+    if constant is cudf.NA and op is operator.pow:
+        # in pandas, 1**NA == 1. In cudf, 1**NA == 1.
+        with pytest.xfail():
+            run_masked_udf_series(func, func, data, check_dtype=False)
+        return
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", arith_ops)
+@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA])
+def test_series_arith_masked_vs_constant_reflected(op, constant):
+    def func(x):
+        return op(constant, x)
+
+    # Just a single column -> result will be all NA
+    data = cudf.Series([1, 2, cudf.NA])
+    if constant is not cudf.NA and constant == 1 and op is operator.pow:
+        # in pandas, 1**NA == 1. In cudf, 1**NA == 1.
+        with pytest.xfail():
+            run_masked_udf_series(func, func, data, check_dtype=False)
+        return
+    run_masked_udf_series(func, func, data, check_dtype=False)
+
+
+def test_series_masked_is_null_conditional():
+    def func_psr(x):
+        if x is pd.NA:
+            return 42
+        else:
+            return x
+
+    def func_gsr(x):
+        if x is cudf.NA:
+            return 42
+        else:
+            return x
+
+    data = cudf.Series([1, cudf.NA, 3, cudf.NA])
+
+    run_masked_udf_series(func_psr, func_gsr, data, check_dtype=False)
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 25ebe6fa710..22c78b5f933 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -19,16 +19,16 @@
 def test_series_abs(dtype):
     arr = (np.random.random(1000) * 100).astype(dtype)
     sr = Series(arr)
-    np.testing.assert_equal(sr.abs().to_array(), np.abs(arr))
-    np.testing.assert_equal(abs(sr).to_array(), abs(arr))
+    np.testing.assert_equal(sr.abs().to_numpy(), np.abs(arr))
+    np.testing.assert_equal(abs(sr).to_numpy(), abs(arr))
 
 
 @pytest.mark.parametrize("dtype", utils.INTEGER_TYPES)
 def test_series_invert(dtype):
     arr = (np.random.random(1000) * 100).astype(dtype)
     sr = Series(arr)
-    np.testing.assert_equal((~sr).to_array(), np.invert(arr))
-    np.testing.assert_equal((~sr).to_array(), ~arr)
+    np.testing.assert_equal((~sr).to_numpy(), np.invert(arr))
+    np.testing.assert_equal((~sr).to_numpy(), ~arr)
 
 
 @pytest.mark.parametrize("dtype", utils.INTEGER_TYPES + ["bool"])
@@ -41,28 +41,28 @@ def test_series_not(dtype):
         arr = arr * (np.random.random(1000) * 100).astype(dtype)
     sr = Series(arr)
 
-    result = cudf.logical_not(sr).to_array()
+    result = cudf.logical_not(sr).to_numpy()
     expect = np.logical_not(arr)
     np.testing.assert_equal(result, expect)
-    np.testing.assert_equal((~sr).to_array(), ~arr)
+    np.testing.assert_equal((~sr).to_numpy(), ~arr)
 
 
 def test_series_neg():
     arr = np.random.random(100) * 100
     sr = Series(arr)
-    np.testing.assert_equal((-sr).to_array(), -arr)
+    np.testing.assert_equal((-sr).to_numpy(), -arr)
 
 
 def test_series_ceil():
     arr = np.random.random(100) * 100
     sr = Series(arr)
-    np.testing.assert_equal(sr.ceil().to_array(), np.ceil(arr))
+    np.testing.assert_equal(sr.ceil().to_numpy(), np.ceil(arr))
 
 
 def test_series_floor():
     arr = np.random.random(100) * 100
     sr = Series(arr)
-    np.testing.assert_equal(sr.floor().to_array(), np.floor(arr))
+    np.testing.assert_equal(sr.floor().to_numpy(), np.floor(arr))
 
 
 @pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128])
@@ -77,7 +77,7 @@ def test_validity_ceil(nelem):
     res = sr.ceil()
 
     na_value = -100000
-    got = res.fillna(na_value).to_array()
+    got = res.fillna(na_value).to_numpy()
     res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size]
 
     expect = np.ceil(data)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 727bbb1c345..7b7fe674210 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -210,6 +210,16 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
 _udf_code_cache: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 
 
+def make_cache_key(udf, sig):
+    codebytes = udf.__code__.co_code
+    if udf.__closure__ is not None:
+        cvars = tuple([x.cell_contents for x in udf.__closure__])
+        cvarbytes = dumps(cvars)
+    else:
+        cvarbytes = b""
+    return codebytes, cvarbytes, sig
+
+
 def compile_udf(udf, type_signature):
     """Compile ``udf`` with `numba`
 
@@ -244,14 +254,7 @@ def compile_udf(udf, type_signature):
 
     # Check if we've already compiled a similar (but possibly distinct)
     # function before
-    codebytes = udf.__code__.co_code
-    if udf.__closure__ is not None:
-        cvars = tuple([x.cell_contents for x in udf.__closure__])
-        cvarbytes = dumps(cvars)
-    else:
-        cvarbytes = b""
-
-    key = (type_signature, codebytes, cvarbytes)
+    key = make_cache_key(udf, type_signature)
     res = _udf_code_cache.get(key)
     if res:
         return res
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index bdaf5e144a5..c5620bed078 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -499,6 +499,34 @@ def find_common_type(dtypes):
     if len(dtypes) == 0:
         return None
 
+    # Early exit for categoricals since they're not hashable and therefore
+    # can't be put in a set.
+    if any(cudf.api.types.is_categorical_dtype(dtype) for dtype in dtypes):
+        if all(
+            (
+                cudf.api.types.is_categorical_dtype(dtype)
+                and (not dtype.ordered if hasattr(dtype, "ordered") else True)
+            )
+            for dtype in dtypes
+        ):
+            if len(set(dtype._categories.dtype for dtype in dtypes)) == 1:
+                return cudf.CategoricalDtype(
+                    cudf.core.column.concat_columns(
+                        [dtype._categories for dtype in dtypes]
+                    ).unique()
+                )
+            else:
+                raise ValueError(
+                    "Only unordered categories of the same underlying type "
+                    "may be coerced to a common type."
+                )
+        else:
+            # TODO: Should this be an error case (mixing categorical with other
+            # dtypes) or should this return object? Unclear if we have enough
+            # information to decide right now, may have to come back to this as
+            # usage of find_common_type increases.
+            return cudf.dtype("O")
+
     # Aggregate same types
     dtypes = set(dtypes)
 
diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
index f0e2b2f8bcf..d8dce276820 100644
--- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
@@ -36,6 +36,6 @@ sphinx
 sphinx-copybutton
 sphinx-markdown-tables
 sphinxcontrib-websupport
-transformers
+transformers<=4.10.3
 typing_extensions
 wheel
diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
index df7e78354cd..c11d108360d 100644
--- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
@@ -36,6 +36,6 @@ sphinx
 sphinx-copybutton
 sphinx-markdown-tables
 sphinxcontrib-websupport
-transformers
+transformers<=4.10.3
 typing_extensions
 wheel
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index cb8ca9158e1..c081a719808 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -52,7 +52,7 @@
         "python-snappy>=0.6.0",
         "pyorc",
         "msgpack",
-        "transformers",
+        "transformers<=4.10.3",
     ]
 }
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index e604e5511da..d8037cadd7c 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -420,7 +420,7 @@ def struct(self):
 
 
 class Index(Series, dd.core.Index):
-    _partition_type = cudf.Index
+    _partition_type = cudf.Index  # type: ignore
 
 
 def _naive_var(ddf, meta, skipna, ddof, split_every, out):
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 7e2c3a4f36c..0cf9d835523 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 import math
 from operator import getitem
+from typing import Set
 
 import numpy as np
 import pandas as pd
@@ -378,7 +379,7 @@ def _is_supported(arg, supported: set):
     """
     if isinstance(arg, (list, dict)):
         if isinstance(arg, dict):
-            _global_set = set()
+            _global_set: Set[str] = set()
             for col in arg:
                 if isinstance(arg[col], list):
                     _global_set = _global_set.union(set(arg[col]))
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 850cc0843cc..f5224681a1e 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -5,6 +5,7 @@
 from io import BufferedWriter, BytesIO, IOBase
 
 import numpy as np
+import pyarrow as pa
 from pyarrow import dataset as pa_ds, parquet as pq
 
 from dask import dataframe as dd
@@ -45,7 +46,6 @@ def read_metadata(*args, **kwargs):
                 and strings_to_cats
             ):
                 new_meta._data[col] = new_meta._data[col].astype("int32")
-
         return (new_meta, stats, parts, index)
 
     @classmethod
@@ -332,12 +332,13 @@ def set_object_dtypes_from_pa_schema(df, schema):
     # pyarrow schema.
     if schema:
         for col_name, col in df._data.items():
-            if col_name in schema.names and isinstance(
-                col, cudf.core.column.StringColumn
+            typ = schema.field(col_name).type
+            if (
+                col_name in schema.names
+                and not isinstance(typ, (pa.ListType, pa.StructType))
+                and isinstance(col, cudf.core.column.StringColumn)
             ):
-                df._data[col_name] = col.astype(
-                    cudf_dtype_from_pa_type(schema.field(col_name).type)
-                )
+                df._data[col_name] = col.astype(cudf_dtype_from_pa_type(typ))
 
 
 def read_parquet(
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index d512418ed45..4c263ca2e53 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -491,6 +491,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
 @pytest.mark.parametrize(
     "data",
     [
+        ["dog", "cat", "fish"],
         [[0], [1, 2], [3]],
         [None, [1, 2], [3]],
         [{"f1": 1}, {"f1": 0, "f2": "dog"}, {"f2": "cat"}],
@@ -502,5 +503,7 @@ def test_cudf_dtypes_from_pandas(tmpdir, data):
     fn = str(tmpdir.join("test.parquet"))
     dfp = pd.DataFrame({"data": data})
     dfp.to_parquet(fn, engine="pyarrow", index=True)
-    ddf2 = dask_cudf.read_parquet(fn)
+    # Use `split_row_groups=True` to avoid "fast path" where
+    # schema is not is passed through in older Dask versions
+    ddf2 = dask_cudf.read_parquet(fn, split_row_groups=True)
     dd.assert_eq(cudf.from_pandas(dfp), ddf2)