diff --git a/build.sh b/build.sh
index e6169b2036b..bee66d819b4 100755
--- a/build.sh
+++ b/build.sh
@@ -315,9 +315,11 @@ if buildAll || hasArg libcudf; then
            LIBCUDF_FS=$(ls -lh ${LIB_BUILD_DIR}/libcudf.so | awk '{print $5}')
            MSG="${MSG}<br/>libcudf.so size: $LIBCUDF_FS"
         fi
-        echo "$MSG"
-        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
-        cp ${LIB_BUILD_DIR}/.ninja_log ${LIB_BUILD_DIR}/ninja.log
+        BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIB_BUILD_DIR}"}
+        echo "Metrics output dir: [$BMR_DIR]"
+        mkdir -p ${BMR_DIR}
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${BMR_DIR}/ninja_log.html
+        cp ${LIB_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log
     fi
 
     if [[ ${INSTALL_TARGET} != "" ]]; then
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 3b45b3ce2e7..b68c2bdbef6 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -14,3 +14,29 @@ rapids-logger "Begin cpp build"
 rapids-mamba-retry mambabuild conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
+
+echo "++++++++++++++++++++++++++++++++++++++++++++"
+
+if [[ -d $RAPIDS_ARTIFACTS_DIR ]]; then
+  ls -l ${RAPIDS_ARTIFACTS_DIR}
+fi
+
+echo "++++++++++++++++++++++++++++++++++++++++++++"
+
+FILE=${RAPIDS_ARTIFACTS_DIR}/ninja.log
+if [[ -f $FILE ]]; then
+  echo -e "\x1B[33;1m\x1B[48;5;240m Ninja log for this build available at the following link \x1B[0m"
+  UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).ninja.log
+  rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
+fi
+
+echo "++++++++++++++++++++++++++++++++++++++++++++"
+
+FILE=${RAPIDS_ARTIFACTS_DIR}/ninja_log.html
+if [[ -f $FILE ]]; then
+  echo -e "\x1B[33;1m\x1B[48;5;240m Build Metrics Report for this build available at the following link \x1B[0m"
+  UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).BuildMetricsReport.html
+  rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
+fi
+
+echo "++++++++++++++++++++++++++++++++++++++++++++"
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 0be72486319..983a63d4ce9 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -66,21 +66,5 @@ for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
     fi
 done
 
-if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then
-    rapids-logger "Memcheck gtests with rmm_mode=cuda"
-    export GTEST_CUDF_RMM_MODE=cuda
-    COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
-    for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
-        test_name=$(basename ${gt})
-        if [[ "$test_name" == "ERROR_TEST" ]]; then
-            continue
-        fi
-        echo "Running gtest $test_name"
-        ${COMPUTE_SANITIZER_CMD} ${gt} | tee "${RAPIDS_TESTS_DIR}${test_name}.cs.log"
-    done
-    unset GTEST_CUDF_RMM_MODE
-    # TODO: test-results/*.cs.log are processed in CI
-fi
-
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index b0b86b427b7..fbfcf6e71a2 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -27,6 +27,7 @@ build:
     - SCCACHE_IDLE_TIMEOUT
     - AWS_ACCESS_KEY_ID
     - AWS_SECRET_ACCESS_KEY
+    - RAPIDS_ARTIFACTS_DIR
 
 requirements:
   build:
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index 0f005c462cc..1b3f4190680 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,12 +57,12 @@ static void BM_split(benchmark::State& state, split_type rt)
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
 {
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
+  int constexpr min_rows   = 1 << 12;
+  int constexpr max_rows   = 1 << 24;
+  int constexpr row_mult   = 8;
+  int constexpr min_rowlen = 1 << 5;
+  int constexpr max_rowlen = 1 << 13;
+  int constexpr len_mult   = 2;
   for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
     for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
       // avoid generating combinations that exceed the cudf column limit
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index fd4c049e2fc..a0ef2155f7d 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -112,11 +112,21 @@ class datasource {
   /**
    * @brief Creates a source from a host memory buffer.
    *
+   # @deprecated Since 23.04
+   *
    * @param[in] buffer Host buffer object
    * @return Constructed datasource object
    */
   static std::unique_ptr<datasource> create(host_buffer const& buffer);
 
+  /**
+   * @brief Creates a source from a host memory buffer.
+   *
+   * @param[in] buffer Host buffer object
+   * @return Constructed datasource object
+   */
+  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
+
   /**
    * @brief Creates a source from a device memory buffer.
    *
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 06b52563e19..6f97eb768d9 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -150,6 +150,8 @@ struct table_with_metadata {
 /**
  * @brief Non-owning view of a host memory buffer
  *
+ * @deprecated Since 23.04
+ *
  * Used to describe buffer input in `source_info` objects.
  */
 struct host_buffer {
@@ -166,6 +168,22 @@ struct host_buffer {
   host_buffer(const char* data, size_t size) : data(data), size(size) {}
 };
 
+/**
+ * @brief Returns `true` if the type is byte-like, meaning it is reasonable to pass as a pointer to
+ * bytes.
+ *
+ * @tparam T The representation type
+ * @return `true` if the type is considered a byte-like type
+ */
+template <typename T>
+constexpr inline auto is_byte_like_type()
+{
+  using non_cv_T = std::remove_cv_t<T>;
+  return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||
+         std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||
+         std::is_same_v<non_cv_T, std::byte>;
+}
+
 /**
  * @brief Source information for read interfaces
  */
@@ -191,21 +209,70 @@ struct source_info {
   /**
    * @brief Construct a new source info object for multiple buffers in host memory
    *
+   * @deprecated Since 23.04
+   *
    * @param host_buffers Input buffers in host memory
    */
-  explicit source_info(std::vector<host_buffer> const& host_buffers)
-    : _type(io_type::HOST_BUFFER), _host_buffers(host_buffers)
+  explicit source_info(std::vector<host_buffer> const& host_buffers) : _type(io_type::HOST_BUFFER)
   {
+    _host_buffers.reserve(host_buffers.size());
+    std::transform(host_buffers.begin(),
+                   host_buffers.end(),
+                   std::back_inserter(_host_buffers),
+                   [](auto const hb) {
+                     return cudf::host_span<std::byte const>{
+                       reinterpret_cast<std::byte const*>(hb.data), hb.size};
+                   });
   }
 
   /**
    * @brief Construct a new source info object for a single buffer
    *
+   * @deprecated Since 23.04
+   *
    * @param host_data Input buffer in host memory
    * @param size Size of the buffer
    */
   explicit source_info(const char* host_data, size_t size)
-    : _type(io_type::HOST_BUFFER), _host_buffers({{host_data, size}})
+    : _type(io_type::HOST_BUFFER),
+      _host_buffers(
+        {cudf::host_span<std::byte const>(reinterpret_cast<std::byte const*>(host_data), size)})
+  {
+  }
+
+  /**
+   * @brief Construct a new source info object for multiple buffers in host memory
+   *
+   * @param host_buffers Input buffers in host memory
+   */
+  template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
+  explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)
+    : _type(io_type::HOST_BUFFER)
+  {
+    if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {
+      _host_buffers.reserve(host_buffers.size());
+      std::transform(host_buffers.begin(),
+                     host_buffers.end(),
+                     std::back_inserter(_host_buffers),
+                     [](auto const s) {
+                       return cudf::host_span<std::byte const>{
+                         reinterpret_cast<std::byte const*>(s.data()), s.size()};
+                     });
+    } else {
+      _host_buffers.assign(host_buffers.begin(), host_buffers.end());
+    }
+  }
+
+  /**
+   * @brief Construct a new source info object for a single buffer
+   *
+   * @param host_data Input buffer in host memory
+   */
+  template <typename T, CUDF_ENABLE_IF(is_byte_like_type<std::remove_cv_t<T>>())>
+  explicit source_info(cudf::host_span<T> host_data)
+    : _type(io_type::HOST_BUFFER),
+      _host_buffers{cudf::host_span<std::byte const>(
+        reinterpret_cast<std::byte const*>(host_data.data()), host_data.size())}
   {
   }
 
@@ -289,7 +356,7 @@ struct source_info {
  private:
   io_type _type = io_type::FILEPATH;
   std::vector<std::string> _filepaths;
-  std::vector<host_buffer> _host_buffers;
+  std::vector<cudf::host_span<std::byte const>> _host_buffers;
   std::vector<cudf::device_span<std::byte const>> _device_buffers;
   std::vector<cudf::io::datasource*> _user_sources;
 };
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index c2f7b18d443..71d64900398 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -329,10 +329,16 @@ std::unique_ptr<datasource> datasource::create(const std::string& filepath,
 }
 
 std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
+{
+  return create(
+    cudf::host_span<std::byte const>{reinterpret_cast<std::byte const*>(buffer.data), buffer.size});
+}
+
+std::unique_ptr<datasource> datasource::create(cudf::host_span<std::byte const> buffer)
 {
   // Use Arrow IO buffer class for zero-copy reads of host memory
   return std::make_unique<arrow_io_source>(std::make_shared<arrow::io::BufferReader>(
-    reinterpret_cast<const uint8_t*>(buffer.data), buffer.size));
+    reinterpret_cast<const uint8_t*>(buffer.data()), buffer.size()));
 }
 
 std::unique_ptr<datasource> datasource::create(cudf::device_span<std::byte const> buffer)
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index c11d7ad47f9..18599fb568a 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "split.cuh"
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -31,14 +33,10 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
 
@@ -46,321 +44,8 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-using string_index_pair = thrust::pair<const char*, size_type>;
-
 namespace {
 
-/**
- * @brief Base class for delimiter-based tokenizers.
- *
- * These are common methods used by both split and rsplit tokenizer functors.
- */
-struct base_split_tokenizer {
-  __device__ const char* get_base_ptr() const
-  {
-    return d_strings.child(strings_column_view::chars_column_index).data<char>();
-  }
-
-  __device__ string_view const get_string(size_type idx) const
-  {
-    return d_strings.element<string_view>(idx);
-  }
-
-  __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
-
-  /**
-   * @brief Initialize token elements for all strings.
-   *
-   * The process_tokens() only handles creating tokens for strings that contain
-   * delimiters. This function will initialize the output tokens for all
-   * strings by assigning null entries for null and empty strings and the
-   * string itself for strings with no delimiters.
-   *
-   * The tokens are placed in output order so that all tokens for each output
-   * column are stored consecutively in `d_all_tokens`.
-   *
-   * @param idx Index of string in column
-   * @param column_count Number of columns in output
-   * @param d_all_tokens Tokens vector for all strings
-   */
-  __device__ void init_tokens(size_type idx,
-                              size_type column_count,
-                              string_index_pair* d_all_tokens) const
-  {
-    auto d_tokens = d_all_tokens + idx;
-    if (is_valid(idx)) {
-      auto d_str = get_string(idx);
-      *d_tokens  = string_index_pair{d_str.data(), d_str.size_bytes()};
-      --column_count;
-      d_tokens += d_strings.size();
-    }
-    // this is like fill() but output needs to be strided
-    for (size_type col = 0; col < column_count; ++col)
-      d_tokens[d_strings.size() * col] = string_index_pair{nullptr, 0};
-  }
-
-  base_split_tokenizer(column_device_view const& d_strings,
-                       string_view const& d_delimiter,
-                       size_type max_tokens)
-    : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens)
-  {
-  }
-
- protected:
-  column_device_view const d_strings;  // strings to split
-  string_view const d_delimiter;       // delimiter for split
-  size_type max_tokens;
-};
-
-/**
- * @brief The tokenizer functions for split().
- *
- * The methods here count delimiters, tokens, and output token elements
- * for each string in a strings column.
- */
-struct split_tokenizer_fn : base_split_tokenizer {
-  /**
-   * @brief This will create tokens around each delimiter honoring the string boundaries
-   * in which the delimiter resides.
-   *
-   * Each token is placed in `d_all_tokens` so they align consecutively
-   * with other tokens for the same output column.
-   * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col`
-   * for string at `string_index`.
-   *
-   * @param idx Index of the delimiter in the chars column
-   * @param d_token_counts Token counts for each string
-   * @param d_positions The beginning byte position of each delimiter
-   * @param positions_count Number of delimiters
-   * @param d_indexes Indices of the strings for each delimiter
-   * @param d_all_tokens All output tokens for the strings column
-   */
-  __device__ void process_tokens(size_type idx,
-                                 size_type const* d_token_counts,
-                                 size_type const* d_positions,
-                                 size_type positions_count,
-                                 size_type const* d_indexes,
-                                 string_index_pair* d_all_tokens) const
-  {
-    size_type str_idx = d_indexes[idx];
-    if ((idx > 0) && d_indexes[idx - 1] == str_idx)
-      return;   // the first delimiter for the string rules them all
-    --str_idx;  // all of these are off by 1 from the upper_bound call
-    size_type token_count      = d_token_counts[str_idx];  // max_tokens already included
-    const char* const base_ptr = get_base_ptr();  // d_positions values are based on this ptr
-    // this string's tokens output
-    auto d_tokens = d_all_tokens + str_idx;
-    // this string
-    const string_view d_str       = get_string(str_idx);
-    const char* str_ptr           = d_str.data();                  // beginning of the string
-    const char* const str_end_ptr = str_ptr + d_str.size_bytes();  // end of the string
-    // build the index-pair of each token for this string
-    for (size_type col = 0; col < token_count; ++col) {
-      auto next_delim = ((idx + col) < positions_count)  // boundary check for delims in last string
-                          ? (base_ptr + d_positions[idx + col])  // start of next delimiter
-                          : str_end_ptr;                         // or end of this string
-      auto eptr       = (next_delim < str_end_ptr)      // make sure delimiter is inside this string
-                      && (col + 1 < token_count)  // and this is not the last token
-                          ? next_delim
-                          : str_end_ptr;
-      // store the token into the output vector
-      d_tokens[col * d_strings.size()] =
-        string_index_pair{str_ptr, static_cast<size_type>(eptr - str_ptr)};
-      // point past this delimiter
-      str_ptr = eptr + d_delimiter.size_bytes();
-    }
-  }
-
-  /**
-   * @brief Returns `true` if the byte at `idx` is the start of the delimiter.
-   *
-   * @param idx Index of a byte in the chars column.
-   * @param d_offsets Offsets values to locate the chars ranges.
-   * @param chars_bytes Total number of characters to process.
-   * @return true if delimiter is found starting at position `idx`
-   */
-  __device__ bool is_delimiter(size_type idx,  // chars index
-                               int32_t const* d_offsets,
-                               size_type chars_bytes) const
-  {
-    auto d_chars = get_base_ptr() + d_offsets[0];
-    if (idx + d_delimiter.size_bytes() > chars_bytes) return false;
-    return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0;
-  }
-
-  /**
-   * @brief This counts the tokens for strings that contain delimiters.
-   *
-   * @param idx Index of a delimiter
-   * @param d_positions Start positions of all the delimiters
-   * @param positions_count The number of delimiters
-   * @param d_indexes Indices of the strings for each delimiter
-   * @param d_counts The token counts for all the strings
-   */
-  __device__ void count_tokens(size_type idx,  // delimiter index
-                               size_type const* d_positions,
-                               size_type positions_count,
-                               size_type const* d_indexes,
-                               size_type* d_counts) const
-  {
-    size_type str_idx = d_indexes[idx];
-    if ((idx > 0) && d_indexes[idx - 1] == str_idx)
-      return;  // first delimiter found handles all of them for this string
-    auto const delim_length    = d_delimiter.size_bytes();
-    string_view const d_str    = get_string(str_idx - 1);
-    const char* const base_ptr = get_base_ptr();
-    size_type delim_count      = 0;  // re-count delimiters to compute the token-count
-    size_type last_pos         = d_positions[idx] - delim_length;
-    while ((idx < positions_count) && (d_indexes[idx] == str_idx)) {
-      // make sure the whole delimiter is inside the string before counting it
-      auto d_pos = d_positions[idx];
-      if (((base_ptr + d_pos + delim_length - 1) < (d_str.data() + d_str.size_bytes())) &&
-          ((d_pos - last_pos) >= delim_length)) {
-        ++delim_count;     // only count if the delimiter fits
-        last_pos = d_pos;  // overlapping delimiters are ignored too
-      }
-      ++idx;
-    }
-    // the number of tokens is delim_count+1 but capped to max_tokens
-    d_counts[str_idx - 1] =
-      ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1;
-  }
-
-  split_tokenizer_fn(column_device_view const& d_strings,
-                     string_view const& d_delimiter,
-                     size_type max_tokens)
-    : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
-  {
-  }
-};
-
-/**
- * @brief The tokenizer functions for split().
- *
- * The methods here count delimiters, tokens, and output token elements
- * for each string in a strings column.
- *
- * Same as split_tokenizer_fn except tokens are counted from the end of each string.
- */
-struct rsplit_tokenizer_fn : base_split_tokenizer {
-  /**
-   * @brief This will create tokens around each delimiter honoring the string boundaries
-   * in which the delimiter resides.
-   *
-   * The tokens are processed from the end of each string so the `max_tokens`
-   * is honored correctly.
-   *
-   * Each token is placed in `d_all_tokens` so they align consecutively
-   * with other tokens for the same output column.
-   * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col`
-   * for string at `string_index`.
-   *
-   * @param idx Index of the delimiter in the chars column
-   * @param d_token_counts Token counts for each string
-   * @param d_positions The ending byte position of each delimiter
-   * @param positions_count Number of delimiters
-   * @param d_indexes Indices of the strings for each delimiter
-   * @param d_all_tokens All output tokens for the strings column
-   */
-  __device__ void process_tokens(size_type idx,                    // delimiter position index
-                                 size_type const* d_token_counts,  // token counts for each string
-                                 size_type const* d_positions,     // end of each delimiter
-                                 size_type positions_count,        // total number of delimiters
-                                 size_type const* d_indexes,  // string indices for each delimiter
-                                 string_index_pair* d_all_tokens) const
-  {
-    size_type str_idx = d_indexes[idx];
-    if ((idx + 1 < positions_count) && d_indexes[idx + 1] == str_idx)
-      return;   // the last delimiter for the string rules them all
-    --str_idx;  // all of these are off by 1 from the upper_bound call
-    size_type token_count      = d_token_counts[str_idx];  // max_tokens already included
-    const char* const base_ptr = get_base_ptr();  // d_positions values are based on this ptr
-    // this string's tokens output
-    auto d_tokens = d_all_tokens + str_idx;
-    // this string
-    const string_view d_str         = get_string(str_idx);
-    const char* const str_begin_ptr = d_str.data();  // beginning of the string
-    const char* str_ptr             = str_begin_ptr + d_str.size_bytes();  // end of the string
-    // build the index-pair of each token for this string
-    for (size_type col = 0; col < token_count; ++col) {
-      auto prev_delim = (idx >= col)  // boundary check for delims in first string
-                          ? (base_ptr + d_positions[idx - col] + 1)  // end of prev delimiter
-                          : str_begin_ptr;                           // or the start of this string
-      auto sptr       = (prev_delim > str_begin_ptr)    // make sure delimiter is inside the string
-                      && (col + 1 < token_count)  // and this is not the last token
-                          ? prev_delim
-                          : str_begin_ptr;
-      // store the token into the output -- building the array backwards
-      d_tokens[d_strings.size() * (token_count - 1 - col)] =
-        string_index_pair{sptr, static_cast<size_type>(str_ptr - sptr)};
-      str_ptr = sptr - d_delimiter.size_bytes();  // get ready for the next prev token
-    }
-  }
-
-  /**
-   * @brief Returns `true` if the byte at `idx` is the end of the delimiter.
-   *
-   * @param idx Index of a byte in the chars column.
-   * @param d_offsets Offsets values to locate the chars ranges.
-   * @return true if delimiter is found ending at position `idx`
-   */
-  __device__ bool is_delimiter(size_type idx, int32_t const* d_offsets, size_type) const
-  {
-    auto delim_length = d_delimiter.size_bytes();
-    if (idx < delim_length - 1) return false;
-    auto d_chars = get_base_ptr() + d_offsets[0];
-    return d_delimiter.compare(d_chars + idx - (delim_length - 1), delim_length) == 0;
-  }
-
-  /**
-   * @brief This counts the tokens for strings that contain delimiters.
-   *
-   * Token counting starts at the end of the string to honor the `max_tokens`
-   * appropriately.
-   *
-   * @param idx Index of a delimiter
-   * @param d_positions End positions of all the delimiters
-   * @param positions_count The number of delimiters
-   * @param d_indexes Indices of the strings for each delimiter
-   * @param d_counts The token counts for all the strings
-   */
-  __device__ void count_tokens(size_type idx,
-                               size_type const* d_positions,
-                               size_type positions_count,
-                               size_type const* d_indexes,
-                               size_type* d_counts) const
-  {
-    size_type str_idx = d_indexes[idx];  // 1-based string index created by upper_bound()
-    if ((idx > 0) && d_indexes[idx - 1] == str_idx)
-      return;  // first delimiter found handles all of them for this string
-    auto const delim_length    = d_delimiter.size_bytes();
-    const string_view d_str    = get_string(str_idx - 1);  // -1 for 0-based index
-    const char* const base_ptr = get_base_ptr();
-    size_type delim_count      = 0;
-    size_type last_pos         = d_positions[idx] - delim_length;
-    while ((idx < positions_count) && (d_indexes[idx] == str_idx)) {
-      // make sure the whole delimiter is inside the string before counting it
-      auto d_pos = d_positions[idx];
-      if (((base_ptr + d_pos + 1 - delim_length) >= d_str.data()) &&
-          ((d_pos - last_pos) >= delim_length)) {
-        ++delim_count;     // only count if the delimiter fits
-        last_pos = d_pos;  // overlapping delimiters are also ignored
-      }
-      ++idx;
-    }
-    // the number of tokens is delim_count+1 but capped to max_tokens
-    d_counts[str_idx - 1] =
-      ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1;
-  }
-
-  rsplit_tokenizer_fn(column_device_view const& d_strings,
-                      string_view const& d_delimiter,
-                      size_type max_tokens)
-    : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
-  {
-  }
-};
-
 /**
  * @brief Generic split function called by split() and rsplit().
  *
@@ -423,125 +108,42 @@ struct rsplit_tokenizer_fn : base_split_tokenizer {
  * @return table of columns for the output of the split
  */
 template <typename Tokenizer>
-std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
+std::unique_ptr<table> split_fn(strings_column_view const& input,
                                 Tokenizer tokenizer,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   std::vector<std::unique_ptr<column>> results;
-  auto const strings_count = strings_column.size();
-  if (strings_count == 0) {
-    results.push_back(make_empty_column(type_id::STRING));
+  if (input.size() == input.null_count()) {
+    results.push_back(std::make_unique<column>(input.parent(), stream, mr));
     return std::make_unique<table>(std::move(results));
   }
 
-  auto d_offsets = strings_column.offsets_begin();
-  auto const chars_bytes =
-    cudf::detail::get_value<int32_t>(
-      strings_column.offsets(), strings_column.offset() + strings_count, stream) -
-    cudf::detail::get_value<int32_t>(strings_column.offsets(), strings_column.offset(), stream);
+  // builds the offsets and the vector of all tokens
+  auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
+  auto const d_offsets   = offsets->view().template data<size_type>();
+  auto const d_tokens    = tokens.data();
 
-  // count the number of delimiters in the entire column
-  auto const delimiter_count =
-    thrust::count_if(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(chars_bytes),
-                     [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
-                       return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
-                     });
-
-  // create vector of every delimiter position in the chars column
-  rmm::device_uvector<size_type> delimiter_positions(delimiter_count, stream);
-  auto d_positions = delimiter_positions.data();
-  auto copy_end    = thrust::copy_if(rmm::exec_policy(stream),
-                                  thrust::make_counting_iterator<size_type>(0),
-                                  thrust::make_counting_iterator<size_type>(chars_bytes),
-                                  delimiter_positions.begin(),
-                                  [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
-                                    return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
-                                  });
-
-  // create vector of string indices for each delimiter
-  rmm::device_uvector<size_type> string_indices(delimiter_count, stream);  // these will
-  auto d_string_indices = string_indices.data();  // be strings that only contain delimiters
-  thrust::upper_bound(rmm::exec_policy(stream),
-                      d_offsets,
-                      d_offsets + strings_count,
-                      delimiter_positions.begin(),
-                      copy_end,
-                      string_indices.begin());
-
-  // compute the number of tokens per string
-  rmm::device_uvector<size_type> token_counts(strings_count, stream);
-  auto d_token_counts = token_counts.data();
-  // first, initialize token counts for strings without delimiters in them
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings_count),
-                    d_token_counts,
-                    [tokenizer] __device__(size_type idx) {
-                      // null are 0, all others 1
-                      return static_cast<size_type>(tokenizer.is_valid(idx));
-                    });
-
-  // now compute the number of tokens in each string
-  thrust::for_each_n(
+  // compute the maximum number of tokens for any string
+  auto const columns_count = thrust::transform_reduce(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
-    delimiter_count,
-    [tokenizer, d_positions, delimiter_count, d_string_indices, d_token_counts] __device__(
-      size_type idx) {
-      tokenizer.count_tokens(idx, d_positions, delimiter_count, d_string_indices, d_token_counts);
-    });
-
-  // the columns_count is the maximum number of tokens for any string
-  auto const columns_count = thrust::reduce(
-    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
-  // boundary case: if no columns, return one null column (custrings issue #119)
-  if (columns_count == 0) {
-    results.push_back(std::make_unique<column>(
-      data_type{type_id::STRING},
-      strings_count,
-      rmm::device_buffer{0, stream, mr},  // no data
-      cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
-      strings_count));
-  }
+    thrust::make_counting_iterator<size_type>(input.size()),
+    [d_offsets] __device__(auto idx) -> size_type { return d_offsets[idx + 1] - d_offsets[idx]; },
+    0,
+    thrust::maximum{});
 
-  // create working area to hold all token positions
-  rmm::device_uvector<string_index_pair> tokens(columns_count * strings_count, stream);
-  string_index_pair* d_tokens = tokens.data();
-  // initialize the token positions
-  // -- accounts for nulls, empty, and strings with no delimiter in them
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     [tokenizer, columns_count, d_tokens] __device__(size_type idx) {
-                       tokenizer.init_tokens(idx, columns_count, d_tokens);
-                     });
-
-  // get the positions for every token using the delimiter positions
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    delimiter_count,
-    [tokenizer,
-     d_token_counts,
-     d_positions,
-     delimiter_count,
-     d_string_indices,
-     d_tokens] __device__(size_type idx) {
-      tokenizer.process_tokens(
-        idx, d_token_counts, d_positions, delimiter_count, d_string_indices, d_tokens);
-    });
-
-  // Create each column.
-  // - Each pair points to the strings for that column for each row.
-  // - Create the strings column from the vector using the strings factory.
+  // build strings columns for each token position
   for (size_type col = 0; col < columns_count; ++col) {
-    auto column_tokens = d_tokens + (col * strings_count);
-    results.emplace_back(
-      make_strings_column(column_tokens, column_tokens + strings_count, stream, mr));
+    auto itr = cudf::detail::make_counting_transform_iterator(
+      0, [d_tokens, d_offsets, col] __device__(size_type idx) {
+        auto const offset      = d_offsets[idx];
+        auto const token_count = d_offsets[idx + 1] - offset;
+        return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0};
+      });
+    results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr));
   }
+
   return std::make_unique<table>(std::move(results));
 }
 
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
new file mode 100644
index 00000000000..41213dac58b
--- /dev/null
+++ b/cpp/src/strings/split/split.cuh
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/strings/detail/split_utils.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cudf::strings::detail {
+
+/**
+ * @brief Base class for delimiter-based tokenizers
+ *
+ * These are common methods used by both split and rsplit tokenizer functors.
+ *
+ * The Derived class is required to implement the `process_tokens` function.
+ */
+template <typename Derived>
+struct base_split_tokenizer {
+  __device__ char const* get_base_ptr() const
+  {
+    return d_strings.child(strings_column_view::chars_column_index).data<char>();
+  }
+
+  __device__ string_view const get_string(size_type idx) const
+  {
+    return d_strings.element<string_view>(idx);
+  }
+
+  __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
+
+  /**
+   * @brief Returns `true` if the byte at `idx` is the start of the delimiter
+   *
+   * @param idx Index of a byte in the chars column
+   * @param d_offsets Offsets values to locate the chars ranges
+   * @param chars_bytes Total number of characters to process
+   * @return true if delimiter is found starting at position `idx`
+   */
+  __device__ bool is_delimiter(size_type idx,
+                               size_type const* d_offsets,
+                               size_type chars_bytes) const
+  {
+    auto const d_chars = get_base_ptr() + d_offsets[0];
+    if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; }
+    return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0;
+  }
+
+  /**
+   * @brief This counts the tokens for strings that contain delimiters
+   *
+   * Counting tokens is the same regardless if counting from the left
+   * or from the right. This logic counts from the left which is simpler.
+   * The count will be truncated appropriately to the max_tokens value.
+   *
+   * @param idx Index of input string
+   * @param d_positions Start positions of all the delimiters
+   * @param d_delimiter_offsets Offsets per string to delimiters in d_positions
+   */
+  __device__ size_type count_tokens(size_type idx,
+                                    size_type const* d_positions,
+                                    size_type const* d_delimiter_offsets) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const delim_size = d_delimiter.size_bytes();
+    auto const d_str      = get_string(idx);
+    auto const d_str_end  = d_str.data() + d_str.size_bytes();
+    auto const base_ptr   = get_base_ptr() + delim_size - 1;
+    auto const delimiters =
+      cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
+                                         d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
+
+    size_type token_count = 1;  // all strings will have at least one token
+    size_type last_pos    = delimiters[0] - delim_size;
+    for (auto d_pos : delimiters) {
+      // delimiter must fit in string && overlapping delimiters are ignored
+      if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) {
+        ++token_count;
+        last_pos = d_pos;
+      }
+    }
+    // number of tokens is capped to max_tokens
+    return ((max_tokens > 0) && (token_count > max_tokens)) ? max_tokens : token_count;
+  }
+
+  /**
+   * @brief This will create tokens around each delimiter honoring the string boundaries
+   * in which the delimiter resides
+   *
+   * Each token is placed in `d_all_tokens` so they align consecutively
+   * with other tokens for the same output column.
+   *
+   * The actual token extraction is performed in the subclass process_tokens() function.
+   *
+   * @param idx Index of the string to tokenize
+   * @param d_tokens_offsets Token offsets for each string
+   * @param d_positions The beginning byte position of each delimiter
+   * @param d_delimiter_offsets Offsets to d_positions to each delimiter set per string
+   * @param d_all_tokens All output tokens for the strings column
+   */
+  __device__ void get_tokens(size_type idx,
+                             size_type const* d_tokens_offsets,
+                             size_type const* d_positions,
+                             size_type const* d_delimiter_offsets,
+                             string_index_pair* d_all_tokens) const
+  {
+    auto const d_tokens =  // this string's tokens output
+      cudf::device_span<string_index_pair>(d_all_tokens + d_tokens_offsets[idx],
+                                           d_tokens_offsets[idx + 1] - d_tokens_offsets[idx]);
+
+    if (!is_valid(idx)) { return; }
+
+    auto const d_str = get_string(idx);
+
+    // max_tokens already included in token counts
+    if (d_tokens.size() == 1) {
+      d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()};
+      return;
+    }
+
+    auto const delimiters =
+      cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
+                                         d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
+
+    auto& derived = static_cast<Derived const&>(*this);
+    derived.process_tokens(d_str, delimiters, d_tokens);
+  }
+
+  base_split_tokenizer(column_device_view const& d_strings,
+                       string_view const& d_delimiter,
+                       size_type max_tokens)
+    : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens)
+  {
+  }
+
+ protected:
+  column_device_view const d_strings;  // strings to split
+  string_view const d_delimiter;       // delimiter for split
+  size_type max_tokens;                // maximum number of tokens to identify
+};
+
+/**
+ * @brief The tokenizer functions for forward splitting
+ */
+struct split_tokenizer_fn : base_split_tokenizer<split_tokenizer_fn> {
+  /**
+   * @brief This will create tokens around each delimiter honoring the string boundaries
+   *
+   * The tokens are processed from the beginning of each string ignoring overlapping
+   * delimiters and honoring the `max_tokens` value.
+   *
+   * @param d_str String to tokenize
+   * @param d_delimiters Positions of delimiters for this string
+   * @param d_tokens Output vector to store tokens for this string
+   */
+  __device__ void process_tokens(string_view const d_str,
+                                 device_span<size_type const> d_delimiters,
+                                 device_span<string_index_pair> d_tokens) const
+  {
+    auto const base_ptr    = get_base_ptr();  // d_positions values based on this
+    auto str_ptr           = d_str.data();
+    auto const str_end     = str_ptr + d_str.size_bytes();  // end of the string
+    auto const token_count = static_cast<size_type>(d_tokens.size());
+    auto const delim_size  = d_delimiter.size_bytes();
+
+    // build the index-pair of each token for this string
+    size_type token_idx = 0;
+    for (auto d_pos : d_delimiters) {
+      auto const next_delim = base_ptr + d_pos;
+      if (next_delim < str_ptr || ((next_delim + delim_size) > str_end)) { continue; }
+      auto const end_ptr = (token_idx + 1 < token_count) ? next_delim : str_end;
+
+      // store the token into the output vector
+      d_tokens[token_idx++] =
+        string_index_pair{str_ptr, static_cast<size_type>(thrust::distance(str_ptr, end_ptr))};
+
+      // setup for next token
+      str_ptr = end_ptr + delim_size;
+    }
+    // include anything leftover
+    if (token_idx < token_count) {
+      d_tokens[token_idx] =
+        string_index_pair{str_ptr, static_cast<size_type>(thrust::distance(str_ptr, str_end))};
+    }
+  }
+
+  split_tokenizer_fn(column_device_view const& d_strings,
+                     string_view const& d_delimiter,
+                     size_type max_tokens)
+    : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
+  {
+  }
+};
+
+/**
+ * @brief The tokenizer functions for backwards splitting
+ *
+ * Same as split_tokenizer_fn except delimiters are searched from the end of each string.
+ */
+struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
+  /**
+   * @brief This will create tokens around each delimiter honoring the string boundaries
+   *
+   * The tokens are processed from the end of each string ignoring overlapping
+   * delimiters and honoring the `max_tokens` value.
+   *
+   * @param d_str String to tokenize
+   * @param d_delimiters Positions of delimiters for this string
+   * @param d_tokens Output vector to store tokens for this string
+   */
+  __device__ void process_tokens(string_view const d_str,
+                                 device_span<size_type const> d_delimiters,
+                                 device_span<string_index_pair> d_tokens) const
+  {
+    auto const base_ptr    = get_base_ptr();  // d_positions values are based on this ptr
+    auto const str_begin   = d_str.data();    // beginning of the string
+    auto const token_count = static_cast<size_type>(d_tokens.size());
+    auto const delim_count = static_cast<size_type>(d_delimiters.size());
+    auto const delim_size  = d_delimiter.size_bytes();
+
+    // build the index-pair of each token for this string
+    auto str_ptr        = str_begin + d_str.size_bytes();
+    size_type token_idx = 0;
+    for (auto d = delim_count - 1; d >= 0; --d) {  // read right-to-left
+      auto const prev_delim = base_ptr + d_delimiters[d] + delim_size;
+      if (prev_delim > str_ptr || ((prev_delim - delim_size) < str_begin)) { continue; }
+      auto const start_ptr = (token_idx + 1 < token_count) ? prev_delim : str_begin;
+
+      // store the token into the output vector right-to-left
+      d_tokens[token_count - token_idx - 1] =
+        string_index_pair{start_ptr, static_cast<size_type>(thrust::distance(start_ptr, str_ptr))};
+
+      // setup for next token
+      str_ptr = start_ptr - delim_size;
+      ++token_idx;
+    }
+    // include anything leftover (rightover?)
+    if (token_idx < token_count) {
+      d_tokens[0] =
+        string_index_pair{str_begin, static_cast<size_type>(thrust::distance(str_begin, str_ptr))};
+    }
+  }
+
+  rsplit_tokenizer_fn(column_device_view const& d_strings,
+                      string_view const& d_delimiter,
+                      size_type max_tokens)
+    : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
+  {
+  }
+};
+
+/**
+ * @brief Helper function used by split/rsplit and split_record/rsplit_record
+ *
+ * This function returns all the token/split positions within the input column as processed by
+ * the given tokenizer. It also returns the offsets for each set of tokens identified per string.
+ *
+ * @tparam Tokenizer Type of the tokenizer object
+ *
+ * @param input The input column of strings to split
+ * @param tokenizer Object used for counting and identifying delimiters and tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned objects' device memory.
+ */
+template <typename Tokenizer>
+std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split_helper(
+  strings_column_view const& input,
+  Tokenizer tokenizer,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto const strings_count = input.size();
+  auto const chars_bytes =
+    cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
+    cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);
+
+  auto d_offsets = input.offsets_begin();
+
+  // count the number of delimiters in the entire column
+  auto const delimiter_count =
+    thrust::count_if(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     thrust::make_counting_iterator<size_type>(chars_bytes),
+                     [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
+                       return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
+                     });
+  // Create a vector of every delimiter position in the chars column.
+  // These may include overlapping or otherwise out-of-bounds delimiters which
+  // will be resolved during token processing.
+  auto delimiter_positions = rmm::device_uvector<size_type>(delimiter_count, stream);
+  auto d_positions         = delimiter_positions.data();
+  auto const copy_end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(chars_bytes),
+                    delimiter_positions.begin(),
+                    [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
+                      return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
+                    });
+
+  // create a vector of offsets to each string's delimiter set within delimiter_positions
+  auto const delimiter_offsets = [&] {
+    // first, create a vector of string indices for each delimiter
+    auto string_indices = rmm::device_uvector<size_type>(delimiter_count, stream);
+    thrust::upper_bound(rmm::exec_policy(stream),
+                        d_offsets,
+                        d_offsets + strings_count,
+                        delimiter_positions.begin(),
+                        copy_end,
+                        string_indices.begin());
+
+    // compute delimiter offsets per string
+    auto delimiter_offsets   = rmm::device_uvector<size_type>(strings_count + 1, stream);
+    auto d_delimiter_offsets = delimiter_offsets.data();
+
+    // memset to zero-out the delimiter counts for any null-entries or strings with no delimiters
+    CUDF_CUDA_TRY(cudaMemsetAsync(
+      d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value()));
+
+    // next, count the number of delimiters per string
+    auto d_string_indices = string_indices.data();  // identifies strings with delimiters only
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       delimiter_count,
+                       [d_string_indices, d_delimiter_offsets] __device__(size_type idx) {
+                         auto const str_idx = d_string_indices[idx] - 1;
+                         atomicAdd(d_delimiter_offsets + str_idx, 1);
+                       });
+    // finally, convert the delimiter counts into offsets
+    thrust::exclusive_scan(rmm::exec_policy(stream),
+                           delimiter_offsets.begin(),
+                           delimiter_offsets.end(),
+                           delimiter_offsets.begin());
+    return delimiter_offsets;
+  }();
+  auto const d_delimiter_offsets = delimiter_offsets.data();
+
+  // compute the number of tokens per string
+  auto token_counts = rmm::device_uvector<size_type>(strings_count, stream);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(strings_count),
+    token_counts.begin(),
+    [tokenizer, d_positions, d_delimiter_offsets] __device__(size_type idx) -> size_type {
+      return tokenizer.count_tokens(idx, d_positions, d_delimiter_offsets);
+    });
+
+  // create offsets from the counts for return to the caller
+  auto offsets = std::get<0>(
+    cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr));
+  auto const total_tokens =
+    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
+  auto const d_tokens_offsets = offsets->view().data<size_type>();
+
+  // build a vector of all the token positions for all the strings
+  auto tokens   = rmm::device_uvector<string_index_pair>(total_tokens, stream);
+  auto d_tokens = tokens.data();
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    strings_count,
+    [tokenizer, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens] __device__(
+      size_type idx) {
+      tokenizer.get_tokens(idx, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens);
+    });
+
+  return std::make_pair(std::move(offsets), std::move(tokens));
+}
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index d935ad0b1da..5b79fdefb5a 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "split.cuh"
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,14 +25,12 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
@@ -38,108 +38,43 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-using string_index_pair = thrust::pair<const char*, size_type>;
-
 namespace {
 
-enum class Dir { FORWARD, BACKWARD };
-
-/**
- * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
- *
- * The number of tokens is the same regardless if counting from the beginning
- * or the end of the string.
- */
-struct token_counter_fn {
-  column_device_view const d_strings;  // strings to split
-  string_view const d_delimiter;       // delimiter for split
-  size_type const max_tokens = std::numeric_limits<size_type>::max();
-
-  __device__ size_type operator()(size_type idx) const
-  {
-    if (d_strings.is_null(idx)) { return 0; }
-
-    auto const d_str      = d_strings.element<string_view>(idx);
-    size_type token_count = 0;
-    size_type start_pos   = 0;
-    while (token_count < max_tokens - 1) {
-      auto const delimiter_pos = d_str.find(d_delimiter, start_pos);
-      if (delimiter_pos == string_view::npos) break;
-      token_count++;
-      start_pos = delimiter_pos + d_delimiter.length();
-    }
-    return token_count + 1;  // always at least one token
-  }
-};
-
-/**
- * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
- */
-template <Dir dir>
-struct token_reader_fn {
-  column_device_view const d_strings;  // strings to split
-  string_view const d_delimiter;       // delimiter for split
-  int32_t* d_token_offsets{};          // for locating tokens in d_tokens
-  string_index_pair* d_tokens{};
-
-  __device__ string_index_pair resolve_token(string_view const& d_str,
-                                             size_type start_pos,
-                                             size_type end_pos,
-                                             size_type delimiter_pos) const
-  {
-    if (dir == Dir::FORWARD) {
-      auto const byte_offset = d_str.byte_offset(start_pos);
-      return string_index_pair{d_str.data() + byte_offset,
-                               d_str.byte_offset(delimiter_pos) - byte_offset};
-    } else {
-      auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length());
-      return string_index_pair{d_str.data() + byte_offset,
-                               d_str.byte_offset(end_pos) - byte_offset};
-    }
+template <typename Tokenizer>
+std::unique_ptr<column> split_record_fn(strings_column_view const& input,
+                                        Tokenizer tokenizer,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) { return make_empty_column(type_id::LIST); }
+  if (input.size() == input.null_count()) {
+    auto offsets = std::make_unique<column>(input.offsets(), stream, mr);
+    auto results = make_empty_column(type_id::STRING);
+    return make_lists_column(input.size(),
+                             std::move(offsets),
+                             std::move(results),
+                             input.null_count(),
+                             copy_bitmask(input.parent(), stream, mr),
+                             stream,
+                             mr);
   }
 
-  __device__ void operator()(size_type idx)
-  {
-    if (d_strings.is_null(idx)) { return; }
+  // builds the offsets and the vector of all tokens
+  auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
 
-    auto const token_offset = d_token_offsets[idx];
-    auto const token_count  = d_token_offsets[idx + 1] - token_offset;
-    auto d_result           = d_tokens + token_offset;
-    auto const d_str        = d_strings.element<string_view>(idx);
-    if (d_str.empty()) {
-      // Pandas str.split("") for non-whitespace delimiter is an empty string
-      *d_result = string_index_pair{"", 0};
-      return;
-    }
+  // build a strings column from the tokens
+  auto strings_child = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
 
-    size_type token_idx = 0;
-    size_type start_pos = 0;               // updates only if moving forward
-    size_type end_pos   = d_str.length();  // updates only if moving backward
-    while (token_idx < token_count - 1) {
-      auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
-                                                     : d_str.rfind(d_delimiter, start_pos, end_pos);
-      if (delimiter_pos == string_view::npos) break;
-      auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos);
-      if (dir == Dir::FORWARD) {
-        d_result[token_idx] = token;
-        start_pos           = delimiter_pos + d_delimiter.length();
-      } else {
-        d_result[token_count - 1 - token_idx] = token;
-        end_pos                               = delimiter_pos;
-      }
-      token_idx++;
-    }
+  return make_lists_column(input.size(),
+                           std::move(offsets),
+                           std::move(strings_child),
+                           input.null_count(),
+                           copy_bitmask(input.parent(), stream, mr),
+                           stream,
+                           mr);
+}
 
-    // set last token to remainder of the string
-    if (dir == Dir::FORWARD) {
-      auto const offset_bytes = d_str.byte_offset(start_pos);
-      d_result[token_idx] =
-        string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes};
-    } else {
-      d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)};
-    }
-  }
-};
+enum class Dir { FORWARD, BACKWARD };
 
 /**
  * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
@@ -196,7 +131,7 @@ struct whitespace_token_reader_fn {
     whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD);
     size_type token_idx = 0;
     position_pair token{0, 0};
-    if (dir == Dir::FORWARD) {
+    if constexpr (dir == Dir::FORWARD) {
       while (tokenizer.next_token() && (token_idx < token_count)) {
         token = tokenizer.get_token();
         d_result[token_idx++] =
@@ -224,11 +159,11 @@ struct whitespace_token_reader_fn {
 
 // The output is one list item per string
 template <typename TokenCounter, typename TokenReader>
-std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
-                                        TokenCounter counter,
-                                        TokenReader reader,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& strings,
+                                                   TokenCounter counter,
+                                                   TokenReader reader,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   // create offsets column by counting the number of tokens per string
   auto strings_count = strings.size();
@@ -244,7 +179,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
     rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
 
   // last entry is the total number of tokens to be generated
-  auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
+  auto total_tokens = cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
   // split each string into an array of index-pair values
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   reader.d_token_offsets = d_offsets;
@@ -277,18 +212,21 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
 
   auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream);
   if (delimiter.size() == 0) {
-    return split_record_fn(strings,
-                           whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
-                           whitespace_token_reader_fn<dir>{*d_strings_column_ptr, max_tokens},
-                           stream,
-                           mr);
+    return whitespace_split_record_fn(
+      strings,
+      whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
+      whitespace_token_reader_fn<dir>{*d_strings_column_ptr, max_tokens},
+      stream,
+      mr);
   } else {
     string_view d_delimiter(delimiter.data(), delimiter.size());
-    return split_record_fn(strings,
-                           token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens},
-                           token_reader_fn<dir>{*d_strings_column_ptr, d_delimiter},
-                           stream,
-                           mr);
+    if (dir == Dir::FORWARD) {
+      return split_record_fn(
+        strings, split_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr);
+    } else {
+      return split_record_fn(
+        strings, rsplit_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr);
+    }
   }
 }
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 21752196430..48f69e3ecd3 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -357,6 +357,10 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest {
   auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
+template <typename T>
+struct ParquetReaderSourceTest : public ParquetReaderTest {
+};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
@@ -369,6 +373,8 @@ using SupportedTimestampTypes =
   cudf::test::Types<cudf::timestamp_ms, cudf::timestamp_us, cudf::timestamp_ns>;
 TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes);
 TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes);
+using ByteLikeTypes = cudf::test::Types<int8_t, char, uint8_t, unsigned char, std::byte>;
+TYPED_TEST_SUITE(ParquetReaderSourceTest, ByteLikeTypes);
 
 // Base test fixture for chunked writer tests
 struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {
@@ -5113,4 +5119,72 @@ TEST_P(ParquetSizedTest, DictionaryTest)
   EXPECT_EQ(nbits, GetParam());
 }
 
+TYPED_TEST(ParquetReaderSourceTest, BufferSourceTypes)
+{
+  using T = TypeParam;
+
+  srand(31337);
+  auto table = create_random_fixed_table<int>(5, 5, true);
+
+  std::vector<char> out_buffer;
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table);
+  cudf::io::write_parquet(out_opts);
+
+  {
+    cudf::io::parquet_reader_options in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info(
+        cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size())));
+    const auto result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
+  }
+
+  {
+    cudf::io::parquet_reader_options in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info(cudf::host_span<T const>(
+        reinterpret_cast<T const*>(out_buffer.data()), out_buffer.size())));
+    const auto result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*table, result.tbl->view());
+  }
+}
+
+TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes)
+{
+  using T = TypeParam;
+
+  srand(31337);
+  auto table = create_random_fixed_table<int>(5, 5, true);
+
+  std::vector<char> out_buffer;
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), *table);
+  cudf::io::write_parquet(out_opts);
+
+  auto full_table = cudf::concatenate(std::vector<table_view>({*table, *table}));
+
+  {
+    auto spans = std::vector<cudf::host_span<T>>{
+      cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size()),
+      cudf::host_span<T>(reinterpret_cast<T*>(out_buffer.data()), out_buffer.size())};
+    cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
+      cudf::io::source_info(cudf::host_span<cudf::host_span<T>>(spans.data(), spans.size())));
+    const auto result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
+  }
+
+  {
+    auto spans = std::vector<cudf::host_span<T const>>{
+      cudf::host_span<T const>(reinterpret_cast<T const*>(out_buffer.data()), out_buffer.size()),
+      cudf::host_span<T const>(reinterpret_cast<T const*>(out_buffer.data()), out_buffer.size())};
+    cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
+      cudf::io::source_info(cudf::host_span<cudf::host_span<T const>>(spans.data(), spans.size())));
+    const auto result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*full_table, result.tbl->view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 73d5adab427..714c1ad416a 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -308,6 +308,82 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, MultiByteDelimiters)
+{
+  // Overlapping delimiters
+  auto input =
+    cudf::test::strings_column_wrapper({"u::", "w:::x", "y::::z", "::a", ":::b", ":::c:::"});
+  auto view = cudf::strings_column_view(input);
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  {
+    auto result        = cudf::strings::split_record(view, cudf::string_scalar("::"));
+    auto expected_left = LCW({LCW{"u", ""},
+                              LCW{"w", ":x"},
+                              LCW{"y", "", "z"},
+                              LCW{"", "a"},
+                              LCW{"", ":b"},
+                              LCW{"", ":c", ":"}});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_left);
+    result              = cudf::strings::rsplit_record(view, cudf::string_scalar("::"));
+    auto expected_right = LCW({LCW{"u", ""},
+                               LCW{"w:", "x"},
+                               LCW{"y", "", "z"},
+                               LCW{"", "a"},
+                               LCW{":", "b"},
+                               LCW{":", "c:", ""}});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_right);
+  }
+  {
+    auto result = cudf::strings::split(view, cudf::string_scalar("::"));
+
+    auto c0 = cudf::test::strings_column_wrapper({"u", "w", "y", "", "", ""});
+    auto c1 = cudf::test::strings_column_wrapper({"", ":x", "", "a", ":b", ":c"});
+    auto c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ":"}, {0, 0, 1, 0, 0, 1});
+    std::vector<std::unique_ptr<cudf::column>> expected_columns;
+    expected_columns.push_back(c0.release());
+    expected_columns.push_back(c1.release());
+    expected_columns.push_back(c2.release());
+    auto expected_left = std::make_unique<cudf::table>(std::move(expected_columns));
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_left);
+
+    result = cudf::strings::rsplit(view, cudf::string_scalar("::"));
+
+    c0 = cudf::test::strings_column_wrapper({"u", "w:", "y", "", ":", ":"});
+    c1 = cudf::test::strings_column_wrapper({"", "x", "", "a", "b", "c:"});
+    c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ""}, {0, 0, 1, 0, 0, 1});
+    expected_columns.push_back(c0.release());
+    expected_columns.push_back(c1.release());
+    expected_columns.push_back(c2.release());
+    auto expected_right = std::make_unique<cudf::table>(std::move(expected_columns));
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_right);
+  }
+
+  // Delimiters that span across adjacent strings
+  input = cudf::test::strings_column_wrapper({"{a=1}:{b=2}:", "{c=3}", ":{}:{}"});
+  view  = cudf::strings_column_view(input);
+  {
+    auto result   = cudf::strings::split_record(view, cudf::string_scalar("}:{"));
+    auto expected = LCW({LCW{"{a=1", "b=2}:"}, LCW{"{c=3}"}, LCW{":{", "}"}});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+    result = cudf::strings::rsplit_record(view, cudf::string_scalar("}:{"));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  }
+  {
+    auto result = cudf::strings::split(view, cudf::string_scalar("}:{"));
+
+    auto c0 = cudf::test::strings_column_wrapper({"{a=1", "{c=3}", ":{"});
+    auto c1 = cudf::test::strings_column_wrapper({"b=2}:", "", "}"}, {1, 0, 1});
+    std::vector<std::unique_ptr<cudf::column>> expected_columns;
+    expected_columns.push_back(c0.release());
+    expected_columns.push_back(c1.release());
+    auto expected = std::make_unique<cudf::table>(std::move(expected_columns));
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected);
+
+    result = cudf::strings::rsplit(view, cudf::string_scalar("}:{"));
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected);
+  }
+}
+
 TEST_F(StringsSplitTest, SplitRegex)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index db64dcb08c7..937077c89c9 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1045,8 +1045,8 @@ void decimal128Cv() {
     BigInteger bigInteger2 = new BigInteger("14");
     BigInteger bigInteger3 = new BigInteger("152345742357340573405745");
     final BigInteger[] bigInts = new BigInteger[] {bigInteger1, bigInteger2, bigInteger3};
-    try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts)) {
-      HostColumnVector hostColumnVector = v.copyToHost();
+    try (ColumnVector v = ColumnVector.decimalFromBigInt(-dec32Scale1, bigInts);
+         HostColumnVector hostColumnVector = v.copyToHost()) {
       assertEquals(bigInteger1, hostColumnVector.getBigDecimal(0).unscaledValue());
       assertEquals(bigInteger2, hostColumnVector.getBigDecimal(1).unscaledValue());
       assertEquals(bigInteger3, hostColumnVector.getBigDecimal(2).unscaledValue());
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 86c340bb321..f4b652a7d03 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ public void testDoubleClose() {
   }
 
   @Test
-  public void testIncRef() {
+  public void testIncRefAndDoubleFree() {
     Scalar s = Scalar.fromNull(DType.INT32);
     try (Scalar ignored1 = s) {
       try (Scalar ignored2 = s.incRefCount()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 4f00bc7493d..c31bcf4f78d 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -244,7 +244,7 @@ void testOrderByWithNullsAndStrings() {
   }
 
   @Test
-  void testTableCreationIncreasesRefCount() {
+  void testTableCreationIncreasesRefCountWithDoubleFree() {
     //tests the Table increases the refcount on column vectors
     assertThrows(IllegalStateException.class, () -> {
       try (ColumnVector v1 = ColumnVector.build(DType.INT32, 5, Range.appendInts(5));
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index ac058b1d9a1..c528eb69575 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -72,15 +72,6 @@ endif()
 include(rapids-cython)
 
 if(NOT cudf_FOUND)
-  # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required
-  # languages for the C++ project even if this project does not require those languages.
-  include(rapids-cuda)
-  rapids_cuda_init_architectures(cudf-python)
-  enable_language(CUDA)
-  # Since cudf only enables CUDA optionally we need to manually include the file that
-  # rapids_cuda_init_architectures relies on `project` including.
-  include("${CMAKE_PROJECT_cudf-python_INCLUDE}")
-
   set(BUILD_TESTS OFF)
   set(BUILD_BENCHMARKS OFF)
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 965b413e84f..fb1bcf6d673 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -607,7 +607,7 @@ def _scatter_by_slice(
         start, stop, step = key.indices(len(self))
         if start >= stop:
             return None
-        num_keys = (stop - start) // step
+        num_keys = len(range(start, stop, step))
 
         self._check_scatter_key_length(num_keys, value)
 
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 6b2fb90e95b..d59226ee17a 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -347,3 +347,13 @@ def test_series_setitem_upcasting_string_value():
     assert_eq(pd.Series([10, 0, 0], dtype=int), sr)
     with pytest.raises(ValueError):
         sr[0] = "non-integer"
+
+
+def test_scatter_by_slice_with_start_and_step():
+    source = pd.Series([1, 2, 3, 4, 5])
+    csource = cudf.from_pandas(source)
+    target = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    ctarget = cudf.from_pandas(target)
+    target[1::2] = source
+    ctarget[1::2] = csource
+    assert_eq(target, ctarget)
diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt
index ebf47ee8469..0c07236682f 100644
--- a/python/cudf/udf_cpp/CMakeLists.txt
+++ b/python/cudf/udf_cpp/CMakeLists.txt
@@ -18,8 +18,6 @@ include(rapids-cmake)
 include(rapids-cpm)
 include(rapids-find)
 
-rapids_cuda_init_architectures(udf-cpp)
-
 rapids_cpm_init()
 
 rapids_find_package(