From 7da233b279bf84a501e9c2e3041cbc6fb335e610 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 21 Feb 2023 08:12:54 -0500
Subject: [PATCH] Rework logic in cudf::strings::split_record to improve
 performance (#12729)

Updates the `cudf::strings::split_record` logic to match the more optimized code in `cudf::strings:split`.
The optimized code performs much better for longer strings (>64 bytes) by parallelizing over the character bytes to find delimiters before determining split tokens.
This led to refactoring the code so it both APIs can share the optimized code.
Also fixes a bug found when using overlapped delimiters.
Additional tests were added for multi-byte delimiters which can overlap and span multiple adjacent strings.

Closes #12694

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/12729
---
 cpp/benchmarks/string/split.cpp       |  14 +-
 cpp/src/strings/split/split.cu        | 448 ++------------------------
 cpp/src/strings/split/split.cuh       | 403 +++++++++++++++++++++++
 cpp/src/strings/split/split_record.cu | 168 +++-------
 cpp/tests/strings/split_tests.cpp     |  78 ++++-
 5 files changed, 565 insertions(+), 546 deletions(-)
 create mode 100644 cpp/src/strings/split/split.cuh
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index 0f005c462cc..1b3f4190680 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,12 +57,12 @@ static void BM_split(benchmark::State& state, split_type rt)
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
 {
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
+  int constexpr min_rows   = 1 << 12;
+  int constexpr max_rows   = 1 << 24;
+  int constexpr row_mult   = 8;
+  int constexpr min_rowlen = 1 << 5;
+  int constexpr max_rowlen = 1 << 13;
+  int constexpr len_mult   = 2;
   for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
     for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
       // avoid generating combinations that exceed the cudf column limit
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index c11d7ad47f9..18599fb568a 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "split.cuh"
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -31,14 +33,10 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
 
@@ -46,321 +44,8 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-using string_index_pair = thrust::pair<const char*, size_type>;
-
 namespace {
 
-/**
- * @brief Base class for delimiter-based tokenizers.
- *
- * These are common methods used by both split and rsplit tokenizer functors.
- */
-struct base_split_tokenizer {
-  __device__ const char* get_base_ptr() const
-  {
-    return d_strings.child(strings_column_view::chars_column_index).data<char>();
-  }
-
-  __device__ string_view const get_string(size_type idx) const
-  {
-    return d_strings.element<string_view>(idx);
-  }
-
-  __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
-
-  /**
-   * @brief Initialize token elements for all strings.
-   *
-   * The process_tokens() only handles creating tokens for strings that contain
-   * delimiters. This function will initialize the output tokens for all
-   * strings by assigning null entries for null and empty strings and the
-   * string itself for strings with no delimiters.
-   *
-   * The tokens are placed in output order so that all tokens for each output
-   * column are stored consecutively in `d_all_tokens`.
-   *
-   * @param idx Index of string in column
-   * @param column_count Number of columns in output
-   * @param d_all_tokens Tokens vector for all strings
-   */
-  __device__ void init_tokens(size_type idx,
-                              size_type column_count,
-                              string_index_pair* d_all_tokens) const
-  {
-    auto d_tokens = d_all_tokens + idx;
-    if (is_valid(idx)) {
-      auto d_str = get_string(idx);
-      *d_tokens  = string_index_pair{d_str.data(), d_str.size_bytes()};
-      --column_count;
-      d_tokens += d_strings.size();
-    }
-    // this is like fill() but output needs to be strided
-    for (size_type col = 0; col < column_count; ++col)
-      d_tokens[d_strings.size() * col] = string_index_pair{nullptr, 0};
-  }
-
-  base_split_tokenizer(column_device_view const& d_strings,
-                       string_view const& d_delimiter,
-                       size_type max_tokens)
-    : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens)
-  {
-  }
-
- protected:
-  column_device_view const d_strings;  // strings to split
-  string_view const d_delimiter;       // delimiter for split
-  size_type max_tokens;
-};
-
-/**
- * @brief The tokenizer functions for split().
- *
- * The methods here count delimiters, tokens, and output token elements
- * for each string in a strings column.
- */
-struct split_tokenizer_fn : base_split_tokenizer {
-  /**
-   * @brief This will create tokens around each delimiter honoring the string boundaries
-   * in which the delimiter resides.
-   *
-   * Each token is placed in `d_all_tokens` so they align consecutively
-   * with other tokens for the same output column.
-   * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col`
-   * for string at `string_index`.
-   *
-   * @param idx Index of the delimiter in the chars column
-   * @param d_token_counts Token counts for each string
-   * @param d_positions The beginning byte position of each delimiter
-   * @param positions_count Number of delimiters
-   * @param d_indexes Indices of the strings for each delimiter
-   * @param d_all_tokens All output tokens for the strings column
-   */
-  __device__ void process_tokens(size_type idx,
-                                 size_type const* d_token_counts,
-                                 size_type const* d_positions,
-                                 size_type positions_count,
-                                 size_type const* d_indexes,
-                                 string_index_pair* d_all_tokens) const
-  {
-    size_type str_idx = d_indexes[idx];
-    if ((idx > 0) && d_indexes[idx - 1] == str_idx)
-      return;   // the first delimiter for the string rules them all
-    --str_idx;  // all of these are off by 1 from the upper_bound call
-    size_type token_count      = d_token_counts[str_idx];  // max_tokens already included
-    const char* const base_ptr = get_base_ptr();  // d_positions values are based on this ptr
-    // this string's tokens output
-    auto d_tokens = d_all_tokens + str_idx;
-    // this string
-    const string_view d_str       = get_string(str_idx);
-    const char* str_ptr           = d_str.data();                  // beginning of the string
-    const char* const str_end_ptr = str_ptr + d_str.size_bytes();  // end of the string
-    // build the index-pair of each token for this string
-    for (size_type col = 0; col < token_count; ++col) {
-      auto next_delim = ((idx + col) < positions_count)  // boundary check for delims in last string
-                          ? (base_ptr + d_positions[idx + col])  // start of next delimiter
-                          : str_end_ptr;                         // or end of this string
-      auto eptr       = (next_delim < str_end_ptr)      // make sure delimiter is inside this string
-                      && (col + 1 < token_count)  // and this is not the last token
-                          ? next_delim
-                          : str_end_ptr;
-      // store the token into the output vector
-      d_tokens[col * d_strings.size()] =
-        string_index_pair{str_ptr, static_cast<size_type>(eptr - str_ptr)};
-      // point past this delimiter
-      str_ptr = eptr + d_delimiter.size_bytes();
-    }
-  }
-
-  /**
-   * @brief Returns `true` if the byte at `idx` is the start of the delimiter.
-   *
-   * @param idx Index of a byte in the chars column.
-   * @param d_offsets Offsets values to locate the chars ranges.
-   * @param chars_bytes Total number of characters to process.
-   * @return true if delimiter is found starting at position `idx`
-   */
-  __device__ bool is_delimiter(size_type idx,  // chars index
-                               int32_t const* d_offsets,
-                               size_type chars_bytes) const
-  {
-    auto d_chars = get_base_ptr() + d_offsets[0];
-    if (idx + d_delimiter.size_bytes() > chars_bytes) return false;
-    return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0;
-  }
-
-  /**
-   * @brief This counts the tokens for strings that contain delimiters.
-   *
-   * @param idx Index of a delimiter
-   * @param d_positions Start positions of all the delimiters
-   * @param positions_count The number of delimiters
-   * @param d_indexes Indices of the strings for each delimiter
-   * @param d_counts The token counts for all the strings
-   */
-  __device__ void count_tokens(size_type idx,  // delimiter index
-                               size_type const* d_positions,
-                               size_type positions_count,
-                               size_type const* d_indexes,
-                               size_type* d_counts) const
-  {
-    size_type str_idx = d_indexes[idx];
-    if ((idx > 0) && d_indexes[idx - 1] == str_idx)
-      return;  // first delimiter found handles all of them for this string
-    auto const delim_length    = d_delimiter.size_bytes();
-    string_view const d_str    = get_string(str_idx - 1);
-    const char* const base_ptr = get_base_ptr();
-    size_type delim_count      = 0;  // re-count delimiters to compute the token-count
-    size_type last_pos         = d_positions[idx] - delim_length;
-    while ((idx < positions_count) && (d_indexes[idx] == str_idx)) {
-      // make sure the whole delimiter is inside the string before counting it
-      auto d_pos = d_positions[idx];
-      if (((base_ptr + d_pos + delim_length - 1) < (d_str.data() + d_str.size_bytes())) &&
-          ((d_pos - last_pos) >= delim_length)) {
-        ++delim_count;     // only count if the delimiter fits
-        last_pos = d_pos;  // overlapping delimiters are ignored too
-      }
-      ++idx;
-    }
-    // the number of tokens is delim_count+1 but capped to max_tokens
-    d_counts[str_idx - 1] =
-      ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1;
-  }
-
-  split_tokenizer_fn(column_device_view const& d_strings,
-                     string_view const& d_delimiter,
-                     size_type max_tokens)
-    : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
-  {
-  }
-};
-
-/**
- * @brief The tokenizer functions for split().
- *
- * The methods here count delimiters, tokens, and output token elements
- * for each string in a strings column.
- *
- * Same as split_tokenizer_fn except tokens are counted from the end of each string.
- */
-struct rsplit_tokenizer_fn : base_split_tokenizer {
-  /**
-   * @brief This will create tokens around each delimiter honoring the string boundaries
-   * in which the delimiter resides.
-   *
-   * The tokens are processed from the end of each string so the `max_tokens`
-   * is honored correctly.
-   *
-   * Each token is placed in `d_all_tokens` so they align consecutively
-   * with other tokens for the same output column.
-   * That is, `d_tokens[col * strings_count + string_index]` is the token at column `col`
-   * for string at `string_index`.
-   *
-   * @param idx Index of the delimiter in the chars column
-   * @param d_token_counts Token counts for each string
-   * @param d_positions The ending byte position of each delimiter
-   * @param positions_count Number of delimiters
-   * @param d_indexes Indices of the strings for each delimiter
-   * @param d_all_tokens All output tokens for the strings column
-   */
-  __device__ void process_tokens(size_type idx,                    // delimiter position index
-                                 size_type const* d_token_counts,  // token counts for each string
-                                 size_type const* d_positions,     // end of each delimiter
-                                 size_type positions_count,        // total number of delimiters
-                                 size_type const* d_indexes,  // string indices for each delimiter
-                                 string_index_pair* d_all_tokens) const
-  {
-    size_type str_idx = d_indexes[idx];
-    if ((idx + 1 < positions_count) && d_indexes[idx + 1] == str_idx)
-      return;   // the last delimiter for the string rules them all
-    --str_idx;  // all of these are off by 1 from the upper_bound call
-    size_type token_count      = d_token_counts[str_idx];  // max_tokens already included
-    const char* const base_ptr = get_base_ptr();  // d_positions values are based on this ptr
-    // this string's tokens output
-    auto d_tokens = d_all_tokens + str_idx;
-    // this string
-    const string_view d_str         = get_string(str_idx);
-    const char* const str_begin_ptr = d_str.data();  // beginning of the string
-    const char* str_ptr             = str_begin_ptr + d_str.size_bytes();  // end of the string
-    // build the index-pair of each token for this string
-    for (size_type col = 0; col < token_count; ++col) {
-      auto prev_delim = (idx >= col)  // boundary check for delims in first string
-                          ? (base_ptr + d_positions[idx - col] + 1)  // end of prev delimiter
-                          : str_begin_ptr;                           // or the start of this string
-      auto sptr       = (prev_delim > str_begin_ptr)    // make sure delimiter is inside the string
-                      && (col + 1 < token_count)  // and this is not the last token
-                          ? prev_delim
-                          : str_begin_ptr;
-      // store the token into the output -- building the array backwards
-      d_tokens[d_strings.size() * (token_count - 1 - col)] =
-        string_index_pair{sptr, static_cast<size_type>(str_ptr - sptr)};
-      str_ptr = sptr - d_delimiter.size_bytes();  // get ready for the next prev token
-    }
-  }
-
-  /**
-   * @brief Returns `true` if the byte at `idx` is the end of the delimiter.
-   *
-   * @param idx Index of a byte in the chars column.
-   * @param d_offsets Offsets values to locate the chars ranges.
-   * @return true if delimiter is found ending at position `idx`
-   */
-  __device__ bool is_delimiter(size_type idx, int32_t const* d_offsets, size_type) const
-  {
-    auto delim_length = d_delimiter.size_bytes();
-    if (idx < delim_length - 1) return false;
-    auto d_chars = get_base_ptr() + d_offsets[0];
-    return d_delimiter.compare(d_chars + idx - (delim_length - 1), delim_length) == 0;
-  }
-
-  /**
-   * @brief This counts the tokens for strings that contain delimiters.
-   *
-   * Token counting starts at the end of the string to honor the `max_tokens`
-   * appropriately.
-   *
-   * @param idx Index of a delimiter
-   * @param d_positions End positions of all the delimiters
-   * @param positions_count The number of delimiters
-   * @param d_indexes Indices of the strings for each delimiter
-   * @param d_counts The token counts for all the strings
-   */
-  __device__ void count_tokens(size_type idx,
-                               size_type const* d_positions,
-                               size_type positions_count,
-                               size_type const* d_indexes,
-                               size_type* d_counts) const
-  {
-    size_type str_idx = d_indexes[idx];  // 1-based string index created by upper_bound()
-    if ((idx > 0) && d_indexes[idx - 1] == str_idx)
-      return;  // first delimiter found handles all of them for this string
-    auto const delim_length    = d_delimiter.size_bytes();
-    const string_view d_str    = get_string(str_idx - 1);  // -1 for 0-based index
-    const char* const base_ptr = get_base_ptr();
-    size_type delim_count      = 0;
-    size_type last_pos         = d_positions[idx] - delim_length;
-    while ((idx < positions_count) && (d_indexes[idx] == str_idx)) {
-      // make sure the whole delimiter is inside the string before counting it
-      auto d_pos = d_positions[idx];
-      if (((base_ptr + d_pos + 1 - delim_length) >= d_str.data()) &&
-          ((d_pos - last_pos) >= delim_length)) {
-        ++delim_count;     // only count if the delimiter fits
-        last_pos = d_pos;  // overlapping delimiters are also ignored
-      }
-      ++idx;
-    }
-    // the number of tokens is delim_count+1 but capped to max_tokens
-    d_counts[str_idx - 1] =
-      ((max_tokens > 0) && (delim_count + 1 > max_tokens)) ? max_tokens : delim_count + 1;
-  }
-
-  rsplit_tokenizer_fn(column_device_view const& d_strings,
-                      string_view const& d_delimiter,
-                      size_type max_tokens)
-    : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
-  {
-  }
-};
-
 /**
  * @brief Generic split function called by split() and rsplit().
  *
@@ -423,125 +108,42 @@ struct rsplit_tokenizer_fn : base_split_tokenizer {
  * @return table of columns for the output of the split
  */
 template <typename Tokenizer>
-std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
+std::unique_ptr<table> split_fn(strings_column_view const& input,
                                 Tokenizer tokenizer,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   std::vector<std::unique_ptr<column>> results;
-  auto const strings_count = strings_column.size();
-  if (strings_count == 0) {
-    results.push_back(make_empty_column(type_id::STRING));
+  if (input.size() == input.null_count()) {
+    results.push_back(std::make_unique<column>(input.parent(), stream, mr));
     return std::make_unique<table>(std::move(results));
   }
 
-  auto d_offsets = strings_column.offsets_begin();
-  auto const chars_bytes =
-    cudf::detail::get_value<int32_t>(
-      strings_column.offsets(), strings_column.offset() + strings_count, stream) -
-    cudf::detail::get_value<int32_t>(strings_column.offsets(), strings_column.offset(), stream);
+  // builds the offsets and the vector of all tokens
+  auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
+  auto const d_offsets   = offsets->view().template data<size_type>();
+  auto const d_tokens    = tokens.data();
 
-  // count the number of delimiters in the entire column
-  auto const delimiter_count =
-    thrust::count_if(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(chars_bytes),
-                     [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
-                       return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
-                     });
-
-  // create vector of every delimiter position in the chars column
-  rmm::device_uvector<size_type> delimiter_positions(delimiter_count, stream);
-  auto d_positions = delimiter_positions.data();
-  auto copy_end    = thrust::copy_if(rmm::exec_policy(stream),
-                                  thrust::make_counting_iterator<size_type>(0),
-                                  thrust::make_counting_iterator<size_type>(chars_bytes),
-                                  delimiter_positions.begin(),
-                                  [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
-                                    return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
-                                  });
-
-  // create vector of string indices for each delimiter
-  rmm::device_uvector<size_type> string_indices(delimiter_count, stream);  // these will
-  auto d_string_indices = string_indices.data();  // be strings that only contain delimiters
-  thrust::upper_bound(rmm::exec_policy(stream),
-                      d_offsets,
-                      d_offsets + strings_count,
-                      delimiter_positions.begin(),
-                      copy_end,
-                      string_indices.begin());
-
-  // compute the number of tokens per string
-  rmm::device_uvector<size_type> token_counts(strings_count, stream);
-  auto d_token_counts = token_counts.data();
-  // first, initialize token counts for strings without delimiters in them
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings_count),
-                    d_token_counts,
-                    [tokenizer] __device__(size_type idx) {
-                      // null are 0, all others 1
-                      return static_cast<size_type>(tokenizer.is_valid(idx));
-                    });
-
-  // now compute the number of tokens in each string
-  thrust::for_each_n(
+  // compute the maximum number of tokens for any string
+  auto const columns_count = thrust::transform_reduce(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
-    delimiter_count,
-    [tokenizer, d_positions, delimiter_count, d_string_indices, d_token_counts] __device__(
-      size_type idx) {
-      tokenizer.count_tokens(idx, d_positions, delimiter_count, d_string_indices, d_token_counts);
-    });
-
-  // the columns_count is the maximum number of tokens for any string
-  auto const columns_count = thrust::reduce(
-    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
-  // boundary case: if no columns, return one null column (custrings issue #119)
-  if (columns_count == 0) {
-    results.push_back(std::make_unique<column>(
-      data_type{type_id::STRING},
-      strings_count,
-      rmm::device_buffer{0, stream, mr},  // no data
-      cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
-      strings_count));
-  }
+    thrust::make_counting_iterator<size_type>(input.size()),
+    [d_offsets] __device__(auto idx) -> size_type { return d_offsets[idx + 1] - d_offsets[idx]; },
+    0,
+    thrust::maximum{});
 
-  // create working area to hold all token positions
-  rmm::device_uvector<string_index_pair> tokens(columns_count * strings_count, stream);
-  string_index_pair* d_tokens = tokens.data();
-  // initialize the token positions
-  // -- accounts for nulls, empty, and strings with no delimiter in them
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     [tokenizer, columns_count, d_tokens] __device__(size_type idx) {
-                       tokenizer.init_tokens(idx, columns_count, d_tokens);
-                     });
-
-  // get the positions for every token using the delimiter positions
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    delimiter_count,
-    [tokenizer,
-     d_token_counts,
-     d_positions,
-     delimiter_count,
-     d_string_indices,
-     d_tokens] __device__(size_type idx) {
-      tokenizer.process_tokens(
-        idx, d_token_counts, d_positions, delimiter_count, d_string_indices, d_tokens);
-    });
-
-  // Create each column.
-  // - Each pair points to the strings for that column for each row.
-  // - Create the strings column from the vector using the strings factory.
+  // build strings columns for each token position
   for (size_type col = 0; col < columns_count; ++col) {
-    auto column_tokens = d_tokens + (col * strings_count);
-    results.emplace_back(
-      make_strings_column(column_tokens, column_tokens + strings_count, stream, mr));
+    auto itr = cudf::detail::make_counting_transform_iterator(
+      0, [d_tokens, d_offsets, col] __device__(size_type idx) {
+        auto const offset      = d_offsets[idx];
+        auto const token_count = d_offsets[idx + 1] - offset;
+        return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0};
+      });
+    results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr));
   }
+
   return std::make_unique<table>(std::move(results));
 }
 
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
new file mode 100644
index 00000000000..41213dac58b
--- /dev/null
+++ b/cpp/src/strings/split/split.cuh
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/strings/detail/split_utils.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cudf::strings::detail {
+
+/**
+ * @brief Base class for delimiter-based tokenizers
+ *
+ * These are common methods used by both split and rsplit tokenizer functors.
+ *
+ * The Derived class is required to implement the `process_tokens` function.
+ */
+template <typename Derived>
+struct base_split_tokenizer {
+  __device__ char const* get_base_ptr() const
+  {
+    return d_strings.child(strings_column_view::chars_column_index).data<char>();
+  }
+
+  __device__ string_view const get_string(size_type idx) const
+  {
+    return d_strings.element<string_view>(idx);
+  }
+
+  __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
+
+  /**
+   * @brief Returns `true` if the byte at `idx` is the start of the delimiter
+   *
+   * @param idx Index of a byte in the chars column
+   * @param d_offsets Offsets values to locate the chars ranges
+   * @param chars_bytes Total number of characters to process
+   * @return true if delimiter is found starting at position `idx`
+   */
+  __device__ bool is_delimiter(size_type idx,
+                               size_type const* d_offsets,
+                               size_type chars_bytes) const
+  {
+    auto const d_chars = get_base_ptr() + d_offsets[0];
+    if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; }
+    return d_delimiter.compare(d_chars + idx, d_delimiter.size_bytes()) == 0;
+  }
+
+  /**
+   * @brief This counts the tokens for strings that contain delimiters
+   *
+   * Counting tokens is the same regardless if counting from the left
+   * or from the right. This logic counts from the left which is simpler.
+   * The count will be truncated appropriately to the max_tokens value.
+   *
+   * @param idx Index of input string
+   * @param d_positions Start positions of all the delimiters
+   * @param d_delimiter_offsets Offsets per string to delimiters in d_positions
+   */
+  __device__ size_type count_tokens(size_type idx,
+                                    size_type const* d_positions,
+                                    size_type const* d_delimiter_offsets) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const delim_size = d_delimiter.size_bytes();
+    auto const d_str      = get_string(idx);
+    auto const d_str_end  = d_str.data() + d_str.size_bytes();
+    auto const base_ptr   = get_base_ptr() + delim_size - 1;
+    auto const delimiters =
+      cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
+                                         d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
+
+    size_type token_count = 1;  // all strings will have at least one token
+    size_type last_pos    = delimiters[0] - delim_size;
+    for (auto d_pos : delimiters) {
+      // delimiter must fit in string && overlapping delimiters are ignored
+      if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) {
+        ++token_count;
+        last_pos = d_pos;
+      }
+    }
+    // number of tokens is capped to max_tokens
+    return ((max_tokens > 0) && (token_count > max_tokens)) ? max_tokens : token_count;
+  }
+
+  /**
+   * @brief This will create tokens around each delimiter honoring the string boundaries
+   * in which the delimiter resides
+   *
+   * Each token is placed in `d_all_tokens` so they align consecutively
+   * with other tokens for the same output column.
+   *
+   * The actual token extraction is performed in the subclass process_tokens() function.
+   *
+   * @param idx Index of the string to tokenize
+   * @param d_tokens_offsets Token offsets for each string
+   * @param d_positions The beginning byte position of each delimiter
+   * @param d_delimiter_offsets Offsets to d_positions to each delimiter set per string
+   * @param d_all_tokens All output tokens for the strings column
+   */
+  __device__ void get_tokens(size_type idx,
+                             size_type const* d_tokens_offsets,
+                             size_type const* d_positions,
+                             size_type const* d_delimiter_offsets,
+                             string_index_pair* d_all_tokens) const
+  {
+    auto const d_tokens =  // this string's tokens output
+      cudf::device_span<string_index_pair>(d_all_tokens + d_tokens_offsets[idx],
+                                           d_tokens_offsets[idx + 1] - d_tokens_offsets[idx]);
+
+    if (!is_valid(idx)) { return; }
+
+    auto const d_str = get_string(idx);
+
+    // max_tokens already included in token counts
+    if (d_tokens.size() == 1) {
+      d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()};
+      return;
+    }
+
+    auto const delimiters =
+      cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
+                                         d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
+
+    auto& derived = static_cast<Derived const&>(*this);
+    derived.process_tokens(d_str, delimiters, d_tokens);
+  }
+
+  base_split_tokenizer(column_device_view const& d_strings,
+                       string_view const& d_delimiter,
+                       size_type max_tokens)
+    : d_strings(d_strings), d_delimiter(d_delimiter), max_tokens(max_tokens)
+  {
+  }
+
+ protected:
+  column_device_view const d_strings;  // strings to split
+  string_view const d_delimiter;       // delimiter for split
+  size_type max_tokens;                // maximum number of tokens to identify
+};
+
+/**
+ * @brief The tokenizer functions for forward splitting
+ */
+struct split_tokenizer_fn : base_split_tokenizer<split_tokenizer_fn> {
+  /**
+   * @brief This will create tokens around each delimiter honoring the string boundaries
+   *
+   * The tokens are processed from the beginning of each string ignoring overlapping
+   * delimiters and honoring the `max_tokens` value.
+   *
+   * @param d_str String to tokenize
+   * @param d_delimiters Positions of delimiters for this string
+   * @param d_tokens Output vector to store tokens for this string
+   */
+  __device__ void process_tokens(string_view const d_str,
+                                 device_span<size_type const> d_delimiters,
+                                 device_span<string_index_pair> d_tokens) const
+  {
+    auto const base_ptr    = get_base_ptr();  // d_positions values based on this
+    auto str_ptr           = d_str.data();
+    auto const str_end     = str_ptr + d_str.size_bytes();  // end of the string
+    auto const token_count = static_cast<size_type>(d_tokens.size());
+    auto const delim_size  = d_delimiter.size_bytes();
+
+    // build the index-pair of each token for this string
+    size_type token_idx = 0;
+    for (auto d_pos : d_delimiters) {
+      auto const next_delim = base_ptr + d_pos;
+      if (next_delim < str_ptr || ((next_delim + delim_size) > str_end)) { continue; }
+      auto const end_ptr = (token_idx + 1 < token_count) ? next_delim : str_end;
+
+      // store the token into the output vector
+      d_tokens[token_idx++] =
+        string_index_pair{str_ptr, static_cast<size_type>(thrust::distance(str_ptr, end_ptr))};
+
+      // setup for next token
+      str_ptr = end_ptr + delim_size;
+    }
+    // include anything leftover
+    if (token_idx < token_count) {
+      d_tokens[token_idx] =
+        string_index_pair{str_ptr, static_cast<size_type>(thrust::distance(str_ptr, str_end))};
+    }
+  }
+
+  split_tokenizer_fn(column_device_view const& d_strings,
+                     string_view const& d_delimiter,
+                     size_type max_tokens)
+    : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
+  {
+  }
+};
+
+/**
+ * @brief The tokenizer functions for backwards splitting
+ *
+ * Same as split_tokenizer_fn except delimiters are searched from the end of each string.
+ */
+struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
+  /**
+   * @brief This will create tokens around each delimiter honoring the string boundaries
+   *
+   * The tokens are processed from the end of each string ignoring overlapping
+   * delimiters and honoring the `max_tokens` value.
+   *
+   * @param d_str String to tokenize
+   * @param d_delimiters Positions of delimiters for this string
+   * @param d_tokens Output vector to store tokens for this string
+   */
+  __device__ void process_tokens(string_view const d_str,
+                                 device_span<size_type const> d_delimiters,
+                                 device_span<string_index_pair> d_tokens) const
+  {
+    auto const base_ptr    = get_base_ptr();  // d_positions values are based on this ptr
+    auto const str_begin   = d_str.data();    // beginning of the string
+    auto const token_count = static_cast<size_type>(d_tokens.size());
+    auto const delim_count = static_cast<size_type>(d_delimiters.size());
+    auto const delim_size  = d_delimiter.size_bytes();
+
+    // build the index-pair of each token for this string
+    auto str_ptr        = str_begin + d_str.size_bytes();
+    size_type token_idx = 0;
+    for (auto d = delim_count - 1; d >= 0; --d) {  // read right-to-left
+      auto const prev_delim = base_ptr + d_delimiters[d] + delim_size;
+      if (prev_delim > str_ptr || ((prev_delim - delim_size) < str_begin)) { continue; }
+      auto const start_ptr = (token_idx + 1 < token_count) ? prev_delim : str_begin;
+
+      // store the token into the output vector right-to-left
+      d_tokens[token_count - token_idx - 1] =
+        string_index_pair{start_ptr, static_cast<size_type>(thrust::distance(start_ptr, str_ptr))};
+
+      // setup for next token
+      str_ptr = start_ptr - delim_size;
+      ++token_idx;
+    }
+    // include anything leftover (rightover?)
+    if (token_idx < token_count) {
+      d_tokens[0] =
+        string_index_pair{str_begin, static_cast<size_type>(thrust::distance(str_begin, str_ptr))};
+    }
+  }
+
+  rsplit_tokenizer_fn(column_device_view const& d_strings,
+                      string_view const& d_delimiter,
+                      size_type max_tokens)
+    : base_split_tokenizer(d_strings, d_delimiter, max_tokens)
+  {
+  }
+};
+
+/**
+ * @brief Helper function used by split/rsplit and split_record/rsplit_record
+ *
+ * This function returns all the token/split positions within the input column as processed by
+ * the given tokenizer. It also returns the offsets for each set of tokens identified per string.
+ *
+ * @tparam Tokenizer Type of the tokenizer object
+ *
+ * @param input The input column of strings to split
+ * @param tokenizer Object used for counting and identifying delimiters and tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned objects' device memory.
+ */
+template <typename Tokenizer>
+std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split_helper(
+  strings_column_view const& input,
+  Tokenizer tokenizer,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto const strings_count = input.size();
+  auto const chars_bytes =
+    cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
+    cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);
+
+  auto d_offsets = input.offsets_begin();
+
+  // count the number of delimiters in the entire column
+  auto const delimiter_count =
+    thrust::count_if(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     thrust::make_counting_iterator<size_type>(chars_bytes),
+                     [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
+                       return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
+                     });
+  // Create a vector of every delimiter position in the chars column.
+  // These may include overlapping or otherwise out-of-bounds delimiters which
+  // will be resolved during token processing.
+  auto delimiter_positions = rmm::device_uvector<size_type>(delimiter_count, stream);
+  auto d_positions         = delimiter_positions.data();
+  auto const copy_end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(chars_bytes),
+                    delimiter_positions.begin(),
+                    [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
+                      return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
+                    });
+
+  // create a vector of offsets to each string's delimiter set within delimiter_positions
+  auto const delimiter_offsets = [&] {
+    // first, create a vector of string indices for each delimiter
+    auto string_indices = rmm::device_uvector<size_type>(delimiter_count, stream);
+    thrust::upper_bound(rmm::exec_policy(stream),
+                        d_offsets,
+                        d_offsets + strings_count,
+                        delimiter_positions.begin(),
+                        copy_end,
+                        string_indices.begin());
+
+    // compute delimiter offsets per string
+    auto delimiter_offsets   = rmm::device_uvector<size_type>(strings_count + 1, stream);
+    auto d_delimiter_offsets = delimiter_offsets.data();
+
+    // memset to zero-out the delimiter counts for any null-entries or strings with no delimiters
+    CUDF_CUDA_TRY(cudaMemsetAsync(
+      d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value()));
+
+    // next, count the number of delimiters per string
+    auto d_string_indices = string_indices.data();  // identifies strings with delimiters only
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       delimiter_count,
+                       [d_string_indices, d_delimiter_offsets] __device__(size_type idx) {
+                         auto const str_idx = d_string_indices[idx] - 1;
+                         atomicAdd(d_delimiter_offsets + str_idx, 1);
+                       });
+    // finally, convert the delimiter counts into offsets
+    thrust::exclusive_scan(rmm::exec_policy(stream),
+                           delimiter_offsets.begin(),
+                           delimiter_offsets.end(),
+                           delimiter_offsets.begin());
+    return delimiter_offsets;
+  }();
+  auto const d_delimiter_offsets = delimiter_offsets.data();
+
+  // compute the number of tokens per string
+  auto token_counts = rmm::device_uvector<size_type>(strings_count, stream);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(strings_count),
+    token_counts.begin(),
+    [tokenizer, d_positions, d_delimiter_offsets] __device__(size_type idx) -> size_type {
+      return tokenizer.count_tokens(idx, d_positions, d_delimiter_offsets);
+    });
+
+  // create offsets from the counts for return to the caller
+  auto offsets = std::get<0>(
+    cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr));
+  auto const total_tokens =
+    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
+  auto const d_tokens_offsets = offsets->view().data<size_type>();
+
+  // build a vector of all the token positions for all the strings
+  auto tokens   = rmm::device_uvector<string_index_pair>(total_tokens, stream);
+  auto d_tokens = tokens.data();
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    strings_count,
+    [tokenizer, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens] __device__(
+      size_type idx) {
+      tokenizer.get_tokens(idx, d_tokens_offsets, d_positions, d_delimiter_offsets, d_tokens);
+    });
+
+  return std::make_pair(std::move(offsets), std::move(tokens));
+}
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index d935ad0b1da..5b79fdefb5a 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "split.cuh"
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,14 +25,12 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
@@ -38,108 +38,43 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-using string_index_pair = thrust::pair<const char*, size_type>;
-
 namespace {
 
-enum class Dir { FORWARD, BACKWARD };
-
-/**
- * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
- *
- * The number of tokens is the same regardless if counting from the beginning
- * or the end of the string.
- */
-struct token_counter_fn {
-  column_device_view const d_strings;  // strings to split
-  string_view const d_delimiter;       // delimiter for split
-  size_type const max_tokens = std::numeric_limits<size_type>::max();
-
-  __device__ size_type operator()(size_type idx) const
-  {
-    if (d_strings.is_null(idx)) { return 0; }
-
-    auto const d_str      = d_strings.element<string_view>(idx);
-    size_type token_count = 0;
-    size_type start_pos   = 0;
-    while (token_count < max_tokens - 1) {
-      auto const delimiter_pos = d_str.find(d_delimiter, start_pos);
-      if (delimiter_pos == string_view::npos) break;
-      token_count++;
-      start_pos = delimiter_pos + d_delimiter.length();
-    }
-    return token_count + 1;  // always at least one token
-  }
-};
-
-/**
- * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
- */
-template <Dir dir>
-struct token_reader_fn {
-  column_device_view const d_strings;  // strings to split
-  string_view const d_delimiter;       // delimiter for split
-  int32_t* d_token_offsets{};          // for locating tokens in d_tokens
-  string_index_pair* d_tokens{};
-
-  __device__ string_index_pair resolve_token(string_view const& d_str,
-                                             size_type start_pos,
-                                             size_type end_pos,
-                                             size_type delimiter_pos) const
-  {
-    if (dir == Dir::FORWARD) {
-      auto const byte_offset = d_str.byte_offset(start_pos);
-      return string_index_pair{d_str.data() + byte_offset,
-                               d_str.byte_offset(delimiter_pos) - byte_offset};
-    } else {
-      auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length());
-      return string_index_pair{d_str.data() + byte_offset,
-                               d_str.byte_offset(end_pos) - byte_offset};
-    }
+template <typename Tokenizer>
+std::unique_ptr<column> split_record_fn(strings_column_view const& input,
+                                        Tokenizer tokenizer,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) { return make_empty_column(type_id::LIST); }
+  if (input.size() == input.null_count()) {
+    auto offsets = std::make_unique<column>(input.offsets(), stream, mr);
+    auto results = make_empty_column(type_id::STRING);
+    return make_lists_column(input.size(),
+                             std::move(offsets),
+                             std::move(results),
+                             input.null_count(),
+                             copy_bitmask(input.parent(), stream, mr),
+                             stream,
+                             mr);
   }
 
-  __device__ void operator()(size_type idx)
-  {
-    if (d_strings.is_null(idx)) { return; }
+  // builds the offsets and the vector of all tokens
+  auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
 
-    auto const token_offset = d_token_offsets[idx];
-    auto const token_count  = d_token_offsets[idx + 1] - token_offset;
-    auto d_result           = d_tokens + token_offset;
-    auto const d_str        = d_strings.element<string_view>(idx);
-    if (d_str.empty()) {
-      // Pandas str.split("") for non-whitespace delimiter is an empty string
-      *d_result = string_index_pair{"", 0};
-      return;
-    }
+  // build a strings column from the tokens
+  auto strings_child = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
 
-    size_type token_idx = 0;
-    size_type start_pos = 0;               // updates only if moving forward
-    size_type end_pos   = d_str.length();  // updates only if moving backward
-    while (token_idx < token_count - 1) {
-      auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
-                                                     : d_str.rfind(d_delimiter, start_pos, end_pos);
-      if (delimiter_pos == string_view::npos) break;
-      auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos);
-      if (dir == Dir::FORWARD) {
-        d_result[token_idx] = token;
-        start_pos           = delimiter_pos + d_delimiter.length();
-      } else {
-        d_result[token_count - 1 - token_idx] = token;
-        end_pos                               = delimiter_pos;
-      }
-      token_idx++;
-    }
+  return make_lists_column(input.size(),
+                           std::move(offsets),
+                           std::move(strings_child),
+                           input.null_count(),
+                           copy_bitmask(input.parent(), stream, mr),
+                           stream,
+                           mr);
+}
 
-    // set last token to remainder of the string
-    if (dir == Dir::FORWARD) {
-      auto const offset_bytes = d_str.byte_offset(start_pos);
-      d_result[token_idx] =
-        string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes};
-    } else {
-      d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)};
-    }
-  }
-};
+enum class Dir { FORWARD, BACKWARD };
 
 /**
  * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
@@ -196,7 +131,7 @@ struct whitespace_token_reader_fn {
     whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD);
     size_type token_idx = 0;
     position_pair token{0, 0};
-    if (dir == Dir::FORWARD) {
+    if constexpr (dir == Dir::FORWARD) {
       while (tokenizer.next_token() && (token_idx < token_count)) {
         token = tokenizer.get_token();
         d_result[token_idx++] =
@@ -224,11 +159,11 @@ struct whitespace_token_reader_fn {
 
 // The output is one list item per string
 template <typename TokenCounter, typename TokenReader>
-std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
-                                        TokenCounter counter,
-                                        TokenReader reader,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& strings,
+                                                   TokenCounter counter,
+                                                   TokenReader reader,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   // create offsets column by counting the number of tokens per string
   auto strings_count = strings.size();
@@ -244,7 +179,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
     rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
 
   // last entry is the total number of tokens to be generated
-  auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
+  auto total_tokens = cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
   // split each string into an array of index-pair values
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   reader.d_token_offsets = d_offsets;
@@ -277,18 +212,21 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
 
   auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream);
   if (delimiter.size() == 0) {
-    return split_record_fn(strings,
-                           whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
-                           whitespace_token_reader_fn<dir>{*d_strings_column_ptr, max_tokens},
-                           stream,
-                           mr);
+    return whitespace_split_record_fn(
+      strings,
+      whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
+      whitespace_token_reader_fn<dir>{*d_strings_column_ptr, max_tokens},
+      stream,
+      mr);
   } else {
     string_view d_delimiter(delimiter.data(), delimiter.size());
-    return split_record_fn(strings,
-                           token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens},
-                           token_reader_fn<dir>{*d_strings_column_ptr, d_delimiter},
-                           stream,
-                           mr);
+    if (dir == Dir::FORWARD) {
+      return split_record_fn(
+        strings, split_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr);
+    } else {
+      return split_record_fn(
+        strings, rsplit_tokenizer_fn{*d_strings_column_ptr, d_delimiter, max_tokens}, stream, mr);
+    }
   }
 }
 
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 73d5adab427..714c1ad416a 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -308,6 +308,82 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, MultiByteDelimiters)
+{
+  // Overlapping delimiters
+  auto input =
+    cudf::test::strings_column_wrapper({"u::", "w:::x", "y::::z", "::a", ":::b", ":::c:::"});
+  auto view = cudf::strings_column_view(input);
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  {
+    auto result        = cudf::strings::split_record(view, cudf::string_scalar("::"));
+    auto expected_left = LCW({LCW{"u", ""},
+                              LCW{"w", ":x"},
+                              LCW{"y", "", "z"},
+                              LCW{"", "a"},
+                              LCW{"", ":b"},
+                              LCW{"", ":c", ":"}});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_left);
+    result              = cudf::strings::rsplit_record(view, cudf::string_scalar("::"));
+    auto expected_right = LCW({LCW{"u", ""},
+                               LCW{"w:", "x"},
+                               LCW{"y", "", "z"},
+                               LCW{"", "a"},
+                               LCW{":", "b"},
+                               LCW{":", "c:", ""}});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_right);
+  }
+  {
+    auto result = cudf::strings::split(view, cudf::string_scalar("::"));
+
+    auto c0 = cudf::test::strings_column_wrapper({"u", "w", "y", "", "", ""});
+    auto c1 = cudf::test::strings_column_wrapper({"", ":x", "", "a", ":b", ":c"});
+    auto c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ":"}, {0, 0, 1, 0, 0, 1});
+    std::vector<std::unique_ptr<cudf::column>> expected_columns;
+    expected_columns.push_back(c0.release());
+    expected_columns.push_back(c1.release());
+    expected_columns.push_back(c2.release());
+    auto expected_left = std::make_unique<cudf::table>(std::move(expected_columns));
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_left);
+
+    result = cudf::strings::rsplit(view, cudf::string_scalar("::"));
+
+    c0 = cudf::test::strings_column_wrapper({"u", "w:", "y", "", ":", ":"});
+    c1 = cudf::test::strings_column_wrapper({"", "x", "", "a", "b", "c:"});
+    c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ""}, {0, 0, 1, 0, 0, 1});
+    expected_columns.push_back(c0.release());
+    expected_columns.push_back(c1.release());
+    expected_columns.push_back(c2.release());
+    auto expected_right = std::make_unique<cudf::table>(std::move(expected_columns));
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected_right);
+  }
+
+  // Delimiters that span across adjacent strings
+  input = cudf::test::strings_column_wrapper({"{a=1}:{b=2}:", "{c=3}", ":{}:{}"});
+  view  = cudf::strings_column_view(input);
+  {
+    auto result   = cudf::strings::split_record(view, cudf::string_scalar("}:{"));
+    auto expected = LCW({LCW{"{a=1", "b=2}:"}, LCW{"{c=3}"}, LCW{":{", "}"}});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+    result = cudf::strings::rsplit_record(view, cudf::string_scalar("}:{"));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  }
+  {
+    auto result = cudf::strings::split(view, cudf::string_scalar("}:{"));
+
+    auto c0 = cudf::test::strings_column_wrapper({"{a=1", "{c=3}", ":{"});
+    auto c1 = cudf::test::strings_column_wrapper({"b=2}:", "", "}"}, {1, 0, 1});
+    std::vector<std::unique_ptr<cudf::column>> expected_columns;
+    expected_columns.push_back(c0.release());
+    expected_columns.push_back(c1.release());
+    auto expected = std::make_unique<cudf::table>(std::move(expected_columns));
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected);
+
+    result = cudf::strings::rsplit(view, cudf::string_scalar("}:{"));
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, *expected);
+  }
+}
+
 TEST_F(StringsSplitTest, SplitRegex)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};