From db3d6633cc5051cf8f665f43b991e043fbc21eac Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Tue, 14 Jul 2020 11:08:51 -0400
Subject: [PATCH 1/8] change split-record to return list column

---
 cpp/include/cudf/strings/split/split.hpp |  83 ++--
 cpp/src/strings/split/split_record.cu    | 550 +++++++----------------
 cpp/src/strings/split/split_utils.cuh    | 118 +++++
 cpp/tests/strings/split_tests.cpp        | 329 ++++----------
 4 files changed, 413 insertions(+), 667 deletions(-)
 create mode 100644 cpp/src/strings/split/split_utils.cuh
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index 371048287ca..4abd7ea54c8 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -82,82 +82,69 @@ std::unique_ptr<table> rsplit(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
 /**
- * @brief The result(s) of a `contiguous_(r)split_record`
+ * @brief Splits individual strings elements in to a list of tokens.
  *
- * Each column_view resulting from a split operation performed by
- * contiguous_split_record will be returned wrapped in a
- * `contiguous_split_record_result`. The column data addresses stored in the
- * column_view objects are not owned by top level cudf::column objects. The
- * backing memory is instead owned by the `all_data` field and in one contiguous
- * block.
+ * Each element generates an array of tokens that are stored in a
+ * resulting list column.
  *
- * The user is responsible for assuring that the `column_views` or any derived
- * objects do not outlive the memory owned by `all_data`
- */
-struct contiguous_split_record_result {
-  std::vector<column_view> column_views;
-  std::unique_ptr<rmm::device_buffer> all_data;
-};
-
-/**
- * @brief Splits each element of the input column to a column of tokens storing
- * the resulting columns in a single contiguous block of memory.
+ * The number of elements in the output list will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * tokens for that row. The resulting number of tokens in each row can vary
+ * from 0 to `maxsplit+1`.
  *
- * This function splits each element in the input column to a column of tokens.
- * The number of columns in the output vector will be the same as the number of
- * elements in the input column. The column length will coincide with the
- * number of tokens; the resulting columns wrapped in the returned object may
- * have different sizes.
+ * The `delimiter` is searched within each string from beginning to end
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
- * Splitting a null string element will result in an empty output column.
+ * A null string element will result in a null list item for that row.
  *
- * @throws cudf:logic_error if `delimiter` is invalid.
+ * @throw cudf:logic_error if `delimiter` is invalid.
  *
  * @param strings A column of string elements to be splitted.
- * @param delimiter UTF-8 encoded string indicating the split points in each
- *        string.
+ * @param delimiter The string to identify split points in each string.
  *        Default of empty string indicates split on whitespace.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return contiguous_split_record_result New vector of strings column_view
- *         objects
- *         (each column_view element of the vector holds splits from a string
- *         element of the input column).
+ * @return List column of strings
+ *         Each vector of the list column holds splits from a single row
+ *         element of the input column.
  */
-contiguous_split_record_result contiguous_split_record(
+std::unique_ptr<column> split_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
 /**
- * @brief Splits each element of the input column from the end to a column of
- * tokens storing the resulting columns in a single contiguous block of memory.
+ * @brief  Splits individual strings elements in to a list of tokens starting
+ * from the end of each string.
+ *
+ * Each element generates an array of tokens that are stored in a
+ * resulting list column.
+ *
+ * The number of elements in the output list will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * tokens for that row. The resulting number of tokens in each row can vary
+ * from 0 to `maxsplit+1`.
  *
- * This function splits each element in the input column to a column of tokens.
- * The number of columns in the output vector will be the same as the number of
- * elements in the input column. The column length will coincide with the
- * number of tokens; the resulting columns wrapped in the returned object may
- * have different sizes.
+ * The `delimiter` is searched from end to beginning within each string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
- * Splitting a null string element will result in an empty output column.
+ * A null string element will result in a null list item for that row.
  *
- * @throws cudf:logic_error if `delimiter` is invalid.
+ * @throw cudf:logic_error if `delimiter` is invalid.
  *
  * @param strings A column of string elements to be splitted.
- * @param delimiter UTF-8 encoded string indicating the split points in each
- *        string.
+ * @param delimiter The string to identify split points in each string.
  *        Default of empty string indicates split on whitespace.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return contiguous_split_record_result New vector of strings column_view
- *         objects
- *         (each column_view element of the vector holds splits from a string
- *         element of the input column).
+ * @return List column of strings
+ *         Each vector of the list column holds splits from a single row
+ *         element of the input column.
  */
-contiguous_split_record_result contiguous_rsplit_record(
+std::unique_ptr<column> rsplit_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 4a069906f80..19ea2503ca3 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -16,125 +16,81 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+#include <strings/split/split_utils.cuh>
 
+#include <thrust/scan.h>
 #include <thrust/transform.h>
-#include <vector>
 
 namespace cudf {
 namespace strings {
 namespace detail {
-namespace {
 
-// align all column size allocations to this boundary so that all output column buffers
-// start at that alignment.
-static constexpr size_type split_align = 64;
+using string_index_pair = thrust::pair<const char*, size_type>;
 
-__device__ size_type compute_memory_size(size_type token_count, size_type token_size_sum)
-{
-  return cudf::detail::round_up_pow2(token_size_sum, split_align) +
-         cudf::detail::round_up_pow2((token_count + 1) * static_cast<size_type>(sizeof(size_type)),
-                                     split_align);
-}
-
-struct copy_info {
-  size_type idx{};
-  size_type token_count{};
-  size_type token_size_sum{};
-  void* memory_ptr{};
-};
+namespace {
 
 enum class Dir { FORWARD, BACKWARD };
 
 /**
- * @brief Compute the number of tokens, the total byte sizes of the tokens, and
- * required memory size for the `idx'th` string element of `d_strings`.
+ * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
  */
 template <Dir dir>
-struct token_reader_fn {
+struct token_counter_fn {
   column_device_view const d_strings;  // strings to split
   string_view const d_delimiter;       // delimiter for split
   size_type const max_tokens = std::numeric_limits<size_type>::max();
-  bool const has_validity    = false;
 
-  template <bool last>
-  __device__ size_type compute_token_char_bytes(string_view const& d_str,
-                                                size_type start_pos,
-                                                size_type end_pos,
-                                                size_type delimiter_pos) const
+  __device__ size_type operator()(size_type idx) const
   {
-    if (last) {
-      return dir == Dir::FORWARD ? d_str.byte_offset(end_pos) - d_str.byte_offset(start_pos)
-                                 : d_str.byte_offset(end_pos);
-    } else {
-      return dir == Dir::FORWARD ? d_str.byte_offset(delimiter_pos) - d_str.byte_offset(start_pos)
-                                 : d_str.byte_offset(end_pos) -
-                                     d_str.byte_offset(delimiter_pos + d_delimiter.length());
-    }
-  }
-
-  // returns a tuple of token count, sum of token sizes in bytes, and required
-  // memory block size
-  __device__ thrust::tuple<size_type, size_type, size_type> operator()(size_type idx) const
-  {
-    if (has_validity && d_strings.is_null(idx)) {
-      return thrust::make_tuple<size_type, size_type, size_type>(0, 0, 0);
-    }
+    if (d_strings.is_null(idx)) { return 0; }
 
-    auto const d_str         = d_strings.element<string_view>(idx);
-    size_type token_count    = 0;
-    size_type token_size_sum = 0;
-    size_type start_pos      = 0;               // updates only if moving forward
-    auto end_pos             = d_str.length();  // updates only if moving backward
+    auto const d_str      = d_strings.element<string_view>(idx);
+    size_type token_count = 0;
+    size_type start_pos   = 0;               // updates only if moving forward
+    size_type end_pos     = d_str.length();  // updates only if moving backward
     while (token_count < max_tokens - 1) {
       auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
                                                      : d_str.rfind(d_delimiter, start_pos, end_pos);
-      if (delimiter_pos != -1) {
-        token_count++;
-        token_size_sum += compute_token_char_bytes<false>(d_str, start_pos, end_pos, delimiter_pos);
-        if (dir == Dir::FORWARD) {
-          start_pos = delimiter_pos + d_delimiter.length();
-        } else {
-          end_pos = delimiter_pos;
-        }
-      } else {
-        break;
-      }
+      if (delimiter_pos < 0) break;
+      token_count++;
+      if (dir == Dir::FORWARD)
+        start_pos = delimiter_pos + d_delimiter.length();
+      else
+        end_pos = delimiter_pos;
     }
-    token_count++;
-    token_size_sum += compute_token_char_bytes<true>(d_str, start_pos, end_pos, -1);
-
-    auto const memory_size = compute_memory_size(token_count, token_size_sum);
-
-    return thrust::make_tuple<size_type, size_type, size_type>(
-      token_count, token_size_sum, memory_size);
+    return token_count + 1;  // always at least one token
   }
 };
 
 /**
- * @brief Copy the tokens from the `idx'th` string element of `d_strings` to
- * the contiguous memory buffer.
+ * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
  */
 template <Dir dir>
-struct token_copier_fn {
+struct token_reader_fn {
   column_device_view const d_strings;  // strings to split
   string_view const d_delimiter;       // delimiter for split
-  bool const has_validity = false;
+  int32_t* d_token_offsets{};          // for locating tokens in d_tokens
+  string_index_pair* d_tokens{};
 
   template <bool last>
-  __device__ thrust::pair<size_type, size_type> compute_src_byte_offset_and_token_char_bytes(
-    string_view const& d_str, size_type start_pos, size_type end_pos, size_type delimiter_pos) const
+  __device__ string_index_pair resolve_token(string_view const& d_str,
+                                             size_type start_pos,
+                                             size_type end_pos,
+                                             size_type delimiter_pos) const
   {
     if (last) {
       auto const src_byte_offset  = dir == Dir::FORWARD ? d_str.byte_offset(start_pos) : 0;
       auto const token_char_bytes = dir == Dir::FORWARD
                                       ? d_str.byte_offset(end_pos) - src_byte_offset
                                       : d_str.byte_offset(end_pos);
-      return thrust::make_pair<size_type, size_type>(src_byte_offset, token_char_bytes);
+      return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes};
     } else {
       auto const src_byte_offset = dir == Dir::FORWARD
                                      ? d_str.byte_offset(start_pos)
@@ -142,123 +98,71 @@ struct token_copier_fn {
       auto const token_char_bytes = dir == Dir::FORWARD
                                       ? d_str.byte_offset(delimiter_pos) - src_byte_offset
                                       : d_str.byte_offset(end_pos) - src_byte_offset;
-      return thrust::make_pair<size_type, size_type>(src_byte_offset, token_char_bytes);
+      return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes};
     }
   }
 
-  __device__ void operator()(copy_info const info) const
+  __device__ void operator()(size_type idx)
   {
-    if (info.token_count == 0) { return; }
-
-    auto memory_ptr = static_cast<char*>(info.memory_ptr);
-
-    auto const char_buf_size = cudf::detail::round_up_pow2(info.token_size_sum, split_align);
-    auto const char_buf_ptr  = memory_ptr;
-    memory_ptr += char_buf_size;
-    auto const offset_buf_ptr = reinterpret_cast<size_type*>(memory_ptr);
+    if (d_strings.is_null(idx)) { return; }
+
+    auto const token_offset = d_token_offsets[idx];
+    auto const token_count  = d_token_offsets[idx + 1] - token_offset;
+    auto d_result           = d_tokens + token_offset;
+    auto const d_str        = d_strings.element<string_view>(idx);
+    if (d_str.empty()) {
+      *d_result = string_index_pair{"", 0};
+      return;
+    }
 
-    auto const d_str            = d_strings.element<string_view>(info.idx);
-    size_type token_idx         = 0;
-    size_type char_bytes_copied = 0;
-    size_type start_pos         = 0;               // updates only if moving forward
-    auto end_pos                = d_str.length();  // updates only if moving backward
-    while (token_idx < info.token_count - 1) {
+    size_type token_idx = 0;
+    size_type start_pos = 0;               // updates only if moving forward
+    size_type end_pos   = d_str.length();  // updates only if moving backward
+    while (token_idx < token_count - 1) {
       auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
                                                      : d_str.rfind(d_delimiter, start_pos, end_pos);
-      if (delimiter_pos != -1) {
-        auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes<false>(
-          d_str, start_pos, end_pos, delimiter_pos);
-        if (dir == Dir::FORWARD) {
-          thrust::copy(thrust::seq,
-                       d_str.data() + offset_size_pair.first,
-                       d_str.data() + offset_size_pair.first + offset_size_pair.second,
-                       char_buf_ptr + char_bytes_copied);
-          offset_buf_ptr[token_idx] = char_bytes_copied;
-        } else {
-          auto const char_buf_offset =
-            info.token_size_sum - char_bytes_copied - offset_size_pair.second;
-          thrust::copy(thrust::seq,
-                       d_str.data() + offset_size_pair.first,
-                       d_str.data() + offset_size_pair.first + offset_size_pair.second,
-                       char_buf_ptr + char_buf_offset);
-          offset_buf_ptr[info.token_count - 1 - token_idx] = char_buf_offset;
-        }
-        token_idx++;
-        char_bytes_copied += offset_size_pair.second;
-        if (dir == Dir::FORWARD) {
-          start_pos = delimiter_pos + d_delimiter.length();
-        } else {
-          end_pos = delimiter_pos;
-        }
-      } else {
-        break;
-      }
+      if (delimiter_pos < 0) break;
+      auto const token = resolve_token<false>(d_str, start_pos, end_pos, delimiter_pos);
+      if (dir == Dir::FORWARD)
+        d_result[token_idx] = token;
+      else
+        d_result[token_count - 1 - token_idx] = token;
+
+      token_idx++;
+      if (dir == Dir::FORWARD)
+        start_pos = delimiter_pos + d_delimiter.length();
+      else
+        end_pos = delimiter_pos;
     }
 
-    auto const offset_size_pair =
-      compute_src_byte_offset_and_token_char_bytes<true>(d_str, start_pos, end_pos, -1);
-    if (dir == Dir::FORWARD) {
-      thrust::copy(thrust::seq,
-                   d_str.data() + offset_size_pair.first,
-                   d_str.data() + offset_size_pair.first + offset_size_pair.second,
-                   char_buf_ptr + char_bytes_copied);
-      offset_buf_ptr[token_idx] = char_bytes_copied;
-    } else {
-      thrust::copy(thrust::seq, d_str.data(), d_str.data() + offset_size_pair.second, char_buf_ptr);
-      offset_buf_ptr[0] = 0;
-    }
-    offset_buf_ptr[info.token_count] = info.token_size_sum;
+    auto const last_token = resolve_token<true>(d_str, start_pos, end_pos, -1);
+    if (dir == Dir::FORWARD)
+      d_result[token_idx] = last_token;
+    else
+      d_result[0] = last_token;
   }
 };
 
 /**
- * @brief Compute the number of tokens, the total byte sizes of the tokens, and
- * required memory size for the `idx'th` string element of `d_strings`.
+ * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
  */
-template <Dir dir>
-struct whitespace_token_reader_fn {
+struct whitespace_token_counter_fn {
   column_device_view const d_strings;  // strings to split
   size_type const max_tokens = std::numeric_limits<size_type>::max();
-  bool const has_validity    = false;
 
-  template <bool last>
-  __device__ size_type compute_token_char_bytes(string_view const& d_str,
-                                                size_type cur_pos,
-                                                size_type to_token_pos) const
+  __device__ size_type operator()(size_type idx) const
   {
-    if (last) {
-      return dir == Dir::FORWARD
-               ? d_str.byte_offset(d_str.length()) - d_str.byte_offset(to_token_pos)
-               : d_str.byte_offset(to_token_pos + 1) - d_str.byte_offset(0);
-    } else {
-      return dir == Dir::FORWARD
-               ? d_str.byte_offset(cur_pos) - d_str.byte_offset(to_token_pos)
-               : d_str.byte_offset(to_token_pos + 1) - d_str.byte_offset(cur_pos + 1);
-    }
-  }
+    if (d_strings.is_null(idx)) { return 0; }
 
-  __device__ thrust::tuple<size_type, size_type, size_type> operator()(size_type idx) const
-  {
-    if (has_validity && d_strings.is_null(idx)) {
-      return thrust::make_tuple<size_type, size_type, size_type>(0, 0, 0);
-    }
-
-    auto const d_str         = d_strings.element<string_view>(idx);
-    size_type token_count    = 0;
-    size_type token_size_sum = 0;
-    auto spaces              = true;
-    auto reached_max_tokens  = false;
-    size_type to_token_pos   = 0;
-    for (size_type i = 0; i < d_str.length(); ++i) {
-      auto const cur_pos = dir == Dir::FORWARD ? i : d_str.length() - 1 - i;
-      auto const ch      = d_str[cur_pos];
+    auto const d_str        = d_strings.element<string_view>(idx);
+    size_type token_count   = 0;
+    auto spaces             = true;
+    auto reached_max_tokens = false;
+    for (auto ch : d_str) {
       if (spaces != (ch <= ' ')) {
-        if (spaces) {  // from whitespace(s) to a new token
-          to_token_pos = cur_pos;
-        } else {  // from a token to whitespace(s)
+        if (!spaces) {
           if (token_count < max_tokens - 1) {
             token_count++;
-            token_size_sum += compute_token_char_bytes<false>(d_str, cur_pos, to_token_pos);
           } else {
             reached_max_tokens = true;
             break;
@@ -267,217 +171,105 @@ struct whitespace_token_reader_fn {
         spaces = !spaces;
       }
     }
-    if (reached_max_tokens || !spaces) {
-      token_count++;
-      token_size_sum += compute_token_char_bytes<true>(d_str, -1, to_token_pos);
-    }
-
-    if (token_count == 0) {  // note that pandas.Series.str.split("", pat=" ")
-                             // returns one token (i.e. "") while
-                             // pandas.Series.str.split("") returns 0 token.
-      return thrust::make_tuple<size_type, size_type, size_type>(0, 0, 0);
-    }
-
-    auto const memory_size = compute_memory_size(token_count, token_size_sum);
-
-    return thrust::make_tuple<size_type, size_type, size_type>(
-      token_count, token_size_sum, memory_size);
+    // pandas.Series.str.split("") returns 0 tokens.
+    if (reached_max_tokens || !spaces) token_count++;
+    return token_count;
   }
 };
 
 /**
- * @brief Copy the tokens from the `idx'th` string element of `d_strings` to
- * the contiguous memory buffer.
+ * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
  */
 template <Dir dir>
-struct whitespace_token_copier_fn {
+struct whitespace_token_reader_fn {
   column_device_view const d_strings;  // strings to split
-  bool const has_validity = false;
+  size_type const max_tokens{};
+  int32_t* d_token_offsets{};
+  string_index_pair* d_tokens{};
 
-  template <bool last>
-  __device__ thrust::pair<size_type, size_type> compute_src_byte_offset_and_token_char_bytes(
-    string_view const& d_str,
-    size_type cur_pos,
-    size_type to_token_pos,
-    size_type remaining_bytes) const
+  __device__ void operator()(size_type idx)
   {
-    if (last) {
-      auto const token_char_bytes = remaining_bytes;
-      auto const src_byte_offset  = dir == Dir::FORWARD
-                                     ? d_str.byte_offset(to_token_pos)
-                                     : d_str.byte_offset(to_token_pos + 1) - token_char_bytes;
-      return thrust::make_pair<size_type, size_type>(src_byte_offset, token_char_bytes);
+    auto const token_offset = d_token_offsets[idx];
+    auto const token_count  = d_token_offsets[idx + 1] - token_offset;
+    if (token_count == 0) { return; }
+    auto d_result = d_tokens + token_offset;
+
+    auto const d_str = d_strings.element<string_view>(idx);
+    whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD);
+    size_type token_idx = 0;
+    string_view last_token{};
+    if (dir == Dir::FORWARD) {
+      while (tokenizer.next_token() && (token_idx < token_count)) {
+        last_token            = tokenizer.get_token();
+        d_result[token_idx++] = string_index_pair{last_token.data(), last_token.size_bytes()};
+      }
+      if (token_count == max_tokens) {
+        d_result[token_idx - 1] = string_index_pair{
+          last_token.data(),
+          static_cast<size_type>(d_str.data() + d_str.size_bytes() - last_token.data())};
+      }
     } else {
-      auto const src_byte_offset =
-        dir == Dir::FORWARD ? d_str.byte_offset(to_token_pos) : d_str.byte_offset(cur_pos + 1);
-      auto const token_char_bytes = dir == Dir::FORWARD
-                                      ? d_str.byte_offset(cur_pos) - src_byte_offset
-                                      : d_str.byte_offset(to_token_pos + 1) - src_byte_offset;
-      return thrust::make_pair<size_type, size_type>(src_byte_offset, token_char_bytes);
-    }
-  }
-
-  __device__ void operator()(copy_info const info) const
-  {
-    if (info.token_count == 0) { return; }
-
-    auto memory_ptr = static_cast<char*>(info.memory_ptr);
-
-    auto const char_buf_size = cudf::detail::round_up_pow2(info.token_size_sum, split_align);
-    auto const char_buf_ptr  = memory_ptr;
-    memory_ptr += char_buf_size;
-    auto const offset_buf_ptr = reinterpret_cast<size_type*>(memory_ptr);
-
-    auto const d_str            = d_strings.element<string_view>(info.idx);
-    size_type token_idx         = 0;
-    size_type char_bytes_copied = 0;
-    auto spaces                 = true;
-    size_type to_token_pos      = 0;
-    for (size_type i = 0; i < d_str.length(); ++i) {
-      auto const cur_pos = dir == Dir::FORWARD ? i : d_str.length() - 1 - i;
-      auto const ch      = d_str[cur_pos];
-      if (spaces != (ch <= ' ')) {
-        if (spaces) {  // from whitespace(s) to a new token
-          to_token_pos = cur_pos;
-        } else {  // from a token to whitespace(s)
-          if (token_idx < info.token_count - 1) {
-            auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes<false>(
-              d_str, cur_pos, to_token_pos, info.token_size_sum - char_bytes_copied);
-            if (dir == Dir::FORWARD) {
-              thrust::copy(thrust::seq,
-                           d_str.data() + offset_size_pair.first,
-                           d_str.data() + offset_size_pair.first + offset_size_pair.second,
-                           char_buf_ptr + char_bytes_copied);
-              offset_buf_ptr[token_idx] = char_bytes_copied;
-            } else {
-              auto const char_buf_offset =
-                info.token_size_sum - char_bytes_copied - offset_size_pair.second;
-              thrust::copy(thrust::seq,
-                           d_str.data() + offset_size_pair.first,
-                           d_str.data() + offset_size_pair.first + offset_size_pair.second,
-                           char_buf_ptr + char_buf_offset);
-              offset_buf_ptr[info.token_count - 1 - token_idx] = char_buf_offset;
-            }
-            token_idx++;
-            char_bytes_copied += offset_size_pair.second;
-          } else {
-            break;
-          }
-        }
-        spaces = !spaces;
+      while (tokenizer.prev_token() && (token_idx < token_count)) {
+        last_token = tokenizer.get_token();
+        d_result[token_count - 1 - token_idx] =
+          string_index_pair{last_token.data(), last_token.size_bytes()};
+        ++token_idx;
       }
-    }
-    if (token_idx < info.token_count) {
-      auto const offset_size_pair = compute_src_byte_offset_and_token_char_bytes<true>(
-        d_str, -1, to_token_pos, info.token_size_sum - char_bytes_copied);
-      if (dir == Dir::FORWARD) {
-        thrust::copy(thrust::seq,
-                     d_str.data() + offset_size_pair.first,
-                     d_str.data() + offset_size_pair.first + offset_size_pair.second,
-                     char_buf_ptr + char_bytes_copied);
-        offset_buf_ptr[token_idx] = char_bytes_copied;
-      } else {
-        thrust::copy(thrust::seq,
-                     d_str.data() + offset_size_pair.first,
-                     d_str.data() + offset_size_pair.first + offset_size_pair.second,
-                     char_buf_ptr);
-        offset_buf_ptr[0] = 0;
+      if (token_count == max_tokens) {
+        --token_idx;
+        d_result[token_count - 1 - token_idx] = string_index_pair{
+          d_str.data(),
+          static_cast<size_type>(last_token.data() + last_token.size_bytes() - d_str.data())};
       }
     }
-    offset_buf_ptr[info.token_count] = info.token_size_sum;
   }
 };
 
-// Generic split function used by split_record and rsplit_record
-template <typename TokenReader, typename TokenCopier>
-contiguous_split_record_result contiguous_split_record_fn(strings_column_view const& strings,
-                                                          TokenReader reader,
-                                                          TokenCopier copier,
-                                                          rmm::mr::device_memory_resource* mr,
-                                                          cudaStream_t stream)
-{
-  // read each string element of the input column to count the number of tokens
-  // and compute the memory offsets
+}  // namespace
 
+// The output is one list item per string
+template <typename TokenCounter, typename TokenReader>
+std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
+                                        TokenCounter counter,
+                                        TokenReader reader,
+                                        rmm::mr::device_memory_resource* mr,
+                                        cudaStream_t stream)
+{
+  // create offsets column by counting the number of tokens per string
   auto strings_count = strings.size();
-  rmm::device_vector<size_type> d_token_counts(strings_count);
-  rmm::device_vector<size_type> d_token_size_sums(strings_count);
-  rmm::device_vector<size_type> d_memory_offsets(strings_count + 1);
-
+  auto offsets       = make_numeric_column(
+    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<int32_t>();
   thrust::transform(rmm::exec_policy(stream)->on(stream),
                     thrust::make_counting_iterator(0),
                     thrust::make_counting_iterator(strings_count),
-                    thrust::make_zip_iterator(thrust::make_tuple(
-                      d_token_counts.begin(), d_token_size_sums.begin(), d_memory_offsets.begin())),
-                    reader);
-
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
-                         d_memory_offsets.begin(),
-                         d_memory_offsets.end(),
-                         d_memory_offsets.begin());
-
-  // allocate and copy
-
-  thrust::host_vector<size_type> h_token_counts    = d_token_counts;
-  thrust::host_vector<size_type> h_token_size_sums = d_token_size_sums;
-  thrust::host_vector<size_type> h_memory_offsets  = d_memory_offsets;
-
-  auto memory_size  = h_memory_offsets.back();
-  auto all_data_ptr = std::make_unique<rmm::device_buffer>(memory_size, stream, mr);
-
-  auto d_all_data_ptr        = reinterpret_cast<char*>(all_data_ptr->data());
-  auto d_token_counts_ptr    = d_token_counts.data().get();
-  auto d_memory_offsets_ptr  = d_memory_offsets.data().get();
-  auto d_token_size_sums_ptr = d_token_size_sums.data().get();
-  auto copy_info_begin       = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    [d_all_data_ptr, d_token_counts_ptr, d_memory_offsets_ptr, d_token_size_sums_ptr] __device__(
-      auto i) {
-      return copy_info{i,
-                       d_token_counts_ptr[i],
-                       d_token_size_sums_ptr[i],
-                       d_all_data_ptr + d_memory_offsets_ptr[i]};
-    });
-
-  thrust::for_each(
-    rmm::exec_policy(stream)->on(stream), copy_info_begin, copy_info_begin + strings_count, copier);
-
-  // update column_view objects
-
-  std::vector<column_view> column_views{};
-  for (size_type i = 0; i < strings_count; ++i) {
-    if (h_token_counts[i] == 0) {
-      column_views.emplace_back(strings.parent().type(), 0, nullptr);
-    } else {
-      auto memory_ptr    = d_all_data_ptr + h_memory_offsets[i];
-      auto char_buf_size = cudf::util::round_up_safe(h_token_size_sums[i], split_align);
-
-      auto char_buf_ptr = memory_ptr;
-      memory_ptr += char_buf_size;
-      auto offset_buf_ptr = reinterpret_cast<size_type*>(memory_ptr);
-
-      column_views.emplace_back(
-        strings.parent().type(),
-        h_token_counts[i],
-        nullptr,
-        nullptr,
-        UNKNOWN_NULL_COUNT,
-        0,
-        std::vector<column_view>{
-          column_view(strings.offsets().type(), h_token_counts[i] + 1, offset_buf_ptr),
-          column_view(strings.chars().type(), h_token_size_sums[i], char_buf_ptr)});
-    }
-  }
-
-  CUDA_TRY(cudaStreamSynchronize(stream));
-
-  return contiguous_split_record_result{std::move(column_views), std::move(all_data_ptr)};
+                    d_offsets,
+                    counter);
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream)->on(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+  // last entry is the total number of tokens to be generated
+  auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
+  // split each string into an array of index-pair values
+  rmm::device_vector<string_index_pair> tokens(total_tokens);
+  reader.d_token_offsets = d_offsets;
+  reader.d_tokens        = tokens.data().get();
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     strings_count,
+                     reader);
+  // convert the index-pairs into one big strings column
+  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), mr, stream);
+  // create a lists column using the offsets and the strings columns
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           strings.null_count(),
+                           copy_bitmask(strings.parent(), stream, mr));
 }
 
-}  // namespace
-
 template <Dir dir>
-contiguous_split_record_result contiguous_split_record(
+std::unique_ptr<column> split_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
@@ -488,24 +280,21 @@ contiguous_split_record_result contiguous_split_record(
 
   // makes consistent with Pandas
   size_type max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max();
-  auto has_validity    = strings.parent().nullable();
 
   auto d_strings_column_ptr = column_device_view::create(strings.parent(), stream);
   if (delimiter.size() == 0) {
-    return contiguous_split_record_fn(
-      strings,
-      whitespace_token_reader_fn<dir>{*d_strings_column_ptr, max_tokens, has_validity},
-      whitespace_token_copier_fn<dir>{*d_strings_column_ptr, has_validity},
-      mr,
-      stream);
+    return split_record_fn(strings,
+                           whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
+                           whitespace_token_reader_fn<dir>{*d_strings_column_ptr, max_tokens},
+                           mr,
+                           stream);
   } else {
     string_view d_delimiter(delimiter.data(), delimiter.size());
-    return contiguous_split_record_fn(
-      strings,
-      token_reader_fn<dir>{*d_strings_column_ptr, d_delimiter, max_tokens, has_validity},
-      token_copier_fn<dir>{*d_strings_column_ptr, d_delimiter, has_validity},
-      mr,
-      stream);
+    return split_record_fn(strings,
+                           token_counter_fn<dir>{*d_strings_column_ptr, d_delimiter, max_tokens},
+                           token_reader_fn<dir>{*d_strings_column_ptr, d_delimiter},
+                           mr,
+                           stream);
   }
 }
 
@@ -513,23 +302,22 @@ contiguous_split_record_result contiguous_split_record(
 
 // external APIs
 
-contiguous_split_record_result contiguous_split_record(strings_column_view const& strings,
-                                                       string_scalar const& delimiter,
-                                                       size_type maxsplit,
-                                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> split_record(strings_column_view const& strings,
+                                     string_scalar const& delimiter,
+                                     size_type maxsplit,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contiguous_split_record<detail::Dir::FORWARD>(strings, delimiter, maxsplit, mr, 0);
+  return detail::split_record<detail::Dir::FORWARD>(strings, delimiter, maxsplit, mr, 0);
 }
 
-contiguous_split_record_result contiguous_rsplit_record(strings_column_view const& strings,
-                                                        string_scalar const& delimiter,
-                                                        size_type maxsplit,
-                                                        rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
+                                      string_scalar const& delimiter,
+                                      size_type maxsplit,
+                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contiguous_split_record<detail::Dir::BACKWARD>(
-    strings, delimiter, maxsplit, mr, 0);
+  return detail::split_record<detail::Dir::BACKWARD>(strings, delimiter, maxsplit, mr, 0);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split_utils.cuh b/cpp/src/strings/split/split_utils.cuh
new file mode 100644
index 00000000000..5c9f98f273a
--- /dev/null
+++ b/cpp/src/strings/split/split_utils.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/string_view.cuh>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @brief Instantiated for each string to manage navigating tokens from
+ * the beginning or the end of that string.
+ */
+struct whitespace_string_tokenizer {
+  /**
+   * @brief Identifies the position range of the next token in the given
+   * string at the specified iterator position.
+   *
+   * Tokens are delimited by one or more whitespace characters.
+   *
+   * @return true if a token has been found
+   */
+  __device__ bool next_token()
+  {
+    if (itr != d_str.begin()) {  // skip these 2 lines the first time through
+      ++itr;
+      start_position = itr.byte_offset();  // end_position + 1;
+    }
+    if (start_position >= d_str.size_bytes()) return false;
+    // continue search for the next token
+    end_position = d_str.size_bytes();
+    for (; itr < d_str.end(); ++itr) {
+      if (spaces == (*itr <= ' ')) {
+        if (spaces)
+          start_position = (itr + 1).byte_offset();
+        else
+          end_position = (itr + 1).byte_offset();
+        continue;
+      }
+      spaces = !spaces;
+      if (spaces) {
+        end_position = itr.byte_offset();
+        break;
+      }
+    }
+    return start_position < end_position;
+  }
+
+  /**
+   * @brief Identifies the position range of the previous token in the given
+   * string at the specified iterator position.
+   *
+   * Tokens are delimited by one or more whitespace characters.
+   *
+   * @return true if a token has been found
+   */
+  __device__ bool prev_token()
+  {
+    end_position = start_position - 1;
+    --itr;
+    if (end_position <= 0) return false;
+    // continue search for the next token
+    start_position = 0;
+    for (; itr >= d_str.begin(); --itr) {
+      if (spaces == (*itr <= ' ')) {
+        if (spaces)
+          end_position = itr.byte_offset();
+        else
+          start_position = itr.byte_offset();
+        continue;
+      }
+      spaces = !spaces;
+      if (spaces) {
+        start_position = (itr + 1).byte_offset();
+        break;
+      }
+    }
+    return start_position < end_position;
+  }
+
+  __device__ string_view get_token() const
+  {
+    return string_view{d_str.data() + start_position, end_position - start_position};
+  }
+
+  __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false)
+    : d_str{d_str},
+      spaces(true),
+      start_position{reverse ? d_str.size_bytes() + 1 : 0},
+      end_position{d_str.size_bytes()},
+      itr{reverse ? d_str.end() : d_str.begin()}
+  {
+  }
+
+ private:
+  string_view const d_str;
+  bool spaces;  // true if current position is whitespace
+  cudf::string_view::const_iterator itr;
+  size_type start_position;
+  size_type end_position;
+};
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index ebb1e1e78f7..2958b2892f3 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -275,7 +275,7 @@ TEST_F(StringsSplitTest, AllNullsCase)
   EXPECT_TRUE(column.null_count() == column.size());
 }
 
-TEST_F(StringsSplitTest, ContiguousSplitRecord)
+TEST_F(StringsSplitTest, SplitRecord)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
   cudf::test::strings_column_wrapper strings(
@@ -283,34 +283,17 @@ TEST_F(StringsSplitTest, ContiguousSplitRecord)
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  cudf::strings_column_view strings_view(strings);
-
-  std::vector<const char*> h_expected1{"", "Héllo", "thesé"};
-  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
-  std::vector<const char*> h_expected2{};
-  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
-  std::vector<const char*> h_expected3{"are", "some", "", ""};
-  cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
-  std::vector<const char*> h_expected4{"tést", "String"};
-  cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
-  std::vector<const char*> h_expected5{""};
-  cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-
-  std::vector<std::unique_ptr<cudf::column>> expected_columns;
-  expected_columns.push_back(expected1.release());
-  expected_columns.push_back(expected2.release());
-  expected_columns.push_back(expected3.release());
-  expected_columns.push_back(expected4.release());
-  expected_columns.push_back(expected5.release());
-
-  auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(" "));
-  EXPECT_TRUE(result.column_views.size() == expected_columns.size());
-  for (size_t i = 0; i < result.column_views.size(); ++i) {
-    cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
-  }
+  auto result =
+    cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" "));
+  cudf::lists_column_view lcv(result->view());
+  cudf::test::strings_column_wrapper expected(
+    {"", "Héllo", "thesé", "are", "some", "", "", "tést", "String", ""});
+  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 3, 3, 7, 9, 10});
+  cudf::test::expect_columns_equal(lcv.child(), expected);
+  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }
 
-TEST_F(StringsSplitTest, ContiguousSplitRecordWithMaxSplit)
+TEST_F(StringsSplitTest, SplitRecordWithMaxSplit)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
   cudf::test::strings_column_wrapper strings(
@@ -318,34 +301,18 @@ TEST_F(StringsSplitTest, ContiguousSplitRecordWithMaxSplit)
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  cudf::strings_column_view strings_view(strings);
-
-  std::vector<const char*> h_expected1{"", "Héllo thesé"};
-  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
-  std::vector<const char*> h_expected2{};
-  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
-  std::vector<const char*> h_expected3{"are", "some  "};
-  cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
-  std::vector<const char*> h_expected4{"tést", "String"};
-  cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
-  std::vector<const char*> h_expected5{""};
-  cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
+  auto result =
+    cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" "), 1);
 
-  std::vector<std::unique_ptr<cudf::column>> expected_columns;
-  expected_columns.push_back(expected1.release());
-  expected_columns.push_back(expected2.release());
-  expected_columns.push_back(expected3.release());
-  expected_columns.push_back(expected4.release());
-  expected_columns.push_back(expected5.release());
-
-  auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(" "), 1);
-  EXPECT_TRUE(result.column_views.size() == expected_columns.size());
-  for (size_t i = 0; i < result.column_views.size(); ++i) {
-    cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
-  }
+  cudf::lists_column_view lcv(result->view());
+  cudf::test::strings_column_wrapper expected(
+    {"", "Héllo thesé", "are", "some  ", "tést", "String", ""});
+  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 2, 2, 4, 6, 7});
+  cudf::test::expect_columns_equal(lcv.child(), expected);
+  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }
 
-TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespace)
+TEST_F(StringsSplitTest, SplitRecordWhitespace)
 {
   std::vector<const char*> h_strings{
     "   Héllo thesé", nullptr, "are\tsome  ", "tést\nString", "  "};
@@ -354,34 +321,15 @@ TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespace)
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  cudf::strings_column_view strings_view(strings);
-
-  std::vector<const char*> h_expected1{"Héllo", "thesé"};
-  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
-  std::vector<const char*> h_expected2{};
-  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
-  std::vector<const char*> h_expected3{"are", "some"};
-  cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
-  std::vector<const char*> h_expected4{"tést", "String"};
-  cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
-  std::vector<const char*> h_expected5{};
-  cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-
-  std::vector<std::unique_ptr<cudf::column>> expected_columns;
-  expected_columns.push_back(expected1.release());
-  expected_columns.push_back(expected2.release());
-  expected_columns.push_back(expected3.release());
-  expected_columns.push_back(expected4.release());
-  expected_columns.push_back(expected5.release());
-
-  auto result = cudf::strings::contiguous_split_record(strings_view);
-  EXPECT_TRUE(result.column_views.size() == expected_columns.size());
-  for (size_t i = 0; i < result.column_views.size(); ++i) {
-    cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
-  }
+  auto result = cudf::strings::split_record(cudf::strings_column_view(strings));
+  cudf::lists_column_view lcv(result->view());
+  cudf::test::strings_column_wrapper expected({"Héllo", "thesé", "are", "some", "tést", "String"});
+  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 2, 2, 4, 6, 6});
+  cudf::test::expect_columns_equal(lcv.child(), expected);
+  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }
 
-TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespaceWithMaxSplit)
+TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
 {
   std::vector<const char*> h_strings{
     "   Héllo thesé  ", nullptr, "are\tsome  ", "tést\nString", "  "};
@@ -390,34 +338,17 @@ TEST_F(StringsSplitTest, ContiguousSplitRecordWhitespaceWithMaxSplit)
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  cudf::strings_column_view strings_view(strings);
-
-  std::vector<const char*> h_expected1{"Héllo", "thesé  "};
-  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
-  std::vector<const char*> h_expected2{};
-  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
-  std::vector<const char*> h_expected3{"are", "some  "};
-  cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
-  std::vector<const char*> h_expected4{"tést", "String"};
-  cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
-  std::vector<const char*> h_expected5{};
-  cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-
-  std::vector<std::unique_ptr<cudf::column>> expected_columns;
-  expected_columns.push_back(expected1.release());
-  expected_columns.push_back(expected2.release());
-  expected_columns.push_back(expected3.release());
-  expected_columns.push_back(expected4.release());
-  expected_columns.push_back(expected5.release());
-
-  auto result = cudf::strings::contiguous_split_record(strings_view, cudf::string_scalar(""), 1);
-  EXPECT_TRUE(result.column_views.size() == expected_columns.size());
-  for (size_t i = 0; i < result.column_views.size(); ++i) {
-    cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
-  }
+  auto result =
+    cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1);
+  cudf::lists_column_view lcv(result->view());
+  cudf::test::strings_column_wrapper expected(
+    {"Héllo", "thesé  ", "are", "some  ", "tést", "String"});
+  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 2, 2, 4, 6, 6});
+  cudf::test::expect_columns_equal(lcv.child(), expected);
+  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }
 
-TEST_F(StringsSplitTest, ContiguousRSplitRecord)
+TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<const char*> h_strings{
     "héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b ", " a  bbb   c"};
@@ -426,46 +357,31 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecord)
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  cudf::strings_column_view strings_view(strings);
-
-  std::vector<const char*> h_expected1{"héllo"};
-  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
-  std::vector<const char*> h_expected2{};
-  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
-  std::vector<const char*> h_expected3{"a", "bc", "déf"};
-  cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
-  std::vector<const char*> h_expected4{"a", "", "bc"};
-  cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
-  std::vector<const char*> h_expected5{"", "ab", "cd"};
-  cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-  std::vector<const char*> h_expected6{"ab", "cd", ""};
-  cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end());
-  std::vector<const char*> h_expected7{""};
-  cudf::test::strings_column_wrapper expected7(h_expected7.begin(), h_expected7.end());
-  std::vector<const char*> h_expected8{" a b "};
-  cudf::test::strings_column_wrapper expected8(h_expected8.begin(), h_expected8.end());
-  std::vector<const char*> h_expected9{" a  bbb   c"};
-  cudf::test::strings_column_wrapper expected9(h_expected9.begin(), h_expected9.end());
-
-  std::vector<std::unique_ptr<cudf::column>> expected_columns;
-  expected_columns.push_back(expected1.release());
-  expected_columns.push_back(expected2.release());
-  expected_columns.push_back(expected3.release());
-  expected_columns.push_back(expected4.release());
-  expected_columns.push_back(expected5.release());
-  expected_columns.push_back(expected6.release());
-  expected_columns.push_back(expected7.release());
-  expected_columns.push_back(expected8.release());
-  expected_columns.push_back(expected9.release());
-
-  auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar("_"));
-  EXPECT_TRUE(result.column_views.size() == expected_columns.size());
-  for (size_t i = 0; i < result.column_views.size(); i++) {
-    cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
-  }
+  cudf::test::strings_column_wrapper expected({"héllo",
+                                               "a",
+                                               "bc",
+                                               "déf",
+                                               "a",
+                                               "",
+                                               "bc",
+                                               "",
+                                               "ab",
+                                               "cd",
+                                               "ab",
+                                               "cd",
+                                               "",
+                                               "",
+                                               " a b ",
+                                               " a  bbb   c"});
+  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 1, 1, 4, 7, 10, 13, 14, 15, 16});
+  auto result =
+    cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_"));
+  cudf::lists_column_view lcv(result->view());
+  cudf::test::expect_columns_equal(lcv.child(), expected);
+  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }
 
-TEST_F(StringsSplitTest, ContiguousRSplitRecordWithMaxSplit)
+TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit)
 {
   std::vector<const char*> h_strings{"héllo",
                                      nullptr,
@@ -481,46 +397,20 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecordWithMaxSplit)
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  cudf::strings_column_view strings_view(strings);
+  cudf::test::strings_column_wrapper expected(
+    {"héllo", "a",  "bc", "déf", "___a",   "", "bc", "_ab", "cd", "",
+     "ab",    "cd", "",   "",    " a b _", "", "",   "_",   "",   " a  bbb   c"});
+  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 1, 1, 4, 7, 10, 13, 14, 17, 20});
 
-  std::vector<const char*> h_expected1{"héllo"};
-  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
-  std::vector<const char*> h_expected2{};
-  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
-  std::vector<const char*> h_expected3{"a", "bc", "déf"};
-  cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
-  std::vector<const char*> h_expected4{"___a", "", "bc"};
-  cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
-  std::vector<const char*> h_expected5{"_ab", "cd", ""};
-  cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-  std::vector<const char*> h_expected6{"ab", "cd", ""};
-  cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end());
-  std::vector<const char*> h_expected7{""};
-  cudf::test::strings_column_wrapper expected7(h_expected7.begin(), h_expected7.end());
-  std::vector<const char*> h_expected8{" a b _", "", ""};
-  cudf::test::strings_column_wrapper expected8(h_expected8.begin(), h_expected8.end());
-  std::vector<const char*> h_expected9{"_", "", " a  bbb   c"};
-  cudf::test::strings_column_wrapper expected9(h_expected9.begin(), h_expected9.end());
+  auto result =
+    cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_"), 2);
 
-  std::vector<std::unique_ptr<cudf::column>> expected_columns;
-  expected_columns.push_back(expected1.release());
-  expected_columns.push_back(expected2.release());
-  expected_columns.push_back(expected3.release());
-  expected_columns.push_back(expected4.release());
-  expected_columns.push_back(expected5.release());
-  expected_columns.push_back(expected6.release());
-  expected_columns.push_back(expected7.release());
-  expected_columns.push_back(expected8.release());
-  expected_columns.push_back(expected9.release());
-
-  auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar("_"), 2);
-  EXPECT_TRUE(result.column_views.size() == expected_columns.size());
-  for (size_t i = 0; i < result.column_views.size(); i++) {
-    cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
-  }
+  cudf::lists_column_view lcv(result->view());
+  cudf::test::expect_columns_equal(lcv.child(), expected);
+  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }
 
-TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespace)
+TEST_F(StringsSplitTest, RSplitRecordWhitespace)
 {
   std::vector<const char*> h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb   c"};
   cudf::test::strings_column_wrapper strings(
@@ -528,37 +418,17 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespace)
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  cudf::strings_column_view strings_view(strings);
+  cudf::test::strings_column_wrapper expected({"héllo", "a_bc_déf", "a", "b", "a", "bbb", "c"});
+  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 1, 1, 2, 2, 4, 7});
 
-  std::vector<const char*> h_expected1{"héllo"};
-  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
-  std::vector<const char*> h_expected2{};
-  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
-  std::vector<const char*> h_expected3{"a_bc_déf"};
-  cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
-  std::vector<const char*> h_expected4{};
-  cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
-  std::vector<const char*> h_expected5{"a", "b"};
-  cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-  std::vector<const char*> h_expected6{"a", "bbb", "c"};
-  cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end());
+  auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings));
 
-  std::vector<std::unique_ptr<cudf::column>> expected_columns;
-  expected_columns.push_back(expected1.release());
-  expected_columns.push_back(expected2.release());
-  expected_columns.push_back(expected3.release());
-  expected_columns.push_back(expected4.release());
-  expected_columns.push_back(expected5.release());
-  expected_columns.push_back(expected6.release());
-  auto result = cudf::strings::contiguous_rsplit_record(strings_view);
-
-  EXPECT_TRUE(result.column_views.size() == expected_columns.size());
-  for (size_t i = 4; i < 5; i++) {
-    cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
-  }
+  cudf::lists_column_view lcv(result->view());
+  cudf::test::expect_columns_equal(lcv.child(), expected);
+  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }
 
-TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespaceWithMaxSplit)
+TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
 {
   std::vector<const char*> h_strings{
     "  héllo Asher ", nullptr, "   a_bc_déf   ", "", " a\tb ", " a\r bbb   c"};
@@ -567,44 +437,27 @@ TEST_F(StringsSplitTest, ContiguousRSplitRecordWhitespaceWithMaxSplit)
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  cudf::strings_column_view strings_view(strings);
-
-  std::vector<const char*> h_expected1{"  héllo", "Asher"};
-  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end());
-  std::vector<const char*> h_expected2{};
-  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
-  std::vector<const char*> h_expected3{"a_bc_déf"};
-  cudf::test::strings_column_wrapper expected3(h_expected3.begin(), h_expected3.end());
-  std::vector<const char*> h_expected4{};
-  cudf::test::strings_column_wrapper expected4(h_expected4.begin(), h_expected4.end());
-  std::vector<const char*> h_expected5{" a", "b"};
-  cudf::test::strings_column_wrapper expected5(h_expected5.begin(), h_expected5.end());
-  std::vector<const char*> h_expected6{" a\r bbb", "c"};
-  cudf::test::strings_column_wrapper expected6(h_expected6.begin(), h_expected6.end());
-
-  std::vector<std::unique_ptr<cudf::column>> expected_columns;
-  expected_columns.push_back(expected1.release());
-  expected_columns.push_back(expected2.release());
-  expected_columns.push_back(expected3.release());
-  expected_columns.push_back(expected4.release());
-  expected_columns.push_back(expected5.release());
-  expected_columns.push_back(expected6.release());
-  auto result = cudf::strings::contiguous_rsplit_record(strings_view, cudf::string_scalar(""), 1);
-
-  EXPECT_TRUE(result.column_views.size() == expected_columns.size());
-  for (size_t i = 4; i < 5; i++) {
-    cudf::test::expect_columns_equal(result.column_views[i], *expected_columns[i]);
-  }
+  cudf::test::strings_column_wrapper expected(
+    {"  héllo", "Asher", "a_bc_déf", " a", "b", " a\r bbb", "c"});
+  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 2, 2, 3, 3, 5, 7});
+
+  auto result =
+    cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1);
+  cudf::lists_column_view lcv(result->view());
+  cudf::test::print(lcv.offsets());
+  cudf::test::print(lcv.child());
+  cudf::test::expect_columns_equal(lcv.child(), expected);
+  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }
 
-TEST_F(StringsSplitTest, ContiguousSplitRecordZeroSizeStringsColumns)
+TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto split_record_result = cudf::strings::contiguous_split_record(zero_size_strings_column);
-  EXPECT_TRUE(split_record_result.column_views.size() == 0);
-  auto rsplit_record_result = cudf::strings::contiguous_rsplit_record(zero_size_strings_column);
-  EXPECT_TRUE(rsplit_record_result.column_views.size() == 0);
+  auto split_record_result = cudf::strings::split_record(zero_size_strings_column);
+  EXPECT_TRUE(split_record_result->size() == 0);
+  auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column);
+  EXPECT_TRUE(rsplit_record_result->size() == 0);
 }
 
 TEST_F(StringsSplitTest, Partition)

From c3dad9e81299fb2a72112660538cef668921c607 Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Tue, 14 Jul 2020 11:23:31 -0400
Subject: [PATCH 2/8] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9228e873d4b..fce7556420f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -116,6 +116,7 @@
 - PR #5662 Make Java ColumnVector(long nativePointer) constructor public
 - PR #5679 Use `pickle5` to test older Python versions
 - PR #5684 Use `pickle5` in `Serializable` (when available)
+- PR #5687 Change strings::split_record to return a lists column
 
 ## Bug Fixes
 

From bc62892e7c67334bedf1b372d6c1764f8a855bd0 Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Tue, 14 Jul 2020 13:24:25 -0400
Subject: [PATCH 3/8] remove test print lines

---
 cpp/tests/strings/split_tests.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 2958b2892f3..95756e2fd33 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -444,8 +444,6 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
   auto result =
     cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1);
   cudf::lists_column_view lcv(result->view());
-  cudf::test::print(lcv.offsets());
-  cudf::test::print(lcv.child());
   cudf::test::expect_columns_equal(lcv.child(), expected);
   cudf::test::expect_columns_equal(lcv.offsets(), offsets);
 }

From a6b8f3614c52030800ccff661d2f69de1aa54ba9 Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Tue, 14 Jul 2020 13:24:56 -0400
Subject: [PATCH 4/8] refactor whitespace tokenize utility from split()

---
 cpp/src/strings/split/split.cu        | 100 ++------------------------
 cpp/src/strings/split/split_record.cu |  20 +++---
 cpp/src/strings/split/split_utils.cuh |   7 +-
 3 files changed, 16 insertions(+), 111 deletions(-)

diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 89d09d56517..3d7d902551f 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -24,6 +24,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <strings/split/split_utils.cuh>
 
 #include <thrust/binary_search.h>  // upper_bound()
 #include <thrust/copy.h>           // copy_if()
@@ -34,8 +35,8 @@
 namespace cudf {
 namespace strings {
 namespace detail {
+
 using string_index_pair = thrust::pair<const char*, size_type>;
-using position_pair     = thrust::pair<size_type, size_type>;
 
 namespace {
 
@@ -582,99 +583,6 @@ struct base_whitespace_split_tokenizer {
   size_type max_tokens;  // maximum number of tokens
 };
 
-/**
- * @brief Instantiated for each string to manage navigating tokens from
- * the beginning or the end of that string.
- */
-struct whitespace_string_tokenizer {
-  /**
-   * @brief Identifies the position range of the next token in the given
-   * string at the specified iterator position.
-   *
-   * Tokens are delimited by one or more whitespace characters.
-   *
-   * @return true if a token has been found
-   */
-  __device__ bool next_token()
-  {
-    if (itr != d_str.begin()) {  // skip these 2 lines the first time through
-      start_position = end_position + 1;
-      ++itr;
-    }
-    if (start_position >= d_str.length()) return false;
-    // continue search for the next token
-    end_position = d_str.length();
-    for (; itr < d_str.end(); ++itr) {
-      if (spaces == (*itr <= ' ')) {
-        if (spaces)
-          start_position = itr.position() + 1;
-        else
-          end_position = itr.position() + 1;
-        continue;
-      }
-      spaces = !spaces;
-      if (spaces) {
-        end_position = itr.position();
-        break;
-      }
-    }
-    return start_position < end_position;
-  }
-
-  /**
-   * @brief Identifies the position range of the previous token in the given
-   * string at the specified iterator position.
-   *
-   * Tokens are delimited by one or more whitespace characters.
-   *
-   * @return true if a token has been found
-   */
-  __device__ bool prev_token()
-  {
-    end_position = start_position - 1;
-    --itr;
-    if (end_position <= 0) return false;
-    // continue search for the next token
-    start_position = 0;
-    for (; itr >= d_str.begin(); --itr) {
-      if (spaces == (*itr <= ' ')) {
-        if (spaces)
-          end_position = itr.position();
-        else
-          start_position = itr.position();
-        continue;
-      }
-      spaces = !spaces;
-      if (spaces) {
-        start_position = itr.position() + 1;
-        break;
-      }
-    }
-    return start_position < end_position;
-  }
-
-  __device__ position_pair token_byte_positions()
-  {
-    return position_pair{d_str.byte_offset(start_position), d_str.byte_offset(end_position)};
-  }
-
-  __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false)
-    : d_str{d_str},
-      spaces(true),
-      start_position{reverse ? d_str.length() + 1 : 0},
-      end_position{d_str.length()},
-      itr{reverse ? d_str.end() : d_str.begin()}
-  {
-  }
-
- private:
-  string_view const d_str;
-  bool spaces;  // true if current position is whitespace
-  cudf::string_view::const_iterator itr;
-  size_type start_position;
-  size_type end_position;
-};
-
 /**
  * @brief The tokenizer functions for split() with whitespace.
  *
@@ -709,7 +617,7 @@ struct whitespace_split_tokenizer_fn : base_whitespace_split_tokenizer {
     size_type token_idx   = 0;
     position_pair token{0, 0};
     while (tokenizer.next_token() && (token_idx < token_count)) {
-      token = tokenizer.token_byte_positions();
+      token = tokenizer.get_token();
       d_tokens[d_strings.size() * (token_idx++)] =
         string_index_pair{d_str.data() + token.first, (token.second - token.first)};
     }
@@ -760,7 +668,7 @@ struct whitespace_rsplit_tokenizer_fn : base_whitespace_split_tokenizer {
     size_type token_idx   = 0;
     position_pair token{0, 0};
     while (tokenizer.prev_token() && (token_idx < token_count)) {
-      token = tokenizer.token_byte_positions();
+      token = tokenizer.get_token();
       d_tokens[d_strings.size() * (token_count - 1 - token_idx)] =
         string_index_pair{d_str.data() + token.first, (token.second - token.first)};
       ++token_idx;
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 19ea2503ca3..c0d515b9d56 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -197,29 +197,27 @@ struct whitespace_token_reader_fn {
     auto const d_str = d_strings.element<string_view>(idx);
     whitespace_string_tokenizer tokenizer(d_str, dir != Dir::FORWARD);
     size_type token_idx = 0;
-    string_view last_token{};
+    position_pair token{0, 0};
     if (dir == Dir::FORWARD) {
       while (tokenizer.next_token() && (token_idx < token_count)) {
-        last_token            = tokenizer.get_token();
-        d_result[token_idx++] = string_index_pair{last_token.data(), last_token.size_bytes()};
+        token = tokenizer.get_token();
+        d_result[token_idx++] =
+          string_index_pair{d_str.data() + token.first, token.second - token.first};
       }
       if (token_count == max_tokens) {
-        d_result[token_idx - 1] = string_index_pair{
-          last_token.data(),
-          static_cast<size_type>(d_str.data() + d_str.size_bytes() - last_token.data())};
+        d_result[token_idx - 1] =
+          string_index_pair{d_str.data() + token.first, d_str.size_bytes() - token.first};
       }
     } else {
       while (tokenizer.prev_token() && (token_idx < token_count)) {
-        last_token = tokenizer.get_token();
+        token = tokenizer.get_token();
         d_result[token_count - 1 - token_idx] =
-          string_index_pair{last_token.data(), last_token.size_bytes()};
+          string_index_pair{d_str.data() + token.first, token.second - token.first};
         ++token_idx;
       }
       if (token_count == max_tokens) {
         --token_idx;
-        d_result[token_count - 1 - token_idx] = string_index_pair{
-          d_str.data(),
-          static_cast<size_type>(last_token.data() + last_token.size_bytes() - d_str.data())};
+        d_result[token_count - 1 - token_idx] = string_index_pair{d_str.data(), token.second};
       }
     }
   }
diff --git a/cpp/src/strings/split/split_utils.cuh b/cpp/src/strings/split/split_utils.cuh
index 5c9f98f273a..a6afd1bef10 100644
--- a/cpp/src/strings/split/split_utils.cuh
+++ b/cpp/src/strings/split/split_utils.cuh
@@ -20,6 +20,8 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+using position_pair = thrust::pair<size_type, size_type>;
+
 /**
  * @brief Instantiated for each string to manage navigating tokens from
  * the beginning or the end of that string.
@@ -91,10 +93,7 @@ struct whitespace_string_tokenizer {
     return start_position < end_position;
   }
 
-  __device__ string_view get_token() const
-  {
-    return string_view{d_str.data() + start_position, end_position - start_position};
-  }
+  __device__ position_pair get_token() const { return position_pair{start_position, end_position}; }
 
   __device__ whitespace_string_tokenizer(string_view const& d_str, bool reverse = false)
     : d_str{d_str},

From 2d880a125ab11ba5808b4b7c8f918d0c0b13ee91 Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Tue, 14 Jul 2020 13:45:13 -0400
Subject: [PATCH 5/8] remove unneeded template case

---
 cpp/src/strings/split/split_record.cu | 68 ++++++++++++---------------
 1 file changed, 29 insertions(+), 39 deletions(-)

diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index c0d515b9d56..d41fc01d31c 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -40,8 +40,10 @@ enum class Dir { FORWARD, BACKWARD };
 
 /**
  * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
+ *
+ * The number of tokens is the same regardless if counting from the beginning
+ * or the end of the string.
  */
-template <Dir dir>
 struct token_counter_fn {
   column_device_view const d_strings;  // strings to split
   string_view const d_delimiter;       // delimiter for split
@@ -53,17 +55,12 @@ struct token_counter_fn {
 
     auto const d_str      = d_strings.element<string_view>(idx);
     size_type token_count = 0;
-    size_type start_pos   = 0;               // updates only if moving forward
-    size_type end_pos     = d_str.length();  // updates only if moving backward
+    size_type start_pos   = 0;
     while (token_count < max_tokens - 1) {
-      auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
-                                                     : d_str.rfind(d_delimiter, start_pos, end_pos);
+      auto const delimiter_pos = d_str.find(d_delimiter, start_pos);
       if (delimiter_pos < 0) break;
       token_count++;
-      if (dir == Dir::FORWARD)
-        start_pos = delimiter_pos + d_delimiter.length();
-      else
-        end_pos = delimiter_pos;
+      start_pos = delimiter_pos + d_delimiter.length();
     }
     return token_count + 1;  // always at least one token
   }
@@ -79,27 +76,18 @@ struct token_reader_fn {
   int32_t* d_token_offsets{};          // for locating tokens in d_tokens
   string_index_pair* d_tokens{};
 
-  template <bool last>
   __device__ string_index_pair resolve_token(string_view const& d_str,
                                              size_type start_pos,
                                              size_type end_pos,
                                              size_type delimiter_pos) const
   {
-    if (last) {
-      auto const src_byte_offset  = dir == Dir::FORWARD ? d_str.byte_offset(start_pos) : 0;
-      auto const token_char_bytes = dir == Dir::FORWARD
-                                      ? d_str.byte_offset(end_pos) - src_byte_offset
-                                      : d_str.byte_offset(end_pos);
-      return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes};
-    } else {
-      auto const src_byte_offset = dir == Dir::FORWARD
-                                     ? d_str.byte_offset(start_pos)
-                                     : d_str.byte_offset(delimiter_pos + d_delimiter.length());
-      auto const token_char_bytes = dir == Dir::FORWARD
-                                      ? d_str.byte_offset(delimiter_pos) - src_byte_offset
-                                      : d_str.byte_offset(end_pos) - src_byte_offset;
-      return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes};
-    }
+    auto const src_byte_offset = dir == Dir::FORWARD
+                                   ? d_str.byte_offset(start_pos)
+                                   : d_str.byte_offset(delimiter_pos + d_delimiter.length());
+    auto const token_char_bytes = dir == Dir::FORWARD
+                                    ? d_str.byte_offset(delimiter_pos) - src_byte_offset
+                                    : d_str.byte_offset(end_pos) - src_byte_offset;
+    return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes};
   }
 
   __device__ void operator()(size_type idx)
@@ -111,6 +99,7 @@ struct token_reader_fn {
     auto d_result           = d_tokens + token_offset;
     auto const d_str        = d_strings.element<string_view>(idx);
     if (d_str.empty()) {
+      // Pandas str.split("") for non-whitespace delimiter is an empty string
       *d_result = string_index_pair{"", 0};
       return;
     }
@@ -122,24 +111,25 @@ struct token_reader_fn {
       auto const delimiter_pos = dir == Dir::FORWARD ? d_str.find(d_delimiter, start_pos)
                                                      : d_str.rfind(d_delimiter, start_pos, end_pos);
       if (delimiter_pos < 0) break;
-      auto const token = resolve_token<false>(d_str, start_pos, end_pos, delimiter_pos);
-      if (dir == Dir::FORWARD)
+      auto const token = resolve_token(d_str, start_pos, end_pos, delimiter_pos);
+      if (dir == Dir::FORWARD) {
         d_result[token_idx] = token;
-      else
+        start_pos           = delimiter_pos + d_delimiter.length();
+      } else {
         d_result[token_count - 1 - token_idx] = token;
-
+        end_pos                               = delimiter_pos;
+      }
       token_idx++;
-      if (dir == Dir::FORWARD)
-        start_pos = delimiter_pos + d_delimiter.length();
-      else
-        end_pos = delimiter_pos;
     }
 
-    auto const last_token = resolve_token<true>(d_str, start_pos, end_pos, -1);
-    if (dir == Dir::FORWARD)
-      d_result[token_idx] = last_token;
-    else
-      d_result[0] = last_token;
+    // set last token to remainder of the string
+    if (dir == Dir::FORWARD) {
+      auto const offset_bytes = d_str.byte_offset(start_pos);
+      d_result[token_idx] =
+        string_index_pair{d_str.data() + offset_bytes, d_str.byte_offset(end_pos) - offset_bytes};
+    } else {
+      d_result[0] = string_index_pair{d_str.data(), d_str.byte_offset(end_pos)};
+    }
   }
 };
 
@@ -289,7 +279,7 @@ std::unique_ptr<column> split_record(
   } else {
     string_view d_delimiter(delimiter.data(), delimiter.size());
     return split_record_fn(strings,
-                           token_counter_fn<dir>{*d_strings_column_ptr, d_delimiter, max_tokens},
+                           token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens},
                            token_reader_fn<dir>{*d_strings_column_ptr, d_delimiter},
                            mr,
                            stream);

From 154ff73ca88550e8e3126946ba667db6686a4fce Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Tue, 14 Jul 2020 16:34:56 -0400
Subject: [PATCH 6/8] simplify resolve-token logic

---
 cpp/src/strings/split/split_record.cu | 31 ++++++++++++++-------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index d41fc01d31c..7d0aee57bd5 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -81,13 +81,15 @@ struct token_reader_fn {
                                              size_type end_pos,
                                              size_type delimiter_pos) const
   {
-    auto const src_byte_offset = dir == Dir::FORWARD
-                                   ? d_str.byte_offset(start_pos)
-                                   : d_str.byte_offset(delimiter_pos + d_delimiter.length());
-    auto const token_char_bytes = dir == Dir::FORWARD
-                                    ? d_str.byte_offset(delimiter_pos) - src_byte_offset
-                                    : d_str.byte_offset(end_pos) - src_byte_offset;
-    return string_index_pair{d_str.data() + src_byte_offset, token_char_bytes};
+    if (dir == Dir::FORWARD) {
+      auto const byte_offset = d_str.byte_offset(start_pos);
+      return string_index_pair{d_str.data() + byte_offset,
+                               d_str.byte_offset(delimiter_pos) - byte_offset};
+    } else {
+      auto const byte_offset = d_str.byte_offset(delimiter_pos + d_delimiter.length());
+      return string_index_pair{d_str.data() + byte_offset,
+                               d_str.byte_offset(end_pos) - byte_offset};
+    }
   }
 
   __device__ void operator()(size_type idx)
@@ -194,10 +196,8 @@ struct whitespace_token_reader_fn {
         d_result[token_idx++] =
           string_index_pair{d_str.data() + token.first, token.second - token.first};
       }
-      if (token_count == max_tokens) {
-        d_result[token_idx - 1] =
-          string_index_pair{d_str.data() + token.first, d_str.size_bytes() - token.first};
-      }
+      --token_idx;
+      token.second = d_str.size_bytes() - token.first;
     } else {
       while (tokenizer.prev_token() && (token_idx < token_count)) {
         token = tokenizer.get_token();
@@ -205,11 +205,12 @@ struct whitespace_token_reader_fn {
           string_index_pair{d_str.data() + token.first, token.second - token.first};
         ++token_idx;
       }
-      if (token_count == max_tokens) {
-        --token_idx;
-        d_result[token_count - 1 - token_idx] = string_index_pair{d_str.data(), token.second};
-      }
+      token_idx   = token_count - token_idx;  // token_count - 1 - (token_idx-1)
+      token.first = 0;
     }
+    // reset last token only if we hit the max
+    if (token_count == max_tokens)
+      d_result[token_idx] = string_index_pair{d_str.data() + token.first, token.second};
   }
 };
 

From b7f26ddeef5988186272feaa7e20ccbf1a27b31d Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Wed, 15 Jul 2020 11:07:39 -0400
Subject: [PATCH 7/8] add examples in the doxygen comments

---
 cpp/include/cudf/strings/split/split.hpp | 116 +++++++++++++++++++----
 1 file changed, 98 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index 4abd7ea54c8..87e423236e9 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,19 +82,57 @@ std::unique_ptr<table> rsplit(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
 /**
- * @brief Splits individual strings elements in to a list of tokens.
+ * @brief Splits individual strings elements into a list of strings.
  *
- * Each element generates an array of tokens that are stored in a
- * resulting list column.
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
  *
- * The number of elements in the output list will be the same as the number of
+ * The number of elements in the output column will be the same as the number of
  * elements in the input column. Each individual list item will contain the
- * tokens for that row. The resulting number of tokens in each row can vary
- * from 0 to `maxsplit+1`.
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
  *
  * The `delimiter` is searched within each string from beginning to end
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
+ * If a delimiter is not whitespace and occurs adjacent to another delimiter,
+ * an empty string is produced for that split occurrence. Likewise, a non-whitespace
+ * delimiter produces an empty string if it appears at the beginning or the end
+ * of a string.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
+ * s1 = split_record(s, "_")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = split_record(s, "_", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc_def_g"],
+ *       ["a", "_bc"],
+ *       ["", "ab_cd"],
+ *       ["ab", "cd_"] ]
+ * @endcode
+ *
+ * A whitespace delimiter produces no empty strings.
+ * @code{.pseudo}
+ * s = ["a bc def", "a  bc", " ab cd", "ab cd "]
+ * s1 = split_record(s, "")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def"],
+ *       ["a", "bc"],
+ *       ["ab", "cd"],
+ *       ["ab", "cd"] ]
+ * s2 = split_record(s, "", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc def"],
+ *       ["a", "bc"],
+ *       ["ab", "cd"],
+ *       ["ab", "cd "] ]
+ * @endcode
+ *
  * A null string element will result in a null list item for that row.
  *
  * @throw cudf:logic_error if `delimiter` is invalid.
@@ -105,8 +143,8 @@ std::unique_ptr<table> rsplit(
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return List column of strings
- *         Each vector of the list column holds splits from a single row
+ * @return Lists column of strings
+ *         Each vector of the lists column holds splits from a single row
  *         element of the input column.
  */
 std::unique_ptr<column> split_record(
@@ -116,19 +154,61 @@ std::unique_ptr<column> split_record(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
 /**
- * @brief  Splits individual strings elements in to a list of tokens starting
+ * @brief  Splits individual strings elements into a list of strings starting
  * from the end of each string.
  *
- * Each element generates an array of tokens that are stored in a
- * resulting list column.
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
  *
- * The number of elements in the output list will be the same as the number of
+ * The number of elements in the output column will be the same as the number of
  * elements in the input column. Each individual list item will contain the
- * tokens for that row. The resulting number of tokens in each row can vary
- * from 0 to `maxsplit+1`.
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
  *
  * The `delimiter` is searched from end to beginning within each string
- * and splitting stops when either `maxsplit` or the end of the string is reached.
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * If a delimiter is not whitespace and occurs adjacent to another delimiter,
+ * an empty string is produced for that split occurrence. Likewise, a non-whitespace
+ * delimiter produces an empty string if it appears at the beginning or the end
+ * of a string.
+ *
+ * Note that `rsplit_record` and `split_record` produce equivalent results for
+ * the default `maxsplit` value.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
+ * s1 = rsplit_record(s, "_")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = rsplit_record(s, "_", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a_bc_def", "g"],
+ *       ["a_", "bc"],
+ *       ["_ab", "cd"],
+ *       ["ab_cd", ""] ]
+ * @endcode
+ *
+ * A whitespace delimiter produces no empty strings.
+ * @code{.pseudo}
+ * s = ["a bc def", "a  bc", " ab cd", "ab cd "]
+ * s1 = rsplit_record(s, "")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def"],
+ *       ["a", "bc"],
+ *       ["ab", "cd"],
+ *       ["ab", "cd"] ]
+ * s2 = rsplit_record(s, "", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a bc", "def"],
+ *       ["a", "bc"],
+ *       [" ab", "cd"],
+ *       ["ab", "cd"] ]
+ * @endcode
  *
  * A null string element will result in a null list item for that row.
  *
@@ -140,8 +220,8 @@ std::unique_ptr<column> split_record(
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return List column of strings
- *         Each vector of the list column holds splits from a single row
+ * @return Lists column of strings
+ *         Each vector of the lists column holds splits from a single row
  *         element of the input column.
  */
 std::unique_ptr<column> rsplit_record(

From ca7212d5625e7c4ea51e8286e8a65830798cf4a8 Mon Sep 17 00:00:00 2001
From: davidwendt <dwendt@nvidia.com>
Date: Mon, 20 Jul 2020 13:16:38 -0400
Subject: [PATCH 8/8] use lists_column_wrapper to create expected gtests
 results

---
 cpp/tests/strings/split_tests.cpp | 172 +++++++++++++-----------------
 1 file changed, 77 insertions(+), 95 deletions(-)

diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 95756e2fd33..ffb875d330f 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -278,107 +278,89 @@ TEST_F(StringsSplitTest, AllNullsCase)
 TEST_F(StringsSplitTest, SplitRecord)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
   auto result =
     cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" "));
-  cudf::lists_column_view lcv(result->view());
-  cudf::test::strings_column_wrapper expected(
-    {"", "Héllo", "thesé", "are", "some", "", "", "tést", "String", ""});
-  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 3, 3, 7, 9, 10});
-  cudf::test::expect_columns_equal(lcv.child(), expected);
-  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected(
+    {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", ""}, LCW{"tést", "String"}, LCW{""}},
+    validity);
+  cudf::test::expect_columns_equal(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, SplitRecordWithMaxSplit)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
   auto result =
     cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(" "), 1);
 
-  cudf::lists_column_view lcv(result->view());
-  cudf::test::strings_column_wrapper expected(
-    {"", "Héllo thesé", "are", "some  ", "tést", "String", ""});
-  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 2, 2, 4, 6, 7});
-  cudf::test::expect_columns_equal(lcv.child(), expected);
-  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected(
+    {LCW{"", "Héllo thesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
+    validity);
+  cudf::test::expect_columns_equal(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, SplitRecordWhitespace)
 {
   std::vector<const char*> h_strings{
     "   Héllo thesé", nullptr, "are\tsome  ", "tést\nString", "  "};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
   auto result = cudf::strings::split_record(cudf::strings_column_view(strings));
-  cudf::lists_column_view lcv(result->view());
-  cudf::test::strings_column_wrapper expected({"Héllo", "thesé", "are", "some", "tést", "String"});
-  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 2, 2, 4, 6, 6});
-  cudf::test::expect_columns_equal(lcv.child(), expected);
-  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
+  using LCW   = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"Héllo", "thesé"}, LCW{}, LCW{"are", "some"}, LCW{"tést", "String"}, LCW{}},
+               validity);
+  cudf::test::expect_columns_equal(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
 {
   std::vector<const char*> h_strings{
     "   Héllo thesé  ", nullptr, "are\tsome  ", "tést\nString", "  "};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
   auto result =
     cudf::strings::split_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1);
-  cudf::lists_column_view lcv(result->view());
-  cudf::test::strings_column_wrapper expected(
-    {"Héllo", "thesé  ", "are", "some  ", "tést", "String"});
-  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 2, 2, 4, 6, 6});
-  cudf::test::expect_columns_equal(lcv.child(), expected);
-  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"Héllo", "thesé  "}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{}},
+               validity);
+  cudf::test::expect_columns_equal(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<const char*> h_strings{
     "héllo", nullptr, "a_bc_déf", "a__bc", "_ab_cd", "ab_cd_", "", " a b ", " a  bbb   c"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  cudf::test::strings_column_wrapper expected({"héllo",
-                                               "a",
-                                               "bc",
-                                               "déf",
-                                               "a",
-                                               "",
-                                               "bc",
-                                               "",
-                                               "ab",
-                                               "cd",
-                                               "ab",
-                                               "cd",
-                                               "",
-                                               "",
-                                               " a b ",
-                                               " a  bbb   c"});
-  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 1, 1, 4, 7, 10, 13, 14, 15, 16});
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"héllo"},
+                LCW{},
+                LCW{"a", "bc", "déf"},
+                LCW{"a", "", "bc"},
+                LCW{"", "ab", "cd"},
+                LCW{"ab", "cd", ""},
+                LCW{""},
+                LCW{" a b "},
+                LCW{" a  bbb   c"}},
+               validity);
   auto result =
     cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_"));
-  cudf::lists_column_view lcv(result->view());
-  cudf::test::expect_columns_equal(lcv.child(), expected);
-  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
+  cudf::test::expect_columns_equal(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit)
@@ -392,60 +374,60 @@ TEST_F(StringsSplitTest, RSplitRecordWithMaxSplit)
                                      "",
                                      " a b ___",
                                      "___ a  bbb   c"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  cudf::test::strings_column_wrapper expected(
-    {"héllo", "a",  "bc", "déf", "___a",   "", "bc", "_ab", "cd", "",
-     "ab",    "cd", "",   "",    " a b _", "", "",   "_",   "",   " a  bbb   c"});
-  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 1, 1, 4, 7, 10, 13, 14, 17, 20});
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"héllo"},
+                LCW{},
+                LCW{"a", "bc", "déf"},
+                LCW{"___a", "", "bc"},
+                LCW{"_ab", "cd", ""},
+                LCW{"ab", "cd", ""},
+                LCW{""},
+                LCW{" a b _", "", ""},
+                LCW{"_", "", " a  bbb   c"}},
+               validity);
 
   auto result =
     cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar("_"), 2);
 
-  cudf::lists_column_view lcv(result->view());
-  cudf::test::expect_columns_equal(lcv.child(), expected);
-  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
+  cudf::test::expect_columns_equal(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, RSplitRecordWhitespace)
 {
   std::vector<const char*> h_strings{"héllo", nullptr, "a_bc_déf", "", " a\tb ", " a\r bbb   c"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
-  cudf::test::strings_column_wrapper expected({"héllo", "a_bc_déf", "a", "b", "a", "bbb", "c"});
-  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 1, 1, 2, 2, 4, 7});
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"héllo"}, LCW{}, LCW{"a_bc_déf"}, LCW{}, LCW{"a", "b"}, LCW{"a", "bbb", "c"}},
+               validity);
 
   auto result = cudf::strings::rsplit_record(cudf::strings_column_view(strings));
 
-  cudf::lists_column_view lcv(result->view());
-  cudf::test::expect_columns_equal(lcv.child(), expected);
-  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
+  cudf::test::expect_columns_equal(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
 {
   std::vector<const char*> h_strings{
     "  héllo Asher ", nullptr, "   a_bc_déf   ", "", " a\tb ", " a\r bbb   c"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
-  cudf::test::strings_column_wrapper expected(
-    {"  héllo", "Asher", "a_bc_déf", " a", "b", " a\r bbb", "c"});
-  cudf::test::fixed_width_column_wrapper<int32_t> offsets({0, 2, 2, 3, 3, 5, 7});
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected(
+    {LCW{"  héllo", "Asher"}, LCW{}, LCW{"a_bc_déf"}, LCW{}, LCW{" a", "b"}, LCW{" a\r bbb", "c"}},
+    validity);
 
   auto result =
     cudf::strings::rsplit_record(cudf::strings_column_view(strings), cudf::string_scalar(""), 1);
-  cudf::lists_column_view lcv(result->view());
-  cudf::test::expect_columns_equal(lcv.child(), expected);
-  cudf::test::expect_columns_equal(lcv.offsets(), offsets);
+  cudf::test::expect_columns_equal(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)