From eaba42e4d7631302c81c4caf2f3d29fb24f3c45d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 25 Jan 2022 19:47:19 -0500
Subject: [PATCH 01/39] Add libcudf strings split API that accepts regex
 pattern

---
 conda/recipes/libcudf/meta.yaml             |   1 +
 cpp/CMakeLists.txt                          |   1 +
 cpp/include/cudf/strings/split/split_re.hpp |  82 ++++++++
 cpp/src/strings/split/split_record_re.cu    | 215 ++++++++++++++++++++
 cpp/tests/strings/split_tests.cpp           |  37 +++-
 5 files changed, 335 insertions(+), 1 deletion(-)
 create mode 100644 cpp/include/cudf/strings/split/split_re.hpp
 create mode 100644 cpp/src/strings/split/split_record_re.cu

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 2cbe5173de0..01ad8d4e270 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -214,6 +214,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp
     - test -f $PREFIX/include/cudf/strings/split/partition.hpp
     - test -f $PREFIX/include/cudf/strings/split/split.hpp
+    - test -f $PREFIX/include/cudf/strings/split/split_re.hpp
     - test -f $PREFIX/include/cudf/strings/string_view.hpp
     - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp
     - test -f $PREFIX/include/cudf/strings/strip.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e4637408110..b25d6ff3703 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -444,6 +444,7 @@ add_library(
   src/strings/split/partition.cu
   src/strings/split/split.cu
   src/strings/split/split_record.cu
+  src/strings/split/split_record_re.cu
   src/strings/strings_column_factories.cu
   src/strings/strings_column_view.cpp
   src/strings/strings_scalar_factories.cpp
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
new file mode 100644
index 00000000000..b69bd1c5991
--- /dev/null
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace cudf {
+namespace strings {
+/**
+ * @addtogroup strings_split
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Splits individual strings elements into a list of strings
+ * using a regex pattern to delimit each string.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null list item output row.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = split_record(s, "[_ ]")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = split_record(s, "[ _]", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc_def_g"],
+ *       ["a", "_bc"],
+ *       ["", "ab_cd"],
+ *       ["ab", "cd_"] ]
+ * @endcode
+ *
+ * @throw cudf:logic_error if `pattern` is empty.
+ *
+ * @param strings A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings
+ *         Each vector of the lists column holds splits from a single row
+ *         element of the input column.
+ */
+std::unique_ptr<column> split_record_re(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu
new file mode 100644
index 00000000000..d197ee9c7e3
--- /dev/null
+++ b/cpp/src/strings/split/split_record_re.cu
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/split/split_re.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+using string_index_pair = thrust::pair<const char*, size_type>;
+
+namespace {
+
+/**
+ * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
+ */
+template <int stack_size>
+struct token_counter_fn {
+  column_device_view const d_strings;  // strings to split
+  reprog_device prog;
+  size_type const max_tokens;
+
+  __device__ size_type operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return 0; }
+
+    auto const d_str      = d_strings.element<string_view>(idx);
+    size_type token_count = 0;
+
+    int32_t begin = 0;
+    int32_t end   = -1;
+    while (token_count < max_tokens - 1) {
+      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) { break; }
+      token_count++;
+      begin = end + (begin == end);
+      end   = -1;
+    }
+    return token_count + 1;  // always at least one token
+  }
+};
+
+/**
+ * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
+ */
+template <int stack_size>
+struct token_reader_fn {
+  column_device_view const d_strings;
+  reprog_device prog;
+  int32_t const* d_token_offsets;
+  string_index_pair* d_tokens;
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return; }
+
+    auto const token_offset = d_token_offsets[idx];
+    auto const token_count  = d_token_offsets[idx + 1] - token_offset;
+    auto d_result           = d_tokens + token_offset;
+    auto const d_str        = d_strings.element<string_view>(idx);
+    if (d_str.empty()) {
+      // return empty string output for empty string input
+      *d_result = string_index_pair{"", 0};
+      return;
+    }
+
+    size_type token_idx = 0;
+    size_type begin     = 0;
+    size_type end       = d_str.length();
+    size_type last_pos  = 0;
+    while (token_idx < token_count - 1) {
+      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) { break; }
+
+      auto const start_pos = d_str.byte_offset(begin);
+      auto const end_pos   = d_str.byte_offset(end);
+      d_result[token_idx]  = string_index_pair{d_str.data() + last_pos, start_pos - last_pos};
+
+      begin = end + (begin == end);
+      end   = d_str.length();
+      token_idx++;
+      last_pos = end_pos;
+    }
+
+    // set last token to remainder of the string
+    if (last_pos <= d_str.size_bytes()) {
+      d_result[token_idx] =
+        string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
+    }
+  }
+};
+
+}  // namespace
+
+// The output is one list item per string
+std::unique_ptr<column> split_record_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const max_tokens    = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max();
+  auto const strings_count = input.size();
+
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto offsets = make_numeric_column(
+    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<int32_t>();
+
+  auto const begin = thrust::make_counting_iterator<size_type>(0);
+  auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
+
+  // create offsets column by counting the number of tokens per string
+  auto const regex_insts = d_prog->insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    token_counter_fn<RX_STACK_SMALL> counter{*d_strings, *d_prog, max_tokens};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    token_counter_fn<RX_STACK_MEDIUM> counter{*d_strings, *d_prog, max_tokens};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    token_counter_fn<RX_STACK_LARGE> counter{*d_strings, *d_prog, max_tokens};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
+  } else {
+    token_counter_fn<RX_STACK_ANY> counter{*d_strings, *d_prog, max_tokens};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
+  }
+  // convert counts into offsets
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+  // last entry is the total number of tokens to be generated
+  auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
+
+  printf("instruction = %d\ntotal_tokens = %d\nbegin,end = %d,%d\n",
+         regex_insts,
+         total_tokens,
+         *begin,
+         *end);
+  // split each string into an array of index-pair values
+  rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
+  if (regex_insts <= RX_SMALL_INSTS) {
+    token_reader_fn<RX_STACK_SMALL> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    token_reader_fn<RX_STACK_MEDIUM> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    token_reader_fn<RX_STACK_LARGE> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else {
+    token_reader_fn<RX_STACK_ANY> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  }
+
+  // convert the index-pairs into one big strings column
+  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+  // create a lists column using the offsets and the strings columns
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           input.null_count(),
+                           copy_bitmask(input.parent(), stream, mr),
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+
+// external APIs
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        size_type maxsplit,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index de4e48fd70a..7eddc947d40 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/split/partition.hpp>
 #include <cudf/strings/split/split.hpp>
+#include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 
@@ -339,6 +340,40 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, SplitRecordRegex)
+{
+  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto result = cudf::strings::split_record_re(sv, "[eé]");
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected(
+    {LCW{" H", "llo th", "s", ""}, LCW{}, LCW{"ar", " som", "  "}, LCW{"t", "st String"}, LCW{""}},
+    validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
+TEST_F(StringsSplitTest, SplitRecordRegexWithMaxSplit)
+{
+  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto result = cudf::strings::split_record_re(sv, "\\s", 1);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected(
+    {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
+    validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<const char*> h_strings{

From a83243646975f0c398910bb0cb80affa40c214bf Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 26 Jan 2022 11:25:18 -0500
Subject: [PATCH 02/39] add error-checking gtests

---
 cpp/tests/strings/split_tests.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 7eddc947d40..badb84536ba 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -654,6 +654,11 @@ TEST_F(StringsSplitTest, InvalidParameter)
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_record(strings_view, cudf::string_scalar("", false)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)),

From baccf10d8b7c447b6ff9b77451df6ce9afc2fe65 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Wed, 26 Jan 2022 13:29:17 -0700
Subject: [PATCH 03/39] Add JNI

---
 java/src/main/native/src/ColumnViewJni.cpp | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 38c6bb3740e..ee00d8707f8 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -576,6 +576,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRegex(JNIEnv *env, jclass,
+                                                                             jlong column_view,
+                                                                             jlong delimiter) {
+  JNI_NULL_CHECK(env, column_view, "column is null", 0);
+  JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::strings_column_view scv(*cv);
+    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
+
+    std::unique_ptr<cudf::table> table_result = cudf::strings::split_re(scv, *ss_scalar);
+    return cudf::jni::convert_table_for_return(env, table_result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
                                                                          jlong column_view,
                                                                          jlong delimiter,
@@ -592,6 +609,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRegex(JNIEnv *env, jclass,
+                                                                              jlong column_view,
+                                                                              jlong delimiter,
+                                                                              jint max_split) {
+  JNI_NULL_CHECK(env, column_view, "column is null", 0);
+  JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::strings_column_view scv(*cv);
+    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
+    return release_as_jlong(cudf::strings::split_record_re(scv, *ss_scalar, max_split));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, jclass clazz,
                                                                   jlong input_column,
                                                                   jintArray split_indices) {

From d33f79bb9c89d014a4e6a374067e8a88c366aafa Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 27 Jan 2022 10:49:01 -0500
Subject: [PATCH 04/39] use count_matches utility

---
 cpp/src/strings/split/split_record_re.cu | 63 ++++++++----------------
 1 file changed, 21 insertions(+), 42 deletions(-)

diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu
index d197ee9c7e3..eff0c511393 100644
--- a/cpp/src/strings/split/split_record_re.cu
+++ b/cpp/src/strings/split/split_record_re.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/count_matches.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -42,30 +43,23 @@ using string_index_pair = thrust::pair<const char*, size_type>;
 namespace {
 
 /**
- * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
+ * @brief Convert match counts to token counts.
+ *
+ * The matches are the delimiters and the tokens are what is left:
+ * `token1, delimiter, token2, delimiter, token3, etc`
+ * Usually `token_count = match_count + 1` even with empty strings.
+ * However, we need to account for the max_tokens and null rows.
  */
-template <int stack_size>
-struct token_counter_fn {
-  column_device_view const d_strings;  // strings to split
-  reprog_device prog;
+struct match_to_token_count_fn {
+  column_device_view const d_strings;
+  size_type const* d_counts;
   size_type const max_tokens;
 
   __device__ size_type operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) { return 0; }
-
-    auto const d_str      = d_strings.element<string_view>(idx);
-    size_type token_count = 0;
-
-    int32_t begin = 0;
-    int32_t end   = -1;
-    while (token_count < max_tokens - 1) {
-      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) { break; }
-      token_count++;
-      begin = end + (begin == end);
-      end   = -1;
-    }
-    return token_count + 1;  // always at least one token
+    auto const match_count = d_counts[idx];
+    return std::min(match_count, max_tokens) + 1;
   }
 };
 
@@ -130,34 +124,23 @@ std::unique_ptr<column> split_record_re(
 {
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
 
-  auto const max_tokens    = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max();
+  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
   auto const strings_count = input.size();
 
   auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto offsets = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto offsets   = count_matches(*d_strings, *d_prog, stream, mr);
   auto d_offsets = offsets->mutable_view().data<int32_t>();
 
   auto const begin = thrust::make_counting_iterator<size_type>(0);
   auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
-
-  // create offsets column by counting the number of tokens per string
-  auto const regex_insts = d_prog->insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    token_counter_fn<RX_STACK_SMALL> counter{*d_strings, *d_prog, max_tokens};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    token_counter_fn<RX_STACK_MEDIUM> counter{*d_strings, *d_prog, max_tokens};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    token_counter_fn<RX_STACK_LARGE> counter{*d_strings, *d_prog, max_tokens};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
-  } else {
-    token_counter_fn<RX_STACK_ANY> counter{*d_strings, *d_prog, max_tokens};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
-  }
+  // convert match counts to tokens
+  thrust::transform(rmm::exec_policy(stream),
+                    begin,
+                    end,
+                    d_offsets,
+                    match_to_token_count_fn{*d_strings, d_offsets, max_tokens});
   // convert counts into offsets
   thrust::exclusive_scan(
     rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
@@ -165,13 +148,9 @@ std::unique_ptr<column> split_record_re(
   // last entry is the total number of tokens to be generated
   auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
 
-  printf("instruction = %d\ntotal_tokens = %d\nbegin,end = %d,%d\n",
-         regex_insts,
-         total_tokens,
-         *begin,
-         *end);
   // split each string into an array of index-pair values
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
+  auto const regex_insts = d_prog->insts_counts();
   if (regex_insts <= RX_SMALL_INSTS) {
     token_reader_fn<RX_STACK_SMALL> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);

From 9c74fdffbc2c3ddc7e4a248a4c837996c8c25bf4 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 27 Jan 2022 10:49:16 -0500
Subject: [PATCH 05/39] add split_re declaration

---
 cpp/include/cudf/strings/split/split_re.hpp | 62 +++++++++++++++++++--
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index b69bd1c5991..54d590fcf71 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
 
 namespace cudf {
 namespace strings {
@@ -26,6 +27,59 @@ namespace strings {
  * @file
  */
 
+/**
+ * @brief Splits individual strings elements into a table of strings columns
+ * using a regex pattern to delimit each string.
+ *
+ * Each element generates an array of strings that are stored in corresponding
+ * rows in the output table.
+ *
+ * The number of elements in the output table will be the same as the number of
+ * elements in the input column. The row for each column will contain the
+ * new strings produced from that input row.
+ *
+ * The resulting number of columns will be the maximum number of tokens found
+ * in any input row.
+ *
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row in the first column.
+ *
+ * A null row will produce a corresponding null rows in the output table.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = split_re(s, "[_ ]")
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * s2 = split_re(s, "[ _]", 1)
+ * s2 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc def_g", "_bc", "ab_cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf:logic_error if `pattern` is empty.
+ *
+ * @param strings A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings
+ *         Each vector of the lists column holds splits from a single row
+ *         element of the input column.
+ */
+std::unique_ptr<table> split_re(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Splits individual strings elements into a list of strings
  * using a regex pattern to delimit each string.
@@ -54,10 +108,10 @@ namespace strings {
  *       ["ab", "cd", ""] ]
  * s2 = split_record(s, "[ _]", 1)
  * s2 is a lists column of strings:
- *     [ ["a", "bc_def_g"],
+ *     [ ["a", "bc def_g"],
  *       ["a", "_bc"],
- *       ["", "ab_cd"],
- *       ["ab", "cd_"] ]
+ *       ["", "ab cd"],
+ *       ["ab", "cd "] ]
  * @endcode
  *
  * @throw cudf:logic_error if `pattern` is empty.

From 1a89db5f53ed21952183d8ab9f2d4e6e800b1175 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 27 Jan 2022 18:01:11 -0500
Subject: [PATCH 06/39] split_re implementation and tests

---
 cpp/src/strings/split/split_record_re.cu | 193 ++++++++++++++++++-----
 cpp/tests/strings/split_tests.cpp        | 110 +++++++++----
 2 files changed, 235 insertions(+), 68 deletions(-)

diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu
index eff0c511393..3f916d0138f 100644
--- a/cpp/src/strings/split/split_record_re.cu
+++ b/cpp/src/strings/split/split_record_re.cu
@@ -22,6 +22,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/split/split_re.hpp>
@@ -70,7 +71,7 @@ template <int stack_size>
 struct token_reader_fn {
   column_device_view const d_strings;
   reprog_device prog;
-  int32_t const* d_token_offsets;
+  offset_type const* d_token_offsets;
   string_index_pair* d_tokens;
 
   __device__ void operator()(size_type idx)
@@ -81,11 +82,6 @@ struct token_reader_fn {
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
     auto d_result           = d_tokens + token_offset;
     auto const d_str        = d_strings.element<string_view>(idx);
-    if (d_str.empty()) {
-      // return empty string output for empty string input
-      *d_result = string_index_pair{"", 0};
-      return;
-    }
 
     size_type token_idx = 0;
     size_type begin     = 0;
@@ -112,61 +108,105 @@ struct token_reader_fn {
   }
 };
 
-}  // namespace
-
-// The output is one list item per string
-std::unique_ptr<column> split_record_re(
-  strings_column_view const& input,
-  std::string const& pattern,
-  size_type maxsplit,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
-
-  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
-  auto const strings_count = input.size();
+struct tokens_transform_fn {
+  column_device_view const d_strings;
+  string_index_pair const* d_tokens;
+  offset_type const* d_token_offsets;
+  size_type const column_index;
 
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
-  auto d_strings = column_device_view::create(input.parent(), stream);
+  __device__ string_index_pair operator()(size_type idx) const
+  {
+    auto const offset      = d_token_offsets[idx];
+    auto const token_count = d_token_offsets[idx + 1] - offset;
+    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+    if (column_index > token_count - 1) { return string_index_pair{nullptr, 0}; }
+    return d_tokens[offset + column_index];
+  }
+};
 
-  auto offsets   = count_matches(*d_strings, *d_prog, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<int32_t>();
+/**
+ * @brief Call regex to split each input string into tokens.
+ *
+ * This will also convert the `offsets` values from counts to offsets.
+ *
+ * @param d_strings Strings to split
+ * @param d_prog Regex to evaluate against each string
+ * @param max_tokens The maximum number of tokens for each split.
+ * @param offsets The number of matches on input.
+ *                The offsets for each token in each string on output.
+ * @param stream CUDA stream used for kernel launches.
+ */
+rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d_strings,
+                                                     reprog_device& d_prog,
+                                                     size_type max_tokens,
+                                                     mutable_column_view& offsets,
+                                                     rmm::cuda_stream_view stream)
+{
+  auto d_offsets           = offsets.data<offset_type>();
+  auto const strings_count = d_strings.size();
 
   auto const begin = thrust::make_counting_iterator<size_type>(0);
   auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
+
   // convert match counts to tokens
-  thrust::transform(rmm::exec_policy(stream),
-                    begin,
-                    end,
-                    d_offsets,
-                    match_to_token_count_fn{*d_strings, d_offsets, max_tokens});
+  match_to_token_count_fn match_fn{d_strings, d_offsets, max_tokens};
+  thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, match_fn);
+
   // convert counts into offsets
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         offsets.begin<offset_type>(),
+                         offsets.end<offset_type>(),
+                         offsets.begin<offset_type>());
 
-  // last entry is the total number of tokens to be generated
-  auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
+  // the last entry is the total number of tokens to be generated
+  auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
 
-  // split each string into an array of index-pair values
+  // generate tokens for each string
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
-  auto const regex_insts = d_prog->insts_counts();
+  auto const regex_insts = d_prog.insts_counts();
   if (regex_insts <= RX_SMALL_INSTS) {
-    token_reader_fn<RX_STACK_SMALL> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    token_reader_fn<RX_STACK_MEDIUM> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else if (regex_insts <= RX_LARGE_INSTS) {
-    token_reader_fn<RX_STACK_LARGE> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else {
-    token_reader_fn<RX_STACK_ANY> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   }
 
-  // convert the index-pairs into one big strings column
+  return tokens;
+}
+
+}  // namespace
+
+// The output is one list item per string
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+  auto const strings_count = input.size();
+
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
+  auto offsets_view = offsets->mutable_view();
+
+  // get split tokens from the input column
+  auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream);
+
+  // convert the tokens into one big strings column
   auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+
   // create a lists column using the offsets and the strings columns
   return make_lists_column(strings_count,
                            std::move(offsets),
@@ -177,10 +217,83 @@ std::unique_ptr<column> split_record_re(
                            mr);
 }
 
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                size_type maxsplit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+  auto const strings_count = input.size();
+
+  std::vector<std::unique_ptr<column>> results;
+  if (strings_count == 0) {
+    results.push_back(make_empty_column(type_id::STRING));
+    return std::make_unique<table>(std::move(results));
+  }
+
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
+  auto offsets_view = offsets->mutable_view();
+
+  // get split tokens from the input column
+  auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream);
+
+  // the columns_count is the maximum number of tokens for any string in the input column
+  auto const begin = thrust::make_counting_iterator<size_type>(0);
+  auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
+  auto d_offsets   = offsets_view.data<offset_type>();
+  auto size_lambda = [d_offsets] __device__(auto const idx) -> size_type {
+    return d_offsets[idx + 1] - d_offsets[idx];
+  };
+  auto const columns_count = thrust::transform_reduce(
+    rmm::exec_policy(stream), begin, end, size_lambda, 0, thrust::maximum<size_type>{});
+
+  // boundary case: if no columns, return one all-null column (custrings issue #119)
+  if (columns_count == 0) {
+    results.push_back(std::make_unique<column>(
+      data_type{type_id::STRING},
+      strings_count,
+      rmm::device_buffer{0, stream, mr},  // no data
+      cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
+      strings_count));
+    return std::make_unique<table>(std::move(results));
+  }
+
+  // convert the tokens into multiple strings columns
+  auto make_strings_lambda = [&](size_type column_index) {
+    // returns appropriate token for each row/column
+    auto indices_itr = cudf::detail::make_counting_transform_iterator(
+      0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index});
+    return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr);
+  };
+  // create each column of tokens
+  results.resize(columns_count);
+  std::transform(thrust::make_counting_iterator<size_type>(0),
+                 thrust::make_counting_iterator<size_type>(columns_count),
+                 results.begin(),
+                 make_strings_lambda);
+
+  return std::make_unique<table>(std::move(results));
+}
+
 }  // namespace detail
 
 // external APIs
 
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                size_type maxsplit,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         std::string const& pattern,
                                         size_type maxsplit,
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index badb84536ba..f541a6b0e81 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -247,33 +247,13 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::split(zero_size_strings_column);
   EXPECT_TRUE(results->num_columns() == 1);
-  cudf::test::expect_strings_empty(results->get_column(0));
+  EXPECT_TRUE(results->num_rows() == 0);
   results = cudf::strings::rsplit(zero_size_strings_column);
   EXPECT_TRUE(results->num_columns() == 1);
-  cudf::test::expect_strings_empty(results->get_column(0));
-}
-
-// This test specifically for https://github.com/rapidsai/custrings/issues/119
-TEST_F(StringsSplitTest, AllNullsCase)
-{
-  std::vector<const char*> h_strings{nullptr, nullptr, nullptr};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  auto results = cudf::strings::split(cudf::strings_column_view(strings));
-  EXPECT_TRUE(results->num_columns() == 1);
-  auto column = results->get_column(0).view();
-  EXPECT_TRUE(column.size() == 3);
-  EXPECT_TRUE(column.has_nulls());
-  EXPECT_TRUE(column.null_count() == column.size());
-  results = cudf::strings::split(cudf::strings_column_view(strings), cudf::string_scalar("-"));
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::split_re(zero_size_strings_column, "\\s");
   EXPECT_TRUE(results->num_columns() == 1);
-  column = results->get_column(0);
-  EXPECT_TRUE(column.size() == 3);
-  EXPECT_TRUE(column.has_nulls());
-  EXPECT_TRUE(column.null_count() == column.size());
+  EXPECT_TRUE(results->num_rows() == 0);
 }
 
 TEST_F(StringsSplitTest, SplitRecord)
@@ -340,6 +320,54 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, SplitRegex)
+{
+  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  {
+    auto result = cudf::strings::split_re(sv, "\\s+");
+
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0});
+    cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0});
+    auto expected = cudf::table_view({col0, col1, col2});
+    CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+  }
+
+  {
+    auto result = cudf::strings::split_re(sv, "[eé]");
+
+    cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""},
+                                            {1, 0, 1, 1, 0});
+    cudf::test::strings_column_wrapper col2({"s", "", "  ", "", ""}, {1, 0, 1, 0, 0});
+    cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0});
+    auto expected = cudf::table_view({col0, col1, col2, col3});
+    CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+  }
+}
+
+TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
+{
+  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto result = cudf::strings::split_re(sv, "\\s+", 1);
+
+  cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+  cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
+                                          {1, 0, 1, 1, 0});
+  auto expected = cudf::table_view({col0, col1});
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+}
+
 TEST_F(StringsSplitTest, SplitRecordRegex)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
@@ -469,10 +497,35 @@ TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto split_record_result = cudf::strings::split_record(zero_size_strings_column);
-  EXPECT_TRUE(split_record_result->size() == 0);
-  auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column);
-  EXPECT_TRUE(rsplit_record_result->size() == 0);
+  auto result = cudf::strings::split_record(zero_size_strings_column);
+  EXPECT_TRUE(result->size() == 0);
+  result = cudf::strings::rsplit_record(zero_size_strings_column);
+  EXPECT_TRUE(result->size() == 0);
+  result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(result->size() == 0);
+}
+
+// This test specifically for https://github.com/rapidsai/custrings/issues/119
+TEST_F(StringsSplitTest, AllNullsCase)
+{
+  cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0});
+  auto sv = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::split(sv);
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::split(sv, cudf::string_scalar("-"));
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit(sv);
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit(sv, cudf::string_scalar("-"));
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::split_re(sv, "-");
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
 }
 
 TEST_F(StringsSplitTest, Partition)
@@ -658,6 +711,7 @@ TEST_F(StringsSplitTest, InvalidParameter)
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);

From 8599d0cba24ef963c28361455769969a3764a430 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 28 Jan 2022 08:16:31 -0500
Subject: [PATCH 07/39] rename split_record_re.cu to split_re.cu

---
 cpp/CMakeLists.txt                                        | 2 +-
 cpp/src/strings/split/{split_record_re.cu => split_re.cu} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cpp/src/strings/split/{split_record_re.cu => split_re.cu} (100%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f96ef4945b9..407e1f9a858 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -445,8 +445,8 @@ add_library(
   src/strings/search/find_multiple.cu
   src/strings/split/partition.cu
   src/strings/split/split.cu
+  src/strings/split/split_re.cu
   src/strings/split/split_record.cu
-  src/strings/split/split_record_re.cu
   src/strings/strings_column_factories.cu
   src/strings/strings_column_view.cpp
   src/strings/strings_scalar_factories.cpp
diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_re.cu
similarity index 100%
rename from cpp/src/strings/split/split_record_re.cu
rename to cpp/src/strings/split/split_re.cu

From b6d7453b66c3548e4e47499de66e7eae0fa0b2fb Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 31 Jan 2022 14:05:56 -0500
Subject: [PATCH 08/39] refactored split_re/rsplit_re functions

---
 cpp/include/cudf/strings/split/split_re.hpp | 130 +++++++--
 cpp/src/strings/split/split_re.cu           | 281 ++++++++++++--------
 cpp/tests/strings/split_tests.cpp           | 162 +++++++----
 3 files changed, 394 insertions(+), 179 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 54d590fcf71..cf6d23ccd28 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -32,22 +32,19 @@ namespace strings {
  * using a regex pattern to delimit each string.
  *
  * Each element generates an array of strings that are stored in corresponding
- * rows in the output table.
+ * rows in the output table -- `table[col,row] = token[col] of string[row]`
+ * where `token` is the substring between each delimiter.
  *
  * The number of elements in the output table will be the same as the number of
- * elements in the input column. The row for each column will contain the
- * new strings produced from that input row.
- *
- * The resulting number of columns will be the maximum number of tokens found
- * in any input row.
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
  *
  * The `pattern` is used to identify the separation points within a string
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
  * An empty input string will produce a corresponding empty string in the
- * corresponding row in the first column.
- *
- * A null row will produce a corresponding null rows in the output table.
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
  *
  * @code{.pseudo}
  * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
@@ -60,7 +57,7 @@ namespace strings {
  * s2 = split_re(s, "[ _]", 1)
  * s2 is a table of strings columns:
  *     [ ["a", "a", "", "ab"],
- *       ["bc def_g", "_bc", "ab_cd", "cd "] ]
+ *       ["bc def_g", "_bc", "ab cd", "cd "] ]
  * @endcode
  *
  * @throw cudf:logic_error if `pattern` is empty.
@@ -70,9 +67,7 @@ namespace strings {
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return Lists column of strings
- *         Each vector of the lists column holds splits from a single row
- *         element of the input column.
+ * @return A table of columns of strings.
  */
 std::unique_ptr<table> split_re(
   strings_column_view const& strings,
@@ -81,9 +76,59 @@ std::unique_ptr<table> split_re(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Splits individual strings elements into a list of strings
+ * @brief Splits individual strings elements into a table of strings columns
  * using a regex pattern to delimit each string.
  *
+ * Each element generates an array of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of string[row]`
+ * where `token` is the substring between each delimiter.
+ *
+ * The number of elements in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the separation points within the string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = rsplit_re(s, "[_ ]")
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * s2 = rsplit_re(s, "[ _]", 1)
+ * s2 is a table of strings columns:
+ *     [ ["a_bc def", "a_", "_ab", "ab"],
+ *       ["g", "bc", "cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf:logic_error if `pattern` is empty.
+ *
+ * @param strings A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return A table of columns of strings.
+ */
+std::unique_ptr<table> rsplit_re(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Splits individual strings elements into a list of strings
+ * using the given regex pattern to delimit each string.
+ *
  * Each element generates an array of strings that are stored in an output
  * lists column.
  *
@@ -96,7 +141,7 @@ std::unique_ptr<table> split_re(
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
  * An empty input string will produce a corresponding empty list item output row.
- * A null row will produce a corresponding null list item output row.
+ * A null row will produce a corresponding null output row.
  *
  * @code{.pseudo}
  * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
@@ -121,9 +166,7 @@ std::unique_ptr<table> split_re(
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return Lists column of strings
- *         Each vector of the lists column holds splits from a single row
- *         element of the input column.
+ * @return Lists column of strings.
  */
 std::unique_ptr<column> split_record_re(
   strings_column_view const& strings,
@@ -131,6 +174,57 @@ std::unique_ptr<column> split_record_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits individual strings elements into a list of strings
+ * using the given regex pattern to delimit each string.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = rsplit_record(s, "[_ ]")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = rsplit_record(s, "[ _]", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a_bc def", "g"],
+ *       ["a_", "bc"],
+ *       ["_ab", "cd"],
+ *       ["ab_cd", ""] ]
+ * @endcode
+ *
+ * @throw cudf:logic_error if `pattern` is empty.
+ *
+ * @param strings A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings.
+ */
+std::unique_ptr<column> rsplit_record_re(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 3f916d0138f..9427a900d8d 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -32,36 +32,18 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
+#include <thrust/transform_scan.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
-
-using string_index_pair = thrust::pair<const char*, size_type>;
-
 namespace {
 
-/**
- * @brief Convert match counts to token counts.
- *
- * The matches are the delimiters and the tokens are what is left:
- * `token1, delimiter, token2, delimiter, token3, etc`
- * Usually `token_count = match_count + 1` even with empty strings.
- * However, we need to account for the max_tokens and null rows.
- */
-struct match_to_token_count_fn {
-  column_device_view const d_strings;
-  size_type const* d_counts;
-  size_type const max_tokens;
+using string_index_pair = thrust::pair<const char*, size_type>;
 
-  __device__ size_type operator()(size_type idx)
-  {
-    if (d_strings.is_null(idx)) { return 0; }
-    auto const match_count = d_counts[idx];
-    return std::min(match_count, max_tokens) + 1;
-  }
+enum class split_direction {
+  FORWARD,  ///< for split logic
+  BACKWARD  ///< for rsplit logic
 };
 
 /**
@@ -71,56 +53,58 @@ template <int stack_size>
 struct token_reader_fn {
   column_device_view const d_strings;
   reprog_device prog;
+  split_direction const direction;
   offset_type const* d_token_offsets;
   string_index_pair* d_tokens;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) { return; }
+    auto const d_str = d_strings.element<string_view>(idx);
 
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
-    auto d_result           = d_tokens + token_offset;
-    auto const d_str        = d_strings.element<string_view>(idx);
+    auto d_result           = d_tokens + token_offset;  // store tokens here
 
     size_type token_idx = 0;
-    size_type begin     = 0;
+    size_type begin     = 0;  // characters
     size_type end       = d_str.length();
-    size_type last_pos  = 0;
-    while (token_idx < token_count - 1) {
-      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) { break; }
-
-      auto const start_pos = d_str.byte_offset(begin);
-      auto const end_pos   = d_str.byte_offset(end);
-      d_result[token_idx]  = string_index_pair{d_str.data() + last_pos, start_pos - last_pos};
-
-      begin = end + (begin == end);
-      end   = d_str.length();
-      token_idx++;
-      last_pos = end_pos;
+    size_type last_pos  = 0;  // bytes
+    while (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+      // get the token (characters just before this match)
+      auto token = string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
+      // store it if we have space
+      if (token_idx < token_count - 1) {
+        d_result[token_idx++] = token;
+      } else {
+        if (direction == split_direction::FORWARD) { break; }  // we are done
+        for (auto l = 0; l < token_idx - 1; ++l) {
+          d_result[l] = d_result[l + 1];  // shift left
+        }
+        d_result[token_idx - 1] = token;
+      }
+      // setup for next match
+      last_pos = d_str.byte_offset(end);
+      begin    = end + (begin == end);
+      end      = d_str.length();
     }
 
-    // set last token to remainder of the string
+    // set the last token to the remainder of the string
     if (last_pos <= d_str.size_bytes()) {
       d_result[token_idx] =
         string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
     }
-  }
-};
-
-struct tokens_transform_fn {
-  column_device_view const d_strings;
-  string_index_pair const* d_tokens;
-  offset_type const* d_token_offsets;
-  size_type const column_index;
 
-  __device__ string_index_pair operator()(size_type idx) const
-  {
-    auto const offset      = d_token_offsets[idx];
-    auto const token_count = d_token_offsets[idx + 1] - offset;
-    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
-    if (column_index > token_count - 1) { return string_index_pair{nullptr, 0}; }
-    return d_tokens[offset + column_index];
+    if (direction == split_direction::BACKWARD) {
+      // update first entry -- this happens when max-tokens is hit before the end
+      auto const first_offset =
+        d_result[0].first
+          ? static_cast<size_type>(thrust::distance(d_str.data(), d_result[0].first))
+          : 0;
+      if (first_offset) {
+        d_result[0] = string_index_pair{d_str.data(), first_offset + d_result[0].second};
+      }
+    }
   }
 };
 
@@ -138,6 +122,7 @@ struct tokens_transform_fn {
  */
 rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d_strings,
                                                      reprog_device& d_prog,
+                                                     split_direction direction,
                                                      size_type max_tokens,
                                                      mutable_column_view& offsets,
                                                      rmm::cuda_stream_view stream)
@@ -148,15 +133,12 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
   auto const begin = thrust::make_counting_iterator<size_type>(0);
   auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
 
-  // convert match counts to tokens
-  match_to_token_count_fn match_fn{d_strings, d_offsets, max_tokens};
-  thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, match_fn);
-
-  // convert counts into offsets
-  thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets.begin<offset_type>(),
-                         offsets.end<offset_type>(),
-                         offsets.begin<offset_type>());
+  // convert match counts to token offsets
+  auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) {
+    return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1;
+  };
+  thrust::transform_exclusive_scan(
+    rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<offset_type>{});
 
   // the last entry is the total number of tokens to be generated
   auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
@@ -165,60 +147,48 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   auto const regex_insts = d_prog.insts_counts();
   if (regex_insts <= RX_SMALL_INSTS) {
-    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else if (regex_insts <= RX_LARGE_INSTS) {
-    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else {
-    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   }
 
   return tokens;
 }
 
-}  // namespace
-
-// The output is one list item per string
-std::unique_ptr<column> split_record_re(strings_column_view const& input,
-                                        std::string const& pattern,
-                                        size_type maxsplit,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
-
-  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
-  auto const strings_count = input.size();
-
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
-  auto d_strings = column_device_view::create(input.parent(), stream);
-
-  auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
-  auto offsets_view = offsets->mutable_view();
-
-  // get split tokens from the input column
-  auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream);
-
-  // convert the tokens into one big strings column
-  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+/**
+ * @brief Returns string pair for the specified column for each string in `d_strings`
+ *
+ * This is used to build the table result of a split.
+ * Null is returned if the row is null of if the `column_index` is larger
+ * than the token count for that string.
+ */
+struct tokens_transform_fn {
+  column_device_view const d_strings;
+  string_index_pair const* d_tokens;
+  offset_type const* d_token_offsets;
+  size_type const column_index;
 
-  // create a lists column using the offsets and the strings columns
-  return make_lists_column(strings_count,
-                           std::move(offsets),
-                           std::move(strings_output),
-                           input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
-                           stream,
-                           mr);
-}
+  __device__ string_index_pair operator()(size_type idx) const
+  {
+    auto const offset      = d_token_offsets[idx];
+    auto const token_count = d_token_offsets[idx + 1] - offset;
+    return (column_index > token_count - 1) || d_strings.is_null(idx)
+             ? string_index_pair{nullptr, 0}
+             : d_tokens[offset + column_index];
+  }
+};
 
 std::unique_ptr<table> split_re(strings_column_view const& input,
                                 std::string const& pattern,
+                                split_direction direction,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
@@ -239,19 +209,21 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
 
   auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
   auto offsets_view = offsets->mutable_view();
+  auto d_offsets    = offsets_view.data<offset_type>();
 
   // get split tokens from the input column
-  auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream);
+  auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
   // the columns_count is the maximum number of tokens for any string in the input column
-  auto const begin = thrust::make_counting_iterator<size_type>(0);
-  auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
-  auto d_offsets   = offsets_view.data<offset_type>();
-  auto size_lambda = [d_offsets] __device__(auto const idx) -> size_type {
-    return d_offsets[idx + 1] - d_offsets[idx];
-  };
   auto const columns_count = thrust::transform_reduce(
-    rmm::exec_policy(stream), begin, end, size_lambda, 0, thrust::maximum<size_type>{});
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(strings_count),
+    [d_offsets] __device__(auto const idx) -> size_type {
+      return d_offsets[idx + 1] - d_offsets[idx];
+    },
+    0,
+    thrust::maximum<size_type>{});
 
   // boundary case: if no columns, return one all-null column (custrings issue #119)
   if (columns_count == 0) {
@@ -271,7 +243,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
       0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index});
     return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr);
   };
-  // create each column of tokens
+  // build a vector of columns
   results.resize(columns_count);
   std::transform(thrust::make_counting_iterator<size_type>(0),
                  thrust::make_counting_iterator<size_type>(columns_count),
@@ -281,6 +253,78 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   return std::make_unique<table>(std::move(results));
 }
 
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        split_direction direction,
+                                        size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+  auto const strings_count = input.size();
+
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
+  auto offsets_view = offsets->mutable_view();
+
+  // get split tokens from the input column
+  auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+
+  // convert the tokens into one big strings column
+  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+
+  // create a lists column using the offsets and the strings columns
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           input.null_count(),
+                           copy_bitmask(input.parent(), stream, mr),
+                           stream,
+                           mr);
+}
+
+}  // namespace
+
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                size_type maxsplit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+{
+  return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<table> rsplit_re(strings_column_view const& input,
+                                 std::string const& pattern,
+                                 size_type maxsplit,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
+                                         std::string const& pattern,
+                                         size_type maxsplit,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+}
+
 }  // namespace detail
 
 // external APIs
@@ -303,5 +347,22 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<table> rsplit_re(strings_column_view const& input,
+                                 std::string const& pattern,
+                                 size_type maxsplit,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
+                                         std::string const& pattern,
+                                         size_type maxsplit,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index f541a6b0e81..d0b695bbc93 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -25,8 +25,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <tests/strings/utilities.h>
 
 #include <vector>
 
@@ -241,21 +241,6 @@ TEST_F(StringsSplitTest, RSplitWhitespaceWithMax)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, *expected);
 }
 
-TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
-{
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::split(zero_size_strings_column);
-  EXPECT_TRUE(results->num_columns() == 1);
-  EXPECT_TRUE(results->num_rows() == 0);
-  results = cudf::strings::rsplit(zero_size_strings_column);
-  EXPECT_TRUE(results->num_columns() == 1);
-  EXPECT_TRUE(results->num_rows() == 0);
-  results = cudf::strings::split_re(zero_size_strings_column, "\\s");
-  EXPECT_TRUE(results->num_columns() == 1);
-  EXPECT_TRUE(results->num_rows() == 0);
-}
-
 TEST_F(StringsSplitTest, SplitRecord)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
@@ -331,41 +316,30 @@ TEST_F(StringsSplitTest, SplitRegex)
   {
     auto result = cudf::strings::split_re(sv, "\\s+");
 
-    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity);
     cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0});
     cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0});
     auto expected = cudf::table_view({col0, col1, col2});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+
+    result = cudf::strings::rsplit_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
 
   {
     auto result = cudf::strings::split_re(sv, "[eé]");
 
-    cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity);
     cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""},
                                             {1, 0, 1, 1, 0});
     cudf::test::strings_column_wrapper col2({"s", "", "  ", "", ""}, {1, 0, 1, 0, 0});
     cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0});
     auto expected = cudf::table_view({col0, col1, col2, col3});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
-  }
-}
-
-TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
-{
-  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
-  auto validity =
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
-  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
-  auto sv = cudf::strings_column_view(input);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
-  auto result = cudf::strings::split_re(sv, "\\s+", 1);
-
-  cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
-  cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
-                                          {1, 0, 1, 1, 0});
-  auto expected = cudf::table_view({col0, col1});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+    result = cudf::strings::rsplit_re(sv, "[eé]");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
 }
 
 TEST_F(StringsSplitTest, SplitRecordRegex)
@@ -376,30 +350,60 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto result = cudf::strings::split_record_re(sv, "[eé]");
-
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected(
-    {LCW{" H", "llo th", "s", ""}, LCW{}, LCW{"ar", " som", "  "}, LCW{"t", "st String"}, LCW{""}},
-    validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  {
+    auto result = cudf::strings::split_record_re(sv, "\\s+");
+
+    LCW expected(
+      {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    result = cudf::strings::rsplit_record_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
+
+  {
+    auto result = cudf::strings::split_record_re(sv, "[eé]");
+
+    LCW expected({LCW{" H", "llo th", "s", ""},
+                  LCW{},
+                  LCW{"ar", " som", "  "},
+                  LCW{"t", "st String"},
+                  LCW{""}},
+                 validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    result = cudf::strings::rsplit_record_re(sv, "[eé]");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
 }
 
-TEST_F(StringsSplitTest, SplitRecordRegexWithMaxSplit)
+TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
 {
   std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
   auto sv = cudf::strings_column_view(input);
+  {
+    auto result = cudf::strings::split_re(sv, "\\s+", 1);
 
-  auto result = cudf::strings::split_record_re(sv, "\\s", 1);
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
+                                            {1, 0, 1, 1, 0});
+    auto expected = cudf::table_view({col0, col1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
+  {
+    auto result = cudf::strings::split_record_re(sv, "\\s", 1);
 
-  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected(
-    {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
-    validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected(
+      {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
 }
 
 TEST_F(StringsSplitTest, RSplitRecord)
@@ -493,16 +497,58 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
-TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)
+TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
+{
+  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  {
+    auto result = cudf::strings::rsplit_re(sv, "\\s+", 1);
+
+    cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity);
+    cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0});
+    auto expected = cudf::table_view({col0, col1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
+  {
+    auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1);
+
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected(
+      {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
+}
+
+TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto results = cudf::strings::split(zero_size_strings_column);
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::rsplit(zero_size_strings_column);
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::split_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::rsplit_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+
   auto result = cudf::strings::split_record(zero_size_strings_column);
   EXPECT_TRUE(result->size() == 0);
   result = cudf::strings::rsplit_record(zero_size_strings_column);
   EXPECT_TRUE(result->size() == 0);
   result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
   EXPECT_TRUE(result->size() == 0);
+  result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(result->size() == 0);
 }
 
 // This test specifically for https://github.com/rapidsai/custrings/issues/119
@@ -526,6 +572,20 @@ TEST_F(StringsSplitTest, AllNullsCase)
   results = cudf::strings::split_re(sv, "-");
   EXPECT_TRUE(results->num_columns() == 1);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit_re(sv, "-");
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+
+  auto result = cudf::strings::split_record(sv);
+  using LCW   = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  result = cudf::strings::rsplit_record(sv);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  result = cudf::strings::split_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  result = cudf::strings::rsplit_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, Partition)

From 7bc451b142c84c2505416ae5d8f2d9d979a1989f Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 31 Jan 2022 17:26:52 -0500
Subject: [PATCH 09/39] remove unneeded if-check

---
 cpp/src/strings/split/split_re.cu | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 9427a900d8d..9dcf7e6f17b 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -64,7 +64,7 @@ struct token_reader_fn {
 
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
-    auto d_result           = d_tokens + token_offset;  // store tokens here
+    auto const d_result     = d_tokens + token_offset;  // store tokens here
 
     size_type token_idx = 0;
     size_type begin     = 0;  // characters
@@ -72,7 +72,8 @@ struct token_reader_fn {
     size_type last_pos  = 0;  // bytes
     while (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
       // get the token (characters just before this match)
-      auto token = string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
+      auto const token =
+        string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
       // store it if we have space
       if (token_idx < token_count - 1) {
         d_result[token_idx++] = token;
@@ -90,13 +91,10 @@ struct token_reader_fn {
     }
 
     // set the last token to the remainder of the string
-    if (last_pos <= d_str.size_bytes()) {
-      d_result[token_idx] =
-        string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
-    }
+    d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
 
     if (direction == split_direction::BACKWARD) {
-      // update first entry -- this happens when max-tokens is hit before the end
+      // update first entry -- this happens when max-tokens is hit before the end of the string
       auto const first_offset =
         d_result[0].first
           ? static_cast<size_type>(thrust::distance(d_str.data(), d_result[0].first))
@@ -127,11 +125,11 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
                                                      mutable_column_view& offsets,
                                                      rmm::cuda_stream_view stream)
 {
-  auto d_offsets           = offsets.data<offset_type>();
   auto const strings_count = d_strings.size();
 
-  auto const begin = thrust::make_counting_iterator<size_type>(0);
-  auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
+  auto const begin     = thrust::make_counting_iterator<size_type>(0);
+  auto const end       = thrust::make_counting_iterator<size_type>(strings_count);
+  auto const d_offsets = offsets.data<offset_type>();
 
   // convert match counts to token offsets
   auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) {
@@ -140,7 +138,7 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<offset_type>{});
 
-  // the last entry is the total number of tokens to be generated
+  // the last offset entry is the total number of tokens to be generated
   auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
 
   // generate tokens for each string
@@ -204,14 +202,16 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
     return std::make_unique<table>(std::move(results));
   }
 
+  // create the regex device prog from the given pattern
   auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
   auto d_strings = column_device_view::create(input.parent(), stream);
 
+  // count the number of delimiters matched in each string
   auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
   auto offsets_view = offsets->mutable_view();
   auto d_offsets    = offsets_view.data<offset_type>();
 
-  // get split tokens from the input column
+  // get the split tokens from the input column
   auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
   // the columns_count is the maximum number of tokens for any string in the input column
@@ -265,13 +265,15 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
   auto const strings_count = input.size();
 
+  // create the regex device prog from the given pattern
   auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
   auto d_strings = column_device_view::create(input.parent(), stream);
 
+  // count the number of delimiters matched in each string
   auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
   auto offsets_view = offsets->mutable_view();
 
-  // get split tokens from the input column
+  // get the split tokens from the input column
   auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
   // convert the tokens into one big strings column

From 93887b1877733bfc97c29606f8c9a221d8304efb Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 31 Jan 2022 17:27:29 -0500
Subject: [PATCH 10/39] add all empty and all null test cases

---
 cpp/tests/strings/split_tests.cpp | 36 ++++++++++++++++---------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index d0b695bbc93..4650cbc3c44 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -541,14 +541,14 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
   EXPECT_TRUE(results->num_columns() == 1);
   EXPECT_TRUE(results->num_rows() == 0);
 
-  auto result = cudf::strings::split_record(zero_size_strings_column);
-  EXPECT_TRUE(result->size() == 0);
-  result = cudf::strings::rsplit_record(zero_size_strings_column);
-  EXPECT_TRUE(result->size() == 0);
-  result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
-  EXPECT_TRUE(result->size() == 0);
-  result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s");
-  EXPECT_TRUE(result->size() == 0);
+  auto list_result = cudf::strings::split_record(zero_size_strings_column);
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::rsplit_record(zero_size_strings_column);
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(list_result->size() == 0);
 }
 
 // This test specifically for https://github.com/rapidsai/custrings/issues/119
@@ -576,16 +576,16 @@ TEST_F(StringsSplitTest, AllNullsCase)
   EXPECT_TRUE(results->num_columns() == 1);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
 
-  auto result = cudf::strings::split_record(sv);
-  using LCW   = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto list_result = cudf::strings::split_record(sv);
+  using LCW        = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
-  result = cudf::strings::rsplit_record(sv);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
-  result = cudf::strings::split_record_re(sv, "-");
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
-  result = cudf::strings::rsplit_record_re(sv, "-");
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::rsplit_record(sv);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::split_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::rsplit_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, Partition)
@@ -773,6 +773,8 @@ TEST_F(StringsSplitTest, InvalidParameter)
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_re(strings_view, ""), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)),

From c88eeae8727b9c94f05d15c0e9e3e9714107bf39 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 1 Feb 2022 12:34:45 -0500
Subject: [PATCH 11/39] add more maxsplit gtests

---
 cpp/include/cudf/strings/split/split_re.hpp | 50 +++++++++++----------
 cpp/src/strings/split/split_re.cu           | 28 +++++++-----
 cpp/tests/strings/split_tests.cpp           | 34 +++++++++++---
 3 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index cf6d23ccd28..c6dc1e5c697 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -28,18 +28,18 @@ namespace strings {
  */
 
 /**
- * @brief Splits individual strings elements into a table of strings columns
+ * @brief Splits strings elements into a table of strings columns
  * using a regex pattern to delimit each string.
  *
- * Each element generates an array of strings that are stored in corresponding
- * rows in the output table -- `table[col,row] = token[col] of string[row]`
- * where `token` is the substring between each delimiter.
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of strings[row]`
+ * where `token` is a substring between delimiters.
  *
- * The number of elements in the output table will be the same as the number of
+ * The number of rows in the output table will be the same as the number of
  * elements in the input column. The resulting number of columns will be the
  * maximum number of tokens found in any input row.
  *
- * The `pattern` is used to identify the separation points within a string
+ * The `pattern` is used to identify the delimiters within a string
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
  * An empty input string will produce a corresponding empty string in the
@@ -62,7 +62,7 @@ namespace strings {
  *
  * @throw cudf:logic_error if `pattern` is empty.
  *
- * @param strings A column of string elements to be split.
+ * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
@@ -70,25 +70,25 @@ namespace strings {
  * @return A table of columns of strings.
  */
 std::unique_ptr<table> split_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Splits individual strings elements into a table of strings columns
+ * @brief Splits strings elements into a table of strings columns
  * using a regex pattern to delimit each string.
  *
- * Each element generates an array of strings that are stored in corresponding
+ * Each element generates a vector of strings that are stored in corresponding
  * rows in the output table -- `table[col,row] = token[col] of string[row]`
  * where `token` is the substring between each delimiter.
  *
- * The number of elements in the output table will be the same as the number of
+ * The number of rows in the output table will be the same as the number of
  * elements in the input column. The resulting number of columns will be the
  * maximum number of tokens found in any input row.
  *
  * Splitting occurs by traversing starting from the end of the input string.
- * The `pattern` is used to identify the separation points within the string
+ * The `pattern` is used to identify the delimiters within a string
  * and splitting stops when either `maxsplit` or the beginning of the string
  * is reached.
  *
@@ -112,7 +112,7 @@ std::unique_ptr<table> split_re(
  *
  * @throw cudf:logic_error if `pattern` is empty.
  *
- * @param strings A column of string elements to be split.
+ * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
@@ -120,24 +120,25 @@ std::unique_ptr<table> split_re(
  * @return A table of columns of strings.
  */
 std::unique_ptr<table> rsplit_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Splits individual strings elements into a list of strings
+ * @brief Splits strings elements into a list column of strings
  * using the given regex pattern to delimit each string.
  *
  * Each element generates an array of strings that are stored in an output
- * lists column.
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
  *
  * The number of elements in the output column will be the same as the number of
  * elements in the input column. Each individual list item will contain the
  * new strings for that row. The resulting number of strings in each row can vary
  * from 0 to `maxsplit + 1`.
  *
- * The `pattern` is used to identify the separation points within a string
+ * The `pattern` is used to identify the delimiters within a string
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
  * An empty input string will produce a corresponding empty list item output row.
@@ -161,7 +162,7 @@ std::unique_ptr<table> rsplit_re(
  *
  * @throw cudf:logic_error if `pattern` is empty.
  *
- * @param strings A column of string elements to be split.
+ * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
@@ -169,17 +170,18 @@ std::unique_ptr<table> rsplit_re(
  * @return Lists column of strings.
  */
 std::unique_ptr<column> split_record_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Splits individual strings elements into a list of strings
+ * @brief Splits strings elements into a list column of strings
  * using the given regex pattern to delimit each string.
  *
- * Each element generates an array of strings that are stored in an output
- * lists column.
+ * Each element generates a vector of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
  *
  * The number of elements in the output column will be the same as the number of
  * elements in the input column. Each individual list item will contain the
@@ -212,7 +214,7 @@ std::unique_ptr<column> split_record_re(
  *
  * @throw cudf:logic_error if `pattern` is empty.
  *
- * @param strings A column of string elements to be split.
+ * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
@@ -220,7 +222,7 @@ std::unique_ptr<column> split_record_re(
  * @return Lists column of strings.
  */
 std::unique_ptr<column> rsplit_record_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 9dcf7e6f17b..dd71533c773 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -48,6 +48,10 @@ enum class split_direction {
 
 /**
  * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
+ *
+ * Each string's tokens are stored in the `d_tokens` vector.
+ * The `d_token_offsets` specifies the output position within `d_tokens`
+ * for each string.
  */
 template <int stack_size>
 struct token_reader_fn {
@@ -118,12 +122,12 @@ struct token_reader_fn {
  *                The offsets for each token in each string on output.
  * @param stream CUDA stream used for kernel launches.
  */
-rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d_strings,
-                                                     reprog_device& d_prog,
-                                                     split_direction direction,
-                                                     size_type max_tokens,
-                                                     mutable_column_view& offsets,
-                                                     rmm::cuda_stream_view stream)
+rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const& d_strings,
+                                                       reprog_device& d_prog,
+                                                       split_direction direction,
+                                                       size_type max_tokens,
+                                                       mutable_column_view& offsets,
+                                                       rmm::cuda_stream_view stream)
 {
   auto const strings_count = d_strings.size();
 
@@ -165,7 +169,7 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
  * @brief Returns string pair for the specified column for each string in `d_strings`
  *
  * This is used to build the table result of a split.
- * Null is returned if the row is null of if the `column_index` is larger
+ * Null is returned if the row is null or if the `column_index` is larger
  * than the token count for that string.
  */
 struct tokens_transform_fn {
@@ -211,10 +215,10 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto offsets_view = offsets->mutable_view();
   auto d_offsets    = offsets_view.data<offset_type>();
 
-  // get the split tokens from the input column
-  auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+  // get the split tokens from the input column; this also converts the counts into offsets
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
-  // the columns_count is the maximum number of tokens for any string in the input column
+  // the output column count is the maximum number of tokens generated for any input string
   auto const columns_count = thrust::transform_reduce(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
@@ -273,8 +277,8 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
   auto offsets_view = offsets->mutable_view();
 
-  // get the split tokens from the input column
-  auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+  // get the split tokens from the input column; this also converts the counts into offsets
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
   // convert the tokens into one big strings column
   auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 4650cbc3c44..f0d7315929b 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -322,6 +322,7 @@ TEST_F(StringsSplitTest, SplitRegex)
     auto expected = cudf::table_view({col0, col1, col2});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
+    // rsplit == split when using default parameters
     result = cudf::strings::rsplit_re(sv, "\\s+");
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
@@ -337,6 +338,7 @@ TEST_F(StringsSplitTest, SplitRegex)
     auto expected = cudf::table_view({col0, col1, col2, col3});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
+    // rsplit == split when using default parameters
     result = cudf::strings::rsplit_re(sv, "[eé]");
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
@@ -359,6 +361,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
+    // rsplit == split when using default parameters
     result = cudf::strings::rsplit_record_re(sv, "\\s+");
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
@@ -374,6 +377,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
                  validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
+    // rsplit == split when using default parameters
     result = cudf::strings::rsplit_record_re(sv, "[eé]");
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
@@ -394,15 +398,31 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
                                             {1, 0, 1, 1, 0});
     auto expected = cudf::table_view({col0, col1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+
+    // split everything is the same output as maxsplit==2 for the test input column here
+    result         = cudf::strings::split_re(sv, "\\s+", 2);
+    auto expected2 = cudf::strings::split_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view());
   }
   {
     auto result = cudf::strings::split_record_re(sv, "\\s", 1);
 
     using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-    LCW expected(
+    LCW expected1(
       {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
       validity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1);
+
+    result = cudf::strings::split_record_re(sv, "\\s", 2);
+    LCW expected2(
+      {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2);
+
+    // split everything is the same output as maxsplit==3 for the test input column here
+    result         = cudf::strings::split_record_re(sv, "\\s", 3);
+    auto expected0 = cudf::strings::split_record_re(sv, "\\s");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
   }
 }
 
@@ -521,6 +541,11 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
       {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    // split everything is the same output as any maxsplit > 2 for the test input column here
+    result         = cudf::strings::rsplit_record_re(sv, "\\s+", 3);
+    auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
   }
 }
 
@@ -760,9 +785,8 @@ TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns)
 
 TEST_F(StringsSplitTest, InvalidParameter)
 {
-  std::vector<const char*> h_strings{"string left intentionally blank"};
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
-  auto strings_view = cudf::strings_column_view(strings);
+  cudf::test::strings_column_wrapper input({"string left intentionally blank"});
+  auto strings_view = cudf::strings_column_view(input);
   EXPECT_THROW(cudf::strings::split(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)),

From 79887d85c9d38916e13069cff2ad76a02ed9a59d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 13:41:00 -0700
Subject: [PATCH 12/39] Change JNI to add a new boolean flag for regex split

---
 java/src/main/native/src/ColumnViewJni.cpp | 82 ++++++++--------------
 1 file changed, 29 insertions(+), 53 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index aef6b57230e..3985de41b32 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -61,6 +61,7 @@
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/split/split.hpp>
+#include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/strip.hpp>
 #include <cudf/strings/substring.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -561,67 +562,42 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
-                                                                        jlong column_view,
-                                                                        jlong delimiter_ptr,
-                                                                        jint max_split) {
-  JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0);
+                                                                        jlong input_handle,
+                                                                        jlong delimiter_handle,
+                                                                        jint max_split,
+                                                                        jboolean split_by_regex) {
+  JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
+  JNI_NULL_CHECK(env, delimiter_handle, "string scalar delimiter is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::strings_column_view const scv{*reinterpret_cast<cudf::column_view *>(column_view)};
-    auto delimiter = reinterpret_cast<cudf::string_scalar *>(delimiter_ptr);
+    auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
+    auto const strs_input = cudf::strings_column_view{*input};
+    auto const delimiter = reinterpret_cast<cudf::string_scalar *>(delimiter_handle);
 
-    return cudf::jni::convert_table_for_return(env,
-                                               cudf::strings::split(scv, *delimiter, max_split));
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRegex(JNIEnv *env, jclass,
-                                                                             jlong column_view,
-                                                                             jlong delimiter) {
-  JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0);
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
-    cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
-
-    std::unique_ptr<cudf::table> table_result = cudf::strings::split_re(scv, *ss_scalar);
-    return cudf::jni::convert_table_for_return(env, table_result);
+    auto result = split_by_regex ? cudf::strings::split_re(strs_input, *delimiter, max_split) :
+                                   cudf::strings::split(strs_input, *delimiter, max_split);
+    return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
-                                                                         jlong column_view,
-                                                                         jlong delimiter,
-                                                                         jint max_split) {
-  JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0);
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
-    cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
-    return release_as_jlong(cudf::strings::split_record(scv, *ss_scalar, max_split));
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRegex(JNIEnv *env, jclass,
-                                                                              jlong column_view,
-                                                                              jlong delimiter,
-                                                                              jint max_split) {
-  JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0);
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
-    cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
-    return release_as_jlong(cudf::strings::split_record_re(scv, *ss_scalar, max_split));
+                                                                         jlong input_handle,
+                                                                         jlong delimiter_handle,
+                                                                         jint max_split,
+                                                                         jboolean split_by_regex) {
+  JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
+  JNI_NULL_CHECK(env, delimiter_handle, "delimiter_handle is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
+    auto const strs_input = cudf::strings_column_view{*input};
+    auto const delimiter = reinterpret_cast<cudf::string_scalar *>(delimiter_handle);
+
+    auto result = split_by_regex ?
+                      cudf::strings::split_record_re(strs_input, *delimiter, max_split) :
+                      cudf::strings::split_record(strs_input, *delimiter, max_split);
+    return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }

From 28524b3c6bd7cd061dbea92f2c6710396dfc97b6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 14:19:33 -0700
Subject: [PATCH 13/39] Implement all possible overloads for stringSplit
 binding

---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 154 +++++++++++++-----
 1 file changed, 116 insertions(+), 38 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 8155fe79080..1bf0302d1a1 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co
   /**
    * Creates a deep copy of a column while replacing the validity mask. The validity mask is the
    * device_vector equivalent of the boolean column given as argument.
-   * 
+   *
    * The boolColumn must have the same number of rows as the current column.
-   * The result column will have the same number of rows as the current column. 
+   * The result column will have the same number of rows as the current column.
    * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i.
    * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   * 
+   *
    * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
    * then the row value is undefined.
-   * 
+   *
    * @param boolColumn bool column whose value is to be used as the validity mask.
    * @return Deep copy of the column with replaced validity mask.
-   */    
+   */
   public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) {
     return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView()));
   }
@@ -2352,81 +2352,157 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    * @param delimiter UTF-8 encoded string identifying the split points in each string.
    *                  An empty string indicates split on whitespace.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(Scalar delimiter, int maxSplit) {
+  public final Table stringSplit(Scalar delimiter, int maxSplit, boolean splitByRegex) {
     assert type.equals(DType.STRING) : "column type must be a String";
     assert delimiter != null : "delimiter may not be null";
     assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+    return new Table(stringSplit(this.getNativeView(),
+                                 delimiter.getScalarHandle(),
+                                 maxSplit,
+                                 splitByRegex));
   }
-  
+
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
+   * Returns a list of columns by splitting each string using the specified string literal delimiter.
    * The number of rows in the output columns will be the same as the input column.
    * Null entries are added for a row where split results have been exhausted.
    * Null string entries return corresponding null output columns.
    * @param delimiter UTF-8 encoded string identifying the split points in each string.
    *                  An empty string indicates split on whitespace.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(Scalar delimiter) {
-    return stringSplit(delimiter, -1);
+  public final Table stringSplit(Scalar delimiter, int maxSplit) {
+    return stringSplit(delimiter, maxSplit, false);
   }
 
   /**
-   * Returns a list of columns by splitting each string using whitespace as the delimiter.
+   * Returns a list of columns by splitting each string using the specified delimiter.
    * The number of rows in the output columns will be the same as the input column.
    * Null entries are added for a row where split results have been exhausted.
    * Null string entries return corresponding null output columns.
+   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   *                  An empty string indicates split on whitespace.
+   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
    * @return New table of strings columns.
    */
-  public final Table stringSplit() {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplit(emptyString, -1);
-    }
+  public final Table stringSplit(Scalar delimiter, boolean splitByRegex) {
+    return stringSplit(delimiter, -1, splitByRegex);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
+   * Returns a list of columns by splitting each string using the specified string literal delimiter.
+   * The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted.
+   * Null string entries return corresponding null output columns.
+   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   *                  An empty string indicates split on whitespace.
+   * @return New table of strings columns.
    */
-  public final ColumnVector stringSplitRecord() {
-    return stringSplitRecord(-1);
+  public final Table stringSplit(Scalar delimiter) {
+    return stringSplit(delimiter, -1, false);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
+   * Returns a list of columns by splitting each string using whitespace as the delimiter.
+   * The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted.
+   * Null string entries return corresponding null output columns.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @return New table of strings columns.
    */
-  public final ColumnVector stringSplitRecord(int maxSplit) {
+  public final Table stringSplit(int maxSplit) {
     try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplitRecord(emptyString, maxSplit);
+      return stringSplit(emptyString, maxSplit, false);
     }
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using the specified delimiter.
+   * Returns a list of columns by splitting each string using whitespace as the delimiter.
+   * The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted.
+   * Null string entries return corresponding null output columns.
+   * @return New table of strings columns.
+   */
+  public final Table stringSplit() {
+    return stringSplit(-1);
+  }
+
+  /**
+   * Returns a column that is a list of strings. Each string list is made by splitting each input
+   * string using the specified delimiter.
    * @param delimiter UTF-8 encoded string identifying the split points in each string.
    *                  An empty string indicates split on whitespace.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return New table of strings columns.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter) {
-    return stringSplitRecord(delimiter, -1);
+  public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit, boolean splitByRegex) {
+    assert type.equals(DType.STRING) : "column type must be String";
+    assert delimiter != null : "delimiter may not be null";
+    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
+    return new ColumnVector(
+        stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit, splitByRegex));
   }
 
   /**
    * Returns a column that is a list of strings. Each string list is made by splitting each input
-   * string using the specified delimiter.
+   * string using the specified string literal delimiter.
    * @param delimiter UTF-8 encoded string identifying the split points in each string.
    *                  An empty string indicates split on whitespace.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @return New table of strings columns.
    */
   public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) {
-    assert type.equals(DType.STRING) : "column type must be a String";
+    assert type.equals(DType.STRING) : "column type must be String";
     assert delimiter != null : "delimiter may not be null";
     assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
     return new ColumnVector(
-        stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+        stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit, false));
+  }
+
+  /**
+   * Returns a column of lists of strings by splitting each string using the specified delimiter.
+   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   *                  An empty string indicates split on whitespace.
+   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   */
+  public final ColumnVector stringSplitRecord(Scalar delimiter, boolean splitByRegex) {
+    return stringSplitRecord(delimiter, -1, splitByRegex);
+  }
+
+  /**
+   * Returns a column of lists of strings by splitting each string using the specified string
+   * literal delimiter.
+   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   *                  An empty string indicates split on whitespace.
+   */
+  public final ColumnVector stringSplitRecord(Scalar delimiter) {
+    return stringSplitRecord(delimiter, -1, false);
+  }
+
+  /**
+   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   */
+  public final ColumnVector stringSplitRecord(int maxSplit) {
+    try (Scalar emptyString = Scalar.fromString("")) {
+      return stringSplitRecord(emptyString, maxSplit, false);
+    }
+  }
+
+  /**
+   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
+   */
+  public final ColumnVector stringSplitRecord() {
+    return stringSplitRecord(-1);
   }
 
   /**
@@ -3248,7 +3324,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST};
    * Create a column of int32 indices, indicating the position of the scalar search key
    * in each list row.
    * All indices are 0-based. If a search key is not found, the index is set to -1.
-   * The index is set to null if one of the following is true: 
+   * The index is set to null if one of the following is true:
    * 1. The search key is null.
    * 2. The list row is null.
    * @param key The scalar search key
@@ -3265,7 +3341,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) {
    * Create a column of int32 indices, indicating the position of each row in the
    * search key column in the corresponding row of the lists column.
    * All indices are 0-based. If a search key is not found, the index is set to -1.
-   * The index is set to null if one of the following is true: 
+   * The index is set to null if one of the following is true:
    * 1. The search key row is null.
    * 2. The list row is null.
    * @param keys ColumnView of search keys.
@@ -3537,9 +3613,11 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
    * @param delimiter  UTF-8 encoded string identifying the split points in each string.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    */
-  private static native long[] stringSplit(long columnView, long delimiter, int maxSplit);
+  private static native long[] stringSplit(long columnView, long delimiter, int maxSplit,
+                                           boolean splitByRegex);
 
-  private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit);
+  private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit,
+                                               boolean splitByRegex);
 
   /**
    * Native method to calculate substring from a given string column. 0 indexing.
@@ -3714,7 +3792,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
   /**
    * Native method to search list rows for null elements.
    * @param nativeView the column view handle of the list
-   * @return column handle of the resultant boolean column 
+   * @return column handle of the resultant boolean column
    */
   private static native long listContainsNulls(long nativeView);
 
@@ -3896,20 +3974,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi
   /**
    * Native method to deep copy a column while replacing the null mask. The null mask is the
    * device_vector equivalent of the boolean column given as argument.
-   * 
+   *
    * The boolColumn must have the same number of rows as the exemplar column.
    * The result column will have the same number of rows as the exemplar.
    * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i.
    * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   * 
+   *
    * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
    * then the resultant row value is undefined.
-   * 
+   *
    * @param exemplarViewHandle column view of the column that is deep copied.
    * @param boolColumnViewHandle bool column whose value is to be used as the null mask.
    * @return Deep copy of the column with replaced null mask.
-   */                                                      
-  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, 
+   */
+  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle,
                                                              long boolColumnViewHandle) throws CudfException;
 
   ////////

From 75ffaf839b034517d7af58e8d91fdc5629ce6e12 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 15:59:02 -0700
Subject: [PATCH 14/39] Change JNI for stringSplit and stringSplitRecord

---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 81 +++++++++----------
 java/src/main/native/src/ColumnViewJni.cpp    | 29 ++++---
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 40 ++++-----
 3 files changed, 72 insertions(+), 78 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 1bf0302d1a1..6d176e5934a 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2349,21 +2349,16 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    * The number of rows in the output columns will be the same as the input column.
    * Null entries are added for a row where split results have been exhausted.
    * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @param splitByRegex a boolean flag indicating whether the input string will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(Scalar delimiter, int maxSplit, boolean splitByRegex) {
+  public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) {
     assert type.equals(DType.STRING) : "column type must be a String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new Table(stringSplit(this.getNativeView(),
-                                 delimiter.getScalarHandle(),
-                                 maxSplit,
-                                 splitByRegex));
+    return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex));
   }
 
   /**
@@ -2371,12 +2366,12 @@ public final Table stringSplit(Scalar delimiter, int maxSplit, boolean splitByRe
    * The number of rows in the output columns will be the same as the input column.
    * Null entries are added for a row where split results have been exhausted.
    * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(Scalar delimiter, int maxSplit) {
+  public final Table stringSplit(String delimiter, int maxSplit) {
     return stringSplit(delimiter, maxSplit, false);
   }
 
@@ -2385,13 +2380,13 @@ public final Table stringSplit(Scalar delimiter, int maxSplit) {
    * The number of rows in the output columns will be the same as the input column.
    * Null entries are added for a row where split results have been exhausted.
    * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
    * @param splitByRegex a boolean flag indicating whether the input string will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(Scalar delimiter, boolean splitByRegex) {
+  public final Table stringSplit(String delimiter, boolean splitByRegex) {
     return stringSplit(delimiter, -1, splitByRegex);
   }
 
@@ -2400,11 +2395,11 @@ public final Table stringSplit(Scalar delimiter, boolean splitByRegex) {
    * The number of rows in the output columns will be the same as the input column.
    * Null entries are added for a row where split results have been exhausted.
    * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(Scalar delimiter) {
+  public final Table stringSplit(String delimiter) {
     return stringSplit(delimiter, -1, false);
   }
 
@@ -2417,9 +2412,8 @@ public final Table stringSplit(Scalar delimiter) {
    * @return New table of strings columns.
    */
   public final Table stringSplit(int maxSplit) {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplit(emptyString, maxSplit, false);
-    }
+    String emptyString = "";
+    return stringSplit(emptyString, maxSplit, false);
   }
 
   /**
@@ -2436,55 +2430,48 @@ public final Table stringSplit() {
   /**
    * Returns a column that is a list of strings. Each string list is made by splitting each input
    * string using the specified delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @param splitByRegex a boolean flag indicating whether the input string will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    * @return New table of strings columns.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit, boolean splitByRegex) {
+  public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) {
     assert type.equals(DType.STRING) : "column type must be String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
     return new ColumnVector(
-        stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit, splitByRegex));
+        stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex));
   }
 
   /**
    * Returns a column that is a list of strings. Each string list is made by splitting each input
    * string using the specified string literal delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    * @return New table of strings columns.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) {
-    assert type.equals(DType.STRING) : "column type must be String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new ColumnVector(
-        stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit, false));
+  public final ColumnVector stringSplitRecord(String delimiter, int maxSplit) {
+    return stringSplitRecord(delimiter, maxSplit, false);
   }
-
   /**
    * Returns a column of lists of strings by splitting each string using the specified delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
    * @param splitByRegex a boolean flag indicating whether the input string will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter, boolean splitByRegex) {
+  public final ColumnVector stringSplitRecord(String delimiter, boolean splitByRegex) {
     return stringSplitRecord(delimiter, -1, splitByRegex);
   }
 
   /**
    * Returns a column of lists of strings by splitting each string using the specified string
    * literal delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter) {
+  public final ColumnVector stringSplitRecord(String delimiter) {
     return stringSplitRecord(delimiter, -1, false);
   }
 
@@ -2493,9 +2480,8 @@ public final ColumnVector stringSplitRecord(Scalar delimiter) {
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
    */
   public final ColumnVector stringSplitRecord(int maxSplit) {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplitRecord(emptyString, maxSplit, false);
-    }
+    String emptyString = "";
+    return stringSplitRecord(emptyString, maxSplit, false);
   }
 
   /**
@@ -3607,16 +3593,27 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
   private static native long substringLocate(long columnView, long substringScalar, int start, int end);
 
   /**
-   * Native method which returns array of columns by splitting each string using the specified
+   * Native method which returns an array of columns by splitting each string using the specified
    * delimiter.
-   * @param columnView native handle of the cudf::column_view being operated on.
-   * @param delimiter  UTF-8 encoded string identifying the split points in each string.
+   * @param nativeHandle native handle of the cudf::column_view being operated on.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
    */
-  private static native long[] stringSplit(long columnView, long delimiter, int maxSplit,
+  private static native long[] stringSplit(long nativeHandle, String delimiter, int maxSplit,
                                            boolean splitByRegex);
 
-  private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit,
+  /**
+   * Native method which returns a LIST column by splitting each string into a list of strings
+   * using the specified delimiter.
+   * @param nativeHandle native handle of the cudf::column_view being operated on.
+   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
+   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   */
+  private static native long stringSplitRecord(long nativeHandle, String delimiter, int maxSplit,
                                                boolean splitByRegex);
 
   /**
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 3985de41b32..c6964f98a53 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -563,19 +563,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong input_handle,
-                                                                        jlong delimiter_handle,
+                                                                        jstring delimiter,
                                                                         jint max_split,
                                                                         jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
-  JNI_NULL_CHECK(env, delimiter_handle, "string scalar delimiter is null", 0);
   try {
     cudf::jni::auto_set_device(env);
+
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
-    auto const delimiter = reinterpret_cast<cudf::string_scalar *>(delimiter_handle);
 
-    auto result = split_by_regex ? cudf::strings::split_re(strs_input, *delimiter, max_split) :
-                                   cudf::strings::split(strs_input, *delimiter, max_split);
+    auto const delimiter_content = env->GetStringUTFChars(delimiter, nullptr);
+    auto const str_delimiter = std::string{delimiter_content};
+
+    auto result =
+        split_by_regex ?
+            cudf::strings::split_re(strs_input, str_delimiter, max_split) :
+            cudf::strings::split(strs_input, cudf::string_scalar{str_delimiter}, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
@@ -583,20 +587,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
                                                                          jlong input_handle,
-                                                                         jlong delimiter_handle,
+                                                                         jstring delimiter,
                                                                          jint max_split,
                                                                          jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
-  JNI_NULL_CHECK(env, delimiter_handle, "delimiter_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
+
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
-    auto const delimiter = reinterpret_cast<cudf::string_scalar *>(delimiter_handle);
 
-    auto result = split_by_regex ?
-                      cudf::strings::split_record_re(strs_input, *delimiter, max_split) :
-                      cudf::strings::split_record(strs_input, *delimiter, max_split);
+    auto const delimiter_content = env->GetStringUTFChars(delimiter, nullptr);
+    auto const str_delimiter = std::string{delimiter_content};
+
+    auto result =
+        split_by_regex ?
+            cudf::strings::split_record_re(strs_input, str_delimiter, max_split) :
+            cudf::strings::split_record(strs_input, cudf::string_scalar{str_delimiter}, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index f9c8029ed84..05c7f2e18ce 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4741,24 +4741,25 @@ void testListSortRowsWithStringChild() {
 
   @Test
   void testStringSplitRecord() {
-      try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings");
-           ColumnVector expected = ColumnVector.fromLists(
-                   new HostColumnVector.ListType(true,
-                       new HostColumnVector.BasicType(true, DType.STRING)),
-                   Arrays.asList("Héllo", "there"),
-                   Arrays.asList("thésé"),
-                   Arrays.asList("null"),
-                   Arrays.asList(""),
-                   Arrays.asList("ARé", "some"),
-                   Arrays.asList("test", "strings"));
-           Scalar pattern = Scalar.fromString(" ");
-           ColumnVector result = v.stringSplitRecord(pattern, -1)) {
-          assertColumnsAreEqual(expected, result);
-      }
+    String pattern = " ";
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings");
+         ColumnVector expected = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true,
+                     new HostColumnVector.BasicType(true, DType.STRING)),
+                 Arrays.asList("Héllo", "there"),
+                 Arrays.asList("thésé"),
+                 Arrays.asList("null"),
+                 Arrays.asList(""),
+                 Arrays.asList("ARé", "some"),
+                 Arrays.asList("test", "strings"));
+         ColumnVector result = v.stringSplitRecord(pattern, -1)) {
+        assertColumnsAreEqual(expected, result);
+    }
   }
 
   @Test
   void testStringSplit() {
+    String pattern = " ";
     try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here");
          Table expectedSplitOnce = new Table.TestBuilder()
          .column("Héllo", "thésé", null, "", "ARé", "test")
@@ -4769,7 +4770,6 @@ void testStringSplit() {
          .column("there", null, null, null, "some", "strings")
          .column("all", null, null, null, "things", "here")
          .build();
-         Scalar pattern = Scalar.fromString(" ");
          Table resultSplitOnce = v.stringSplit(pattern, 1);
          Table resultSplitAll = v.stringSplit(pattern)) {
           assertTablesAreEqual(expectedSplitOnce, resultSplitOnce);
@@ -4790,16 +4790,6 @@ void teststringSplitWhiteSpace() {
 
   @Test
   void teststringSplitThrowsException() {
-    assertThrows(CudfException.class, () -> {
-      try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
-           Scalar delimiter = Scalar.fromString(null);
-           Table result = cv.stringSplit(delimiter)) {}
-    });
-    assertThrows(AssertionError.class, () -> {
-    try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
-         Scalar delimiter = Scalar.fromInt(1);
-         Table result = cv.stringSplit(delimiter)) {}
-    });
     assertThrows(AssertionError.class, () -> {
       try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
            Table result = cv.stringSplit(null)) {}

From b3604c9057cbe896824c62972fbc5970bcacf2e7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 16:12:48 -0700
Subject: [PATCH 15/39] Rename tests

---
 java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 05c7f2e18ce..024640fefb1 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4778,7 +4778,7 @@ void testStringSplit() {
   }
 
   @Test
-  void teststringSplitWhiteSpace() {
+  void testStringSplitWhiteSpace() {
     try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " ");
          Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null)
          .column("thesé", null, "some", "String", null)
@@ -4789,7 +4789,7 @@ void teststringSplitWhiteSpace() {
   }
 
   @Test
-  void teststringSplitThrowsException() {
+  void testStringSplitThrowsException() {
     assertThrows(AssertionError.class, () -> {
       try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
            Table result = cv.stringSplit(null)) {}

From 61605ef727e361d9c83b311e67afb7284022903a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 16:15:41 -0700
Subject: [PATCH 16/39] Remove test

---
 java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 024640fefb1..e9d7bcbfcbd 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4788,14 +4788,6 @@ void testStringSplitWhiteSpace() {
     }
   }
 
-  @Test
-  void testStringSplitThrowsException() {
-    assertThrows(AssertionError.class, () -> {
-      try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
-           Table result = cv.stringSplit(null)) {}
-    });
-  }
-
   @Test
   void testsubstringColumn() {
     try (ColumnVector v = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");

From 8307bfe091b3e1e67cc623ea007cc9400a24ab44 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 16:15:46 -0700
Subject: [PATCH 17/39] Add assert

---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 6d176e5934a..740163f7482 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2357,6 +2357,7 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    * @return New table of strings columns.
    */
   public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) {
+    assert delimiter != null : "delimiter is null";
     assert type.equals(DType.STRING) : "column type must be a String";
     return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex));
   }
@@ -2438,6 +2439,7 @@ public final Table stringSplit() {
    * @return New table of strings columns.
    */
   public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) {
+    assert delimiter != null : "delimiter is null";
     assert type.equals(DType.STRING) : "column type must be String";
     return new ColumnVector(
         stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex));

From 75dc621f804bcfe0ff48bc5a7d7b2f4e5ba9a217 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 16:25:23 -0700
Subject: [PATCH 18/39] Add assert

---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 740163f7482..b6503e01cf3 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2358,6 +2358,7 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    */
   public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) {
     assert delimiter != null : "delimiter is null";
+    assert (delimiter.length() == 0) ^ splitByRegex : "cannot split by empty regex";
     assert type.equals(DType.STRING) : "column type must be a String";
     return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex));
   }
@@ -2440,6 +2441,7 @@ public final Table stringSplit() {
    */
   public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) {
     assert delimiter != null : "delimiter is null";
+    assert (delimiter.length() == 0) ^ splitByRegex : "cannot split by empty regex";
     assert type.equals(DType.STRING) : "column type must be String";
     return new ColumnVector(
         stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex));

From 4a2066272f0a9dd9fe349fbc02abb3e128fd2565 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 16:28:11 -0700
Subject: [PATCH 19/39] Fix assert

---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index b6503e01cf3..912f9f7649b 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2358,7 +2358,7 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    */
   public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) {
     assert delimiter != null : "delimiter is null";
-    assert (delimiter.length() == 0) ^ splitByRegex : "cannot split by empty regex";
+    assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex";
     assert type.equals(DType.STRING) : "column type must be a String";
     return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex));
   }
@@ -2441,7 +2441,7 @@ public final Table stringSplit() {
    */
   public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) {
     assert delimiter != null : "delimiter is null";
-    assert (delimiter.length() == 0) ^ splitByRegex : "cannot split by empty regex";
+    assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex";
     assert type.equals(DType.STRING) : "column type must be String";
     return new ColumnVector(
         stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex));

From f915f7e92e2adf091c16d182d8b152de27b66c92 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 4 Feb 2022 17:38:53 -0700
Subject: [PATCH 20/39] Fix string construction from jstring

---
 java/src/main/native/src/ColumnViewJni.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index c6964f98a53..dcc6085c509 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -573,8 +573,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
 
-    auto const delimiter_content = env->GetStringUTFChars(delimiter, nullptr);
-    auto const str_delimiter = std::string{delimiter_content};
+    auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr);
+    auto const delimiter_size = env->GetStringUTFLength(delimiter);
+    auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
+    env->ReleaseStringUTFChars(delimiter, delimiter_chars);
 
     auto result =
         split_by_regex ?
@@ -597,8 +599,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
 
-    auto const delimiter_content = env->GetStringUTFChars(delimiter, nullptr);
-    auto const str_delimiter = std::string{delimiter_content};
+    auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr);
+    auto const delimiter_size = env->GetStringUTFLength(delimiter);
+    auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
+    env->ReleaseStringUTFChars(delimiter, delimiter_chars);
 
     auto result =
         split_by_regex ?

From 4176563e6dca76b1618c4661eaac4b31574d99fb Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 7 Feb 2022 14:44:37 -0700
Subject: [PATCH 21/39] Rename variable and rewrite javadoc

---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 62 ++++++++++++-------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 912f9f7649b..e7af237dd9a 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2351,16 +2351,19 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    * Null string entries return corresponding null output columns.
    * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param limit the maximum size of the array resulting from splitting the input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
    * @param splitByRegex a boolean flag indicating whether the input string will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) {
+  public final Table stringSplit(String delimiter, int limit, boolean splitByRegex) {
     assert delimiter != null : "delimiter is null";
     assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex";
     assert type.equals(DType.STRING) : "column type must be a String";
-    return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex));
+    assert limit != 0 && limit != 1;
+    return new Table(stringSplit(this.getNativeView(), delimiter, limit, splitByRegex));
   }
 
   /**
@@ -2370,11 +2373,13 @@ public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRe
    * Null string entries return corresponding null output columns.
    * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param limit the maximum size of the array resulting from splitting the input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(String delimiter, int maxSplit) {
-    return stringSplit(delimiter, maxSplit, false);
+  public final Table stringSplit(String delimiter, int limit) {
+    return stringSplit(delimiter, limit, false);
   }
 
   /**
@@ -2410,12 +2415,14 @@ public final Table stringSplit(String delimiter) {
    * The number of rows in the output columns will be the same as the input column.
    * Null entries are added for a row where split results have been exhausted.
    * Null string entries return corresponding null output columns.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param limit the maximum size of the array resulting from splitting the input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
    * @return New table of strings columns.
    */
-  public final Table stringSplit(int maxSplit) {
+  public final Table stringSplit(int limit) {
     String emptyString = "";
-    return stringSplit(emptyString, maxSplit, false);
+    return stringSplit(emptyString, limit, false);
   }
 
   /**
@@ -2434,17 +2441,20 @@ public final Table stringSplit() {
    * string using the specified delimiter.
    * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param limit the maximum size of the array resulting from splitting the input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
    * @param splitByRegex a boolean flag indicating whether the input string will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    * @return New table of strings columns.
    */
-  public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) {
+  public final ColumnVector stringSplitRecord(String delimiter, int limit, boolean splitByRegex) {
     assert delimiter != null : "delimiter is null";
     assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex";
     assert type.equals(DType.STRING) : "column type must be String";
+    assert limit != 0 && limit != 1;
     return new ColumnVector(
-        stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex));
+        stringSplitRecord(this.getNativeView(), delimiter, limit, splitByRegex));
   }
 
   /**
@@ -2452,11 +2462,13 @@ public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, bool
    * string using the specified string literal delimiter.
    * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
    *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param limit the maximum size of the array resulting from splitting the input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
    * @return New table of strings columns.
    */
-  public final ColumnVector stringSplitRecord(String delimiter, int maxSplit) {
-    return stringSplitRecord(delimiter, maxSplit, false);
+  public final ColumnVector stringSplitRecord(String delimiter, int limit) {
+    return stringSplitRecord(delimiter, limit, false);
   }
   /**
    * Returns a column of lists of strings by splitting each string using the specified delimiter.
@@ -2481,11 +2493,13 @@ public final ColumnVector stringSplitRecord(String delimiter) {
 
   /**
    * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param limit the maximum size of the array resulting from splitting the input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
    */
-  public final ColumnVector stringSplitRecord(int maxSplit) {
+  public final ColumnVector stringSplitRecord(int limit) {
     String emptyString = "";
-    return stringSplitRecord(emptyString, maxSplit, false);
+    return stringSplitRecord(emptyString, limit, false);
   }
 
   /**
@@ -3601,11 +3615,13 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
    * delimiter.
    * @param nativeHandle native handle of the cudf::column_view being operated on.
    * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param limit the maximum size of the array resulting from splitting the input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
    * @param splitByRegex a boolean flag indicating whether the input string will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    */
-  private static native long[] stringSplit(long nativeHandle, String delimiter, int maxSplit,
+  private static native long[] stringSplit(long nativeHandle, String delimiter, int limit,
                                            boolean splitByRegex);
 
   /**
@@ -3613,11 +3629,13 @@ private static native long[] stringSplit(long nativeHandle, String delimiter, in
    * using the specified delimiter.
    * @param nativeHandle native handle of the cudf::column_view being operated on.
    * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * @param limit the maximum size of the array resulting from splitting the input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
    * @param splitByRegex a boolean flag indicating whether the input string will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    */
-  private static native long stringSplitRecord(long nativeHandle, String delimiter, int maxSplit,
+  private static native long stringSplitRecord(long nativeHandle, String delimiter, int limit,
                                                boolean splitByRegex);
 
   /**

From 6d8bcc94c86214148fc310d48e7608d8344fa647 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 7 Feb 2022 14:48:33 -0700
Subject: [PATCH 22/39] Convert java limit to cudf max_split

---
 java/src/main/native/src/ColumnViewJni.cpp | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index dcc6085c509..167f53493f4 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -564,7 +564,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong input_handle,
                                                                         jstring delimiter,
-                                                                        jint max_split,
+                                                                        jint limit,
                                                                         jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
   try {
@@ -578,6 +578,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
     auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
     env->ReleaseStringUTFChars(delimiter, delimiter_chars);
 
+    if (limit == 0 || limit == 1) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                    "limit == 0 or limit == 1 are not supported", 0);
+    }
+    auto const max_split = limit > 1 ? limit - 1 : limit;
+
     auto result =
         split_by_regex ?
             cudf::strings::split_re(strs_input, str_delimiter, max_split) :
@@ -590,7 +596,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
                                                                          jlong input_handle,
                                                                          jstring delimiter,
-                                                                         jint max_split,
+                                                                         jint limit,
                                                                          jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
   try {
@@ -604,6 +610,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
     auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
     env->ReleaseStringUTFChars(delimiter, delimiter_chars);
 
+    if (limit == 0 || limit == 1) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                    "limit == 0 or limit == 1 are not supported", 0);
+    }
+    auto const max_split = limit > 1 ? limit - 1 : limit;
+
     auto result =
         split_by_regex ?
             cudf::strings::split_record_re(strs_input, str_delimiter, max_split) :

From 2e6450fd610c960564c5feab35cb949a592838a7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 7 Feb 2022 15:00:43 -0700
Subject: [PATCH 23/39] Fix Java test

---
 java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index e9d7bcbfcbd..bda1a6bda55 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4761,7 +4761,7 @@ void testStringSplitRecord() {
   void testStringSplit() {
     String pattern = " ";
     try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here");
-         Table expectedSplitOnce = new Table.TestBuilder()
+         Table expectedSplitLimit2 = new Table.TestBuilder()
          .column("Héllo", "thésé", null, "", "ARé", "test")
          .column("there all", null, null, null, "some things", "strings here")
          .build();
@@ -4770,9 +4770,9 @@ void testStringSplit() {
          .column("there", null, null, null, "some", "strings")
          .column("all", null, null, null, "things", "here")
          .build();
-         Table resultSplitOnce = v.stringSplit(pattern, 1);
+         Table resultSplitLimit2 = v.stringSplit(pattern, 2);
          Table resultSplitAll = v.stringSplit(pattern)) {
-          assertTablesAreEqual(expectedSplitOnce, resultSplitOnce);
+          assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2);
           assertTablesAreEqual(expectedSplitAll, resultSplitAll);
     }
   }

From eb8c326cd1b2ea1e9f673b6a16e07533ce637f14 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 7 Feb 2022 19:51:00 -0500
Subject: [PATCH 24/39] fix doxygen typo in @throw line

---
 cpp/include/cudf/strings/split/split_re.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index c6dc1e5c697..d61b802efe9 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -60,7 +60,7 @@ namespace strings {
  *       ["bc def_g", "_bc", "ab cd", "cd "] ]
  * @endcode
  *
- * @throw cudf:logic_error if `pattern` is empty.
+ * @throw cudf::logic_error if `pattern` is empty.
  *
  * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
@@ -110,7 +110,7 @@ std::unique_ptr<table> split_re(
  *       ["g", "bc", "cd", "cd "] ]
  * @endcode
  *
- * @throw cudf:logic_error if `pattern` is empty.
+ * @throw cudf::logic_error if `pattern` is empty.
  *
  * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
@@ -160,7 +160,7 @@ std::unique_ptr<table> rsplit_re(
  *       ["ab", "cd "] ]
  * @endcode
  *
- * @throw cudf:logic_error if `pattern` is empty.
+ * @throw cudf::logic_error if `pattern` is empty.
  *
  * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
@@ -212,7 +212,7 @@ std::unique_ptr<column> split_record_re(
  *       ["ab_cd", ""] ]
  * @endcode
  *
- * @throw cudf:logic_error if `pattern` is empty.
+ * @throw cudf::logic_error if `pattern` is empty.
  *
  * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.

From d6ee8837ff3f523816d96f444e1b001d14debdf7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 7 Feb 2022 19:51:37 -0500
Subject: [PATCH 25/39] refactor max-tokens calculation into helper function

---
 cpp/src/strings/split/split_re.cu | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index dd71533c773..d80148f2fe6 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -98,7 +98,7 @@ struct token_reader_fn {
     d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
 
     if (direction == split_direction::BACKWARD) {
-      // update first entry -- this happens when max-tokens is hit before the end of the string
+      // update first entry -- this happens when max_tokens is hit before the end of the string
       auto const first_offset =
         d_result[0].first
           ? static_cast<size_type>(thrust::distance(d_str.data(), d_result[0].first))
@@ -117,6 +117,7 @@ struct token_reader_fn {
  *
  * @param d_strings Strings to split
  * @param d_prog Regex to evaluate against each string
+ * @param direction Whether tokens are generated forwards or backwards.
  * @param max_tokens The maximum number of tokens for each split.
  * @param offsets The number of matches on input.
  *                The offsets for each token in each string on output.
@@ -125,12 +126,14 @@ struct token_reader_fn {
 rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const& d_strings,
                                                        reprog_device& d_prog,
                                                        split_direction direction,
-                                                       size_type max_tokens,
+                                                       size_type maxsplit,
                                                        mutable_column_view& offsets,
                                                        rmm::cuda_stream_view stream)
 {
   auto const strings_count = d_strings.size();
 
+  auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+
   auto const begin     = thrust::make_counting_iterator<size_type>(0);
   auto const end       = thrust::make_counting_iterator<size_type>(strings_count);
   auto const d_offsets = offsets.data<offset_type>();
@@ -182,7 +185,7 @@ struct tokens_transform_fn {
   {
     auto const offset      = d_token_offsets[idx];
     auto const token_count = d_token_offsets[idx + 1] - offset;
-    return (column_index > token_count - 1) || d_strings.is_null(idx)
+    return (column_index >= token_count) || d_strings.is_null(idx)
              ? string_index_pair{nullptr, 0}
              : d_tokens[offset + column_index];
   }
@@ -197,7 +200,6 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
 {
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
 
-  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
   auto const strings_count = input.size();
 
   std::vector<std::unique_ptr<column>> results;
@@ -216,7 +218,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto d_offsets    = offsets_view.data<offset_type>();
 
   // get the split tokens from the input column; this also converts the counts into offsets
-  auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream);
 
   // the output column count is the maximum number of tokens generated for any input string
   auto const columns_count = thrust::transform_reduce(
@@ -266,7 +268,6 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
 {
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
 
-  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
   auto const strings_count = input.size();
 
   // create the regex device prog from the given pattern
@@ -278,7 +279,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto offsets_view = offsets->mutable_view();
 
   // get the split tokens from the input column; this also converts the counts into offsets
-  auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream);
 
   // convert the tokens into one big strings column
   auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);

From f528107d2daeb430b0ae89d92becb1477c6e0de1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 8 Feb 2022 09:27:51 -0700
Subject: [PATCH 26/39] Fix typo

---
 java/src/main/native/src/ColumnViewJni.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 167f53493f4..6b0172acd99 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -580,7 +580,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
 
     if (limit == 0 || limit == 1) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                    "limit == 0 or limit == 1 are not supported", 0);
+                    "limit == 0 and limit == 1 are not supported", 0);
     }
     auto const max_split = limit > 1 ? limit - 1 : limit;
 
@@ -612,7 +612,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
 
     if (limit == 0 || limit == 1) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                    "limit == 0 or limit == 1 are not supported", 0);
+                    "limit == 0 and limit == 1 are not supported", 0);
     }
     auto const max_split = limit > 1 ? limit - 1 : limit;
 

From 70a4e342b26bed44df064b46293ae537cb7b7184 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 10 Feb 2022 14:35:18 -0700
Subject: [PATCH 27/39] Remove support for empty delimiter

---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 56 ++-----------------
 java/src/main/native/src/ColumnVectorJni.cpp  |  1 +
 java/src/main/native/src/ColumnViewJni.cpp    | 39 +++++++++----
 3 files changed, 36 insertions(+), 60 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index d28211d808f..c364643373d 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2359,10 +2359,10 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
    * @return New table of strings columns.
    */
   public final Table stringSplit(String delimiter, int limit, boolean splitByRegex) {
-    assert delimiter != null : "delimiter is null";
-    assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex";
     assert type.equals(DType.STRING) : "column type must be a String";
-    assert limit != 0 && limit != 1;
+    assert delimiter != null : "delimiter is null";
+    assert delimiter.length() > 0 : "empty delimiter is not supported";
+    assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
     return new Table(stringSplit(this.getNativeView(), delimiter, limit, splitByRegex));
   }
 
@@ -2410,32 +2410,6 @@ public final Table stringSplit(String delimiter) {
     return stringSplit(delimiter, -1, false);
   }
 
-  /**
-   * Returns a list of columns by splitting each string using whitespace as the delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param limit the maximum size of the array resulting from splitting the input string,
-   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
-   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
-   * @return New table of strings columns.
-   */
-  public final Table stringSplit(int limit) {
-    String emptyString = "";
-    return stringSplit(emptyString, limit, false);
-  }
-
-  /**
-   * Returns a list of columns by splitting each string using whitespace as the delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @return New table of strings columns.
-   */
-  public final Table stringSplit() {
-    return stringSplit(-1);
-  }
-
   /**
    * Returns a column that is a list of strings. Each string list is made by splitting each input
    * string using the specified delimiter.
@@ -2449,10 +2423,10 @@ public final Table stringSplit() {
    * @return New table of strings columns.
    */
   public final ColumnVector stringSplitRecord(String delimiter, int limit, boolean splitByRegex) {
-    assert delimiter != null : "delimiter is null";
-    assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex";
     assert type.equals(DType.STRING) : "column type must be String";
-    assert limit != 0 && limit != 1;
+    assert delimiter != null : "delimiter is null";
+    assert delimiter.length() > 0 : "empty delimiter is not supported";
+    assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
     return new ColumnVector(
         stringSplitRecord(this.getNativeView(), delimiter, limit, splitByRegex));
   }
@@ -2491,24 +2465,6 @@ public final ColumnVector stringSplitRecord(String delimiter) {
     return stringSplitRecord(delimiter, -1, false);
   }
 
-  /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
-   * @param limit the maximum size of the array resulting from splitting the input string,
-   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
-   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
-   */
-  public final ColumnVector stringSplitRecord(int limit) {
-    String emptyString = "";
-    return stringSplitRecord(emptyString, limit, false);
-  }
-
-  /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
-   */
-  public final ColumnVector stringSplitRecord() {
-    return stringSplitRecord(-1);
-  }
-
   /**
    * Returns a new strings column that contains substrings of the strings in the provided column.
    * Overloading subString to support if end index is not provided. Appending -1 to indicate to
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index f01d832eb19..83202213d3e 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -32,6 +32,7 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <nvtext/tokenize.hpp>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 6b0172acd99..2209dc347d2 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -70,6 +70,7 @@
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <nvtext/tokenize.hpp>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
@@ -567,6 +568,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
                                                                         jint limit,
                                                                         jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
+
+  // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1.
+  if (limit == 0 || limit == 1) {
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                  "limit == 0 and limit == 1 are not supported", 0);
+  }
+
   try {
     cudf::jni::auto_set_device(env);
 
@@ -575,14 +583,16 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
 
     auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr);
     auto const delimiter_size = env->GetStringUTFLength(delimiter);
-    auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
-    env->ReleaseStringUTFChars(delimiter, delimiter_chars);
 
-    if (limit == 0 || limit == 1) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                    "limit == 0 and limit == 1 are not supported", 0);
+    // Java's split API produces different behaviors than cudf when splitting with empty delimiter.
+    if (delimiter_size == 0) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty delimiter is not supported",
+                    0);
     }
+
     auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
+    env->ReleaseStringUTFChars(delimiter, delimiter_chars);
 
     auto result =
         split_by_regex ?
@@ -599,6 +609,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
                                                                          jint limit,
                                                                          jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
+
+  // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1.
+  if (limit == 0 || limit == 1) {
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                  "limit == 0 and limit == 1 are not supported", 0);
+  }
+
   try {
     cudf::jni::auto_set_device(env);
 
@@ -607,14 +624,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
 
     auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr);
     auto const delimiter_size = env->GetStringUTFLength(delimiter);
-    auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
-    env->ReleaseStringUTFChars(delimiter, delimiter_chars);
 
-    if (limit == 0 || limit == 1) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                    "limit == 0 and limit == 1 are not supported", 0);
+    // Java's split API produces different behaviors than cudf when splitting with empty delimiter.
+    if (delimiter_size == 0) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty delimiter is not supported",
+                    0);
     }
+
     auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
+    env->ReleaseStringUTFChars(delimiter, delimiter_chars);
 
     auto result =
         split_by_regex ?

From af16edd1874a472459971081a3929d0eac600d8f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 10 Feb 2022 15:26:56 -0700
Subject: [PATCH 28/39] Update Java tests

---
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 105 +++++++++++++-----
 1 file changed, 79 insertions(+), 26 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index bda1a6bda55..ebbf73a8033 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4739,28 +4739,11 @@ void testListSortRowsWithStringChild() {
     }
   }
 
-  @Test
-  void testStringSplitRecord() {
-    String pattern = " ";
-    try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings");
-         ColumnVector expected = ColumnVector.fromLists(
-                 new HostColumnVector.ListType(true,
-                     new HostColumnVector.BasicType(true, DType.STRING)),
-                 Arrays.asList("Héllo", "there"),
-                 Arrays.asList("thésé"),
-                 Arrays.asList("null"),
-                 Arrays.asList(""),
-                 Arrays.asList("ARé", "some"),
-                 Arrays.asList("test", "strings"));
-         ColumnVector result = v.stringSplitRecord(pattern, -1)) {
-        assertColumnsAreEqual(expected, result);
-    }
-  }
-
   @Test
   void testStringSplit() {
     String pattern = " ";
-    try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here");
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "",
+        "ARé some things", "test strings here");
          Table expectedSplitLimit2 = new Table.TestBuilder()
          .column("Héllo", "thésé", null, "", "ARé", "test")
          .column("there all", null, null, null, "some things", "strings here")
@@ -4778,13 +4761,83 @@ void testStringSplit() {
   }
 
   @Test
-  void testStringSplitWhiteSpace() {
-    try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " ");
-         Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null)
-         .column("thesé", null, "some", "String", null)
-         .build();
-         Table result = v.stringSplit()) {
-      assertTablesAreEqual(expected, result);
+  void testStringSplitByRegularExpression() {
+    String pattern = "[_ ]";
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "",
+        "ARé some_things", "test_strings_here");
+         Table expectedSplitLimit2 = new Table.TestBuilder()
+             .column("Héllo", "thésé", null, "", "ARé", "test")
+             .column("there all", null, null, null, "some_things", "strings_here")
+             .build();
+         Table expectedSplitAll = new Table.TestBuilder()
+             .column("Héllo", "thésé", null, "", "ARé", "test")
+             .column("there", null, null, null, "some", "strings")
+             .column("all", null, null, null, "things", "here")
+             .build();
+         Table resultSplitLimit2 = v.stringSplit(pattern, 2, true);
+         Table resultSplitAll = v.stringSplit(pattern, true)) {
+      assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2);
+      assertTablesAreEqual(expectedSplitAll, resultSplitAll);
+    }
+  }
+
+  @Test
+  void testStringSplitRecord() {
+    String pattern = " ";
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "",
+        "ARé some things", "test strings here");
+         ColumnVector expectedSplitLimit2 = ColumnVector.fromLists(
+             new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("Héllo", "there all"),
+             Arrays.asList("thésé"),
+             Arrays.asList((Object) null),
+             Arrays.asList(""),
+             Arrays.asList("ARé", "some things"),
+             Arrays.asList("test", "strings here"));
+         ColumnVector expectedSplitAll = ColumnVector.fromLists(
+             new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("Héllo", "there", "all"),
+             Arrays.asList("thésé"),
+             Arrays.asList((Object) null),
+             Arrays.asList(""),
+             Arrays.asList("ARé", "some", "things"),
+             Arrays.asList("test", "strings", "here"));
+         ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2);
+         ColumnVector resultSplitAll = v.stringSplitRecord(pattern)) {
+      assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2);
+      assertColumnsAreEqual(expectedSplitAll, resultSplitAll);
+    }
+  }
+
+  @Test
+  void testStringSplitRecordByRegularExpression() {
+    String pattern = "[_ ]";
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "",
+        "ARé some_things", "test_strings_here");
+         ColumnVector expectedSplitLimit2 = ColumnVector.fromLists(
+             new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("Héllo", "there all"),
+             Arrays.asList("thésé"),
+             Arrays.asList((Object) null),
+             Arrays.asList(""),
+             Arrays.asList("ARé", "some_things"),
+             Arrays.asList("test", "strings_here"));
+         ColumnVector expectedSplitAll = ColumnVector.fromLists(
+             new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("Héllo", "there", "all"),
+             Arrays.asList("thésé"),
+             Arrays.asList((Object) null),
+             Arrays.asList(""),
+             Arrays.asList("ARé", "some", "things"),
+             Arrays.asList("test", "strings", "here"));
+         ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2, true);
+         ColumnVector resultSplitAll = v.stringSplitRecord(pattern, true)) {
+      assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2);
+      assertColumnsAreEqual(expectedSplitAll, resultSplitAll);
     }
   }
 

From cac2637805bb17151fad6508d71401dfd0ae9af6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 10 Feb 2022 16:10:20 -0700
Subject: [PATCH 29/39] Fix Java tests

---
 .../test/java/ai/rapids/cudf/ColumnVectorTest.java   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 999e9a36267..b759c746735 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4364,10 +4364,10 @@ void testExtractListElements() {
            ColumnVector expected = ColumnVector.fromStrings("Héllo",
                    "thésé",
                    null,
-                   null,
+                   "",
                    "ARé",
                    "test");
-           ColumnVector tmp = v.stringSplitRecord();
+           ColumnVector tmp = v.stringSplitRecord(" ");
            ColumnVector result = tmp.extractListElement(0)) {
           assertColumnsAreEqual(expected, result);
       }
@@ -4813,7 +4813,7 @@ void testStringSplitRecord() {
                  new HostColumnVector.BasicType(true, DType.STRING)),
              Arrays.asList("Héllo", "there all"),
              Arrays.asList("thésé"),
-             Arrays.asList((Object) null),
+             null,
              Arrays.asList(""),
              Arrays.asList("ARé", "some things"),
              Arrays.asList("test", "strings here"));
@@ -4822,7 +4822,7 @@ void testStringSplitRecord() {
                  new HostColumnVector.BasicType(true, DType.STRING)),
              Arrays.asList("Héllo", "there", "all"),
              Arrays.asList("thésé"),
-             Arrays.asList((Object) null),
+             null,
              Arrays.asList(""),
              Arrays.asList("ARé", "some", "things"),
              Arrays.asList("test", "strings", "here"));
@@ -4843,7 +4843,7 @@ void testStringSplitRecordByRegularExpression() {
                  new HostColumnVector.BasicType(true, DType.STRING)),
              Arrays.asList("Héllo", "there all"),
              Arrays.asList("thésé"),
-             Arrays.asList((Object) null),
+             null,
              Arrays.asList(""),
              Arrays.asList("ARé", "some_things"),
              Arrays.asList("test", "strings_here"));
@@ -4852,7 +4852,7 @@ void testStringSplitRecordByRegularExpression() {
                  new HostColumnVector.BasicType(true, DType.STRING)),
              Arrays.asList("Héllo", "there", "all"),
              Arrays.asList("thésé"),
-             Arrays.asList((Object) null),
+             null,
              Arrays.asList(""),
              Arrays.asList("ARé", "some", "things"),
              Arrays.asList("test", "strings", "here"));

From 2fade8da4952da9cf4d1c5c9e1a739aee51a4e11 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 06:25:37 -0700
Subject: [PATCH 30/39] Reverse change

---
 java/src/main/native/src/ColumnVectorJni.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 83202213d3e..f01d832eb19 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -32,7 +32,6 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <nvtext/tokenize.hpp>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"

From a65d358cfda73457c35bc6cb318942e8714bb14f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 11:17:07 -0700
Subject: [PATCH 31/39] Rewrite Javadoc

---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 152 +++++++++---------
 1 file changed, 78 insertions(+), 74 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index c364643373d..770368791f7 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2345,121 +2345,125 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
   }
 
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param limit the maximum size of the array resulting from splitting the input string,
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
    *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
    *              trailing empty strings) and limit = 1 (no split at all) are not supported.
-   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
-   * @return New table of strings columns.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(String delimiter, int limit, boolean splitByRegex) {
+  public final Table stringSplit(String pattern, int limit, boolean splitByRegex) {
     assert type.equals(DType.STRING) : "column type must be a String";
-    assert delimiter != null : "delimiter is null";
-    assert delimiter.length() > 0 : "empty delimiter is not supported";
+    assert pattern != null : "pattern is null";
+    assert pattern.length() > 0 : "empty pattern is not supported";
     assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
-    return new Table(stringSplit(this.getNativeView(), delimiter, limit, splitByRegex));
+    return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex));
   }
 
   /**
-   * Returns a list of columns by splitting each string using the specified string literal delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param limit the maximum size of the array resulting from splitting the input string,
-   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
-   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(String delimiter, int limit) {
-    return stringSplit(delimiter, limit, false);
+  public final Table stringSplit(String pattern, boolean splitByRegex) {
+    return stringSplit(pattern, -1, splitByRegex);
   }
 
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
-   *                     regular expression pattern or just by a string literal delimiter.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified string literal
+   * delimiter. The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted. Null input entries
+   * result in all nulls in the corresponding rows of the output columns.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(String delimiter, boolean splitByRegex) {
-    return stringSplit(delimiter, -1, splitByRegex);
+  public final Table stringSplit(String delimiter, int limit) {
+    return stringSplit(delimiter, limit, false);
   }
 
   /**
-   * Returns a list of columns by splitting each string using the specified string literal delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   *                  An empty string indicates split on whitespace.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified string literal
+   * delimiter. The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted. Null input entries
+   * result in all nulls in the corresponding rows of the output columns.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @return list of strings columns as a table.
    */
   public final Table stringSplit(String delimiter) {
     return stringSplit(delimiter, -1, false);
   }
 
   /**
-   * Returns a column that is a list of strings. Each string list is made by splitting each input
-   * string using the specified delimiter.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param limit the maximum size of the array resulting from splitting the input string,
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified pattern.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
    *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
    *              trailing empty strings) and limit = 1 (no split at all) are not supported.
-   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
-   * @return New table of strings columns.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(String delimiter, int limit, boolean splitByRegex) {
+  public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) {
     assert type.equals(DType.STRING) : "column type must be String";
-    assert delimiter != null : "delimiter is null";
-    assert delimiter.length() > 0 : "empty delimiter is not supported";
+    assert pattern != null : "pattern is null";
+    assert pattern.length() > 0 : "empty pattern is not supported";
     assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
     return new ColumnVector(
-        stringSplitRecord(this.getNativeView(), delimiter, limit, splitByRegex));
+        stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex));
   }
 
   /**
-   * Returns a column that is a list of strings. Each string list is made by splitting each input
-   * string using the specified string literal delimiter.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param limit the maximum size of the array resulting from splitting the input string,
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified pattern.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return a LIST column of string elements.
+   */
+  public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) {
+    return stringSplitRecord(pattern, -1, splitByRegex);
+  }
+
+  /**
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
    *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
    *              trailing empty strings) and limit = 1 (no split at all) are not supported.
-   * @return New table of strings columns.
+   * @return a LIST column of string elements.
    */
   public final ColumnVector stringSplitRecord(String delimiter, int limit) {
     return stringSplitRecord(delimiter, limit, false);
   }
-  /**
-   * Returns a column of lists of strings by splitting each string using the specified delimiter.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
-   *                     regular expression pattern or just by a string literal delimiter.
-   */
-  public final ColumnVector stringSplitRecord(String delimiter, boolean splitByRegex) {
-    return stringSplitRecord(delimiter, -1, splitByRegex);
-  }
 
   /**
-   * Returns a column of lists of strings by splitting each string using the specified string
-   * literal delimiter.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   *                  An empty string indicates split on whitespace.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @return a LIST column of string elements.
    */
   public final ColumnVector stringSplitRecord(String delimiter) {
     return stringSplitRecord(delimiter, -1, false);

From 7f0fee7edb252991228533462227b392a82b9f33 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 11:21:22 -0700
Subject: [PATCH 32/39] Fix Javadoc for the native methods

---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 770368791f7..f91ee5535b1 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -3571,31 +3571,35 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
   private static native long substringLocate(long columnView, long substringScalar, int start, int end);
 
   /**
-   * Native method which returns an array of columns by splitting each string using the specified
-   * delimiter.
-   * @param nativeHandle native handle of the cudf::column_view being operated on.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   * @param limit the maximum size of the array resulting from splitting the input string,
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param nativeHandle native handle of the input strings column that being operated on.
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
    *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
    *              trailing empty strings) and limit = 1 (no split at all) are not supported.
-   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    */
-  private static native long[] stringSplit(long nativeHandle, String delimiter, int limit,
+  private static native long[] stringSplit(long nativeHandle, String pattern, int limit,
                                            boolean splitByRegex);
 
   /**
-   * Native method which returns a LIST column by splitting each string into a list of strings
-   * using the specified delimiter.
-   * @param nativeHandle native handle of the cudf::column_view being operated on.
-   * @param delimiter UTF-8 string identifying the split points or split pattern in each string.
-   * @param limit the maximum size of the array resulting from splitting the input string,
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param nativeHandle native handle of the input strings column that being operated on.
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
    *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
    *              trailing empty strings) and limit = 1 (no split at all) are not supported.
-   * @param splitByRegex a boolean flag indicating whether the input string will be split by a
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
    *                     regular expression pattern or just by a string literal delimiter.
    */
-  private static native long stringSplitRecord(long nativeHandle, String delimiter, int limit,
+  private static native long stringSplitRecord(long nativeHandle, String pattern, int limit,
                                                boolean splitByRegex);
 
   /**

From 1b4cd51b9f7ebb3ac784227e85d175f9cf7ebcbc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 12:33:14 -0700
Subject: [PATCH 33/39] Rewrite JNI

---
 java/src/main/native/src/ColumnViewJni.cpp | 61 +++++++++-------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 2209dc347d2..89896df515b 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -564,81 +564,72 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong input_handle,
-                                                                        jstring delimiter,
-                                                                        jint limit,
+                                                                        jstring pattern, jint limit,
                                                                         jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
-  // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1.
   if (limit == 0 || limit == 1) {
+    // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1.
     JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
                   "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
 
-    auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr);
-    auto const delimiter_size = env->GetStringUTFLength(delimiter);
-
-    // Java's split API produces different behaviors than cudf when splitting with empty delimiter.
-    if (delimiter_size == 0) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty delimiter is not supported",
-                    0);
+    auto const pattern_size = env->GetStringUTFLength(pattern);
+    if (pattern_size == 0) {
+      // Java's split API produces different behaviors than cudf when splitting with empty
+      // pattern.
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0);
     }
 
     auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
-    env->ReleaseStringUTFChars(delimiter, delimiter_chars);
+    auto const pattern_chars = env->GetStringUTFChars(pattern, nullptr);
+    auto const pattern_str = std::string(pattern_chars, pattern_size);
+    env->ReleaseStringUTFChars(pattern, pattern_chars);
 
-    auto result =
-        split_by_regex ?
-            cudf::strings::split_re(strs_input, str_delimiter, max_split) :
-            cudf::strings::split(strs_input, cudf::string_scalar{str_delimiter}, max_split);
+    auto result = split_by_regex ?
+                      cudf::strings::split_re(strs_input, pattern_str, max_split) :
+                      cudf::strings::split(strs_input, cudf::string_scalar{pattern_str}, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
-                                                                         jlong input_handle,
-                                                                         jstring delimiter,
-                                                                         jint limit,
-                                                                         jboolean split_by_regex) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(
+    JNIEnv *env, jclass, jlong input_handle, jstring pattern, jint limit, jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
-  // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1.
   if (limit == 0 || limit == 1) {
+    // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1.
     JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
                   "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
 
-    auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr);
-    auto const delimiter_size = env->GetStringUTFLength(delimiter);
-
-    // Java's split API produces different behaviors than cudf when splitting with empty delimiter.
-    if (delimiter_size == 0) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty delimiter is not supported",
-                    0);
+    auto const pattern_size = env->GetStringUTFLength(pattern);
+    if (pattern_size == 0) {
+      // Java's split API produces different behaviors than cudf when splitting with empty
+      // pattern.
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0);
     }
 
     auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const str_delimiter = std::string(delimiter_chars, delimiter_size);
-    env->ReleaseStringUTFChars(delimiter, delimiter_chars);
+    auto const pattern_chars = env->GetStringUTFChars(pattern, nullptr);
+    auto const pattern_str = std::string(pattern_chars, pattern_size);
+    env->ReleaseStringUTFChars(pattern, pattern_chars);
 
     auto result =
         split_by_regex ?
-            cudf::strings::split_record_re(strs_input, str_delimiter, max_split) :
-            cudf::strings::split_record(strs_input, cudf::string_scalar{str_delimiter}, max_split);
+            cudf::strings::split_record_re(strs_input, pattern_str, max_split) :
+            cudf::strings::split_record(strs_input, cudf::string_scalar{pattern_str}, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);

From 00d7d8ba4f06455f33f1917d7fc6aaed765a0af9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 13:45:39 -0700
Subject: [PATCH 34/39] Add a function to construct std::string from
 native_jstring

---
 java/src/main/native/include/jni_utils.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index a45716a89b3..e8346e5ef1e 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -17,6 +17,7 @@
 
 #include <algorithm>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include <jni.h>
@@ -524,7 +525,7 @@ class native_jstring {
   void init_cstr() const {
     if (orig != NULL && cstr == NULL) {
       cstr_length = env->GetStringUTFLength(orig);
-      cstr = env->GetStringUTFChars(orig, 0);
+      cstr = env->GetStringUTFChars(orig, 0); // not guarantee to have null terminated.
       check_java_exception(env);
     }
   }
@@ -555,6 +556,7 @@ class native_jstring {
 
   bool is_null() const noexcept { return orig == NULL; }
 
+  // Note that the char* return by this function is not guaranteed to be null-terminated.
   const char *get() const {
     init_cstr();
     return cstr;
@@ -565,6 +567,12 @@ class native_jstring {
     return cstr_length;
   }
 
+  // Note that the char* return by `get()` is not guaranteed to be null-terminated.
+  // Thus, constructing an std::string should be performed with a string size supplied.
+  std::string get_cpp_str() const { return std::string(get(), size_bytes()); }
+
+  jstring get_jstring() const { return orig; }
+
   bool is_empty() const {
     if (cstr != NULL) {
       return cstr_length <= 0;
@@ -576,8 +584,6 @@ class native_jstring {
     return true;
   }
 
-  const jstring get_jstring() const { return orig; }
-
   ~native_jstring() {
     if (orig != NULL && cstr != NULL) {
       env->ReleaseStringUTFChars(orig, cstr);

From 69bb7a0ca3b3357d8f35c8a8017313676dabb7c0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 13:45:45 -0700
Subject: [PATCH 35/39] Update JNI

---
 java/src/main/native/src/ColumnViewJni.cpp | 34 ++++++++++------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 89896df515b..b8d62940cae 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -564,7 +564,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
                                                                         jlong input_handle,
-                                                                        jstring pattern, jint limit,
+                                                                        jstring pattern_obj,
+                                                                        jint limit,
                                                                         jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
@@ -579,28 +580,27 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
 
-    auto const pattern_size = env->GetStringUTFLength(pattern);
-    if (pattern_size == 0) {
+    auto const pattern = cudf::jni::native_jstring(env, pattern_obj).get_cpp_str();
+    if (pattern.empty()) {
       // Java's split API produces different behaviors than cudf when splitting with empty
       // pattern.
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0);
     }
 
     auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const pattern_chars = env->GetStringUTFChars(pattern, nullptr);
-    auto const pattern_str = std::string(pattern_chars, pattern_size);
-    env->ReleaseStringUTFChars(pattern, pattern_chars);
-
     auto result = split_by_regex ?
-                      cudf::strings::split_re(strs_input, pattern_str, max_split) :
-                      cudf::strings::split(strs_input, cudf::string_scalar{pattern_str}, max_split);
+                      cudf::strings::split_re(strs_input, pattern, max_split) :
+                      cudf::strings::split(strs_input, cudf::string_scalar{pattern}, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(
-    JNIEnv *env, jclass, jlong input_handle, jstring pattern, jint limit, jboolean split_by_regex) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
+                                                                         jlong input_handle,
+                                                                         jstring pattern_obj,
+                                                                         jint limit,
+                                                                         jboolean split_by_regex) {
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -614,22 +614,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
 
-    auto const pattern_size = env->GetStringUTFLength(pattern);
-    if (pattern_size == 0) {
+    auto const pattern = cudf::jni::native_jstring(env, pattern_obj).get_cpp_str();
+    if (pattern.empty()) {
       // Java's split API produces different behaviors than cudf when splitting with empty
       // pattern.
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0);
     }
 
     auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const pattern_chars = env->GetStringUTFChars(pattern, nullptr);
-    auto const pattern_str = std::string(pattern_chars, pattern_size);
-    env->ReleaseStringUTFChars(pattern, pattern_chars);
-
     auto result =
         split_by_regex ?
-            cudf::strings::split_record_re(strs_input, pattern_str, max_split) :
-            cudf::strings::split_record(strs_input, cudf::string_scalar{pattern_str}, max_split);
+            cudf::strings::split_record_re(strs_input, pattern, max_split) :
+            cudf::strings::split_record(strs_input, cudf::string_scalar{pattern}, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);

From a1b0e37f7beecbacd67457b1f3df0115a66e0e7e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 13:50:24 -0700
Subject: [PATCH 36/39] Revert "Add a function to construct std::string from
 native_jstring"

This reverts commit 00d7d8ba4f06455f33f1917d7fc6aaed765a0af9.
---
 java/src/main/native/include/jni_utils.hpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index e8346e5ef1e..a45716a89b3 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -17,7 +17,6 @@
 
 #include <algorithm>
 #include <memory>
-#include <string>
 #include <vector>
 
 #include <jni.h>
@@ -525,7 +524,7 @@ class native_jstring {
   void init_cstr() const {
     if (orig != NULL && cstr == NULL) {
       cstr_length = env->GetStringUTFLength(orig);
-      cstr = env->GetStringUTFChars(orig, 0); // not guarantee to have null terminated.
+      cstr = env->GetStringUTFChars(orig, 0);
       check_java_exception(env);
     }
   }
@@ -556,7 +555,6 @@ class native_jstring {
 
   bool is_null() const noexcept { return orig == NULL; }
 
-  // Note that the char* return by this function is not guaranteed to be null-terminated.
   const char *get() const {
     init_cstr();
     return cstr;
@@ -567,12 +565,6 @@ class native_jstring {
     return cstr_length;
   }
 
-  // Note that the char* return by `get()` is not guaranteed to be null-terminated.
-  // Thus, constructing an std::string should be performed with a string size supplied.
-  std::string get_cpp_str() const { return std::string(get(), size_bytes()); }
-
-  jstring get_jstring() const { return orig; }
-
   bool is_empty() const {
     if (cstr != NULL) {
       return cstr_length <= 0;
@@ -584,6 +576,8 @@ class native_jstring {
     return true;
   }
 
+  const jstring get_jstring() const { return orig; }
+
   ~native_jstring() {
     if (orig != NULL && cstr != NULL) {
       env->ReleaseStringUTFChars(orig, cstr);

From 3d55e34751d5f048ac00e709160ce6232830ee6e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 13:52:21 -0700
Subject: [PATCH 37/39] Update JNI

---
 java/src/main/native/src/ColumnViewJni.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index b8d62940cae..997a4feaada 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -580,13 +580,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
 
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj).get_cpp_str();
-    if (pattern.empty()) {
+    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
+    if (pattern_jstr.is_empty()) {
       // Java's split API produces different behaviors than cudf when splitting with empty
       // pattern.
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0);
     }
 
+    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
     auto const max_split = limit > 1 ? limit - 1 : limit;
     auto result = split_by_regex ?
                       cudf::strings::split_re(strs_input, pattern, max_split) :
@@ -614,13 +615,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
     auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
     auto const strs_input = cudf::strings_column_view{*input};
 
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj).get_cpp_str();
-    if (pattern.empty()) {
+    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
+    if (pattern_jstr.is_empty()) {
       // Java's split API produces different behaviors than cudf when splitting with empty
       // pattern.
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0);
     }
 
+    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
     auto const max_split = limit > 1 ? limit - 1 : limit;
     auto result =
         split_by_regex ?

From 2ba4039916c91a23159ee813dcb32e4d447c1f3c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 14:13:55 -0700
Subject: [PATCH 38/39] Update comments to clarify why we don't support
 limit==0 and limit==1

---
 java/src/main/native/src/ColumnViewJni.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 997a4feaada..5784be25008 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -570,7 +570,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
-    // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1.
+    // Cannot achieve the results of splitting with limit == 0 or limit == 1.
+    // This is because cudf operates on a different parameter (`max_split`) which is converted from
+    // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
+    // unlimited split.
     JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
                   "limit == 0 and limit == 1 are not supported", 0);
   }
@@ -605,7 +608,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
-    // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1.
+    // Cannot achieve the results of splitting with limit == 0 or limit == 1.
+    // This is because cudf operates on a different parameter (`max_split`) which is converted from
+    // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
+    // unlimited split.
     JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
                   "limit == 0 and limit == 1 are not supported", 0);
   }

From bad6e6771f6139a053d7cdfea45c938f6425c6cc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 11 Feb 2022 14:24:33 -0700
Subject: [PATCH 39/39] Remove unused header that was added by accident

---
 java/src/main/native/src/ColumnViewJni.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 5784be25008..548844aa0d3 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -70,7 +70,6 @@
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <nvtext/tokenize.hpp>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"