From 56758d88ad5665dfd0d4c5542ec25c240db67ece Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 10 Feb 2022 14:20:57 -0500
Subject: [PATCH 01/10] Add nvtext::byte_pair_encoding API

---
 conda/recipes/libcudf/meta.yaml             |   3 +-
 cpp/CMakeLists.txt                          |   2 +
 cpp/include/cudf/strings/detail/combine.hpp |  16 +-
 cpp/include/cudf/strings/detail/split.hpp   |  38 ++
 cpp/include/nvtext/bpe_tokenize.hpp         |  99 +++++
 cpp/src/strings/split/split_record.cu       |  31 +-
 cpp/src/text/subword/bpe_tokenizer.cu       | 439 ++++++++++++++++++++
 cpp/src/text/subword/load_merges_file.cu    |  93 +++++
 cpp/tests/CMakeLists.txt                    |   1 +
 cpp/tests/text/bpe_tests.cpp                |  87 ++++
 10 files changed, 800 insertions(+), 9 deletions(-)
 create mode 100644 cpp/include/cudf/strings/detail/split.hpp
 create mode 100644 cpp/include/nvtext/bpe_tokenize.hpp
 create mode 100644 cpp/src/text/subword/bpe_tokenizer.cu
 create mode 100644 cpp/src/text/subword/load_merges_file.cu
 create mode 100644 cpp/tests/text/bpe_tests.cpp
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 70c020d4abd..8388ad4afbf 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -203,6 +203,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/json.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/split.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 90e94ffcc7b..4d165b470b2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -468,8 +468,10 @@ add_library(
   src/text/normalize.cu
   src/text/replace.cu
   src/text/stemmer.cu
+  src/text/subword/bpe_tokenizer.cu
   src/text/subword/data_normalizer.cu
   src/text/subword/load_hash_file.cu
+  src/text/subword/load_merges_file.cu
   src/text/subword/subword_tokenize.cu
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index d6bdf398886..50f9a70e21c 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,6 +54,20 @@ std::unique_ptr<column> join_strings(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar
+ * const&,separator_on_nulls,output_if_empty_list,rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           string_scalar const& separator,
+                                           string_scalar const& narep,
+                                           separator_on_nulls separate_nulls,
+                                           output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/split.hpp b/cpp/include/cudf/strings/detail/split.hpp
new file mode 100644
index 00000000000..2f00a82678b
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/split.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/split/split.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @copydoc split_record(strings_column_view const&,string_scalar const&,size_type,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> split_record(strings_column_view const& input,
+                                     string_scalar const& delimiter,
+                                     size_type maxsplit,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
new file mode 100644
index 00000000000..a8f971d97b2
--- /dev/null
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace nvtext {
+
+/**
+ * @addtogroup nvtext_tokenize
+ * @{
+ * @file
+ */
+
+/**
+ * @brief The table of merge pairs for the BPE encoder.
+ *
+ * To create an instance, call nvtext::load_merges_table
+ */
+struct bpe_merge_pairs {
+  std::unique_ptr<cudf::column> merge_pairs;  // strings
+};
+
+/**
+ * @brief Create a nvtext::bpe_merge_pairs from an input file.
+ *
+ * The file should contain a pair of strings per line separated by
+ * a single space.
+ *
+ * Example:
+ * @code{.txt}
+ * e n
+ * i t
+ * i s
+ * ...
+ * @endcode
+ *
+ * The pairs are expected to be ordered in the file by their rank
+ * relative to each other. A pair will be is in priority over
+ * any pairs below it.
+ *
+ * @param filename_merges Local file path of pairs encoded in UTF-8
+ * @param mr Memory resource to allocate any returned objects.
+ */
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
+  std::string const& filename_merges,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Byte pair encode the input strings.
+ *
+ * This will split each string on whitespace, perform the encoding,
+ * and then build the output using the given `separator`.
+ *
+ * The encoding algorithm rebuilds each string by matching substrings
+ * in the `merge_pairs` table and iteratively removing the minimum ranked pair
+ * until no pairs are left. Then, a space is inserted between the remaining
+ * pairs before the result is joined to make the output string.
+ *
+ * @code{.pseudo}
+ * mps = load_merges_file("merges.txt")
+ * input = ["test sentence", "this is test"]
+ * result = byte_pair_encoding(input, mps)
+ * result is now ["test Ġsent tence", "this Ġis Ġtest"]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `merge_pairs` is empty
+ * @throw cudf::logic_error if `separator` is invalid
+ *
+ * @param input Strings to encode.
+ * @param merge_pairs Created by a call to nvtext::load_merges_file.
+ * @param separator String used to build the output after encoding.
+ *                  Default is a space followed by `Ġ`.
+ * @param mr Memory resource to allocate any returned objects.
+ */
+std::unique_ptr<cudf::column> byte_pair_encoding(
+  cudf::strings_column_view const& input,
+  bpe_merge_pairs const& merges_pairs,
+  cudf::string_scalar const& separator = cudf::string_scalar(" Ġ"),
+  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+}  // namespace nvtext
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 929d21a024c..832fd00725a 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -217,8 +217,6 @@ struct whitespace_token_reader_fn {
   }
 };
 
-}  // namespace
-
 // The output is one list item per string
 template <typename TokenCounter, typename TokenReader>
 std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
@@ -289,6 +287,27 @@ std::unique_ptr<column> split_record(
                            mr);
   }
 }
+}  // namespace
+
+std::unique_ptr<column> split_record(strings_column_view const& strings,
+                                     string_scalar const& delimiter,
+                                     size_type maxsplit,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  return split_record<detail::Dir::FORWARD>(
+    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
+                                      string_scalar const& delimiter,
+                                      size_type maxsplit,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  return split_record<detail::Dir::BACKWARD>(
+    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
+}
 
 }  // namespace detail
 
@@ -300,8 +319,7 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record<detail::Dir::FORWARD>(
-    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
+  return detail::split_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
@@ -310,8 +328,7 @@ std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record<detail::Dir::BACKWARD>(
-    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
+  return detail::split_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
new file mode 100644
index 00000000000..a47be57126f
--- /dev/null
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/bpe_tokenize.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/split.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/reduce.h>
+
+namespace nvtext {
+namespace detail {
+
+namespace {
+
+/**
+ * @brief Initialize the byte indices and the pair rank for each string.
+ */
+struct initialize_indices_fn {
+  cudf::column_device_view const d_merges;
+  cudf::column_device_view const d_strings;
+  cudf::size_type* d_byte_indices;
+  cudf::size_type* d_min_ranks;
+
+  __device__ void operator()(cudf::size_type idx)
+  {
+    d_min_ranks[idx] = cuda::std::numeric_limits<cudf::size_type>::max();
+
+    if (d_strings.is_null(idx)) { return; }
+
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    if (d_str.empty()) { return; }
+
+    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
+                          .element<cudf::offset_type>(idx);
+    auto d_indices = d_byte_indices + offset;
+
+    // set the index value for each byte
+    for (auto i = 0; i < d_str.size_bytes(); ++i) {
+      auto const byte = static_cast<uint8_t>(d_str.data()[i]);
+      // for intermediate UTF-8 bytes set the index value to 0
+      d_indices[i] = cudf::strings::detail::is_begin_utf8_char(byte) ? i : 0;
+    }
+  }
+};
+
+/**
+ * @brief Parse the merge pair into components.
+ *
+ * The two substrings are separated by a single space.
+ *
+ * @param d_pair String to dissect
+ * @return The left and right halves of the input pair.
+ */
+__device__ thrust::pair<cudf::string_view, cudf::string_view> dissect_merge_pair(
+  cudf::string_view const& d_pair)
+{
+  auto const lhs      = d_pair.data();
+  auto const end_str  = d_pair.data() + d_pair.size_bytes();
+  auto const rhs      = thrust::find(thrust::seq, lhs, end_str, ' ') + 1;
+  auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs - 1));
+  auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs, end_str));
+  return thrust::make_pair(cudf::string_view(lhs, lhs_size), cudf::string_view(rhs, rhs_size));
+}
+
+/**
+ * @brief Get the next substring of the given string.
+ *
+ * This will find the next sequence of characters identified by the
+ * given byte indices iterator values. The beginning of the sequence
+ * starts at `begin` and the end of the sequence is the first non-zero
+ * index found between (begin,end) exclusive.
+ *
+ * @tparam Iterator The byte indices iterator type
+ * @param begin Start of indices to check
+ * @param end End of indices to check
+ * @param d_str String to substring
+ * @return The substring found.
+ */
+template <typename Iterator>
+__device__ cudf::string_view next_substr(Iterator begin,
+                                         Iterator end,
+                                         cudf::string_view const& d_str)
+{
+  auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
+  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+  return cudf::string_view(d_str.data() + *begin, size);
+}
+
+/**
+ * @brief Iterate over the merge pairs and to find the minimum rank in each string.
+ *
+ * As a merge pair is located in each string, the minimum rank is accumulated in
+ * the output `d_min_ranks`. The rank is simply the position of the merge pair
+ * in the `d_merges` column.
+ */
+struct find_minimum_pair_fn {
+  cudf::column_device_view const d_merges;
+  cudf::column_device_view const d_strings;
+  cudf::size_type* d_byte_indices;
+  cudf::size_type* d_min_ranks;
+
+  // index is over the merges table
+  __device__ void operator()(cudf::size_type index)
+  {
+    auto const d_pair = dissect_merge_pair(d_merges.element<cudf::string_view>(index));
+
+    // locate this pair in each string
+    for (auto idx = 0; idx < d_strings.size(); ++idx) {
+      if (d_strings.is_null(idx)) continue;
+      auto const d_str = d_strings.element<cudf::string_view>(idx);
+      if (d_str.empty()) continue;
+
+      auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
+                            .element<cudf::offset_type>(idx);
+      auto d_indices = d_byte_indices + offset;
+
+      auto const begin = d_indices;
+      auto const end   = d_indices + d_str.size_bytes();
+
+      // check for the merge-pair in this string
+      auto lhs = next_substr(begin, end, d_str);
+      auto itr = begin + lhs.size_bytes();
+      while (itr < end) {
+        auto rhs = next_substr(itr, end, d_str);
+        if (rhs.empty()) break;
+
+        if (d_pair.first == lhs && d_pair.second == rhs) {
+          // found a match, record the rank
+          atomicMin(d_min_ranks + idx, index);
+          break;  // done with this string
+        }
+
+        // next substring
+        lhs = rhs;
+        itr += rhs.size_bytes();
+      }
+    }
+  }
+};
+
+/**
+ * @brief Remove merge pair from each string.
+ *
+ * The minimum rank found for each string used to identify the pair(s)
+ * to be removed. The pairs are removed by just zeroing the byte index
+ * found between the adjacent substrings.
+ *
+ * @code{.txt}
+ * d_strings =        ["helloworld", "testisthis"]
+ * d_byte_indices =   [ 0123456789    01234567]
+ * d_merges[d_min_ranks] = [ "ll o", "i s" ]
+ *
+ * d_bytes_indices -> [ 0123056789 01234060 ]
+ * d_min_ranks is reset to [ max, max ]
+ * @endcode
+ *
+ */
+struct remove_pair_fn {
+  cudf::column_device_view const d_merges;
+  cudf::column_device_view const d_strings;
+  cudf::size_type* d_byte_indices;
+  cudf::size_type* d_min_ranks;
+
+  __device__ void operator()(cudf::size_type idx)
+  {
+    if (d_strings.is_null(idx)) return;
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    if (d_str.empty()) return;
+
+    auto rank = d_min_ranks[idx];
+    if (rank == cuda::std::numeric_limits<cudf::size_type>::max()) return;
+
+    auto const d_pair = dissect_merge_pair(d_merges.element<cudf::string_view>(rank));
+
+    // resolve byte indices for this string
+    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
+                          .element<cudf::offset_type>(idx);
+    auto d_indices = d_byte_indices + offset;
+
+    auto const begin = d_indices;
+    auto const end   = d_indices + d_str.size_bytes();
+
+    // locate d_pair and remove it from this string
+    auto lhs = next_substr(begin, end, d_str);
+    auto itr = begin + lhs.size_bytes();
+    while (itr < end) {
+      auto rhs = next_substr(itr, end, d_str);
+      if (d_pair.first == lhs && d_pair.second == rhs) {
+        *itr = 0;  // removes the pair from this string
+        itr += rhs.size_bytes();
+        if (itr < end) {
+          rhs = next_substr(itr, end, d_str);  // skip to the next pair
+        } else {
+          break;  // done with this string
+        }
+      }
+      // next substring
+      lhs = rhs;
+      itr += rhs.size_bytes();
+    }
+
+    // reset for next iteration
+    d_min_ranks[idx] = cuda::std::numeric_limits<cudf::size_type>::max();
+  }
+};
+
+/**
+ * @brief Computes the output size of each string.
+ *
+ * The output size is the size of the current string plus the
+ * number of spaces to be added between adjacent substrings.
+ * The number of spaces will equal the number of non-zero byte indices
+ * for the string.
+ */
+struct compute_sizes_fn {
+  cudf::column_device_view const d_strings;
+  cudf::size_type* d_byte_indices;
+
+  __device__ cudf::size_type operator()(cudf::size_type idx)
+  {
+    if (d_strings.is_null(idx)) return 0;
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    auto offset      = d_strings.child(cudf::strings_column_view::offsets_column_index)
+                    .element<cudf::offset_type>(idx);
+    auto d_indices = d_byte_indices + offset;
+    return d_str.size_bytes() + thrust::count_if(  // number of non-zero byte indices
+                                  thrust::seq,
+                                  d_indices,
+                                  d_indices + d_str.size_bytes(),
+                                  [](auto v) { return v != 0; });
+  }
+};
+
+/**
+ * @brief Build the output string encoding.
+ *
+ * This copies each string to the output inserting a space at each non-zero byte index.
+ *
+ * @code{.txt}
+ * d_strings =      ["helloworld", "testthis"]
+ * d_byte_indices = [ 0000050000    00004000]
+ * result is ["hello world", "test this"]
+ * @endcode
+ */
+struct build_encoding_fn {
+  cudf::column_device_view const d_strings;
+  cudf::size_type* d_byte_indices;
+  cudf::offset_type const* d_offsets;
+  char* d_chars{};
+
+  __device__ void operator()(cudf::size_type idx)
+  {
+    if (d_strings.is_null(idx)) return;
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    if (d_str.empty()) return;
+
+    auto offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
+                    .element<cudf::offset_type>(idx);
+    auto d_indices = d_byte_indices + offset;
+    auto d_output  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+
+    // copy chars while indices==0, add space each time indices!=0
+    auto begin   = d_indices;
+    auto end     = d_indices + d_str.size_bytes();
+    auto d_input = d_str.data();
+    *d_output++  = *d_input++;
+    auto itr     = begin + 1;
+    while (itr < end) {
+      if (*itr++) *d_output++ = ' ';
+      *d_output++ = *d_input++;
+    }
+  }
+};
+
+/**
+ * @brief Perform byte pair encoding on each string in the input column.
+ *
+ * The result is a strings column of the same size where each string has been encoded.
+ *
+ * The encoding is performed iteratively. Each pass determines the string's lowest
+ * ranked merge pair as determined by the strings in `merges_table`. This pair
+ * is the removed (virtually) from each string before starting the next iteration.
+ *
+ * Once all pairs have exhausted for all strings, the output is constructed from
+ * the results by adding spaces between each remaining pair in each string.
+ */
+std::unique_ptr<cudf::column> byte_pair_encoding(
+  cudf::strings_column_view const& input,
+  bpe_merge_pairs const& merges_table,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  // build working vectors
+  rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
+  rmm::device_uvector<cudf::size_type> d_min_ranks(input.size(), stream);
+
+  auto d_merges  = cudf::column_device_view::create(merges_table.merge_pairs->view(), stream);
+  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    input.size(),
+    initialize_indices_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()});
+
+  cudf::size_type min_rank = 0;
+  while (min_rank < std::numeric_limits<cudf::size_type>::max()) {
+    // find minimum merge pair for each string
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<cudf::size_type>(0),
+      d_merges->size(),
+      find_minimum_pair_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()});
+
+    // get the minimum rank over all strings;
+    // this is only used to see if we are finished
+    min_rank = thrust::reduce(rmm::exec_policy(stream),
+                              d_min_ranks.begin(),
+                              d_min_ranks.end(),
+                              std::numeric_limits<cudf::size_type>::max(),
+                              thrust::minimum<cudf::size_type>{});
+
+    // check if any pairs have been found;
+    // if so, remove that pair from each string
+    if (min_rank < std::numeric_limits<cudf::size_type>::max()) {
+      thrust::for_each_n(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator<cudf::size_type>(0),
+        input.size(),
+        remove_pair_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()});
+    }
+  }
+
+  // build the output:
+  // add spaces between the remaining pairs in each string
+  auto offsets_itr =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<cudf::size_type>(0),
+                                    compute_sizes_fn{*d_strings, d_byte_indices.data()});
+  auto offsets = cudf::strings::detail::make_offsets_child_column(
+    offsets_itr, offsets_itr + input.size(), stream, mr);
+  auto d_offsets = offsets->view().data<cudf::offset_type>();
+
+  auto const bytes = cudf::detail::get_value<int32_t>(offsets->view(), input.size(), stream);
+  auto chars       = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
+  auto d_chars     = chars->mutable_view().data<char>();
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<cudf::size_type>(0),
+                     input.size(),
+                     build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars});
+
+  return make_strings_column(input.size(),
+                             std::move(offsets),
+                             std::move(chars),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
+
+}  // namespace
+
+std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
+                                                 bpe_merge_pairs const& merges_table,
+                                                 cudf::string_scalar const& separator,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  auto const strings_count = input.size();
+  if (strings_count == 0 || input.chars_size() == 0)
+    return cudf::make_empty_column(cudf::type_id::STRING);
+  CUDF_EXPECTS(!merges_table.merge_pairs->view().is_empty(), "Merge pairs table must not be empty");
+
+  // split input on whitespace
+  auto split_result = cudf::strings::detail::split_record(
+    input, cudf::string_scalar(""), -1, stream, rmm::mr::get_current_device_resource());
+  auto split_view = cudf::lists_column_view(split_result->view());
+
+  // run BPE on the strings child column
+  auto bpe_column = byte_pair_encoding(split_view.child(), merges_table, stream);
+
+  // recombine the result:
+  // use the offsets from split_record and the strings from byte_pair_encoding
+  // to build a lists column_view
+  auto list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST},
+                                     strings_count,
+                                     nullptr,  // no parent data in list column
+                                     split_view.null_mask(),
+                                     split_view.null_count(),
+                                     0,
+                                     {split_view.offsets(), bpe_column->view()});
+
+  // use join_list_elements to build the output strings column
+  return cudf::strings::detail::join_list_elements(
+    cudf::lists_column_view(list_join),
+    separator,
+    cudf::string_scalar(""),
+    cudf::strings::separator_on_nulls::NO,
+    cudf::strings::output_if_empty_list::EMPTY_STRING,
+    stream,
+    mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
+                                                 bpe_merge_pairs const& merges_table,
+                                                 cudf::string_scalar const& separator,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::byte_pair_encoding(input, merges_table, separator, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace nvtext
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
new file mode 100644
index 00000000000..eaeaec03a96
--- /dev/null
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/bpe_tokenize.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+namespace nvtext {
+namespace detail {
+
+/**
+ * @brief Loads a text file of merge-pairs into a strings column.
+ *
+ * The line position in the file indicates the pair's rank.
+ *
+ * @code{.pseudo}
+ * Format of the file:
+ * #version ..
+ * a1 a2
+ * b1 b2
+ * c1 c3
+ * ...
+ * @endcode
+ *
+ * @param filename_merges Path to text file containing merge-pairs.
+ * @return object containing table elements for the BPE function
+ */
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filename_merges,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  bpe_merge_pairs result;
+  std::ifstream merges_file(filename_merges);
+  CUDF_EXPECTS(merges_file.good(), "Could not open " + filename_merges);
+
+  std::vector<char> chars{};
+  std::vector<cudf::offset_type> offsets(1, 0);
+
+  std::string line;
+  std::getline(merges_file, line);
+  std::string version = "#version";
+  if (line.substr(0, version.size()).compare(version) == 0) { std::getline(merges_file, line); }
+
+  // This is a text file delimited only by CR/LF.
+  // Look into using the CSV reader to load the strings column instead.
+  while (!line.empty()) {
+    chars.insert(chars.end(), std::cbegin(line), std::cend(line));
+    offsets.push_back(offsets.back() + line.length());
+    std::getline(merges_file, line);
+  }
+
+  CUDF_EXPECTS(!chars.empty(), "No data found in " + filename_merges);
+
+  auto d_chars       = cudf::detail::make_device_uvector_async(chars, stream, mr);
+  auto d_offsets     = cudf::detail::make_device_uvector_async(offsets, stream, mr);
+  result.merge_pairs = cudf::make_strings_column(d_chars, d_offsets);
+
+  return std::make_unique<bpe_merge_pairs>(std::move(result));
+}
+
+}  // namespace detail
+
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filename_merges,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::load_merge_pairs_file(filename_merges, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6b5670630ec..e5313c1d061 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -390,6 +390,7 @@ ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_te
 # * nvtext test -----------------------------------------------------------------------------------
 ConfigureTest(
   TEXT_TEST
+  text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
new file mode 100644
index 00000000000..a009919fa21
--- /dev/null
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/bpe_tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+struct TextBPETokenize : public cudf::test::BaseFixture {
+};
+
+TEST_F(TextBPETokenize, BytePairEncoding)
+{
+  // fake table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
+  auto mpt = cudf::test::strings_column_wrapper({
+    "e n",    // 12
+    "i t",    // 14
+    "i s",    // 15
+    "e s",    // 18
+    "en t",   // 42
+    "c e",    // 88
+    "es t",   // 139
+    "en ce",  // 338
+    "T h",    // 561
+    "Th is",  // 956
+    "t est",  // 9032
+    "s ent",  // 33830
+  });
+  nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
+
+  auto validity = cudf::test::iterators::null_at(4);
+  cudf::test::strings_column_wrapper input({"This is it",
+                                            "This is test-sentence-1",
+                                            "This is test sentence-2",
+                                            "This-is test sentence 3",
+                                            "",
+                                            ""},
+                                           validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto results = nvtext::byte_pair_encoding(sv, merge_pairs);
+
+  auto expected = cudf::test::strings_column_wrapper({"This Ġis Ġit",
+                                                      "This Ġis Ġtest - sent ence - 1",
+                                                      "This Ġis Ġtest Ġsent ence - 2",
+                                                      "This - is Ġtest Ġsent ence Ġ3",
+                                                      "",
+                                                      ""},
+                                                     validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
+TEST_F(TextBPETokenize, BPE_Empty)
+{
+  auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
+  nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
+  auto empty   = cudf::make_empty_column(cudf::type_id::STRING);
+  auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), merge_pairs);
+  EXPECT_EQ(0, results->size());
+}
+
+TEST_F(TextBPETokenize, BPE_Error)
+{
+  auto empty = cudf::make_empty_column(cudf::type_id::STRING);
+  nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};
+  cudf::test::strings_column_wrapper input({"isit"});
+  EXPECT_THROW(nvtext::byte_pair_encoding(cudf::strings_column_view(input), merge_pairs),
+               cudf::logic_error);
+}

From ae2baa04d790722a5d81e0992b20b99501296682 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 11 Feb 2022 09:24:32 -0500
Subject: [PATCH 02/10] fix call to detail::rsplit_record

---
 cpp/src/strings/split/split_record.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 832fd00725a..8834ece8734 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -328,7 +328,7 @@ std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
+  return detail::rsplit_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings

From aa6f8e896019cf61445e2c57856a9407e46a790c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 17 Feb 2022 16:47:37 -0500
Subject: [PATCH 03/10] change algorithm to use cuco::static-map

---
 conda/recipes/libcudf/meta.yaml           |   1 -
 cpp/include/cudf/strings/detail/split.hpp |  38 --
 cpp/include/nvtext/bpe_tokenize.hpp       |  24 +-
 cpp/src/text/subword/bpe_tokenizer.cu     | 617 ++++++++++++----------
 cpp/src/text/subword/bpe_tokenizer.cuh    |  59 +++
 cpp/src/text/subword/load_merges_file.cu  | 111 +++-
 cpp/tests/text/bpe_tests.cpp              |  13 +-
 7 files changed, 530 insertions(+), 333 deletions(-)
 delete mode 100644 cpp/include/cudf/strings/detail/split.hpp
 create mode 100644 cpp/src/text/subword/bpe_tokenizer.cuh

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 4dfacb76a95..4e20c979f6c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -203,7 +203,6 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/json.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
-    - test -f $PREFIX/include/cudf/strings/detail/split.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp
diff --git a/cpp/include/cudf/strings/detail/split.hpp b/cpp/include/cudf/strings/detail/split.hpp
deleted file mode 100644
index 2f00a82678b..00000000000
--- a/cpp/include/cudf/strings/detail/split.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/strings/split/split.hpp>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-
-/**
- * @copydoc split_record(strings_column_view const&,string_scalar const&,size_type,
- * rmm::mr::device_memory_resource*)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> split_record(strings_column_view const& input,
-                                     string_scalar const& delimiter,
-                                     size_type maxsplit,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index a8f971d97b2..53eda236d0b 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -34,7 +34,21 @@ namespace nvtext {
  * To create an instance, call nvtext::load_merges_table
  */
 struct bpe_merge_pairs {
-  std::unique_ptr<cudf::column> merge_pairs;  // strings
+  struct bpe_merge_pairs_impl;
+  std::unique_ptr<bpe_merge_pairs_impl> impl{};
+
+  bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  bpe_merge_pairs(cudf::strings_column_view const& input,
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  ~bpe_merge_pairs();
+
+  cudf::size_type get_size();
+  std::size_t get_map_size();
 };
 
 /**
@@ -75,9 +89,9 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  *
  * @code{.pseudo}
  * mps = load_merges_file("merges.txt")
- * input = ["test sentence", "this is test"]
+ * input = ["test sentence", "thisis test"]
  * result = byte_pair_encoding(input, mps)
- * result is now ["test Ġsent tence", "this Ġis Ġtest"]
+ * result is now ["test sent tence", "this is test"]
  * @endcode
  *
  * @throw cudf::logic_error if `merge_pairs` is empty
@@ -86,13 +100,13 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * @param input Strings to encode.
  * @param merge_pairs Created by a call to nvtext::load_merges_file.
  * @param separator String used to build the output after encoding.
- *                  Default is a space followed by `Ġ`.
+ *                  Default is a space.
  * @param mr Memory resource to allocate any returned objects.
  */
 std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
-  cudf::string_scalar const& separator = cudf::string_scalar(" Ġ"),
+  cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index a47be57126f..20837716845 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
+#include <text/subword/bpe_tokenizer.cuh>
+
 #include <nvtext/bpe_tokenize.hpp>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/split.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
@@ -31,230 +30,245 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/copy.h>
+#include <thrust/count.h>
 #include <thrust/for_each.h>
-#include <thrust/reduce.h>
+#include <thrust/merge.h>
+#include <thrust/transform.h>
 
 namespace nvtext {
 namespace detail {
 
 namespace {
 
-/**
- * @brief Initialize the byte indices and the pair rank for each string.
- */
-struct initialize_indices_fn {
-  cudf::column_device_view const d_merges;
-  cudf::column_device_view const d_strings;
-  cudf::size_type* d_byte_indices;
-  cudf::size_type* d_min_ranks;
-
-  __device__ void operator()(cudf::size_type idx)
-  {
-    d_min_ranks[idx] = cuda::std::numeric_limits<cudf::size_type>::max();
-
-    if (d_strings.is_null(idx)) { return; }
-
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) { return; }
-
-    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::offset_type>(idx);
-    auto d_indices = d_byte_indices + offset;
-
-    // set the index value for each byte
-    for (auto i = 0; i < d_str.size_bytes(); ++i) {
-      auto const byte = static_cast<uint8_t>(d_str.data()[i]);
-      // for intermediate UTF-8 bytes set the index value to 0
-      d_indices[i] = cudf::strings::detail::is_begin_utf8_char(byte) ? i : 0;
-    }
-  }
-};
-
-/**
- * @brief Parse the merge pair into components.
- *
- * The two substrings are separated by a single space.
- *
- * @param d_pair String to dissect
- * @return The left and right halves of the input pair.
- */
-__device__ thrust::pair<cudf::string_view, cudf::string_view> dissect_merge_pair(
-  cudf::string_view const& d_pair)
+template <typename CharType>
+constexpr bool is_whitespace(CharType ch)
 {
-  auto const lhs      = d_pair.data();
-  auto const end_str  = d_pair.data() + d_pair.size_bytes();
-  auto const rhs      = thrust::find(thrust::seq, lhs, end_str, ' ') + 1;
-  auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs - 1));
-  auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs, end_str));
-  return thrust::make_pair(cudf::string_view(lhs, lhs_size), cudf::string_view(rhs, rhs_size));
+  return ch <= ' ';
 }
 
 /**
- * @brief Get the next substring of the given string.
+ * @brief Resolve a truncated string from a full string.
  *
- * This will find the next sequence of characters identified by the
- * given byte indices iterator values. The beginning of the sequence
- * starts at `begin` and the end of the sequence is the first non-zero
- * index found between (begin,end) exclusive.
+ * This will return a substring of the input starting with the first byte
+ * upto the first whitespace character is found or the end of the string.
+ * Any whitespace is expected only at the end of the string.
  *
- * @tparam Iterator The byte indices iterator type
- * @param begin Start of indices to check
- * @param end End of indices to check
- * @param d_str String to substring
- * @return The substring found.
+ * @param d_str Input string to resolve.
+ * @return Substring of the input excluding trailing whitespace.
  */
-template <typename Iterator>
-__device__ cudf::string_view next_substr(Iterator begin,
-                                         Iterator end,
-                                         cudf::string_view const& d_str)
+__device__ cudf::string_view resolve_string(cudf::string_view const& d_str)
 {
-  auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
-  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-  return cudf::string_view(d_str.data() + *begin, size);
+  auto const begin = d_str.data();
+  auto end         = thrust::find_if(
+    thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); });
+  auto size = static_cast<cudf::size_type>(thrust::distance(begin, end));
+  return cudf::string_view(begin, size);
 }
 
 /**
- * @brief Iterate over the merge pairs and to find the minimum rank in each string.
+ * @brief Main byte pair encoding algorithm function for each string.
  *
- * As a merge pair is located in each string, the minimum rank is accumulated in
- * the output `d_min_ranks`. The rank is simply the position of the merge pair
- * in the `d_merges` column.
+ * @see The byte_pair_encoding_fn::operator() function below for details.
  */
-struct find_minimum_pair_fn {
+struct byte_pair_encoding_fn {
   cudf::column_device_view const d_merges;
   cudf::column_device_view const d_strings;
+  merge_pairs_map_type::device_view const d_map;
+  cudf::size_type* d_sizes;  // output size of encoded string
+  string_hasher_type const hasher;
   cudf::size_type* d_byte_indices;
-  cudf::size_type* d_min_ranks;
 
-  // index is over the merges table
-  __device__ void operator()(cudf::size_type index)
+  /**
+   * @brief Parse the merge pair into components.
+   *
+   * The two substrings are separated by a single space.
+   *
+   * @param idx Index of merge pair to dissect.
+   * @return The left and right halves of the merge pair.
+   */
+  __device__ thrust::pair<cudf::string_view, cudf::string_view> dissect_merge_pair(
+    cudf::size_type idx)
   {
-    auto const d_pair = dissect_merge_pair(d_merges.element<cudf::string_view>(index));
+    auto const d_pair   = d_merges.element<cudf::string_view>(idx);
+    auto const lhs      = d_pair.data();
+    auto const end_str  = d_pair.data() + d_pair.size_bytes();
+    auto const rhs      = thrust::find(thrust::seq, lhs, end_str, ' ') + 1;
+    auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs - 1));
+    auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs, end_str));
+    return thrust::make_pair(cudf::string_view(lhs, lhs_size), cudf::string_view(rhs, rhs_size));
+  }
 
-    // locate this pair in each string
-    for (auto idx = 0; idx < d_strings.size(); ++idx) {
-      if (d_strings.is_null(idx)) continue;
-      auto const d_str = d_strings.element<cudf::string_view>(idx);
-      if (d_str.empty()) continue;
+  /**
+   * @brief Get the next substring of the given string.
+   *
+   * This will find the next sequence of characters identified by the
+   * given byte indices iterator values. The beginning of the sequence
+   * starts at `begin` and the end of the sequence is the first non-zero
+   * index found between (begin,end) exclusive.
+   *
+   * @tparam Iterator The byte indices iterator type
+   * @param begin Start of indices to check
+   * @param end End of indices to check
+   * @param d_str String to substring
+   * @return The substring found.
+   */
+  template <typename Iterator>
+  __device__ cudf::string_view next_substr(Iterator begin,
+                                           Iterator end,
+                                           cudf::string_view const& d_str)
+  {
+    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
+    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+    return cudf::string_view(d_str.data() + *begin, size);
+  }
 
-      auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                            .element<cudf::offset_type>(idx);
-      auto d_indices = d_byte_indices + offset;
+  /**
+   * @brief Compute the hash over the input strings.
+   *
+   * The input strings are combined with a space to produce hash for matching
+   * a merge pair within the `d_map`.
+   *
+   * @param lhs First string.
+   * @param rhs Second string.
+   * @return The hash value to match with `d_map`.
+   */
+  __device__ hash_value_type compute_hash(cudf::string_view const& lhs,
+                                          cudf::string_view const& rhs)
+  {
+    __shared__ char shmem[48 * 1024];  // max for Pascal
+    auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
+    auto const thread_memory_size = static_cast<cudf::size_type>(sizeof(shmem) / blockDim.x);
 
-      auto const begin = d_indices;
-      auto const end   = d_indices + d_str.size_bytes();
+    // Edge case check.
+    // Empirically found only two merge pair strings that were greater than 70 bytes
+    // and they both looked like ignorable errors. Double check this analysis with Vibhu.
+    if (thread_memory_size < total_size) { return 0; }
 
-      // check for the merge-pair in this string
-      auto lhs = next_substr(begin, end, d_str);
-      auto itr = begin + lhs.size_bytes();
-      while (itr < end) {
-        auto rhs = next_substr(itr, end, d_str);
-        if (rhs.empty()) break;
+    // build the target string in shared memory
+    char* ptr = &shmem[threadIdx.x * thread_memory_size];
 
-        if (d_pair.first == lhs && d_pair.second == rhs) {
-          // found a match, record the rank
-          atomicMin(d_min_ranks + idx, index);
-          break;  // done with this string
-        }
+    // build a temp string like:  temp = lhs + ' ' + rhs
+    memcpy(ptr, lhs.data(), lhs.size_bytes());
+    memcpy(ptr + lhs.size_bytes(), " ", 1);
+    memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes());
 
-        // next substring
-        lhs = rhs;
-        itr += rhs.size_bytes();
-      }
-    }
+    auto const d_hash_str = cudf::string_view(ptr, total_size);
+    return hasher(d_hash_str);  // return the hash for the temp string
   }
-};
-
-/**
- * @brief Remove merge pair from each string.
- *
- * The minimum rank found for each string used to identify the pair(s)
- * to be removed. The pairs are removed by just zeroing the byte index
- * found between the adjacent substrings.
- *
- * @code{.txt}
- * d_strings =        ["helloworld", "testisthis"]
- * d_byte_indices =   [ 0123456789    01234567]
- * d_merges[d_min_ranks] = [ "ll o", "i s" ]
- *
- * d_bytes_indices -> [ 0123056789 01234060 ]
- * d_min_ranks is reset to [ max, max ]
- * @endcode
- *
- */
-struct remove_pair_fn {
-  cudf::column_device_view const d_merges;
-  cudf::column_device_view const d_strings;
-  cudf::size_type* d_byte_indices;
-  cudf::size_type* d_min_ranks;
 
+  /**
+   * @brief Byte encode each string.
+   *
+   * Each string is iteratively scanned for the minimum rank of adjacent substring pairs
+   * as found within the `d_map` table. Once the minimum pair is located, that pair
+   * is removed -- virtually by zero-ing the index value between any matching adjacent pairs.
+   *
+   * The iteration ends once there are no more adjacent pairs or there are no more
+   * matches found in `d_map`. At the end, the indices for each string reflect the
+   * encoding pattern and can be used to build the output.
+   *
+   * This function also computes the size of the encoded output of each string
+   * by simply counting the number of non-zero indices values remaining. This saves
+   * an extra kernel launch normally required to compute the offsets of the output column.
+   *
+   * @param idx The index of the string in `d_strings` to encode
+   */
   __device__ void operator()(cudf::size_type idx)
   {
-    if (d_strings.is_null(idx)) return;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return;
-
-    auto rank = d_min_ranks[idx];
-    if (rank == cuda::std::numeric_limits<cudf::size_type>::max()) return;
-
-    auto const d_pair = dissect_merge_pair(d_merges.element<cudf::string_view>(rank));
+    if (d_strings.is_null(idx)) { return; }
+    auto const d_str = resolve_string(d_strings.element<cudf::string_view>(idx));
+    if (d_str.empty()) { return; }
 
-    // resolve byte indices for this string
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
                           .element<cudf::offset_type>(idx);
-    auto d_indices = d_byte_indices + offset;
+    auto const d_indices = d_byte_indices + offset;
+
+    // initialize the byte indices for this string;
+    // set the index value to 0 for any intermediate UTF-8 bytes
+    thrust::transform(thrust::seq,
+                      thrust::make_counting_iterator<cudf::size_type>(0),
+                      thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes()),
+                      d_indices,
+                      [data = d_str.data()](auto idx) {
+                        auto const byte = static_cast<uint8_t>(data[idx]);
+                        return cudf::strings::detail::is_begin_utf8_char(byte) ? idx : 0;
+                      });
 
     auto const begin = d_indices;
     auto const end   = d_indices + d_str.size_bytes();
 
-    // locate d_pair and remove it from this string
-    auto lhs = next_substr(begin, end, d_str);
-    auto itr = begin + lhs.size_bytes();
-    while (itr < end) {
-      auto rhs = next_substr(itr, end, d_str);
-      if (d_pair.first == lhs && d_pair.second == rhs) {
-        *itr = 0;  // removes the pair from this string
+    // keep processing the string until there are no more adjacent pairs found in d_map
+    cudf::size_type min_rank = 0;
+    while (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
+      // initialize working variables
+      min_rank = cuda::std::numeric_limits<cudf::size_type>::max();
+
+      auto lhs = next_substr(begin, end, d_str);
+      auto itr = begin + lhs.size_bytes();
+
+      auto min_itr  = itr;               // these are set along with
+      auto min_size = lhs.size_bytes();  // the min_rank variable
+
+      // check each adjacent pair against the d_map
+      while (itr < end) {
+        auto const rhs = next_substr(itr, end, d_str);
+        if (rhs.empty()) break;  // no more adjacent pairs
+
+        auto const hash    = compute_hash(lhs, rhs);
+        auto const map_itr = d_map.find(hash);
+        if (map_itr != d_map.end()) {
+          // found a match; record the rank (and other min_ vars)
+          auto const rank = static_cast<cudf::size_type>(map_itr->second);
+          if (rank < min_rank) {
+            min_rank = rank;
+            min_itr  = itr;
+            min_size = rhs.size_bytes();
+          }
+        }
+        // next substring
+        lhs = rhs;
         itr += rhs.size_bytes();
+      }
+
+      // if any pair matched, remove every occurrence from the string
+      if (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
+        // remove the first pair we found
+        itr  = min_itr;
+        *itr = 0;
+
+        // continue scanning for other occurrences in the remainder of the string
+        itr += min_size;
         if (itr < end) {
-          rhs = next_substr(itr, end, d_str);  // skip to the next pair
-        } else {
-          break;  // done with this string
+          auto const d_pair = dissect_merge_pair(min_rank);
+
+          lhs = next_substr(itr, end, d_str);
+          itr += lhs.size_bytes();
+          while (itr < end) {
+            auto rhs = next_substr(itr, end, d_str);
+            if (d_pair.first == lhs && d_pair.second == rhs) {
+              *itr = 0;  // removes the pair from this string
+              itr += rhs.size_bytes();
+              if (itr >= end) { break; }  // done checking for pairs
+              // skip to the next adjacent pair
+              rhs = next_substr(itr, end, d_str);
+            }
+            // next substring
+            lhs = rhs;
+            itr += rhs.size_bytes();
+          }
         }
       }
-      // next substring
-      lhs = rhs;
-      itr += rhs.size_bytes();
     }
 
-    // reset for next iteration
-    d_min_ranks[idx] = cuda::std::numeric_limits<cudf::size_type>::max();
-  }
-};
-
-/**
- * @brief Computes the output size of each string.
- *
- * The output size is the size of the current string plus the
- * number of spaces to be added between adjacent substrings.
- * The number of spaces will equal the number of non-zero byte indices
- * for the string.
- */
-struct compute_sizes_fn {
-  cudf::column_device_view const d_strings;
-  cudf::size_type* d_byte_indices;
-
-  __device__ cudf::size_type operator()(cudf::size_type idx)
-  {
-    if (d_strings.is_null(idx)) return 0;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    auto offset      = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                    .element<cudf::offset_type>(idx);
-    auto d_indices = d_byte_indices + offset;
-    return d_str.size_bytes() + thrust::count_if(  // number of non-zero byte indices
-                                  thrust::seq,
-                                  d_indices,
-                                  d_indices + d_str.size_bytes(),
-                                  [](auto v) { return v != 0; });
+    // compute and store the output size for this string's encoding
+    auto const encoded_size = d_str.size_bytes() +  // number of original bytes +
+                              thrust::count_if(     // number of non-zero byte indices
+                                thrust::seq,
+                                d_indices,
+                                d_indices + d_str.size_bytes(),
+                                [](auto v) { return v != 0; });
+    d_sizes[idx] = static_cast<cudf::size_type>(encoded_size);
   }
 };
 
@@ -271,27 +285,28 @@ struct compute_sizes_fn {
  */
 struct build_encoding_fn {
   cudf::column_device_view const d_strings;
-  cudf::size_type* d_byte_indices;
+  cudf::size_type const* d_byte_indices;
   cudf::offset_type const* d_offsets;
   char* d_chars{};
 
   __device__ void operator()(cudf::size_type idx)
   {
-    if (d_strings.is_null(idx)) return;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return;
-
-    auto offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                    .element<cudf::offset_type>(idx);
-    auto d_indices = d_byte_indices + offset;
-    auto d_output  = d_chars ? d_chars + d_offsets[idx] : nullptr;
-
-    // copy chars while indices==0, add space each time indices!=0
-    auto begin   = d_indices;
-    auto end     = d_indices + d_str.size_bytes();
-    auto d_input = d_str.data();
-    *d_output++  = *d_input++;
-    auto itr     = begin + 1;
+    if (d_strings.is_null(idx)) { return; }
+    auto const d_str = resolve_string(d_strings.element<cudf::string_view>(idx));
+    if (d_str.empty()) { return; }
+
+    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
+                          .element<cudf::offset_type>(idx);
+    auto const d_indices = d_byte_indices + offset;
+    auto d_output        = d_chars ? d_chars + d_offsets[idx] : nullptr;
+
+    // copy chars while indices[i]==0,
+    // insert space each time indices[i]!=0
+    auto const begin = d_indices;
+    auto const end   = d_indices + d_str.size_bytes();
+    auto d_input     = d_str.data();
+    *d_output++      = *d_input++;
+    auto itr         = begin + 1;
     while (itr < end) {
       if (*itr++) *d_output++ = ' ';
       *d_output++ = *d_input++;
@@ -310,119 +325,173 @@ struct build_encoding_fn {
  *
  * Once all pairs have exhausted for all strings, the output is constructed from
  * the results by adding spaces between each remaining pair in each string.
+ *
+ * @param input Strings to encode.
+ * @param merge_pairs Merge pairs data and map used for encoding.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
-  bpe_merge_pairs const& merges_table,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs,
+  rmm::cuda_stream_view stream)
 {
-  // build working vectors
-  rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
-  rmm::device_uvector<cudf::size_type> d_min_ranks(input.size(), stream);
+  CUDF_EXPECTS(!merge_pairs.get_merge_pairs().is_empty(), "Merge pairs table must not be empty");
 
-  auto d_merges  = cudf::column_device_view::create(merges_table.merge_pairs->view(), stream);
-  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
+  // build working vector to hold index values per byte
+  rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
 
+  auto const d_merges  = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream);
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
+                                           static_cast<cudf::size_type>(input.size() + 1),
+                                           cudf::mask_state::UNALLOCATED,
+                                           stream,
+                                           rmm::mr::get_current_device_resource());
+  auto d_offsets = offsets->mutable_view().data<cudf::offset_type>();
+
+  byte_pair_encoding_fn fn{*d_merges,
+                           *d_strings,
+                           merge_pairs.get_merge_pairs_map(),
+                           d_offsets,
+                           string_hasher_type{},
+                           d_byte_indices.data()};
   thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    input.size(),
-    initialize_indices_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()});
-
-  cudf::size_type min_rank = 0;
-  while (min_rank < std::numeric_limits<cudf::size_type>::max()) {
-    // find minimum merge pair for each string
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<cudf::size_type>(0),
-      d_merges->size(),
-      find_minimum_pair_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()});
-
-    // get the minimum rank over all strings;
-    // this is only used to see if we are finished
-    min_rank = thrust::reduce(rmm::exec_policy(stream),
-                              d_min_ranks.begin(),
-                              d_min_ranks.end(),
-                              std::numeric_limits<cudf::size_type>::max(),
-                              thrust::minimum<cudf::size_type>{});
-
-    // check if any pairs have been found;
-    // if so, remove that pair from each string
-    if (min_rank < std::numeric_limits<cudf::size_type>::max()) {
-      thrust::for_each_n(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator<cudf::size_type>(0),
-        input.size(),
-        remove_pair_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()});
-    }
-  }
+    rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
 
-  // build the output:
-  // add spaces between the remaining pairs in each string
-  auto offsets_itr =
-    thrust::make_transform_iterator(thrust::make_counting_iterator<cudf::size_type>(0),
-                                    compute_sizes_fn{*d_strings, d_byte_indices.data()});
-  auto offsets = cudf::strings::detail::make_offsets_child_column(
-    offsets_itr, offsets_itr + input.size(), stream, mr);
-  auto d_offsets = offsets->view().data<cudf::offset_type>();
+  // build the output: add spaces between the remaining pairs in each string
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets);
 
   auto const bytes = cudf::detail::get_value<int32_t>(offsets->view(), input.size(), stream);
-  auto chars       = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
-  auto d_chars     = chars->mutable_view().data<char>();
+  auto chars       = cudf::strings::detail::create_chars_child_column(
+    bytes, stream, rmm::mr::get_current_device_resource());
+  auto d_chars = chars->mutable_view().data<char>();
+
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      input.size(),
                      build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars});
 
-  return make_strings_column(input.size(),
-                             std::move(offsets),
-                             std::move(chars),
-                             input.null_count(),
-                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  return make_strings_column(
+    input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{});
+}
+
+/**
+ * @brief Create new offsets by identifying substrings by whitespace.
+ *
+ * This is similar to cudf::strings::split_record but does not fully split
+ * and only returns new offsets. The behavior is more like a view-only slice
+ * of the chars child with the result still including trailing delimiters.
+ *
+ * The encoding algorithm knows to ignore the trailing whitespace of each string.
+ *
+ * @param input Strings to tokenize.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& input,
+                                            rmm::cuda_stream_view stream)
+{
+  // count space offsets
+  // TODO: does not yet account for sliced input column
+  auto const begin   = thrust::make_counting_iterator<cudf::size_type>(1);
+  auto const end     = thrust::make_counting_iterator<cudf::size_type>(input.chars().size());
+  auto edge_of_space = [d_chars = input.chars().data<char>()] __device__(auto idx) -> bool {
+    return !is_whitespace(d_chars[idx]) && is_whitespace(d_chars[idx - 1]);
+  };
+  auto space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space);
+
+  // copy space offsets
+  rmm::device_uvector<cudf::offset_type> space_offsets(space_count, stream);
+  thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space);
+
+  // create output offsets
+  auto split_offsets =
+    cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
+                              static_cast<cudf::size_type>(space_count + input.size() + 1),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              rmm::mr::get_current_device_resource());
+
+  // combine current offsets with space offsets
+  thrust::merge(rmm::exec_policy(stream),
+                input.offsets_begin(),
+                input.offsets_end(),
+                space_offsets.begin(),
+                space_offsets.end(),
+                split_offsets->mutable_view().begin<cudf::offset_type>());
+
+  return split_offsets;
 }
 
+/**
+ * @brief Build new offsets that can be used to build a list column for calling join.
+ *
+ * This essentially returns the number of tokens for each string.
+ */
+struct split_offsets_fn {
+  cudf::column_device_view const d_strings;
+  __device__ cudf::size_type operator()(cudf::size_type idx)
+  {
+    if (d_strings.is_null(idx)) return 0;
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    // TODO: does not correctly account for adjacent whitespace
+    auto const result = thrust::count_if(
+      thrust::seq, d_str.begin(), d_str.end(), [](auto ch) { return is_whitespace(ch); });
+    return static_cast<cudf::size_type>(result) + 1;
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
-                                                 bpe_merge_pairs const& merges_table,
+                                                 bpe_merge_pairs const& merge_pairs,
                                                  cudf::string_scalar const& separator,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
-  auto const strings_count = input.size();
-  if (strings_count == 0 || input.chars_size() == 0)
+  if (input.is_empty() || input.chars_size() == 0)
     return cudf::make_empty_column(cudf::type_id::STRING);
-  CUDF_EXPECTS(!merges_table.merge_pairs->view().is_empty(), "Merge pairs table must not be empty");
 
-  // split input on whitespace
-  auto split_result = cudf::strings::detail::split_record(
-    input, cudf::string_scalar(""), -1, stream, rmm::mr::get_current_device_resource());
-  auto split_view = cudf::lists_column_view(split_result->view());
+  auto d_strings   = cudf::column_device_view::create(input.parent(), stream);
+  auto offsets_itr = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<cudf::size_type>(0), split_offsets_fn{*d_strings});
+  auto split_offsets = cudf::strings::detail::make_offsets_child_column(
+    offsets_itr, offsets_itr + input.size(), stream, rmm::mr::get_current_device_resource());
+
+  auto offsets = space_offsets(input, stream);
+
+  auto const split_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
+                                            offsets->size() - 1,
+                                            nullptr,  // no parent data
+                                            nullptr,  // null-mask
+                                            0,        // null-count
+                                            0,        // offset
+                                            {offsets->view(), input.chars()});
 
   // run BPE on the strings child column
-  auto bpe_column = byte_pair_encoding(split_view.child(), merges_table, stream);
+  auto bpe_column =
+    byte_pair_encoding(cudf::strings_column_view(split_view), *(merge_pairs.impl), stream);
 
-  // recombine the result:
-  // use the offsets from split_record and the strings from byte_pair_encoding
-  // to build a lists column_view
+  // recombine the result
   auto list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST},
-                                     strings_count,
+                                     input.size(),
                                      nullptr,  // no parent data in list column
-                                     split_view.null_mask(),
-                                     split_view.null_count(),
+                                     input.null_mask(),
+                                     input.null_count(),
                                      0,
-                                     {split_view.offsets(), bpe_column->view()});
+                                     {split_offsets->view(), bpe_column->view()});
 
   // use join_list_elements to build the output strings column
-  return cudf::strings::detail::join_list_elements(
-    cudf::lists_column_view(list_join),
-    separator,
-    cudf::string_scalar(""),
-    cudf::strings::separator_on_nulls::NO,
-    cudf::strings::output_if_empty_list::EMPTY_STRING,
-    stream,
-    mr);
+  auto result =
+    cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join),
+                                              separator,
+                                              cudf::string_scalar(""),
+                                              cudf::strings::separator_on_nulls::NO,
+                                              cudf::strings::output_if_empty_list::EMPTY_STRING,
+                                              stream,
+                                              mr);
+  return result;
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
new file mode 100644
index 00000000000..ac21eea0261
--- /dev/null
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <hash/hash_allocator.cuh>
+
+#include <nvtext/bpe_tokenize.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
+
+#include <cuco/static_map.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <cstdint>
+
+namespace nvtext {
+namespace detail {
+
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+
+using merge_pairs_map_type = cuco::static_map<hash_value_type,
+                                              cudf::size_type,
+                                              cuda::thread_scope_device,
+                                              hash_table_allocator_type>;
+
+using string_hasher_type = MurmurHash3_32<cudf::string_view>;
+
+}  // namespace detail
+
+struct bpe_merge_pairs::bpe_merge_pairs_impl {
+  std::unique_ptr<cudf::column> const merge_pairs;
+  std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
+
+  bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
+                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
+
+  auto get_merge_pairs() const { return merge_pairs->view(); }
+  auto get_merge_pairs_map() const { return merge_pairs_map->get_device_view(); }
+};
+
+}  // namespace nvtext
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index eaeaec03a96..1ea2bdcaba5 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include <text/subword/bpe_tokenizer.cuh>
+
 #include <nvtext/bpe_tokenize.hpp>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
@@ -31,6 +34,22 @@
 namespace nvtext {
 namespace detail {
 
+namespace {
+
+struct make_pair_function {
+  /**
+   * @brief Hash the merge pair entry
+   */
+  __device__ cuco::pair_type<hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
+  {
+    auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
+    return cuco::make_pair(result, idx);
+  }
+
+  string_hasher_type const _hasher;
+  cudf::column_device_view const d_strings;
+};
+
 /**
  * @brief Loads a text file of merge-pairs into a strings column.
  *
@@ -41,18 +60,17 @@ namespace detail {
  * #version ..
  * a1 a2
  * b1 b2
- * c1 c3
+ * c1 c2
  * ...
  * @endcode
  *
  * @param filename_merges Path to text file containing merge-pairs.
  * @return object containing table elements for the BPE function
  */
-std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filename_merges,
-                                                       rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_merges,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
-  bpe_merge_pairs result;
   std::ifstream merges_file(filename_merges);
   CUDF_EXPECTS(merges_file.good(), "Could not open " + filename_merges);
 
@@ -74,11 +92,60 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
 
   CUDF_EXPECTS(!chars.empty(), "No data found in " + filename_merges);
 
-  auto d_chars       = cudf::detail::make_device_uvector_async(chars, stream, mr);
-  auto d_offsets     = cudf::detail::make_device_uvector_async(offsets, stream, mr);
-  result.merge_pairs = cudf::make_strings_column(d_chars, d_offsets);
+  auto d_chars   = cudf::detail::make_device_uvector_async(chars, stream, mr);
+  auto d_offsets = cudf::detail::make_device_uvector_async(offsets, stream, mr);
+  return cudf::make_strings_column(d_chars, d_offsets);
+}
 
-  return std::make_unique<bpe_merge_pairs>(std::move(result));
+std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
+  cudf::strings_column_view const& input, rmm::cuda_stream_view stream)
+{
+  auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
+    static_cast<size_t>(input.size() * 2),  // ensure capacity is at least (size*10/7)
+    std::numeric_limits<hash_value_type>::max(),
+    -1,  // empty-value is not used
+    hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value());
+
+  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
+  make_pair_function pair_func{string_hasher_type{}, *d_strings};
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  merge_pairs_map->insert(iter,
+                          iter + input.size(),
+                          cuco::detail::MurmurHash3_32<hash_value_type>{},
+                          thrust::equal_to<hash_value_type>{},
+                          stream.value());
+
+  return merge_pairs_map;
+}
+
+std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
+  std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
+{
+  auto merge_pairs = initialize_merge_pairs_map(cudf::strings_column_view(input->view()), stream);
+  auto result      = std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
+    std::move(input), std::move(merge_pairs));
+  return result;
+}
+
+std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  return create_bpe_merge_pairs_impl(std::make_unique<cudf::column>(input.parent(), stream, mr),
+                                     stream);
+}
+
+}  // namespace
+
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filename_merges,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  auto input_column = load_file_to_column(filename_merges, stream, mr);
+  return std::make_unique<bpe_merge_pairs>(std::move(input_column), stream, mr);
 }
 
 }  // namespace detail
@@ -90,4 +157,30 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
   return detail::load_merge_pairs_file(filename_merges, rmm::cuda_stream_default, mr);
 }
 
+bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
+  std::unique_ptr<cudf::column>&& merge_pairs,
+  std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map)
+  : merge_pairs(std::move(merge_pairs)), merge_pairs_map(std::move(merge_pairs_map))
+{
+}
+
+bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource*)
+  : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream))
+{
+}
+
+bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+  : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr))
+{
+}
+
+bpe_merge_pairs::~bpe_merge_pairs() = default;
+
+cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); }
+std::size_t bpe_merge_pairs::get_map_size() { return impl->merge_pairs_map->get_size(); }
+
 }  // namespace nvtext
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index a009919fa21..a0bb926ee9d 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -29,7 +29,7 @@ struct TextBPETokenize : public cudf::test::BaseFixture {
 
 TEST_F(TextBPETokenize, BytePairEncoding)
 {
-  // fake table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
+  // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
   auto mpt = cudf::test::strings_column_wrapper({
     "e n",    // 12
     "i t",    // 14
@@ -44,7 +44,8 @@ TEST_F(TextBPETokenize, BytePairEncoding)
     "t est",  // 9032
     "s ent",  // 33830
   });
-  nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
+
+  nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
 
   auto validity = cudf::test::iterators::null_at(4);
   cudf::test::strings_column_wrapper input({"This is it",
@@ -58,10 +59,10 @@ TEST_F(TextBPETokenize, BytePairEncoding)
 
   auto results = nvtext::byte_pair_encoding(sv, merge_pairs);
 
-  auto expected = cudf::test::strings_column_wrapper({"This Ġis Ġit",
-                                                      "This Ġis Ġtest - sent ence - 1",
-                                                      "This Ġis Ġtest Ġsent ence - 2",
-                                                      "This - is Ġtest Ġsent ence Ġ3",
+  auto expected = cudf::test::strings_column_wrapper({"This is it",
+                                                      "This is test - sent ence - 1",
+                                                      "This is test sent ence - 2",
+                                                      "This - is test sent ence 3",
                                                       "",
                                                       ""},
                                                      validity);

From 3df89a0dd8df7bd603887d0fb448048705dfd2cd Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 18 Feb 2022 17:10:14 -0500
Subject: [PATCH 04/10] handle sliced input column

---
 cpp/src/text/subword/bpe_tokenizer.cu | 124 +++++++++++++++++---------
 cpp/tests/text/bpe_tests.cpp          |   8 +-
 2 files changed, 91 insertions(+), 41 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 20837716845..49c53b7547c 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/for_each.h>
@@ -60,9 +61,9 @@ constexpr bool is_whitespace(CharType ch)
 __device__ cudf::string_view resolve_string(cudf::string_view const& d_str)
 {
   auto const begin = d_str.data();
-  auto end         = thrust::find_if(
+  auto const end   = thrust::find_if(
     thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); });
-  auto size = static_cast<cudf::size_type>(thrust::distance(begin, end));
+  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, end));
   return cudf::string_view(begin, size);
 }
 
@@ -363,8 +364,9 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   thrust::exclusive_scan(
     rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets);
 
-  auto const bytes = cudf::detail::get_value<int32_t>(offsets->view(), input.size(), stream);
-  auto chars       = cudf::strings::detail::create_chars_child_column(
+  auto const bytes =
+    cudf::detail::get_value<cudf::size_type>(offsets->view(), input.size(), stream);
+  auto chars = cudf::strings::detail::create_chars_child_column(
     bytes, stream, rmm::mr::get_current_device_resource());
   auto d_chars = chars->mutable_view().data<char>();
 
@@ -377,6 +379,37 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
     input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{});
 }
 
+/**
+ * @brief Detect space to not-space transitions inside each string.
+ *
+ * This handles sliced input and null strings as well.
+ * It is parallelized over bytes and returns true only for valid left edges
+ * -- non-space proceeded by a space.
+ */
+struct edge_of_space_fn {
+  cudf::column_device_view const d_strings;
+  __device__ bool operator()(cudf::offset_type offset)
+  {
+    auto const d_chars =
+      d_strings.child(cudf::strings_column_view::chars_column_index).data<char>();
+    if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; }
+
+    auto const offsets   = d_strings.child(cudf::strings_column_view::offsets_column_index);
+    auto const d_offsets = offsets.data<cudf::offset_type>() + d_strings.offset();
+    // ignore offsets outside sliced range
+    if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; }
+
+    auto itr =
+      thrust::lower_bound(thrust::seq, d_offsets, d_offsets + d_strings.size() + 1, offset);
+    // ignore offsets at existing string boundaries
+    if (*itr == offset) { return false; }
+
+    // count only edges for valid strings
+    auto const index = static_cast<cudf::size_type>(thrust::distance(d_offsets, itr)) - 1;
+    return d_strings.is_valid(index);
+  }
+};
+
 /**
  * @brief Create new offsets by identifying substrings by whitespace.
  *
@@ -388,25 +421,24 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
  *
  * @param input Strings to tokenize.
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return New offsets including those at the edge of each space.
  */
 std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& input,
+                                            cudf::column_device_view const& d_strings,
                                             rmm::cuda_stream_view stream)
 {
   // count space offsets
-  // TODO: does not yet account for sliced input column
-  auto const begin   = thrust::make_counting_iterator<cudf::size_type>(1);
-  auto const end     = thrust::make_counting_iterator<cudf::size_type>(input.chars().size());
-  auto edge_of_space = [d_chars = input.chars().data<char>()] __device__(auto idx) -> bool {
-    return !is_whitespace(d_chars[idx]) && is_whitespace(d_chars[idx - 1]);
-  };
-  auto space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space);
+  auto const begin = thrust::make_counting_iterator<cudf::size_type>(1);
+  auto const end   = thrust::make_counting_iterator<cudf::size_type>(input.chars().size());
+  edge_of_space_fn edge_of_space{d_strings};
+  auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space);
 
   // copy space offsets
   rmm::device_uvector<cudf::offset_type> space_offsets(space_count, stream);
   thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space);
 
   // create output offsets
-  auto split_offsets =
+  auto result =
     cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
                               static_cast<cudf::size_type>(space_count + input.size() + 1),
                               cudf::mask_state::UNALLOCATED,
@@ -419,9 +451,9 @@ std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& inp
                 input.offsets_end(),
                 space_offsets.begin(),
                 space_offsets.end(),
-                split_offsets->mutable_view().begin<cudf::offset_type>());
+                result->mutable_view().begin<cudf::offset_type>());
 
-  return split_offsets;
+  return result;
 }
 
 /**
@@ -429,15 +461,22 @@ std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& inp
  *
  * This essentially returns the number of tokens for each string.
  */
-struct split_offsets_fn {
+struct list_offsets_fn {
   cudf::column_device_view const d_strings;
   __device__ cudf::size_type operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) return 0;
     auto const d_str = d_strings.element<cudf::string_view>(idx);
-    // TODO: does not correctly account for adjacent whitespace
-    auto const result = thrust::count_if(
-      thrust::seq, d_str.begin(), d_str.end(), [](auto ch) { return is_whitespace(ch); });
+    if (d_str.empty()) return 1;  // empty is a single valid result
+
+    auto const begin = thrust::make_counting_iterator<cudf::size_type>(1);
+    auto const end   = thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes());
+
+    // this counts the number of non-adjacent delimiters
+    auto const result =
+      thrust::count_if(thrust::seq, begin, end, [data = d_str.data()](auto chidx) {
+        return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]);
+      });
     return static_cast<cudf::size_type>(result) + 1;
   }
 };
@@ -453,15 +492,11 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   if (input.is_empty() || input.chars_size() == 0)
     return cudf::make_empty_column(cudf::type_id::STRING);
 
-  auto d_strings   = cudf::column_device_view::create(input.parent(), stream);
-  auto offsets_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<cudf::size_type>(0), split_offsets_fn{*d_strings});
-  auto split_offsets = cudf::strings::detail::make_offsets_child_column(
-    offsets_itr, offsets_itr + input.size(), stream, rmm::mr::get_current_device_resource());
-
-  auto offsets = space_offsets(input, stream);
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+  auto const offsets   = space_offsets(input, *d_strings, stream);
 
-  auto const split_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
+  // build a view using the new offsets and the current input chars column
+  auto const input_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
                                             offsets->size() - 1,
                                             nullptr,  // no parent data
                                             nullptr,  // null-mask
@@ -469,20 +504,29 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                             0,        // offset
                                             {offsets->view(), input.chars()});
 
-  // run BPE on the strings child column
-  auto bpe_column =
-    byte_pair_encoding(cudf::strings_column_view(split_view), *(merge_pairs.impl), stream);
-
-  // recombine the result
-  auto list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST},
-                                     input.size(),
-                                     nullptr,  // no parent data in list column
-                                     input.null_mask(),
-                                     input.null_count(),
-                                     0,
-                                     {split_offsets->view(), bpe_column->view()});
-
-  // use join_list_elements to build the output strings column
+  // run BPE on this view
+  auto const bpe_column =
+    byte_pair_encoding(cudf::strings_column_view(input_view), *(merge_pairs.impl), stream);
+
+  // recombine the result:
+  // compute the offsets needed to build a list view
+  auto const list_offsets = [d_strings = *d_strings, stream] {
+    auto offsets_itr = thrust::make_transform_iterator(
+      thrust::make_counting_iterator<cudf::size_type>(0), list_offsets_fn{d_strings});
+    return cudf::strings::detail::make_offsets_child_column(
+      offsets_itr, offsets_itr + d_strings.size(), stream, rmm::mr::get_current_device_resource());
+  }();
+
+  // build a list column_view using the BPE output and the list_offsets
+  auto const list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST},
+                                           input.size(),
+                                           nullptr,  // no parent data in list column
+                                           input.null_mask(),
+                                           input.null_count(),
+                                           0,
+                                           {list_offsets->view(), bpe_column->view()});
+
+  // build the output strings column
   auto result =
     cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join),
                                               separator,
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index a0bb926ee9d..1edf94c6c61 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -48,7 +48,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
   nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
 
   auto validity = cudf::test::iterators::null_at(4);
-  cudf::test::strings_column_wrapper input({"This is it",
+  cudf::test::strings_column_wrapper input({"This\tis  it\n",
                                             "This is test-sentence-1",
                                             "This is test sentence-2",
                                             "This-is test sentence 3",
@@ -67,6 +67,12 @@ TEST_F(TextBPETokenize, BytePairEncoding)
                                                       ""},
                                                      validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto sliced          = cudf::slice(input, {1, 4}).front();
+  auto sliced_expected = cudf::slice(expected, {1, 4}).front();
+
+  results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), sliced_expected);
 }
 
 TEST_F(TextBPETokenize, BPE_Empty)

From 6eb61713a9887093a051661e45ac7bca1c36a310 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 22 Feb 2022 19:38:24 -0500
Subject: [PATCH 05/10] add leading space to test

---
 cpp/tests/text/bpe_tests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index 1edf94c6c61..c9f61a16964 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -48,7 +48,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
   nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
 
   auto validity = cudf::test::iterators::null_at(4);
-  cudf::test::strings_column_wrapper input({"This\tis  it\n",
+  cudf::test::strings_column_wrapper input({" This\tis  it\n",
                                             "This is test-sentence-1",
                                             "This is test sentence-2",
                                             "This-is test sentence 3",
@@ -59,7 +59,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
 
   auto results = nvtext::byte_pair_encoding(sv, merge_pairs);
 
-  auto expected = cudf::test::strings_column_wrapper({"This is it",
+  auto expected = cudf::test::strings_column_wrapper({" This is it",
                                                       "This is test - sent ence - 1",
                                                       "This is test sent ence - 2",
                                                       "This - is test sent ence 3",

From 84a2cbece465d455fe458cdea7eb4a6120cb28ee Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 23 Feb 2022 09:39:20 -0500
Subject: [PATCH 06/10] add separator test

---
 cpp/tests/text/bpe_tests.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index c9f61a16964..07f3a41f0e2 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -66,13 +66,30 @@ TEST_F(TextBPETokenize, BytePairEncoding)
                                                       "",
                                                       ""},
                                                      validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 
   auto sliced          = cudf::slice(input, {1, 4}).front();
   auto sliced_expected = cudf::slice(expected, {1, 4}).front();
 
   results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), sliced_expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
+}
+
+TEST_F(TextBPETokenize, BytePairEncodingSeparator)
+{
+  auto mpt = cudf::test::strings_column_wrapper(
+    {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
+  nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
+
+  cudf::test::strings_column_wrapper input(
+    {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "});
+  auto sv = cudf::strings_column_view(input);
+
+  auto results = nvtext::byte_pair_encoding(sv, merge_pairs, std::string(" Ġ"));
+
+  auto expected = cudf::test::strings_column_wrapper(
+    {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(TextBPETokenize, BPE_Empty)

From d282330b885aa21bbb2106e1cb55ef67ae8c273e Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 7 Mar 2022 11:00:37 -0500
Subject: [PATCH 07/10] fix typos in and clarify comments

---
 cpp/include/nvtext/bpe_tokenize.hpp      |  8 +++---
 cpp/src/strings/split/split_record.cu    | 31 ++++++------------------
 cpp/src/text/subword/bpe_tokenizer.cuh   |  4 +--
 cpp/src/text/subword/load_merges_file.cu | 10 +++++---
 4 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index 53eda236d0b..23af9731268 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -66,10 +66,10 @@ struct bpe_merge_pairs {
  * @endcode
  *
  * The pairs are expected to be ordered in the file by their rank
- * relative to each other. A pair will be is in priority over
+ * relative to each other. A pair earlier in the file has priority over
  * any pairs below it.
  *
- * @param filename_merges Local file path of pairs encoded in UTF-8
+ * @param filename_merges Local file path of pairs encoded in UTF-8.
  * @param mr Memory resource to allocate any returned objects.
  */
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
@@ -80,7 +80,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * @brief Byte pair encode the input strings.
  *
  * This will split each string on whitespace, perform the encoding,
- * and then build the output using the given `separator`.
+ * and then build the output column using the given `separator`.
  *
  * The encoding algorithm rebuilds each string by matching substrings
  * in the `merge_pairs` table and iteratively removing the minimum ranked pair
@@ -91,7 +91,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * mps = load_merges_file("merges.txt")
  * input = ["test sentence", "thisis test"]
  * result = byte_pair_encoding(input, mps)
- * result is now ["test sent tence", "this is test"]
+ * result is now ["test sent ence", "this is test"]
  * @endcode
  *
  * @throw cudf::logic_error if `merge_pairs` is empty
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 8834ece8734..929d21a024c 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -217,6 +217,8 @@ struct whitespace_token_reader_fn {
   }
 };
 
+}  // namespace
+
 // The output is one list item per string
 template <typename TokenCounter, typename TokenReader>
 std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
@@ -287,27 +289,6 @@ std::unique_ptr<column> split_record(
                            mr);
   }
 }
-}  // namespace
-
-std::unique_ptr<column> split_record(strings_column_view const& strings,
-                                     string_scalar const& delimiter,
-                                     size_type maxsplit,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-{
-  return split_record<detail::Dir::FORWARD>(
-    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
-                                      string_scalar const& delimiter,
-                                      size_type maxsplit,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
-{
-  return split_record<detail::Dir::BACKWARD>(
-    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
-}
 
 }  // namespace detail
 
@@ -319,7 +300,8 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
+  return detail::split_record<detail::Dir::FORWARD>(
+    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
@@ -328,7 +310,8 @@ std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
+  return detail::split_record<detail::Dir::BACKWARD>(
+    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index ac21eea0261..31cc29a8d8a 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <hash/hash_allocator.cuh>
-
 #include <nvtext/bpe_tokenize.hpp>
 
+#include <hash/hash_allocator.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/detail/utilities/hash_functions.cuh>
 
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index 1ea2bdcaba5..bd206e30a27 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -83,7 +83,7 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
   if (line.substr(0, version.size()).compare(version) == 0) { std::getline(merges_file, line); }
 
   // This is a text file delimited only by CR/LF.
-  // Look into using the CSV reader to load the strings column instead.
+  // TODO: Look into using the CSV reader to load the strings column instead.
   while (!line.empty()) {
     chars.insert(chars.end(), std::cbegin(line), std::cend(line));
     offsets.push_back(offsets.back() + line.length());
@@ -100,10 +100,12 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::strings_column_view const& input, rmm::cuda_stream_view stream)
 {
+  // Ensure capacity is at least (size*10/7) as documented here:
+  // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
-    static_cast<size_t>(input.size() * 2),  // ensure capacity is at least (size*10/7)
-    std::numeric_limits<hash_value_type>::max(),
-    -1,  // empty-value is not used
+    static_cast<size_t>(input.size() * 2),        // capacity is 2x;
+    std::numeric_limits<hash_value_type>::max(),  // empty key;
+    -1,                                           // empty value is not used
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 

From 93b0842134ded39bd5aae33617ffaf4c778ca7e2 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 15 Mar 2022 09:54:20 -0400
Subject: [PATCH 08/10] fix grammar and typos

---
 cpp/include/nvtext/bpe_tokenize.hpp      |  4 ++--
 cpp/src/text/subword/bpe_tokenizer.cu    | 23 ++++++++++++-----------
 cpp/src/text/subword/load_merges_file.cu |  9 ++++-----
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index 23af9731268..3af521a4fc5 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -31,7 +31,7 @@ namespace nvtext {
 /**
  * @brief The table of merge pairs for the BPE encoder.
  *
- * To create an instance, call nvtext::load_merges_table
+ * To create an instance, call @ref nvtext::load_merge_pairs_file
  */
 struct bpe_merge_pairs {
   struct bpe_merge_pairs_impl;
@@ -98,7 +98,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * @throw cudf::logic_error if `separator` is invalid
  *
  * @param input Strings to encode.
- * @param merge_pairs Created by a call to nvtext::load_merges_file.
+ * @param merge_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
  * @param mr Memory resource to allocate any returned objects.
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 49c53b7547c..92f2e483bc6 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -49,16 +49,16 @@ constexpr bool is_whitespace(CharType ch)
 }
 
 /**
- * @brief Resolve a truncated string from a full string.
+ * @brief Resolve a substring up to the first whitespace character.
  *
  * This will return a substring of the input starting with the first byte
- * upto the first whitespace character is found or the end of the string.
+ * up to the first whitespace character found or the end of the string.
  * Any whitespace is expected only at the end of the string.
  *
  * @param d_str Input string to resolve.
- * @return Substring of the input excluding trailing whitespace.
+ * @return Substring of the input excluding any trailing whitespace.
  */
-__device__ cudf::string_view resolve_string(cudf::string_view const& d_str)
+__device__ cudf::string_view get_first_token(cudf::string_view const& d_str)
 {
   auto const begin = d_str.data();
   auto const end   = thrust::find_if(
@@ -178,7 +178,7 @@ struct byte_pair_encoding_fn {
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) { return; }
-    auto const d_str = resolve_string(d_strings.element<cudf::string_view>(idx));
+    auto const d_str = get_first_token(d_strings.element<cudf::string_view>(idx));
     if (d_str.empty()) { return; }
 
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
@@ -293,7 +293,7 @@ struct build_encoding_fn {
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) { return; }
-    auto const d_str = resolve_string(d_strings.element<cudf::string_view>(idx));
+    auto const d_str = get_first_token(d_strings.element<cudf::string_view>(idx));
     if (d_str.empty()) { return; }
 
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
@@ -312,6 +312,7 @@ struct build_encoding_fn {
       if (*itr++) *d_output++ = ' ';
       *d_output++ = *d_input++;
     }
+    // https://github.com/rapidsai/cudf/pull/10270/files#r826319405
   }
 };
 
@@ -322,7 +323,7 @@ struct build_encoding_fn {
  *
  * The encoding is performed iteratively. Each pass determines the string's lowest
  * ranked merge pair as determined by the strings in `merges_table`. This pair
- * is the removed (virtually) from each string before starting the next iteration.
+ * is removed (virtually) from each string before starting the next iteration.
  *
  * Once all pairs have exhausted for all strings, the output is constructed from
  * the results by adding spaces between each remaining pair in each string.
@@ -344,7 +345,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   auto const d_merges  = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream);
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
+  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
                                            static_cast<cudf::size_type>(input.size() + 1),
                                            cudf::mask_state::UNALLOCATED,
                                            stream,
@@ -384,7 +385,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
  *
  * This handles sliced input and null strings as well.
  * It is parallelized over bytes and returns true only for valid left edges
- * -- non-space proceeded by a space.
+ * -- non-space preceded by a space.
  */
 struct edge_of_space_fn {
   cudf::column_device_view const d_strings;
@@ -417,7 +418,7 @@ struct edge_of_space_fn {
  * and only returns new offsets. The behavior is more like a view-only slice
  * of the chars child with the result still including trailing delimiters.
  *
- * The encoding algorithm knows to ignore the trailing whitespace of each string.
+ * The encoding algorithm ignores the trailing whitespace of each string.
  *
  * @param input Strings to tokenize.
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -439,7 +440,7 @@ std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& inp
 
   // create output offsets
   auto result =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
+    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::offset_type>()},
                               static_cast<cudf::size_type>(space_count + input.size() + 1),
                               cudf::mask_state::UNALLOCATED,
                               stream,
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index bd206e30a27..bdcbe45df64 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -64,7 +64,7 @@ struct make_pair_function {
  * ...
  * @endcode
  *
- * @param filename_merges Path to text file containing merge-pairs.
+ * @param filename_merges Path to text file containing merge-pairs
  * @return object containing table elements for the BPE function
  */
 std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_merges,
@@ -100,7 +100,7 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::strings_column_view const& input, rmm::cuda_stream_view stream)
 {
-  // Ensure capacity is at least (size*10/7) as documented here:
+  // Ensure capacity is at least (size/0.7) as documented here:
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),        // capacity is 2x;
@@ -126,9 +126,8 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
   std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
 {
   auto merge_pairs = initialize_merge_pairs_map(cudf::strings_column_view(input->view()), stream);
-  auto result      = std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
-    std::move(input), std::move(merge_pairs));
-  return result;
+  return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(std::move(input),
+                                                                         std::move(merge_pairs));
 }
 
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(

From 845a414b705f362ca186051e4f92633afe412ff2 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 15 Mar 2022 17:18:53 -0400
Subject: [PATCH 09/10] add more entries in load_merge_pairs_file doxygen
 example

---
 cpp/include/nvtext/bpe_tokenize.hpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index 3af521a4fc5..23fcd3acd03 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -62,6 +62,15 @@ struct bpe_merge_pairs {
  * e n
  * i t
  * i s
+ * e s
+ * en t
+ * c e
+ * es t
+ * en ce
+ * T h
+ * Th is
+ * t est
+ * s ent
  * ...
  * @endcode
  *
@@ -88,7 +97,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * pairs before the result is joined to make the output string.
  *
  * @code{.pseudo}
- * mps = load_merges_file("merges.txt")
+ * mps = load_merges_file("merges.txt") // see doxygen for example contents
  * input = ["test sentence", "thisis test"]
  * result = byte_pair_encoding(input, mps)
  * result is now ["test sent ence", "this is test"]

From 060077bdb9ed0e3ce64d878e1f1681684ba738ee Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 15 Mar 2022 18:32:26 -0400
Subject: [PATCH 10/10] add check for unexpected data format

---
 cpp/src/text/subword/bpe_tokenizer.cu | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 92f2e483bc6..c9a1d685f2e 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -91,13 +91,16 @@ struct byte_pair_encoding_fn {
   __device__ thrust::pair<cudf::string_view, cudf::string_view> dissect_merge_pair(
     cudf::size_type idx)
   {
-    auto const d_pair   = d_merges.element<cudf::string_view>(idx);
-    auto const lhs      = d_pair.data();
-    auto const end_str  = d_pair.data() + d_pair.size_bytes();
-    auto const rhs      = thrust::find(thrust::seq, lhs, end_str, ' ') + 1;
-    auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs - 1));
-    auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs, end_str));
-    return thrust::make_pair(cudf::string_view(lhs, lhs_size), cudf::string_view(rhs, rhs_size));
+    auto const d_pair  = d_merges.element<cudf::string_view>(idx);
+    auto const lhs     = d_pair.data();
+    auto const end_str = d_pair.data() + d_pair.size_bytes();
+    auto const rhs     = thrust::find(thrust::seq, lhs, end_str, ' ');  // space always expected
+    // check for malformed pair entry to prevent segfault
+    if (rhs == end_str) { return thrust::make_pair(cudf::string_view{}, cudf::string_view{}); }
+    auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs));
+    auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs + 1, end_str));
+    return thrust::make_pair(cudf::string_view(lhs, lhs_size),
+                             cudf::string_view(rhs + 1, rhs_size));
   }
 
   /**