From c53c47bf70113328f884f89725074d13b24a8702 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 16 Aug 2023 13:48:23 -0400
Subject: [PATCH 01/28] Add BytePairEncoder class to cuDF

---
 cpp/benchmarks/CMakeLists.txt                 |  4 +-
 cpp/benchmarks/text/bpe_tokenize.cpp          | 65 +++++++++++++++++++
 cpp/include/nvtext/bpe_tokenize.hpp           | 17 +----
 cpp/src/text/subword/load_merges_file.cu      | 10 ++-
 .../cudf/_lib/cpp/nvtext/bpe_tokenize.pxd     | 22 +++++++
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |  4 +-
 python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx | 39 +++++++++++
 python/cudf/cudf/core/byte_pair_encoding.py   | 54 +++++++++++++++
 8 files changed, 191 insertions(+), 24 deletions(-)
 create mode 100644 cpp/benchmarks/text/bpe_tokenize.cpp
 create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
 create mode 100644 python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
 create mode 100644 python/cudf/cudf/core/byte_pair_encoding.py

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 96e24efac8a..210ab415c53 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -276,8 +276,8 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c
 ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp text/normalize.cpp
-  text/replace.cpp text/tokenize.cpp
+  TEXT_NVBENCH text/bpe_tokenize.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/bpe_tokenize.cpp b/cpp/benchmarks/text/bpe_tokenize.cpp
new file mode 100644
index 00000000000..a80e7e5d84d
--- /dev/null
+++ b/cpp/benchmarks/text/bpe_tokenize.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/subword_tokenize.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_tokenize(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  cudf::strings_column_view input(column->view());
+
+  auto mps        = nvtext::load_merge_pairs_file("merges.txt");
+  auto vocab      = nvtext::load_vocabulary_file("hashed_vocab.txt");
+  auto seq_len    = 64;
+  auto stride     = 48;
+  auto lower_case = true;
+  auto truncate   = false;
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  auto chars_size = input.chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto bpe    = nvtext::byte_pair_encoding(input, *mps);
+    auto result = nvtext::subword_tokenize(input, *vocab, seq_len, stride, lower_case, truncate);
+  });
+}
+
+NVBENCH_BENCH(bench_tokenize)
+  .set_name("bpe_tokenize")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index b93d93b07c6..b4400f51ce9 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ namespace nvtext {
  */
 struct bpe_merge_pairs {
   struct bpe_merge_pairs_impl;
-  std::unique_ptr<bpe_merge_pairs_impl> impl{};  ///< Implementation of the BPE merge pairs table.
+  bpe_merge_pairs_impl* impl{};  ///< Implementation of the BPE merge pairs table.
 
   /**
    * @brief Construct a new bpe merge pairs object
@@ -62,18 +62,7 @@ struct bpe_merge_pairs {
 
   ~bpe_merge_pairs();
 
-  /**
-   * @brief Returns the number of merge pairs in the table.
-   *
-   * @return The number of merge pairs in the table
-   */
-  cudf::size_type get_size();
-  /**
-   * @brief  Returns the number of unique merge pairs in the table.
-   *
-   * @return The number of unique merge pairs in the table
-   */
-  std::size_t get_map_size();
+  bpe_merge_pairs();
 };
 
 /**
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index b39413af98f..1dbde21e113 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -171,20 +171,18 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
 bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource*)
-  : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream))
+  : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream).release())
 {
 }
 
 bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
-  : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr))
+  : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr).release())
 {
 }
 
-bpe_merge_pairs::~bpe_merge_pairs() = default;
-
-cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); }
-std::size_t bpe_merge_pairs::get_map_size() { return impl->merge_pairs_map->get_size(); }
+bpe_merge_pairs::bpe_merge_pairs() = default;
+bpe_merge_pairs::~bpe_merge_pairs() { delete impl; }
 
 }  // namespace nvtext
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
new file mode 100644
index 00000000000..9dc1a0ad9c0
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
@@ -0,0 +1,22 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+
+
+cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil:
+
+    cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
+        pass
+
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs_file(
+        const string &filename_merges
+    ) except +
+
+    cdef unique_ptr[column] byte_pair_encoding(
+        const column_view &strings,
+        const bpe_merge_pairs &merge_pairs
+    ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index 515b9c1d6e4..86d5bc22858 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+    bpe_tokenize.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
new file mode 100644
index 00000000000..00b6779f144
--- /dev/null
+++ b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
@@ -0,0 +1,39 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.bpe_tokenize cimport (
+    bpe_merge_pairs as cpp_bpe_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs_file as cpp_load_merge_pairs_file,
+)
+
+
+cdef class BPE_Merge_Pairs:
+    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
+
+    def __cinit__(self, merges_file):
+        cdef string c_merges_file = <string>str(merges_file).encode()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs_file(c_merges_file))
+
+
+@acquire_spill_lock()
+def byte_pair_encoding(
+    Column strings,
+    BPE_Merge_Pairs merge_pairs
+):
+    cdef column_view c_strings = strings.view()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(cpp_byte_pair_encoding(c_strings, merge_pairs.c_obj.get()[0]))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
new file mode 100644
index 00000000000..a254f5ac621
--- /dev/null
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.bpe_tokenize import (
+    BPE_Merge_Pairs as cpp_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+)
+
+
+class BytePairEncoder:
+    """
+
+    Parameters
+    ----------
+    merges_file : str
+        Path to file containing merge pairs.
+
+    Returns
+    -------
+    BytePairEncoder
+    """
+
+    def __init__(self, merges_file: str):
+        self.merge_pairs = cpp_merge_pairs(merges_file)
+
+    def __call__(self, text):
+        """
+
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be encoded.
+
+        Returns
+        -------
+        Encoded strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.core.byte_pair_encoding import BytePairEncoder
+        >>> bpe = BytePairEncoder('merges.txt')
+        >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
+        >>> bpe(str_series)
+        0    This is a sent ence
+        1             this is it
+        dtype: object
+        """
+
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs)
+
+        return cudf.Series(result)

From 370086fe62a062c96de272de7590d77c3a77b4b0 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 16 Aug 2023 16:56:40 -0400
Subject: [PATCH 02/28] nvbench experiment

---
 cpp/benchmarks/text/bpe_tokenize.cpp | 29 +++++++++++++---------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/cpp/benchmarks/text/bpe_tokenize.cpp b/cpp/benchmarks/text/bpe_tokenize.cpp
index a80e7e5d84d..6de87eeaae1 100644
--- a/cpp/benchmarks/text/bpe_tokenize.cpp
+++ b/cpp/benchmarks/text/bpe_tokenize.cpp
@@ -17,7 +17,9 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 
+#include <cudf/io/csv.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <nvtext/bpe_tokenize.hpp>
@@ -25,20 +27,18 @@
 
 #include <nvbench/nvbench.cuh>
 
-static void bench_tokenize(nvbench::state& state)
+static cudf::io::table_with_metadata read_csv(std::string const& file_path)
 {
-  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto source_info = cudf::io::source_info(file_path);
+  auto builder     = cudf::io::csv_reader_options::builder(source_info);
+  auto options     = builder.build();
+  return cudf::io::read_csv(options);
+}
 
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
-  cudf::strings_column_view input(column->view());
+static void bench_tokenize(nvbench::state& state)
+{
+  auto csv_metadata = read_csv("input_strings.csv");
+  cudf::strings_column_view input(csv_metadata.tbl->view().column(0));
 
   auto mps        = nvtext::load_merge_pairs_file("merges.txt");
   auto vocab      = nvtext::load_vocabulary_file("hashed_vocab.txt");
@@ -59,7 +59,4 @@ static void bench_tokenize(nvbench::state& state)
   });
 }
 
-NVBENCH_BENCH(bench_tokenize)
-  .set_name("bpe_tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+NVBENCH_BENCH(bench_tokenize).set_name("bpe_tokenize");

From c1673691824aa9ece5fc87737299985cc9cd34e4 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 17 Aug 2023 15:10:22 -0400
Subject: [PATCH 03/28] add separator parameter

---
 .../cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd   |  4 +++-
 python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx    | 16 ++++++++++++++--
 python/cudf/cudf/core/byte_pair_encoding.py      |  6 +++---
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
index 9dc1a0ad9c0..add6d62e85b 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
@@ -5,6 +5,7 @@ from libcpp.string cimport string
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
 
 
 cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil:
@@ -18,5 +19,6 @@ cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] byte_pair_encoding(
         const column_view &strings,
-        const bpe_merge_pairs &merge_pairs
+        const bpe_merge_pairs &merge_pairs,
+        const string_scalar &separator
     ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
index 00b6779f144..9ea7fa2c51e 100644
--- a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
@@ -15,6 +15,8 @@ from cudf._lib.cpp.nvtext.bpe_tokenize cimport (
     byte_pair_encoding as cpp_byte_pair_encoding,
     load_merge_pairs_file as cpp_load_merge_pairs_file,
 )
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.scalar cimport DeviceScalar
 
 
 cdef class BPE_Merge_Pairs:
@@ -29,11 +31,21 @@ cdef class BPE_Merge_Pairs:
 @acquire_spill_lock()
 def byte_pair_encoding(
     Column strings,
-    BPE_Merge_Pairs merge_pairs
+    BPE_Merge_Pairs merge_pairs,
+    object separator
 ):
     cdef column_view c_strings = strings.view()
+    cdef DeviceScalar d_separator = separator.device_value
+    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
+        .get_raw_ptr()
     cdef unique_ptr[column] c_result
     with nogil:
-        c_result = move(cpp_byte_pair_encoding(c_strings, merge_pairs.c_obj.get()[0]))
+        c_result = move(
+            cpp_byte_pair_encoding(
+                c_strings,
+                merge_pairs.c_obj.get()[0],
+                c_separator[0]
+            )
+        )
 
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index a254f5ac621..db220ea6da4 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -25,7 +25,7 @@ class BytePairEncoder:
     def __init__(self, merges_file: str):
         self.merge_pairs = cpp_merge_pairs(merges_file)
 
-    def __call__(self, text):
+    def __call__(self, text, separator: str = " "):
         """
 
         Parameters
@@ -48,7 +48,7 @@ def __call__(self, text):
         1             this is it
         dtype: object
         """
-
-        result = cpp_byte_pair_encoding(text._column, self.merge_pairs)
+        sep = cudf.Scalar(separator, dtype="str")
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
 
         return cudf.Series(result)

From c0f04472ad186af318c067472e6c1c5ce6bfa0f6 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 23 Aug 2023 14:48:56 -0400
Subject: [PATCH 04/28] deprecate loading merge-pairs from a file

---
 cpp/benchmarks/text/bpe_tokenize.cpp          | 69 ++++++++++++-----
 cpp/include/nvtext/bpe_tokenize.hpp           | 74 ++++++++++++++++++-
 cpp/src/text/subword/bpe_tokenizer.cu         | 10 +++
 cpp/src/text/subword/load_merges_file.cu      | 17 +++++
 cpp/tests/text/bpe_tests.cpp                  | 33 ++++++---
 .../cudf/_lib/cpp/nvtext/bpe_tokenize.pxd     |  4 +-
 python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx |  9 +--
 python/cudf/cudf/core/byte_pair_encoding.py   | 13 ++--
 8 files changed, 185 insertions(+), 44 deletions(-)

diff --git a/cpp/benchmarks/text/bpe_tokenize.cpp b/cpp/benchmarks/text/bpe_tokenize.cpp
index 6de87eeaae1..17f4bda81b3 100644
--- a/cpp/benchmarks/text/bpe_tokenize.cpp
+++ b/cpp/benchmarks/text/bpe_tokenize.cpp
@@ -17,35 +17,64 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 
+#include <cudf/filling.hpp>
 #include <cudf/io/csv.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <nvtext/bpe_tokenize.hpp>
 #include <nvtext/subword_tokenize.hpp>
 
 #include <nvbench/nvbench.cuh>
 
-static cudf::io::table_with_metadata read_csv(std::string const& file_path)
+static void bench_bpe(nvbench::state& state)
 {
-  auto source_info = cudf::io::source_info(file_path);
-  auto builder     = cudf::io::csv_reader_options::builder(source_info);
-  auto options     = builder.build();
-  return cudf::io::read_csv(options);
-}
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
 
-static void bench_tokenize(nvbench::state& state)
-{
-  auto csv_metadata = read_csv("input_strings.csv");
-  cudf::strings_column_view input(csv_metadata.tbl->view().column(0));
+  auto raw_data = cudf::test::strings_column_wrapper({"test sentence ",
+                                                      "thisis test ",
+                                                      "this is sentence ",
+                                                      "this istest ",
+                                                      "thisistest ",
+                                                      "sentence is test ",
+                                                      "this sentence is test ",
+                                                      "test test test ",
+                                                      "this this test this ",
+                                                      "sentence "})
+                    .release();
+
+  if (row_width / 20 > 1) {
+    std::vector<cudf::column_view> columns;
+    for (int i = 0; i < row_width / 20; ++i) {
+      columns.push_back(raw_data->view());
+    }
+    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
+  }
+  auto data_view = raw_data->view();
+
+  // Create a randomized gather-map to build a column out of the raw strings in data.
+  data_profile gather_profile =
+    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
+      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
+  auto gather_table =
+    create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
+  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
+  auto gather_map  = gather_table->view().column(0);
+  auto table_input = cudf::gather(cudf::table_view({data_view}), gather_map);
+  auto input       = cudf::strings_column_view(table_input->view().column(0));
 
-  auto mps        = nvtext::load_merge_pairs_file("merges.txt");
-  auto vocab      = nvtext::load_vocabulary_file("hashed_vocab.txt");
-  auto seq_len    = 64;
-  auto stride     = 48;
-  auto lower_case = true;
-  auto truncate   = false;
+  cudf::test::strings_column_wrapper merge_pairs(
+    {"e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
+  auto mps = nvtext::load_merge_pairs(cudf::strings_column_view(merge_pairs));
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
@@ -54,9 +83,11 @@ static void bench_tokenize(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto bpe    = nvtext::byte_pair_encoding(input, *mps);
-    auto result = nvtext::subword_tokenize(input, *vocab, seq_len, stride, lower_case, truncate);
+    auto result = nvtext::byte_pair_encoding(input, *mps);
   });
 }
 
-NVBENCH_BENCH(bench_tokenize).set_name("bpe_tokenize");
+NVBENCH_BENCH(bench_bpe)
+  .set_name("byte_pair_encoding")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216});
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index b4400f51ce9..258ee15ff87 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -68,6 +68,8 @@ struct bpe_merge_pairs {
 /**
  * @brief Create a nvtext::bpe_merge_pairs from an input file.
  *
+ * @deprecated Since 23.10
+ *
  * The file should contain a pair of strings per line separated by
  * a single space.
  *
@@ -96,10 +98,40 @@ struct bpe_merge_pairs {
  * @param mr Memory resource to allocate any returned objects.
  * @return A nvtext::bpe_merge_pairs object
  */
-std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
+[[deprecated]] std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
   std::string const& filename_merges,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create a nvtext::bpe_merge_pairs from a strings column
+ *
+ * The input column should contain a unique pair of strings per line separated by
+ * a single space. An incorrect format or non-unique entries will result in
+ * undefined behavior.
+ *
+ * Example:
+ * @code{.pseudo}
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * mps = load_merge_pairs(merge_pairs)
+ * // the mps object can be passed to the byte_pair_encoding API
+ * @endcode
+ *
+ * The pairs are expected to be ordered in the file by their rank
+ * relative to each other. A pair earlier in the file has priority over
+ * any pairs below it.
+ *
+ * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
+ *
+ * @param merge_pairs Column containing the unique merge pairs
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return A nvtext::bpe_merge_pairs object
+ */
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
+  cudf::strings_column_view const& merge_pairs,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Byte pair encode the input strings.
  *
@@ -134,5 +166,45 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Byte pair encode the input strings
+ *
+ * This will split each string on whitespace, perform the encoding,
+ * and then build the output column using the given `separator`.
+ *
+ * The encoding algorithm rebuilds each string by matching substrings
+ * in the `merge_pairs` column and iteratively removing the minimum ranked pair
+ * until no pairs are left. Then, a space is inserted between the remaining
+ * pairs before the result is joined to make the output string.
+ *
+ * @code{.pseudo}
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * input = ["test sentence", "thisis test"]
+ * result = byte_pair_encoding(input, merge_pairs)
+ * result is now ["test sent ence", "this is test"]
+ * @endcode
+ *
+ * The `merges_pairs` column should contain a unique pair of strings per line separated by
+ * a single space. An incorrect format or non-unique entries will result in
+ * undefined behavior.
+ *
+ * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
+ * @throw cudf::logic_error if `separator` is invalid
+ *
+ * @param input Strings to encode.
+ * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
+ * @param separator String used to build the output after encoding.
+ *                  Default is a space.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects.
+ * @return An encoded column of strings.
+ */
+std::unique_ptr<cudf::column> byte_pair_encoding(
+  cudf::strings_column_view const& input,
+  cudf::strings_column_view const& merges_pairs,
+  cudf::string_scalar const& separator = cudf::string_scalar(" "),
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 4c4f5b3a4b1..6407bfdf7c5 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -561,4 +561,14 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
 }
 
+std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
+                                                 cudf::strings_column_view const& merges_table,
+                                                 cudf::string_scalar const& separator,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::byte_pair_encoding(input, merges_table, separator, stream, mr);
+}
+
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index f382b8ba64d..ece9f9a2967 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -135,6 +135,15 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
   return std::make_unique<bpe_merge_pairs>(std::move(input_column), stream, mr);
 }
 
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!merge_pairs.is_empty(), "Merge pairs must not be empty");
+  CUDF_EXPECTS(!merge_pairs.has_nulls(), "Merge pairs may not contain nulls");
+  return std::make_unique<bpe_merge_pairs>(merge_pairs, stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filename_merges,
@@ -144,6 +153,14 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
   return detail::load_merge_pairs_file(filename_merges, cudf::get_default_stream(), mr);
 }
 
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::load_merge_pairs(merge_pairs, stream, mr);
+}
+
 bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& merge_pairs,
   std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index 234d8c4fecc..1d9ad12225e 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -44,7 +44,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
     "s ent",  // 33830
   });
 
-  nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   auto validity = cudf::test::iterators::null_at(4);
   cudf::test::strings_column_wrapper input({" This\tis  it\n",
@@ -56,7 +56,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
                                            validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto results = nvtext::byte_pair_encoding(sv, merge_pairs);
+  auto results = nvtext::byte_pair_encoding(sv, *merge_pairs);
 
   auto expected = cudf::test::strings_column_wrapper({" This is it",
                                                       "This is test - sent ence - 1",
@@ -66,11 +66,16 @@ TEST_F(TextBPETokenize, BytePairEncoding)
                                                       ""},
                                                      validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  results = nvtext::byte_pair_encoding(sv, cudf::strings_column_view(mpt));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 
   auto sliced          = cudf::slice(input, {1, 4}).front();
   auto sliced_expected = cudf::slice(expected, {1, 4}).front();
+  sv                   = cudf::strings_column_view(sliced);
 
-  results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs);
+  results = nvtext::byte_pair_encoding(sv, *merge_pairs);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
+  results = nvtext::byte_pair_encoding(sv, cudf::strings_column_view(mpt));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }
 
@@ -78,13 +83,13 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
 {
   auto mpt = cudf::test::strings_column_wrapper(
     {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
-  nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   cudf::test::strings_column_wrapper input(
     {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "});
   auto sv = cudf::strings_column_view(input);
 
-  auto results = nvtext::byte_pair_encoding(sv, merge_pairs, std::string(" Ġ"));
+  auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string(" Ġ"));
 
   auto expected = cudf::test::strings_column_wrapper(
     {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"});
@@ -93,18 +98,22 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
 
 TEST_F(TextBPETokenize, BPE_Empty)
 {
-  auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
-  nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
-  auto empty   = cudf::make_empty_column(cudf::type_id::STRING);
-  auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), merge_pairs);
+  auto mpt         = cudf::test::strings_column_wrapper({"i s", "i t"});
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
+  auto empty       = cudf::make_empty_column(cudf::type_id::STRING);
+  auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), *merge_pairs);
   EXPECT_EQ(0, results->size());
 }
 
 TEST_F(TextBPETokenize, BPE_Error)
 {
   auto empty = cudf::make_empty_column(cudf::type_id::STRING);
-  nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};
-  cudf::test::strings_column_wrapper input({"isit"});
-  EXPECT_THROW(nvtext::byte_pair_encoding(cudf::strings_column_view(input), merge_pairs),
+  EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(*empty)), cudf::logic_error);
+  auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {1, 0});
+  EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(null_pairs)), cudf::logic_error);
+  auto input     = cudf::test::strings_column_wrapper({"isit"});
+  auto separator = cudf::string_scalar("", false);
+  EXPECT_THROW(nvtext::byte_pair_encoding(
+                 cudf::strings_column_view(input), cudf::strings_column_view(input), separator),
                cudf::logic_error);
 }
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
index add6d62e85b..45a8574d805 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
@@ -13,8 +13,8 @@ cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil:
     cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
         pass
 
-    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs_file(
-        const string &filename_merges
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
+        const column_view &merge_pairs
     ) except +
 
     cdef unique_ptr[column] byte_pair_encoding(
diff --git a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
index 9ea7fa2c51e..cad33060eaf 100644
--- a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
@@ -4,7 +4,6 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
@@ -13,7 +12,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.nvtext.bpe_tokenize cimport (
     bpe_merge_pairs as cpp_bpe_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
-    load_merge_pairs_file as cpp_load_merge_pairs_file,
+    load_merge_pairs as cpp_load_merge_pairs,
 )
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.scalar cimport DeviceScalar
@@ -22,10 +21,10 @@ from cudf._lib.scalar cimport DeviceScalar
 cdef class BPE_Merge_Pairs:
     cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
 
-    def __cinit__(self, merges_file):
-        cdef string c_merges_file = <string>str(merges_file).encode()
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
         with nogil:
-            self.c_obj = move(cpp_load_merge_pairs_file(c_merges_file))
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index db220ea6da4..a1674a099c5 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -14,16 +14,16 @@ class BytePairEncoder:
 
     Parameters
     ----------
-    merges_file : str
-        Path to file containing merge pairs.
+    merges_pairs : str
+        Strings column of merge pairs
 
     Returns
     -------
     BytePairEncoder
     """
 
-    def __init__(self, merges_file: str):
-        self.merge_pairs = cpp_merge_pairs(merges_file)
+    def __init__(self, merges_pair: "cudf.Series"):
+        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
 
     def __call__(self, text, separator: str = " "):
         """
@@ -41,7 +41,10 @@ def __call__(self, text, separator: str = " "):
         --------
         >>> import cudf
         >>> from cudf.core.byte_pair_encoding import BytePairEncoder
-        >>> bpe = BytePairEncoder('merges.txt')
+        >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
+        ...                    "c e", "es t", "en ce", "T h", "Th is",
+        ...                    "t est", "s ent", "t h", "th is"])
+        >>> bpe = BytePairEncoder(mps)
         >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
         >>> bpe(str_series)
         0    This is a sent ence

From e6a08483e938204dde9969ec74b68d8fd8b6c847 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 24 Aug 2023 12:26:16 -0400
Subject: [PATCH 05/28] use u_char for is-whitespace fn

---
 cpp/src/text/subword/bpe_tokenizer.cu | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 6407bfdf7c5..1afad89571a 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -50,11 +50,7 @@ namespace detail {
 
 namespace {
 
-template <typename CharType>
-constexpr bool is_whitespace(CharType ch)
-{
-  return ch <= ' ';
-}
+constexpr bool is_whitespace(u_char ch) { return ch <= ' '; }
 
 /**
  * @brief Resolve a substring up to the first whitespace character.
@@ -69,9 +65,10 @@ constexpr bool is_whitespace(CharType ch)
 __device__ cudf::string_view get_first_token(cudf::string_view const& d_str)
 {
   auto const begin = d_str.data();
-  auto const end   = thrust::find_if(
-    thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); });
-  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, end));
+  auto const end   = thrust::find_if(thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) {
+    return is_whitespace(static_cast<u_char>(ch));
+  });
+  auto const size  = static_cast<cudf::size_type>(thrust::distance(begin, end));
   return cudf::string_view(begin, size);
 }
 
@@ -402,7 +399,7 @@ struct edge_of_space_fn {
   __device__ bool operator()(cudf::size_type offset)
   {
     auto const d_chars =
-      d_strings.child(cudf::strings_column_view::chars_column_index).data<char>();
+      d_strings.child(cudf::strings_column_view::chars_column_index).data<u_char>();
     if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; }
 
     auto const offsets   = d_strings.child(cudf::strings_column_view::offsets_column_index);
@@ -484,10 +481,10 @@ struct list_offsets_fn {
     auto const end   = thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes());
 
     // this counts the number of non-adjacent delimiters
-    auto const result =
-      thrust::count_if(thrust::seq, begin, end, [data = d_str.data()](auto chidx) {
-        return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]);
-      });
+    auto const data   = reinterpret_cast<u_char const*>(d_str.data());
+    auto const result = thrust::count_if(thrust::seq, begin, end, [data](auto chidx) {
+      return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]);
+    });
     return static_cast<cudf::size_type>(result) + 1;
   }
 };

From a1735ac3f0e584d0f9580f1618ec459b6a0a2b00 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 25 Aug 2023 17:05:31 -0400
Subject: [PATCH 06/28] use separator in final encoding step

---
 cpp/include/nvtext/bpe_tokenize.hpp   | 40 -------------------
 cpp/src/text/subword/bpe_tokenizer.cu | 56 ++++++++++++++-------------
 cpp/tests/text/bpe_tests.cpp          | 15 ++-----
 3 files changed, 33 insertions(+), 78 deletions(-)

diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index 258ee15ff87..7a362847a7c 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -166,45 +166,5 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Byte pair encode the input strings
- *
- * This will split each string on whitespace, perform the encoding,
- * and then build the output column using the given `separator`.
- *
- * The encoding algorithm rebuilds each string by matching substrings
- * in the `merge_pairs` column and iteratively removing the minimum ranked pair
- * until no pairs are left. Then, a space is inserted between the remaining
- * pairs before the result is joined to make the output string.
- *
- * @code{.pseudo}
- * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
- * input = ["test sentence", "thisis test"]
- * result = byte_pair_encoding(input, merge_pairs)
- * result is now ["test sent ence", "this is test"]
- * @endcode
- *
- * The `merges_pairs` column should contain a unique pair of strings per line separated by
- * a single space. An incorrect format or non-unique entries will result in
- * undefined behavior.
- *
- * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
- * @throw cudf::logic_error if `separator` is invalid
- *
- * @param input Strings to encode.
- * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
- * @param separator String used to build the output after encoding.
- *                  Default is a space.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Memory resource to allocate any returned objects.
- * @return An encoded column of strings.
- */
-std::unique_ptr<cudf::column> byte_pair_encoding(
-  cudf::strings_column_view const& input,
-  cudf::strings_column_view const& merges_pairs,
-  cudf::string_scalar const& separator = cudf::string_scalar(" "),
-  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 1afad89571a..bbf07f96b00 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -24,6 +24,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -81,6 +82,7 @@ template <typename MapRefType>
 struct byte_pair_encoding_fn {
   cudf::column_device_view const d_merges;
   cudf::column_device_view const d_strings;
+  cudf::string_view const d_separator;
   MapRefType const d_map;
   cudf::size_type* d_sizes;  // output size of encoded string
   string_hasher_type const hasher;
@@ -273,13 +275,11 @@ struct byte_pair_encoding_fn {
     }
 
     // compute and store the output size for this string's encoding
-    auto const encoded_size = d_str.size_bytes() +  // number of original bytes +
-                              thrust::count_if(     // number of non-zero byte indices
-                                thrust::seq,
-                                d_indices,
-                                d_indices + d_str.size_bytes(),
-                                [](auto v) { return v != 0; });
-    d_sizes[idx] = static_cast<cudf::size_type>(encoded_size);
+    auto separators_size =
+      thrust::count_if(
+        thrust::seq, d_indices, d_indices + d_str.size_bytes(), [](auto v) { return v != 0; }) *
+      d_separator.size_bytes();
+    d_sizes[idx] = static_cast<cudf::size_type>(d_str.size_bytes() + separators_size);
   }
 };
 
@@ -296,6 +296,7 @@ struct byte_pair_encoding_fn {
  */
 struct build_encoding_fn {
   cudf::column_device_view const d_strings;
+  cudf::string_view const d_separator;
   cudf::size_type const* d_byte_indices;
   cudf::size_type const* d_offsets;
   char* d_chars{};
@@ -319,7 +320,7 @@ struct build_encoding_fn {
     *d_output++      = *d_input++;
     auto itr         = begin + 1;
     while (itr < end) {
-      if (*itr++) *d_output++ = ' ';
+      if (*itr++) { d_output = cudf::strings::detail::copy_string(d_output, d_separator); }
       *d_output++ = *d_input++;
     }
     // https://github.com/rapidsai/cudf/pull/10270/files#r826319405
@@ -345,6 +346,7 @@ struct build_encoding_fn {
 std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs,
+  cudf::string_view d_separator,
   rmm::cuda_stream_view stream)
 {
   auto const d_merges = merge_pairs.get_merge_pairs();
@@ -363,8 +365,13 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
   auto map_ref = merge_pairs.get_merge_pairs_ref();
-  byte_pair_encoding_fn<decltype(map_ref)> fn{
-    d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
+  byte_pair_encoding_fn<decltype(map_ref)> fn{d_merges,
+                                              *d_strings,
+                                              d_separator,
+                                              map_ref,
+                                              d_offsets,
+                                              string_hasher_type{},
+                                              d_byte_indices.data()};
   thrust::for_each_n(
     rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
 
@@ -378,10 +385,11 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
     bytes, stream, rmm::mr::get_current_device_resource());
   auto d_chars = chars->mutable_view().data<char>();
 
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars});
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<cudf::size_type>(0),
+    input.size(),
+    build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars});
 
   return make_strings_column(
     input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{});
@@ -497,8 +505,12 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty() || input.chars_size() == 0)
+  if (input.is_empty() || input.chars_size() == 0) {
     return cudf::make_empty_column(cudf::type_id::STRING);
+  }
+
+  CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid");
+  auto const d_separator = separator.value(stream);
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
   auto const offsets   = space_offsets(input, *d_strings, stream);
@@ -513,8 +525,8 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                             {offsets->view(), input.chars()});
 
   // run BPE on this view
-  auto const bpe_column =
-    byte_pair_encoding(cudf::strings_column_view(input_view), *(merge_pairs.impl), stream);
+  auto const bpe_column = byte_pair_encoding(
+    cudf::strings_column_view(input_view), *(merge_pairs.impl), d_separator, stream);
 
   // recombine the result:
   // compute the offsets needed to build a list view
@@ -558,14 +570,4 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
-                                                 cudf::strings_column_view const& merges_table,
-                                                 cudf::string_scalar const& separator,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::byte_pair_encoding(input, merges_table, separator, stream, mr);
-}
-
 }  // namespace nvtext
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index 1d9ad12225e..fbb2335f343 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -66,8 +66,6 @@ TEST_F(TextBPETokenize, BytePairEncoding)
                                                       ""},
                                                      validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
-  results = nvtext::byte_pair_encoding(sv, cudf::strings_column_view(mpt));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 
   auto sliced          = cudf::slice(input, {1, 4}).front();
   auto sliced_expected = cudf::slice(expected, {1, 4}).front();
@@ -75,8 +73,6 @@ TEST_F(TextBPETokenize, BytePairEncoding)
 
   results = nvtext::byte_pair_encoding(sv, *merge_pairs);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
-  results = nvtext::byte_pair_encoding(sv, cudf::strings_column_view(mpt));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }
 
 TEST_F(TextBPETokenize, BytePairEncodingSeparator)
@@ -91,8 +87,10 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
 
   auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string(" Ġ"));
 
-  auto expected = cudf::test::strings_column_wrapper(
-    {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"});
+  auto expected = cudf::test::strings_column_wrapper({"test Ġ- Ġsent Ġence Ġ- Ġ1",
+                                                      "test Ġsent Ġence Ġ- Ġ2",
+                                                      "test Ġsent Ġence Ġ3",
+                                                      " Ġtest Ġsent Ġence Ġ4"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
@@ -111,9 +109,4 @@ TEST_F(TextBPETokenize, BPE_Error)
   EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(*empty)), cudf::logic_error);
   auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {1, 0});
   EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(null_pairs)), cudf::logic_error);
-  auto input     = cudf::test::strings_column_wrapper({"isit"});
-  auto separator = cudf::string_scalar("", false);
-  EXPECT_THROW(nvtext::byte_pair_encoding(
-                 cudf::strings_column_view(input), cudf::strings_column_view(input), separator),
-               cudf::logic_error);
 }

From f3926176ea1751745ddae54ffda6445f212c43b4 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 28 Aug 2023 15:26:16 -0400
Subject: [PATCH 07/28] remove whitespace checks

---
 cpp/src/text/subword/bpe_tokenizer.cu  | 306 +++----------------------
 cpp/src/text/subword/bpe_tokenizer.cuh |  40 +++-
 cpp/tests/text/bpe_tests.cpp           |  46 ++--
 3 files changed, 85 insertions(+), 307 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index bbf07f96b00..ce5b6523854 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -21,8 +21,8 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -31,18 +31,11 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/find.h>
 #include <thrust/for_each.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/merge.h>
-#include <thrust/pair.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
@@ -51,28 +44,6 @@ namespace detail {
 
 namespace {
 
-constexpr bool is_whitespace(u_char ch) { return ch <= ' '; }
-
-/**
- * @brief Resolve a substring up to the first whitespace character.
- *
- * This will return a substring of the input starting with the first byte
- * up to the first whitespace character found or the end of the string.
- * Any whitespace is expected only at the end of the string.
- *
- * @param d_str Input string to resolve.
- * @return Substring of the input excluding any trailing whitespace.
- */
-__device__ cudf::string_view get_first_token(cudf::string_view const& d_str)
-{
-  auto const begin = d_str.data();
-  auto const end   = thrust::find_if(thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) {
-    return is_whitespace(static_cast<u_char>(ch));
-  });
-  auto const size  = static_cast<cudf::size_type>(thrust::distance(begin, end));
-  return cudf::string_view(begin, size);
-}
-
 /**
  * @brief Main byte pair encoding algorithm function for each string.
  *
@@ -88,29 +59,6 @@ struct byte_pair_encoding_fn {
   string_hasher_type const hasher;
   cudf::size_type* d_byte_indices;
 
-  /**
-   * @brief Parse the merge pair into components.
-   *
-   * The two substrings are separated by a single space.
-   *
-   * @param idx Index of merge pair to dissect.
-   * @return The left and right halves of the merge pair.
-   */
-  __device__ thrust::pair<cudf::string_view, cudf::string_view> dissect_merge_pair(
-    cudf::size_type idx)
-  {
-    auto const d_pair  = d_merges.element<cudf::string_view>(idx);
-    auto const lhs     = d_pair.data();
-    auto const end_str = d_pair.data() + d_pair.size_bytes();
-    auto const rhs     = thrust::find(thrust::seq, lhs, end_str, ' ');  // space always expected
-    // check for malformed pair entry to prevent segfault
-    if (rhs == end_str) { return thrust::make_pair(cudf::string_view{}, cudf::string_view{}); }
-    auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs));
-    auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs + 1, end_str));
-    return thrust::make_pair(cudf::string_view(lhs, lhs_size),
-                             cudf::string_view(rhs + 1, rhs_size));
-  }
-
   /**
    * @brief Get the next substring of the given string.
    *
@@ -144,25 +92,8 @@ struct byte_pair_encoding_fn {
    */
   __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs)
   {
-    __shared__ char shmem[48 * 1024];  // max for Pascal
-    auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
-    auto const thread_memory_size = static_cast<cudf::size_type>(sizeof(shmem) / blockDim.x);
-
-    // Edge case check.
-    // Empirically found only two merge pair strings that were greater than 70 bytes
-    // and they both looked like ignorable errors.
-    if (thread_memory_size < total_size) { return d_map.end(); }
-
-    // build the target string in shared memory
-    char* ptr = &shmem[threadIdx.x * thread_memory_size];
-
-    // build a temp string like:  temp = lhs + ' ' + rhs
-    memcpy(ptr, lhs.data(), lhs.size_bytes());
-    memcpy(ptr + lhs.size_bytes(), " ", 1);
-    memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes());
-
-    auto const d_str = cudf::string_view(ptr, total_size);
-    return d_map.find(d_str);
+    auto const mp = merge_pair_type{lhs, rhs};
+    return d_map.find(mp);
   }
 
   /**
@@ -188,14 +119,14 @@ struct byte_pair_encoding_fn {
       d_sizes[idx] = 0;
       return;
     }
-    auto const d_str = get_first_token(d_strings.element<cudf::string_view>(idx));
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
     if (d_str.empty()) {
       d_sizes[idx] = 0;
       return;
     }
 
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::size_type>(idx);
+                          .element<cudf::size_type>(idx + d_strings.offset());
     auto const d_indices = d_byte_indices + offset;
 
     // initialize the byte indices for this string;
@@ -227,7 +158,7 @@ struct byte_pair_encoding_fn {
       // check each adjacent pair against the d_map
       while (itr < end) {
         auto const rhs = next_substr(itr, end, d_str);
-        if (rhs.empty()) break;  // no more adjacent pairs
+        if (rhs.empty()) { break; }  // no more adjacent pairs
 
         auto const map_itr = get_merge_pair(lhs, rhs);
         if (map_itr != d_map.end()) {
@@ -253,11 +184,12 @@ struct byte_pair_encoding_fn {
         // continue scanning for other occurrences in the remainder of the string
         itr += min_size;
         if (itr < end) {
-          auto const d_pair = dissect_merge_pair(min_rank);
+          // auto const d_pair = dissect_merge_pair(min_rank);
+          auto const d_pair = dissect_merge_pair(d_merges.element<cudf::string_view>(min_rank));
 
           lhs = next_substr(itr, end, d_str);
           itr += lhs.size_bytes();
-          while (itr < end) {
+          while (itr < end && !lhs.empty()) {
             auto rhs = next_substr(itr, end, d_str);
             if (d_pair.first == lhs && d_pair.second == rhs) {
               *itr = 0;                   // removes the pair from this string
@@ -304,13 +236,13 @@ struct build_encoding_fn {
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) { return; }
-    auto const d_str = get_first_token(d_strings.element<cudf::string_view>(idx));
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
     if (d_str.empty()) { return; }
 
     auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::size_type>(idx);
+                          .element<cudf::size_type>(idx + d_strings.offset());
     auto const d_indices = d_byte_indices + offset;
-    auto d_output        = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto d_output        = d_chars + d_offsets[idx];
 
     // copy chars while indices[i]==0,
     // insert space each time indices[i]!=0
@@ -327,36 +259,26 @@ struct build_encoding_fn {
   }
 };
 
-/**
- * @brief Perform byte pair encoding on each string in the input column.
- *
- * The result is a strings column of the same size where each string has been encoded.
- *
- * The encoding is performed iteratively. Each pass determines the string's lowest
- * ranked merge pair as determined by the strings in `merges_table`. This pair
- * is removed (virtually) from each string before starting the next iteration.
- *
- * Once all pairs have exhausted for all strings, the output is constructed from
- * the results by adding spaces between each remaining pair in each string.
- *
- * @param input Strings to encode.
- * @param merge_pairs Merge pairs data and map used for encoding.
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-std::unique_ptr<cudf::column> byte_pair_encoding(
-  cudf::strings_column_view const& input,
-  bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs,
-  cudf::string_view d_separator,
-  rmm::cuda_stream_view stream)
+}  // namespace
+
+std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
+                                                 bpe_merge_pairs const& merge_pairs,
+                                                 cudf::string_scalar const& separator,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
-  auto const d_merges = merge_pairs.get_merge_pairs();
-  CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty");
+  if (input.is_empty() || input.chars_size() == 0) {
+    return cudf::make_empty_column(cudf::type_id::STRING);
+  }
 
-  // build working vector to hold index values per byte
-  rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
+  CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid");
+  auto const d_separator = separator.value(stream);
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
+  // build working vector to hold index values per byte
+  rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
+
   auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                            static_cast<cudf::size_type>(input.size() + 1),
                                            cudf::mask_state::UNALLOCATED,
@@ -364,7 +286,8 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
                                            rmm::mr::get_current_device_resource());
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
-  auto map_ref = merge_pairs.get_merge_pairs_ref();
+  auto const d_merges = merge_pairs.impl->get_merge_pairs();
+  auto const map_ref  = merge_pairs.impl->get_merge_pairs_ref();
   byte_pair_encoding_fn<decltype(map_ref)> fn{d_merges,
                                               *d_strings,
                                               d_separator,
@@ -391,172 +314,11 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
     input.size(),
     build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars});
 
-  return make_strings_column(
-    input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{});
-}
-
-/**
- * @brief Detect space to not-space transitions inside each string.
- *
- * This handles sliced input and null strings as well.
- * It is parallelized over bytes and returns true only for valid left edges
- * -- non-space preceded by a space.
- */
-struct edge_of_space_fn {
-  cudf::column_device_view const d_strings;
-  __device__ bool operator()(cudf::size_type offset)
-  {
-    auto const d_chars =
-      d_strings.child(cudf::strings_column_view::chars_column_index).data<u_char>();
-    if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; }
-
-    auto const offsets   = d_strings.child(cudf::strings_column_view::offsets_column_index);
-    auto const d_offsets = offsets.data<cudf::size_type>() + d_strings.offset();
-    // ignore offsets outside sliced range
-    if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; }
-
-    auto itr =
-      thrust::lower_bound(thrust::seq, d_offsets, d_offsets + d_strings.size() + 1, offset);
-    // ignore offsets at existing string boundaries
-    if (*itr == offset) { return false; }
-
-    // count only edges for valid strings
-    auto const index = static_cast<cudf::size_type>(thrust::distance(d_offsets, itr)) - 1;
-    return d_strings.is_valid(index);
-  }
-};
-
-/**
- * @brief Create new offsets by identifying substrings by whitespace.
- *
- * This is similar to cudf::strings::split_record but does not fully split
- * and only returns new offsets. The behavior is more like a view-only slice
- * of the chars child with the result still including trailing delimiters.
- *
- * The encoding algorithm ignores the trailing whitespace of each string.
- *
- * @param input Strings to tokenize.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return New offsets including those at the edge of each space.
- */
-std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& input,
-                                            cudf::column_device_view const& d_strings,
-                                            rmm::cuda_stream_view stream)
-{
-  // count space offsets
-  auto const begin = thrust::make_counting_iterator<cudf::size_type>(1);
-  auto const end   = thrust::make_counting_iterator<cudf::size_type>(input.chars().size());
-  edge_of_space_fn edge_of_space{d_strings};
-  auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space);
-
-  // copy space offsets
-  rmm::device_uvector<cudf::size_type> space_offsets(space_count, stream);
-  thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space);
-
-  // create output offsets
-  auto result =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                              static_cast<cudf::size_type>(space_count + input.size() + 1),
-                              cudf::mask_state::UNALLOCATED,
-                              stream,
-                              rmm::mr::get_current_device_resource());
-
-  // combine current offsets with space offsets
-  thrust::merge(rmm::exec_policy(stream),
-                input.offsets_begin(),
-                input.offsets_end(),
-                space_offsets.begin(),
-                space_offsets.end(),
-                result->mutable_view().begin<cudf::size_type>());
-
-  return result;
-}
-
-/**
- * @brief Build new offsets that can be used to build a list column for calling join.
- *
- * This essentially returns the number of tokens for each string.
- */
-struct list_offsets_fn {
-  cudf::column_device_view const d_strings;
-  __device__ cudf::size_type operator()(cudf::size_type idx)
-  {
-    if (d_strings.is_null(idx)) return 0;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return 1;  // empty is a single valid result
-
-    auto const begin = thrust::make_counting_iterator<cudf::size_type>(1);
-    auto const end   = thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes());
-
-    // this counts the number of non-adjacent delimiters
-    auto const data   = reinterpret_cast<u_char const*>(d_str.data());
-    auto const result = thrust::count_if(thrust::seq, begin, end, [data](auto chidx) {
-      return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]);
-    });
-    return static_cast<cudf::size_type>(result) + 1;
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
-                                                 bpe_merge_pairs const& merge_pairs,
-                                                 cudf::string_scalar const& separator,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  if (input.is_empty() || input.chars_size() == 0) {
-    return cudf::make_empty_column(cudf::type_id::STRING);
-  }
-
-  CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid");
-  auto const d_separator = separator.value(stream);
-
-  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
-  auto const offsets   = space_offsets(input, *d_strings, stream);
-
-  // build a view using the new offsets and the current input chars column
-  auto const input_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
-                                            offsets->size() - 1,
-                                            nullptr,  // no parent data
-                                            nullptr,  // null-mask
-                                            0,        // null-count
-                                            0,        // offset
-                                            {offsets->view(), input.chars()});
-
-  // run BPE on this view
-  auto const bpe_column = byte_pair_encoding(
-    cudf::strings_column_view(input_view), *(merge_pairs.impl), d_separator, stream);
-
-  // recombine the result:
-  // compute the offsets needed to build a list view
-  auto const list_offsets = [d_strings = *d_strings, stream] {
-    auto offsets_itr = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<cudf::size_type>(0), list_offsets_fn{d_strings});
-    auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column(
-      offsets_itr, offsets_itr + d_strings.size(), stream, rmm::mr::get_current_device_resource()));
-    return offsets_column;
-  }();
-
-  // build a list column_view using the BPE output and the list_offsets
-  auto const list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST},
-                                           input.size(),
-                                           nullptr,  // no parent data in list column
-                                           input.null_mask(),
-                                           input.null_count(),
-                                           0,
-                                           {list_offsets->view(), bpe_column->view()});
-
-  // build the output strings column
-  auto result =
-    cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join),
-                                              separator,
-                                              cudf::string_scalar(""),
-                                              cudf::strings::separator_on_nulls::NO,
-                                              cudf::strings::output_if_empty_list::EMPTY_STRING,
-                                              stream,
-                                              mr);
-  return result;
+  return cudf::make_strings_column(input.size(),
+                                   std::move(offsets),
+                                   std::move(chars),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 83aa22aaae9..c24fb8a745e 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -22,6 +22,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
 
@@ -31,6 +32,11 @@
 
 #include <cuco/static_map.cuh>
 
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/find.h>
+#include <thrust/pair.h>
+
 #include <cstdint>
 #include <type_traits>
 
@@ -39,6 +45,27 @@ namespace detail {
 
 using hash_value_type    = uint32_t;
 using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using merge_pair_type    = thrust::pair<cudf::string_view, cudf::string_view>;
+
+/**
+ * @brief Parse the merge pair into components.
+ *
+ * The two substrings are separated by a single space.
+ *
+ * @param idx Index of merge pair to dissect.
+ * @return The left and right halves of the merge pair.
+ */
+__device__ __inline__ merge_pair_type dissect_merge_pair(cudf::string_view d_pair)
+{
+  auto const lhs     = d_pair.data();
+  auto const end_str = d_pair.data() + d_pair.size_bytes();
+  auto const rhs     = thrust::find(thrust::seq, lhs, end_str, ' ');  // space always expected
+  // check for malformed pair entry to prevent segfault
+  if (rhs == end_str) { return merge_pair_type{cudf::string_view{}, cudf::string_view{}}; }
+  auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs));
+  auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs + 1, end_str));
+  return merge_pair_type{cudf::string_view(lhs, lhs_size), cudf::string_view(rhs + 1, rhs_size)};
+}
 
 /**
  * @brief Hasher function used for building and using the cuco static-map
@@ -52,10 +79,14 @@ struct bpe_hasher {
   // used by insert
   __device__ hash_value_type operator()(cudf::size_type index) const
   {
-    return hasher(d_strings.element<cudf::string_view>(index));
+    auto const [lhs, rhs] = dissect_merge_pair(d_strings.element<cudf::string_view>(index));
+    return cudf::hashing::detail::hash_combine(hasher(lhs), hasher(rhs));
   }
   // used by find
-  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+  __device__ hash_value_type operator()(merge_pair_type const& mp) const
+  {
+    return cudf::hashing::detail::hash_combine(hasher(mp.first), hasher(mp.second));
+  }
 };
 
 /**
@@ -72,9 +103,10 @@ struct bpe_equal {
     return d_strings.element<cudf::string_view>(lhs) == d_strings.element<cudf::string_view>(rhs);
   }
   // used by find
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept
   {
-    return d_strings.element<cudf::string_view>(lhs) == rhs;
+    auto const d_pair = dissect_merge_pair(d_strings.element<cudf::string_view>(lhs));
+    return d_pair.first == rhs.first && d_pair.second == rhs.second;
   }
 };
 
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index fbb2335f343..f2fd487e7f9 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -29,39 +29,23 @@ struct TextBPETokenize : public cudf::test::BaseFixture {};
 TEST_F(TextBPETokenize, BytePairEncoding)
 {
   // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
-  auto mpt = cudf::test::strings_column_wrapper({
-    "e n",    // 12
-    "i t",    // 14
-    "i s",    // 15
-    "e s",    // 18
-    "en t",   // 42
-    "c e",    // 88
-    "es t",   // 139
-    "en ce",  // 338
-    "T h",    // 561
-    "Th is",  // 956
-    "t est",  // 9032
-    "s ent",  // 33830
-  });
+  auto mpt = cudf::test::strings_column_wrapper(
+    {"e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "T h", "Th is", "t est", "s ent"});
 
   auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   auto validity = cudf::test::iterators::null_at(4);
-  cudf::test::strings_column_wrapper input({" This\tis  it\n",
-                                            "This is test-sentence-1",
-                                            "This is test sentence-2",
-                                            "This-is test sentence 3",
-                                            "",
-                                            ""},
-                                           validity);
+  cudf::test::strings_column_wrapper input(
+    {"Thisisit", "Thisis test-sentence-1", "Thisistestsentence-2", "This-istestsentence 3", "", ""},
+    validity);
   auto sv = cudf::strings_column_view(input);
 
   auto results = nvtext::byte_pair_encoding(sv, *merge_pairs);
 
-  auto expected = cudf::test::strings_column_wrapper({" This is it",
-                                                      "This is test - sent ence - 1",
+  auto expected = cudf::test::strings_column_wrapper({"This is it",
+                                                      "This is   test - sent ence - 1",
                                                       "This is test sent ence - 2",
-                                                      "This - is test sent ence 3",
+                                                      "This - is test sent ence   3",
                                                       "",
                                                       ""},
                                                      validity);
@@ -78,19 +62,19 @@ TEST_F(TextBPETokenize, BytePairEncoding)
 TEST_F(TextBPETokenize, BytePairEncodingSeparator)
 {
   auto mpt = cudf::test::strings_column_wrapper(
-    {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
+    {"Ġ t", "Ġt he", "h e", "e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
   auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   cudf::test::strings_column_wrapper input(
-    {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "});
+    {"Ġthe test sentence", "test Ġthe sentence", "Ġthetest sentence", "testĠthesentence"});
   auto sv = cudf::strings_column_view(input);
 
-  auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string(" Ġ"));
+  auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string("$"));
 
-  auto expected = cudf::test::strings_column_wrapper({"test Ġ- Ġsent Ġence Ġ- Ġ1",
-                                                      "test Ġsent Ġence Ġ- Ġ2",
-                                                      "test Ġsent Ġence Ġ3",
-                                                      " Ġtest Ġsent Ġence Ġ4"});
+  auto expected = cudf::test::strings_column_wrapper({"Ġthe$ $test$ $sent$ence",
+                                                      "test$ $Ġthe$ $sent$ence",
+                                                      "Ġthe$test$ $sent$ence",
+                                                      "test$Ġthe$sent$ence"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 

From cb02a25f582f0982fb920fe5de0b0cae0ba29b3d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 29 Aug 2023 17:22:25 -0400
Subject: [PATCH 08/28] more efficient pair lookup

---
 cpp/include/nvtext/bpe_tokenize.hpp      |  7 +--
 cpp/src/text/subword/bpe_tokenizer.cu    | 59 +++++++++---------------
 cpp/src/text/subword/bpe_tokenizer.cuh   | 42 ++++++-----------
 cpp/src/text/subword/load_merges_file.cu | 14 +++---
 4 files changed, 48 insertions(+), 74 deletions(-)

diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index 7a362847a7c..4e361e3c130 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -32,7 +32,7 @@ namespace nvtext {
 /**
  * @brief The table of merge pairs for the BPE encoder.
  *
- * To create an instance, call @ref nvtext::load_merge_pairs_file
+ * To create an instance, call @ref nvtext::load_merge_pairs
  */
 struct bpe_merge_pairs {
   struct bpe_merge_pairs_impl;
@@ -144,7 +144,8 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
  * pairs before the result is joined to make the output string.
  *
  * @code{.pseudo}
- * mps = load_merges_file("merges.txt") // see doxygen for example contents
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * mps = load_merge_pairs(merge_pairs)
  * input = ["test sentence", "thisis test"]
  * result = byte_pair_encoding(input, mps)
  * result is now ["test sent ence", "this is test"]
@@ -154,7 +155,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
  * @throw cudf::logic_error if `separator` is invalid
  *
  * @param input Strings to encode.
- * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
+ * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
  * @param mr Memory resource to allocate any returned objects.
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index ce5b6523854..e30dfa7aed6 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -55,9 +55,9 @@ struct byte_pair_encoding_fn {
   cudf::column_device_view const d_strings;
   cudf::string_view const d_separator;
   MapRefType const d_map;
-  cudf::size_type* d_sizes;  // output size of encoded string
+  cudf::size_type* d_sizes;         // output size of encoded string
   string_hasher_type const hasher;
-  cudf::size_type* d_byte_indices;
+  cudf::size_type* d_byte_indices;  // output indices per string
 
   /**
    * @brief Get the next substring of the given string.
@@ -76,26 +76,13 @@ struct byte_pair_encoding_fn {
   template <typename Iterator>
   __device__ cudf::string_view next_substr(Iterator begin,
                                            Iterator end,
-                                           cudf::string_view const& d_str)
+                                           cudf::string_view const& d_str) const
   {
     auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
     auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
     return cudf::string_view(d_str.data() + *begin, size);
   }
 
-  /**
-   * @brief Look up the pair of strings in the d_map/d_merges
-   *
-   * @param lhs Left half of the string
-   * @param rhs Right half of the string
-   * @return Position of merge pair within d_map
-   */
-  __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs)
-  {
-    auto const mp = merge_pair_type{lhs, rhs};
-    return d_map.find(mp);
-  }
-
   /**
    * @brief Byte encode each string.
    *
@@ -113,7 +100,7 @@ struct byte_pair_encoding_fn {
    *
    * @param idx The index of the string in `d_strings` to encode
    */
-  __device__ void operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     if (d_strings.is_null(idx)) {
       d_sizes[idx] = 0;
@@ -154,13 +141,15 @@ struct byte_pair_encoding_fn {
 
       auto min_itr  = itr;               // these are set along with
       auto min_size = lhs.size_bytes();  // the min_rank variable
+      auto min_mp   = merge_pair_type{};
 
       // check each adjacent pair against the d_map
       while (itr < end) {
         auto const rhs = next_substr(itr, end, d_str);
         if (rhs.empty()) { break; }  // no more adjacent pairs
 
-        auto const map_itr = get_merge_pair(lhs, rhs);
+        auto const mp      = merge_pair_type{lhs, rhs};
+        auto const map_itr = d_map.find(mp);
         if (map_itr != d_map.end()) {
           // found a match; record the rank (and other min_ vars)
           auto const rank = map_itr->second;
@@ -168,6 +157,7 @@ struct byte_pair_encoding_fn {
             min_rank = rank;
             min_itr  = itr;
             min_size = rhs.size_bytes();
+            min_mp   = mp;
           }
         }
         // next substring
@@ -184,14 +174,11 @@ struct byte_pair_encoding_fn {
         // continue scanning for other occurrences in the remainder of the string
         itr += min_size;
         if (itr < end) {
-          // auto const d_pair = dissect_merge_pair(min_rank);
-          auto const d_pair = dissect_merge_pair(d_merges.element<cudf::string_view>(min_rank));
-
           lhs = next_substr(itr, end, d_str);
           itr += lhs.size_bytes();
           while (itr < end && !lhs.empty()) {
             auto rhs = next_substr(itr, end, d_str);
-            if (d_pair.first == lhs && d_pair.second == rhs) {
+            if ((min_mp.first == lhs) && (min_mp.second == rhs)) {
               *itr = 0;                   // removes the pair from this string
               itr += rhs.size_bytes();
               if (itr >= end) { break; }  // done checking for pairs
@@ -233,7 +220,7 @@ struct build_encoding_fn {
   cudf::size_type const* d_offsets;
   char* d_chars{};
 
-  __device__ void operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     if (d_strings.is_null(idx)) { return; }
     auto const d_str = d_strings.element<cudf::string_view>(idx);
@@ -288,15 +275,15 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 
   auto const d_merges = merge_pairs.impl->get_merge_pairs();
   auto const map_ref  = merge_pairs.impl->get_merge_pairs_ref();
-  byte_pair_encoding_fn<decltype(map_ref)> fn{d_merges,
-                                              *d_strings,
-                                              d_separator,
-                                              map_ref,
-                                              d_offsets,
-                                              string_hasher_type{},
-                                              d_byte_indices.data()};
-  thrust::for_each_n(
-    rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
+  auto const bpe_fn   = byte_pair_encoding_fn<decltype(map_ref)>{d_merges,
+                                                                 *d_strings,
+                                                                 d_separator,
+                                                                 map_ref,
+                                                                 d_offsets,
+                                                                 string_hasher_type{},
+                                                                 d_byte_indices.data()};
+  auto const zero_itr = thrust::counting_iterator<cudf::size_type>(0);
+  thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), bpe_fn);
 
   // build the output: add spaces between the remaining pairs in each string
   thrust::exclusive_scan(
@@ -308,11 +295,9 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     bytes, stream, rmm::mr::get_current_device_resource());
   auto d_chars = chars->mutable_view().data<char>();
 
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    input.size(),
-    build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars});
+  auto const result_fn =
+    build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars};
+  thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), result_fn);
 
   return cudf::make_strings_column(input.size(),
                                    std::move(offsets),
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index c24fb8a745e..7632d879b67 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -47,26 +47,6 @@ using hash_value_type    = uint32_t;
 using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
 using merge_pair_type    = thrust::pair<cudf::string_view, cudf::string_view>;
 
-/**
- * @brief Parse the merge pair into components.
- *
- * The two substrings are separated by a single space.
- *
- * @param idx Index of merge pair to dissect.
- * @return The left and right halves of the merge pair.
- */
-__device__ __inline__ merge_pair_type dissect_merge_pair(cudf::string_view d_pair)
-{
-  auto const lhs     = d_pair.data();
-  auto const end_str = d_pair.data() + d_pair.size_bytes();
-  auto const rhs     = thrust::find(thrust::seq, lhs, end_str, ' ');  // space always expected
-  // check for malformed pair entry to prevent segfault
-  if (rhs == end_str) { return merge_pair_type{cudf::string_view{}, cudf::string_view{}}; }
-  auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs));
-  auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs + 1, end_str));
-  return merge_pair_type{cudf::string_view(lhs, lhs_size), cudf::string_view(rhs + 1, rhs_size)};
-}
-
 /**
  * @brief Hasher function used for building and using the cuco static-map
  *
@@ -79,7 +59,9 @@ struct bpe_hasher {
   // used by insert
   __device__ hash_value_type operator()(cudf::size_type index) const
   {
-    auto const [lhs, rhs] = dissect_merge_pair(d_strings.element<cudf::string_view>(index));
+    index *= 2;
+    auto const lhs = d_strings.element<cudf::string_view>(index);
+    auto const rhs = d_strings.element<cudf::string_view>(index + 1);
     return cudf::hashing::detail::hash_combine(hasher(lhs), hasher(rhs));
   }
   // used by find
@@ -100,13 +82,20 @@ struct bpe_equal {
   // used by insert
   __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
   {
-    return d_strings.element<cudf::string_view>(lhs) == d_strings.element<cudf::string_view>(rhs);
+    lhs *= 2;
+    rhs *= 2;
+    return (d_strings.element<cudf::string_view>(lhs) ==
+            d_strings.element<cudf::string_view>(rhs)) &&
+           (d_strings.element<cudf::string_view>(lhs + 1) ==
+            d_strings.element<cudf::string_view>(rhs + 1));
   }
   // used by find
   __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept
   {
-    auto const d_pair = dissect_merge_pair(d_strings.element<cudf::string_view>(lhs));
-    return d_pair.first == rhs.first && d_pair.second == rhs.second;
+    lhs *= 2;
+    auto const left  = d_strings.element<cudf::string_view>(lhs);
+    auto const right = d_strings.element<cudf::string_view>(lhs + 1);
+    return (left == rhs.first) && (right == rhs.second);
   }
 };
 
@@ -124,9 +113,8 @@ using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
 
 }  // namespace detail
 
-// since column_device_view::create returns is a little more than
-// std::unique_ptr<column_device_view> this helper simplifies the return type in a more maintainable
-// way
+// since column_device_view::create() returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type for us
 using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
                                              cudf::column_view,
                                              rmm::cuda_stream_view>;
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index ece9f9a2967..b2c552e49bc 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/split/split.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -88,12 +89,10 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  // Ensure capacity is at least (size/0.7) as documented here:
-  // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
-    static_cast<size_t>(input.size() * 2),  // capacity is 2x;
+    static_cast<size_t>(input.size()),
     cuco::empty_key{-1},
-    cuco::empty_value{-1},                  // empty value is not used
+    cuco::empty_value{-1},
     bpe_equal{input},
     probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
@@ -102,7 +101,7 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   auto iter = cudf::detail::make_counting_transform_iterator(
     0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
 
-  merge_pairs_map->insert_async(iter, iter + input.size(), stream.value());
+  merge_pairs_map->insert_async(iter, iter + (input.size() / 2), stream.value());
 
   return merge_pairs_map;
 }
@@ -121,8 +120,9 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  return create_bpe_merge_pairs_impl(std::make_unique<cudf::column>(input.parent(), stream, mr),
-                                     stream);
+  auto pairs   = cudf::strings::split_record(input);  // Fix once 13997 is merged
+  auto content = pairs->release();
+  return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream);
 }
 
 }  // namespace

From fb111166bb0baacc39cede35e72c266ec076576d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 31 Aug 2023 11:30:48 -0400
Subject: [PATCH 09/28] try adding zlib to dependencies.yaml

---
 dependencies.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dependencies.yaml b/dependencies.yaml
index a1d928797b0..fc024a67946 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -179,6 +179,7 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - dlpack>=0.5,<0.6.0a0
+          - zlib
     specific:
       - output_types: conda
         matrices:

From ea4150d640b4a9e5015c9b314b58185bf0c32d56 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 31 Aug 2023 11:38:13 -0400
Subject: [PATCH 10/28] add zlib to conda env yamls too

---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-120_arch-x86_64.yaml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e4a9b2f1d29..e2648090147 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -95,6 +95,7 @@ dependencies:
 - tokenizers==0.13.1
 - transformers==4.24.0
 - typing_extensions>=4.0.0
+- zlib
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index d03c4364435..e6ee24651fb 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -92,6 +92,7 @@ dependencies:
 - tokenizers==0.13.1
 - transformers==4.24.0
 - typing_extensions>=4.0.0
+- zlib
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-120_arch-x86_64

From 346b0cbee2325ff6f6e10baab5fb5c70fe4a9e7b Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 31 Aug 2023 13:11:52 -0400
Subject: [PATCH 11/28] undo temp changes

---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 -
 conda/environments/all_cuda-120_arch-x86_64.yaml | 1 -
 dependencies.yaml                                | 1 -
 3 files changed, 3 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e2648090147..e4a9b2f1d29 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -95,7 +95,6 @@ dependencies:
 - tokenizers==0.13.1
 - transformers==4.24.0
 - typing_extensions>=4.0.0
-- zlib
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index e6ee24651fb..d03c4364435 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -92,7 +92,6 @@ dependencies:
 - tokenizers==0.13.1
 - transformers==4.24.0
 - typing_extensions>=4.0.0
-- zlib
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-120_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index fc024a67946..a1d928797b0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -179,7 +179,6 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - dlpack>=0.5,<0.6.0a0
-          - zlib
     specific:
       - output_types: conda
         matrices:

From 6ad41876cb707c28ae5ebb781af6460b13e31626 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 31 Aug 2023 19:15:35 -0400
Subject: [PATCH 12/28] use segmented reduce

---
 cpp/src/text/subword/bpe_tokenizer.cu    | 67 ++++++++++++------------
 cpp/src/text/subword/bpe_tokenizer.cuh   |  7 +--
 cpp/src/text/subword/load_merges_file.cu |  6 +--
 3 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index e30dfa7aed6..298907ccb27 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -51,7 +52,6 @@ namespace {
  */
 template <typename MapRefType>
 struct byte_pair_encoding_fn {
-  cudf::column_device_view const d_merges;
   cudf::column_device_view const d_strings;
   cudf::string_view const d_separator;
   MapRefType const d_map;
@@ -74,13 +74,16 @@ struct byte_pair_encoding_fn {
    * @return The substring found.
    */
   template <typename Iterator>
-  __device__ cudf::string_view next_substr(Iterator begin,
-                                           Iterator end,
-                                           cudf::string_view const& d_str) const
+  __device__ __inline__ cudf::string_view next_substr(Iterator begin,
+                                                      Iterator end,
+                                                      cudf::string_view const& d_str) const
   {
     auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
     auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-    return cudf::string_view(d_str.data() + *begin, size);
+    return cudf::string_view(
+      d_strings.child(cudf::strings_column_view::chars_column_index).data<char>() +
+        thrust::distance(d_byte_indices, begin),
+      size);
   }
 
   /**
@@ -116,20 +119,11 @@ struct byte_pair_encoding_fn {
                           .element<cudf::size_type>(idx + d_strings.offset());
     auto const d_indices = d_byte_indices + offset;
 
-    // initialize the byte indices for this string;
-    // set the index value to 0 for any intermediate UTF-8 bytes
-    thrust::transform(thrust::seq,
-                      thrust::make_counting_iterator<cudf::size_type>(0),
-                      thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes()),
-                      d_indices,
-                      [data = d_str.data()](auto idx) {
-                        auto const byte = static_cast<uint8_t>(data[idx]);
-                        return cudf::strings::detail::is_begin_utf8_char(byte) ? idx : 0;
-                      });
-
     auto const begin = d_indices;
     auto const end   = d_indices + d_str.size_bytes();
 
+    *begin = d_str.size_bytes();  // init first char
+
     // keep processing the string until there are no more adjacent pairs found in d_map
     cudf::size_type min_rank = 0;
     while (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
@@ -192,13 +186,6 @@ struct byte_pair_encoding_fn {
         }
       }
     }
-
-    // compute and store the output size for this string's encoding
-    auto separators_size =
-      thrust::count_if(
-        thrust::seq, d_indices, d_indices + d_str.size_bytes(), [](auto v) { return v != 0; }) *
-      d_separator.size_bytes();
-    d_sizes[idx] = static_cast<cudf::size_type>(d_str.size_bytes() + separators_size);
   }
 };
 
@@ -265,6 +252,17 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 
   // build working vector to hold index values per byte
   rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
+  // initialize the byte indices for all strings;
+  // set the index value to 0 for any intermediate UTF-8 bytes
+  auto const zero_itr = thrust::counting_iterator<cudf::size_type>(0);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    zero_itr,
+    thrust::counting_iterator<cudf::size_type>(input.chars().size()),
+    d_byte_indices.begin(),
+    [data = input.chars().data<uint8_t>(), d_separator] __device__(auto idx) {
+      return cudf::strings::detail::is_begin_utf8_char(data[idx]) ? d_separator.size_bytes() : 0;
+    });
 
   auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
                                            static_cast<cudf::size_type>(input.size() + 1),
@@ -273,18 +271,21 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                            rmm::mr::get_current_device_resource());
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
-  auto const d_merges = merge_pairs.impl->get_merge_pairs();
-  auto const map_ref  = merge_pairs.impl->get_merge_pairs_ref();
-  auto const bpe_fn   = byte_pair_encoding_fn<decltype(map_ref)>{d_merges,
-                                                                 *d_strings,
-                                                                 d_separator,
-                                                                 map_ref,
-                                                                 d_offsets,
-                                                                 string_hasher_type{},
-                                                                 d_byte_indices.data()};
-  auto const zero_itr = thrust::counting_iterator<cudf::size_type>(0);
+  // auto const d_merges = merge_pairs.impl->get_merge_pairs();
+  auto const map_ref = merge_pairs.impl->get_merge_pairs_ref();
+  auto const bpe_fn  = byte_pair_encoding_fn<decltype(map_ref)>{
+    *d_strings, d_separator, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
   thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), bpe_fn);
 
+  // compute and store the output size for this string's encoding
+  cudf::reduction::detail::segmented_reduce(d_byte_indices.begin(),
+                                            input.offsets_begin(),
+                                            input.offsets_end(),
+                                            d_offsets,
+                                            thrust::plus{},
+                                            0,
+                                            stream);
+
   // build the output: add spaces between the remaining pairs in each string
   thrust::exclusive_scan(
     rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets);
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 7632d879b67..55b192c27d4 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -82,12 +82,7 @@ struct bpe_equal {
   // used by insert
   __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
   {
-    lhs *= 2;
-    rhs *= 2;
-    return (d_strings.element<cudf::string_view>(lhs) ==
-            d_strings.element<cudf::string_view>(rhs)) &&
-           (d_strings.element<cudf::string_view>(lhs + 1) ==
-            d_strings.element<cudf::string_view>(rhs + 1));
+    return lhs == rhs;  // all rows are unique
   }
   // used by find
   __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index b2c552e49bc..4271f3bd3ad 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -29,8 +29,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/functional.h>
-
 #include <fstream>
 #include <iostream>
 #include <vector>
@@ -120,7 +118,9 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto pairs   = cudf::strings::split_record(input);  // Fix once 13997 is merged
+  auto pairs =
+    cudf::strings::split_record(input, cudf::string_scalar(" "));  // Fix once 13997 is merged
+  // perhaps check the pairs are valid?
   auto content = pairs->release();
   return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream);
 }

From 511a076adc2f2ed773d50ff72b6507a2777f68cc Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 7 Sep 2023 12:00:48 -0400
Subject: [PATCH 13/28] block per string

---
 cpp/src/text/subword/bpe_tokenizer.cu    | 395 ++++++++++-------------
 cpp/src/text/subword/load_merges_file.cu |   4 +-
 cpp/tests/text/bpe_tests.cpp             |  24 ++
 3 files changed, 202 insertions(+), 221 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 298907ccb27..338f7164a9b 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -23,6 +23,8 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
@@ -33,206 +35,134 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/for_each.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/scan.h>
+#include <thrust/merge.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
 namespace detail {
-
 namespace {
 
-/**
- * @brief Main byte pair encoding algorithm function for each string.
- *
- * @see The byte_pair_encoding_fn::operator() function below for details.
- */
+constexpr int block_size = 512;
+
+__device__ __inline__ cudf::string_view next_substr(cudf::column_device_view const& d_strings,
+                                                    int* d_spaces,
+                                                    int* begin,
+                                                    int* end,
+                                                    cudf::string_view const& d_str)
+{
+  auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
+  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+  return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
+}
+
 template <typename MapRefType>
-struct byte_pair_encoding_fn {
-  cudf::column_device_view const d_strings;
-  cudf::string_view const d_separator;
-  MapRefType const d_map;
-  cudf::size_type* d_sizes;         // output size of encoded string
-  string_hasher_type const hasher;
-  cudf::size_type* d_byte_indices;  // output indices per string
-
-  /**
-   * @brief Get the next substring of the given string.
-   *
-   * This will find the next sequence of characters identified by the
-   * given byte indices iterator values. The beginning of the sequence
-   * starts at `begin` and the end of the sequence is the first non-zero
-   * index found between (begin,end) exclusive.
-   *
-   * @tparam Iterator The byte indices iterator type
-   * @param begin Start of indices to check
-   * @param end End of indices to check
-   * @param d_str String to substring
-   * @return The substring found.
-   */
-  template <typename Iterator>
-  __device__ __inline__ cudf::string_view next_substr(Iterator begin,
-                                                      Iterator end,
-                                                      cudf::string_view const& d_str) const
-  {
-    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
-    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-    return cudf::string_view(
-      d_strings.child(cudf::strings_column_view::chars_column_index).data<char>() +
-        thrust::distance(d_byte_indices, begin),
-      size);
+__global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
+                                MapRefType const d_map,
+                                cudf::size_type* d_sizes,      // output size of encoded string
+                                cudf::size_type* d_spaces_in,  // output per string
+                                cudf::size_type* d_working     // working memory
+)
+{
+  // string per block
+  auto const str_idx =
+    static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  if (d_strings.is_null(str_idx)) {
+    d_sizes[str_idx] = 0;
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_sizes[str_idx] = 0;
+    return;
   }
 
-  /**
-   * @brief Byte encode each string.
-   *
-   * Each string is iteratively scanned for the minimum rank of adjacent substring pairs
-   * as found within the `d_map` table. Once the minimum pair is located, that pair
-   * is removed -- virtually by zero-ing the index value between any matching adjacent pairs.
-   *
-   * The iteration ends once there are no more adjacent pairs or there are no more
-   * matches found in `d_map`. At the end, the indices for each string reflect the
-   * encoding pattern and can be used to build the output.
-   *
-   * This function also computes the size of the encoded output of each string
-   * by simply counting the number of non-zero indices values remaining. This saves
-   * an extra kernel launch normally required to compute the offsets of the output column.
-   *
-   * @param idx The index of the string in `d_strings` to encode
-   */
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    if (d_strings.is_null(idx)) {
-      d_sizes[idx] = 0;
-      return;
-    }
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) {
-      d_sizes[idx] = 0;
-      return;
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+
+  auto const d_spaces    = d_spaces_in + offset;
+  auto const end_spaces  = d_spaces + d_str.size_bytes();
+  auto const d_min_ranks = d_working + offset;
+  auto const end_ranks   = d_min_ranks + d_str.size_bytes();
+  auto const max_rank    = cuda::std::numeric_limits<cudf::size_type>::max();
+
+  __shared__ cudf::size_type block_min_rank;
+  using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  if (lane_idx == 0) {
+    // the first character is free so we store the string's size here
+    // to help compute the encoded output size later
+    *d_spaces      = d_str.size_bytes();
+    block_min_rank = 0;
+  }
+  __syncthreads();
+
+  // each thread processes their part of the string and records its min_rank
+  while (block_min_rank < max_rank) {
+    auto min_rank = max_rank;
+    // initialize min ranks
+    // future optimization: only invalidate ranks where mins were
+    // found in the previous run
+    for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+      *itr = max_rank;
     }
-
-    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::size_type>(idx + d_strings.offset());
-    auto const d_indices = d_byte_indices + offset;
-
-    auto const begin = d_indices;
-    auto const end   = d_indices + d_str.size_bytes();
-
-    *begin = d_str.size_bytes();  // init first char
-
-    // keep processing the string until there are no more adjacent pairs found in d_map
-    cudf::size_type min_rank = 0;
-    while (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
-      // initialize working variables
-      min_rank = cuda::std::numeric_limits<cudf::size_type>::max();
-
-      auto lhs = next_substr(begin, end, d_str);
-      auto itr = begin + lhs.size_bytes();
-
-      auto min_itr  = itr;               // these are set along with
-      auto min_size = lhs.size_bytes();  // the min_rank variable
-      auto min_mp   = merge_pair_type{};
-
-      // check each adjacent pair against the d_map
-      while (itr < end) {
-        auto const rhs = next_substr(itr, end, d_str);
-        if (rhs.empty()) { break; }  // no more adjacent pairs
-
-        auto const mp      = merge_pair_type{lhs, rhs};
-        auto const map_itr = d_map.find(mp);
-        if (map_itr != d_map.end()) {
-          // found a match; record the rank (and other min_ vars)
-          auto const rank = map_itr->second;
-          if (rank < min_rank) {
-            min_rank = rank;
-            min_itr  = itr;
-            min_size = rhs.size_bytes();
-            min_mp   = mp;
-          }
-        }
-        // next substring
-        lhs = rhs;
-        itr += rhs.size_bytes();
-      }
-
-      // if any pair matched, remove every occurrence from the string
-      if (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
-        // remove the first pair we found
-        itr  = min_itr;
-        *itr = 0;
-
-        // continue scanning for other occurrences in the remainder of the string
-        itr += min_size;
-        if (itr < end) {
-          lhs = next_substr(itr, end, d_str);
-          itr += lhs.size_bytes();
-          while (itr < end && !lhs.empty()) {
-            auto rhs = next_substr(itr, end, d_str);
-            if ((min_mp.first == lhs) && (min_mp.second == rhs)) {
-              *itr = 0;                   // removes the pair from this string
-              itr += rhs.size_bytes();
-              if (itr >= end) { break; }  // done checking for pairs
-              // skip to the next adjacent pair
-              rhs = next_substr(itr, end, d_str);
+    __syncthreads();
+
+    for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+      if (*itr == 0) { continue; }  // start on valid bytes only
+
+      // get left half of the pair
+      auto const lhs      = next_substr(d_strings, d_spaces, itr, end_spaces, d_str);
+      auto const next_itr = itr + lhs.size_bytes();
+      if (next_itr < end_spaces) {
+        // get the right half of the pair
+        auto const rhs = next_substr(d_strings, d_spaces, next_itr, end_spaces, d_str);
+        if (!rhs.empty()) {
+          auto const index = static_cast<int>(thrust::distance(d_spaces, next_itr));
+          // this is setup for future optimization mentioned above;
+          // we only want to hash/lookup if the rank is new for this pair
+          auto rank = d_min_ranks[index];
+          if (rank == max_rank) {
+            // lookup pair in merge-pairs table
+            auto const mp      = merge_pair_type{lhs, rhs};
+            auto const map_itr = d_map.find(mp);
+            if (map_itr != d_map.end()) {  // found a match
+              rank = map_itr->second;
             }
-            // next substring
-            lhs = rhs;
-            itr += rhs.size_bytes();
           }
+          if (rank < min_rank) { min_rank = rank; }
+          d_min_ranks[index] = rank;  // store the rank
         }
       }
     }
-  }
-};
-
-/**
- * @brief Build the output string encoding.
- *
- * This copies each string to the output inserting a space at each non-zero byte index.
- *
- * @code{.txt}
- * d_strings =      ["helloworld", "testthis"]
- * d_byte_indices = [ 0000050000    00004000]
- * result is ["hello world", "test this"]
- * @endcode
- */
-struct build_encoding_fn {
-  cudf::column_device_view const d_strings;
-  cudf::string_view const d_separator;
-  cudf::size_type const* d_byte_indices;
-  cudf::size_type const* d_offsets;
-  char* d_chars{};
-
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    if (d_strings.is_null(idx)) { return; }
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) { return; }
-
-    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::size_type>(idx + d_strings.offset());
-    auto const d_indices = d_byte_indices + offset;
-    auto d_output        = d_chars + d_offsets[idx];
-
-    // copy chars while indices[i]==0,
-    // insert space each time indices[i]!=0
-    auto const begin = d_indices;
-    auto const end   = d_indices + d_str.size_bytes();
-    auto d_input     = d_str.data();
-    *d_output++      = *d_input++;
-    auto itr         = begin + 1;
-    while (itr < end) {
-      if (*itr++) { d_output = cudf::strings::detail::copy_string(d_output, d_separator); }
-      *d_output++ = *d_input++;
+    __syncthreads();
+
+    // once all threads are completed, find the min-rank across the block
+    block_min_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min());
+    __syncthreads();
+
+    if (block_min_rank < max_rank) {
+      // search the d_min_ranks for all the places where the rank matches block_min_rank
+      for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+        auto const index = static_cast<int>(thrust::distance(d_min_ranks, itr));
+        if (*itr == block_min_rank) {
+          // set the output value to 0 at this position
+          if (index > 0 && *(itr - 1) != block_min_rank) { d_spaces[index] = 0; }
+        }
+      }
+      __syncthreads();
     }
-    // https://github.com/rapidsai/cudf/pull/10270/files#r826319405
-  }
-};
-
+  }  // if no mins were found we are done, otherwise start again
+}
 }  // namespace
 
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
@@ -247,58 +177,87 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 
   CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid");
   auto const d_separator = separator.value(stream);
+  CUDF_EXPECTS(d_separator.size_bytes() == 1, "for now, separator must be a single-byte character");
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  // build working vector to hold index values per byte
-  rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
-  // initialize the byte indices for all strings;
-  // set the index value to 0 for any intermediate UTF-8 bytes
-  auto const zero_itr = thrust::counting_iterator<cudf::size_type>(0);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    zero_itr,
-    thrust::counting_iterator<cudf::size_type>(input.chars().size()),
-    d_byte_indices.begin(),
-    [data = input.chars().data<uint8_t>(), d_separator] __device__(auto idx) {
-      return cudf::strings::detail::is_begin_utf8_char(data[idx]) ? d_separator.size_bytes() : 0;
-    });
-
-  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                                           static_cast<cudf::size_type>(input.size() + 1),
-                                           cudf::mask_state::UNALLOCATED,
-                                           stream,
-                                           rmm::mr::get_current_device_resource());
+  auto const first_offset  = (input.offset() == 0) ? 0
+                                                   : cudf::detail::get_value<cudf::size_type>(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
+                               ? input.chars().size()
+                               : cudf::detail::get_value<cudf::size_type>(
+                                 input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars().data<char>() + first_offset;
+
+  auto const offset_data_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
+  auto offsets                = cudf::make_numeric_column(
+    offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr);
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
-  // auto const d_merges = merge_pairs.impl->get_merge_pairs();
+  // initialize the spaces vector which will hold encoding information
+  rmm::device_uvector<cudf::size_type> d_spaces(chars_size, stream);
+  auto const zero_itr  = thrust::counting_iterator<cudf::size_type>(0);
+  auto const chars_end = thrust::counting_iterator<cudf::size_type>(chars_size);
+  thrust::transform(rmm::exec_policy(stream),
+                    zero_itr,
+                    chars_end,
+                    d_spaces.begin(),
+                    [d_input_chars] __device__(auto idx) {
+                      return static_cast<cudf::size_type>(
+                        cudf::strings::detail::is_begin_utf8_char(d_input_chars[idx]));
+                    });
+
+  rmm::device_uvector<cudf::size_type> d_working(chars_size, stream);
   auto const map_ref = merge_pairs.impl->get_merge_pairs_ref();
-  auto const bpe_fn  = byte_pair_encoding_fn<decltype(map_ref)>{
-    *d_strings, d_separator, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
-  thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), bpe_fn);
 
+  // encoding step produces values in d_spaces that indicate where the separator is inserted
+  // and can also be reduced to compute the output size of each row
+  bpe_parallel_fn<decltype(map_ref)><<<input.size(), block_size, 0, stream.value()>>>(
+    *d_strings, map_ref, d_offsets, d_spaces.data(), d_working.data());
   // compute and store the output size for this string's encoding
-  cudf::reduction::detail::segmented_reduce(d_byte_indices.begin(),
-                                            input.offsets_begin(),
-                                            input.offsets_end(),
+  auto const input_offsets = thrust::make_transform_iterator(
+    input.offsets_begin(),
+    [first_offset] __device__(auto offset) { return offset - first_offset; });
+  cudf::reduction::detail::segmented_reduce(d_spaces.begin(),
+                                            input_offsets,
+                                            input_offsets + input.size() + 1,
                                             d_offsets,
                                             thrust::plus{},
                                             0,
                                             stream);
+  // convert sizes to offsets
+  auto const bytes =
+    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
+  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
+               "Size of output exceeds the column size limit",
+               std::overflow_error);
 
   // build the output: add spaces between the remaining pairs in each string
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets);
-
-  auto const bytes =
-    cudf::detail::get_value<cudf::size_type>(offsets->view(), input.size(), stream);
-  auto chars = cudf::strings::detail::create_chars_child_column(
-    bytes, stream, rmm::mr::get_current_device_resource());
+  auto chars   = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
   auto d_chars = chars->mutable_view().data<char>();
 
-  auto const result_fn =
-    build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars};
-  thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), result_fn);
+  // we can reuse the d_working memory to store some temporary offsets now
+  auto const d_inserts = d_working.data();
+  // create offsets where separators will be inserted
+  auto offsets_at_one = [d_spaces = d_spaces.data()] __device__(auto idx) {
+    return d_spaces[idx] == 1;  // this fails if any input string is a single byte
+  };
+  auto const copy_end =
+    thrust::copy_if(rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_one);
+
+  // this will insert the single-byte separator in positions specified in d_inserts
+  auto const sep_char = thrust::constant_iterator<char>(separator.to_string(stream)[0]);
+  thrust::merge_by_key(rmm::exec_policy(stream),
+                       d_inserts,  // where separator is inserted
+                       copy_end,
+                       zero_itr,   // all positions
+                       chars_end,
+                       sep_char,   // byte to insert
+                       d_input_chars,
+                       thrust::make_discard_iterator(),
+                       d_chars);  // result
 
   return cudf::make_strings_column(input.size(),
                                    std::move(offsets),
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index 4271f3bd3ad..c402ff7cc80 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -118,9 +118,7 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto pairs =
-    cudf::strings::split_record(input, cudf::string_scalar(" "));  // Fix once 13997 is merged
-  // perhaps check the pairs are valid?
+  auto pairs   = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr);
   auto content = pairs->release();
   return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream);
 }
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index f2fd487e7f9..869b0cc9dbf 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -78,6 +78,30 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
+TEST_F(TextBPETokenize, DISABLED_BPEAdjacentPairs)
+{
+  auto mpt         = cudf::test::strings_column_wrapper({
+    "▁ H",    //    157
+    "m m",    //  10742
+    "? !",    //  50675
+    "▁H mm",  // 174381
+    "mm m",   // 262776
+    "?! !",   // 352313
+    "? !?",   // 352314
+    "mm mm",  // 387733
+    "▁H m",   // 471269
+    "?! ?!",  // 506981
+    "?!? !",  // 506982
+  });
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
+
+  cudf::test::strings_column_wrapper input({"▁Hmmmmm", "?!?!?!"});
+
+  auto results  = nvtext::byte_pair_encoding(cudf::strings_column_view(input), *merge_pairs);
+  auto expected = cudf::test::strings_column_wrapper({"▁Hmm mmm", "?!?! ?!"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
+
 TEST_F(TextBPETokenize, BPE_Empty)
 {
   auto mpt         = cudf::test::strings_column_wrapper({"i s", "i t"});

From 56c967fe6e330617645eb306ce3b73f86ac6750f Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 12 Sep 2023 16:23:31 -0400
Subject: [PATCH 14/28] add rerank working memory

---
 cpp/benchmarks/text/bpe_tokenize.cpp  |  20 +--
 cpp/src/text/subword/bpe_tokenizer.cu | 227 +++++++++++++++-----------
 cpp/tests/text/bpe_tests.cpp          |  31 +++-
 3 files changed, 169 insertions(+), 109 deletions(-)

diff --git a/cpp/benchmarks/text/bpe_tokenize.cpp b/cpp/benchmarks/text/bpe_tokenize.cpp
index 17f4bda81b3..10b384ca47b 100644
--- a/cpp/benchmarks/text/bpe_tokenize.cpp
+++ b/cpp/benchmarks/text/bpe_tokenize.cpp
@@ -40,16 +40,16 @@ static void bench_bpe(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  auto raw_data = cudf::test::strings_column_wrapper({"test sentence ",
-                                                      "thisis test ",
-                                                      "this is sentence ",
-                                                      "this istest ",
-                                                      "thisistest ",
-                                                      "sentence is test ",
-                                                      "this sentence is test ",
-                                                      "test test test ",
-                                                      "this this test this ",
-                                                      "sentence "})
+  auto raw_data = cudf::test::strings_column_wrapper({"test sentence",
+                                                      "thisis it",
+                                                      "this is sentence",
+                                                      "this isit",
+                                                      "thisisit",
+                                                      "sentenceisit",
+                                                      "this sentence is test",
+                                                      "isitthis",
+                                                      "this this it this",
+                                                      "sentence"})
                     .release();
 
   if (row_width / 20 > 1) {
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 338f7164a9b..2ce763fbf94 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -25,7 +25,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -50,23 +49,13 @@ namespace {
 
 constexpr int block_size = 512;
 
-__device__ __inline__ cudf::string_view next_substr(cudf::column_device_view const& d_strings,
-                                                    int* d_spaces,
-                                                    int* begin,
-                                                    int* end,
-                                                    cudf::string_view const& d_str)
-{
-  auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
-  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-  return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
-}
-
 template <typename MapRefType>
 __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
                                 MapRefType const d_map,
-                                cudf::size_type* d_sizes,      // output size of encoded string
-                                cudf::size_type* d_spaces_in,  // output per string
-                                cudf::size_type* d_working     // working memory
+                                cudf::size_type* d_sizes,  // output size of encoded string
+                                int8_t* d_spaces_in,       // working memory
+                                cudf::size_type* d_ranks,  // more working memory
+                                int8_t* d_rerank_in        // and one more working memory
 )
 {
   // string per block
@@ -90,78 +79,142 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
 
   auto const d_spaces    = d_spaces_in + offset;
   auto const end_spaces  = d_spaces + d_str.size_bytes();
-  auto const d_min_ranks = d_working + offset;
+  auto const d_min_ranks = d_ranks + offset;
   auto const end_ranks   = d_min_ranks + d_str.size_bytes();
+  auto const d_rerank    = d_rerank_in + offset;
+  auto const end_rerank  = d_rerank + d_str.size_bytes();
   auto const max_rank    = cuda::std::numeric_limits<cudf::size_type>::max();
+  auto const num_valid   = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
 
   __shared__ cudf::size_type block_min_rank;
   using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
   __shared__ typename block_reduce::TempStorage temp_storage;
 
-  if (lane_idx == 0) {
-    // the first character is free so we store the string's size here
-    // to help compute the encoded output size later
-    *d_spaces      = d_str.size_bytes();
-    block_min_rank = 0;
+  if (lane_idx == 0) { block_min_rank = 0; }
+  // init all ranks to max
+  for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+    *itr = max_rank;
+  }
+  __syncthreads();
+
+  auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
+    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
+    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+    return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
+  };
+
+  // store all the initial ranks for each pair in the string for this block
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    if (*itr == 0) { continue; }  // start on valid bytes only
+    // resolve pair and lookup its rank
+    auto const lhs      = next_substr(itr);  // retrieve lhs of the pair
+    auto const next_itr = itr + lhs.size_bytes();
+    if (next_itr < end_spaces) {
+      auto const rhs = next_substr(next_itr);  // retrieve rhs of the pair
+      if (!rhs.empty()) {
+        auto rank          = max_rank;
+        auto const mp      = merge_pair_type{lhs, rhs};
+        auto const map_itr = d_map.find(mp);                       // lookup pair in merges table;
+        if (map_itr != d_map.end()) { rank = map_itr->second; }    // found a match;
+        d_min_ranks[thrust::distance(d_spaces, next_itr)] = rank;  // store the rank
+      }
+    }
   }
   __syncthreads();
 
-  // each thread processes their part of the string and records its min_rank
+  // loop through the ranks finding the current minimum until there are no more
   while (block_min_rank < max_rank) {
+    // find new minimum rank
     auto min_rank = max_rank;
-    // initialize min ranks
-    // future optimization: only invalidate ranks where mins were
-    // found in the previous run
     for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
-      *itr = max_rank;
+      auto const rank = *itr;
+      if (rank < min_rank) { min_rank = rank; }
     }
     __syncthreads();
 
-    for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
-      if (*itr == 0) { continue; }  // start on valid bytes only
-
-      // get left half of the pair
-      auto const lhs      = next_substr(d_strings, d_spaces, itr, end_spaces, d_str);
-      auto const next_itr = itr + lhs.size_bytes();
-      if (next_itr < end_spaces) {
-        // get the right half of the pair
-        auto const rhs = next_substr(d_strings, d_spaces, next_itr, end_spaces, d_str);
-        if (!rhs.empty()) {
-          auto const index = static_cast<int>(thrust::distance(d_spaces, next_itr));
-          // this is setup for future optimization mentioned above;
-          // we only want to hash/lookup if the rank is new for this pair
-          auto rank = d_min_ranks[index];
-          if (rank == max_rank) {
-            // lookup pair in merge-pairs table
-            auto const mp      = merge_pair_type{lhs, rhs};
-            auto const map_itr = d_map.find(mp);
-            if (map_itr != d_map.end()) {  // found a match
-              rank = map_itr->second;
-            }
-          }
-          if (rank < min_rank) { min_rank = rank; }
-          d_min_ranks[index] = rank;  // store the rank
-        }
-      }
-    }
-    __syncthreads();
-
-    // once all threads are completed, find the min-rank across the block
-    block_min_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min());
+    // compute the min rank across the block
+    auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+    if (lane_idx == 0) { block_min_rank = reduce_rank; }
     __syncthreads();
 
     if (block_min_rank < max_rank) {
+      // (re)initialize all the re-rank identifiers to zero
+      for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+        *itr = 0;
+      }
+
       // search the d_min_ranks for all the places where the rank matches block_min_rank
       for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
-        auto const index = static_cast<int>(thrust::distance(d_min_ranks, itr));
         if (*itr == block_min_rank) {
-          // set the output value to 0 at this position
-          if (index > 0 && *(itr - 1) != block_min_rank) { d_spaces[index] = 0; }
+          auto ptr = itr - 1;  // check for adjacent min-rank edge-case
+          while (ptr > d_min_ranks && *ptr == max_rank) {
+            --ptr;
+          }
+          // set the output value to 0 at this position (erases separator)
+          if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; }
+        }
+      }
+      __syncthreads();
+
+      auto find_prev = [begin = d_spaces](int8_t* ptr) {
+        while (ptr > begin && *ptr == 0) {
+          --ptr;
+        }
+        return ptr;
+      };
+      auto find_next = [end = end_spaces](int8_t* ptr) {
+        while (ptr < end && *ptr == 0) {
+          ++ptr;
+        }
+        return ptr;
+      };
+
+      // identify the re-rank locations
+      for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+        auto const index = thrust::distance(d_min_ranks, itr);
+        if (*itr == block_min_rank && d_spaces[index] == 0) {
+          auto ptr = find_prev(d_spaces + index - 1);  // find previous pair mid-point
+          if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }  // atomicExch
+          ptr = find_next(d_spaces + index + 1);  // find next pair mid-point
+          if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }  // atomicExch
+          *itr = max_rank;  // reset this rank
+        }
+      }
+      __syncthreads();
+
+      // compute the ranks for the newly created pairs
+      for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+        if (*itr) {
+          auto const index = thrust::distance(d_rerank, itr);
+          // build lhs of pair
+          auto const ptr  = find_prev(d_spaces + index - 1);
+          auto const size = static_cast<cudf::size_type>(thrust::distance(ptr, d_spaces + index));
+          auto const lhs  = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size);
+          // retrieve rhs of pair
+          auto const rhs = next_substr(d_spaces + index);
+          auto rank      = max_rank;
+          if (!rhs.empty()) {
+            auto const mp      = merge_pair_type{lhs, rhs};
+            auto const map_itr = d_map.find(mp);                     // lookup in merges;
+            if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match
+          }
+          d_min_ranks[index] = rank;
         }
       }
       __syncthreads();
     }
   }  // if no mins were found we are done, otherwise start again
+
+  // reset the first position -- no separator to be added here
+  if (lane_idx == 0) { *d_spaces = 0; }
+
+  // compute the output size for this string by counting the resulting separator positions
+  auto bytes = 0;
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    bytes += *itr;
+  }
+  auto const size = block_reduce(temp_storage).Sum(bytes, num_valid);
+  if (lane_idx == 0) { d_sizes[str_idx] = size + d_str.size_bytes(); }
 }
 }  // namespace
 
@@ -196,53 +249,45 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr);
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
-  // initialize the spaces vector which will hold encoding information
-  rmm::device_uvector<cudf::size_type> d_spaces(chars_size, stream);
+  // initialize the spaces vector which will hold the separators locations
+  rmm::device_uvector<int8_t> d_spaces(chars_size, stream);
   auto const zero_itr  = thrust::counting_iterator<cudf::size_type>(0);
   auto const chars_end = thrust::counting_iterator<cudf::size_type>(chars_size);
-  thrust::transform(rmm::exec_policy(stream),
-                    zero_itr,
-                    chars_end,
-                    d_spaces.begin(),
-                    [d_input_chars] __device__(auto idx) {
-                      return static_cast<cudf::size_type>(
-                        cudf::strings::detail::is_begin_utf8_char(d_input_chars[idx]));
-                    });
-
-  rmm::device_uvector<cudf::size_type> d_working(chars_size, stream);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    zero_itr,
+    chars_end,
+    d_spaces.begin(),
+    [d_input_chars] __device__(auto idx) {
+      return static_cast<int8_t>(cudf::strings::detail::is_begin_utf8_char(d_input_chars[idx]));
+    });
+
+  rmm::device_uvector<cudf::size_type> d_ranks(chars_size, stream);  // rank per string pair;
+  rmm::device_uvector<int8_t> d_rerank(chars_size, stream);          // re-ranking identifiers
   auto const map_ref = merge_pairs.impl->get_merge_pairs_ref();
 
-  // encoding step produces values in d_spaces that indicate where the separator is inserted
-  // and can also be reduced to compute the output size of each row
+  // the main kernel here produces the sizes of the encoded output string
+  // as well as the d_spaces values which identify where to insert the separators
   bpe_parallel_fn<decltype(map_ref)><<<input.size(), block_size, 0, stream.value()>>>(
-    *d_strings, map_ref, d_offsets, d_spaces.data(), d_working.data());
-  // compute and store the output size for this string's encoding
-  auto const input_offsets = thrust::make_transform_iterator(
-    input.offsets_begin(),
-    [first_offset] __device__(auto offset) { return offset - first_offset; });
-  cudf::reduction::detail::segmented_reduce(d_spaces.begin(),
-                                            input_offsets,
-                                            input_offsets + input.size() + 1,
-                                            d_offsets,
-                                            thrust::plus{},
-                                            0,
-                                            stream);
-  // convert sizes to offsets
+    *d_strings, map_ref, d_offsets, d_spaces.data(), d_ranks.data(), d_rerank.data());
+
+  // convert sizes to offsets in-place
   auto const bytes =
     cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
   CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
                "Size of output exceeds the column size limit",
                std::overflow_error);
 
-  // build the output: add spaces between the remaining pairs in each string
+  // build the output: adding separators between the remaining pairs in each string
   auto chars   = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
   auto d_chars = chars->mutable_view().data<char>();
 
-  // we can reuse the d_working memory to store some temporary offsets now
-  auto const d_inserts = d_working.data();
+  // we can reuse the ranks working memory to store some temporary offsets now;
+  // the offsets are produced by the index of the d_spaces values
+  auto const d_inserts = d_ranks.data();
   // create offsets where separators will be inserted
   auto offsets_at_one = [d_spaces = d_spaces.data()] __device__(auto idx) {
-    return d_spaces[idx] == 1;  // this fails if any input string is a single byte
+    return d_spaces[idx] > 0;  // separator to be inserted here
   };
   auto const copy_end =
     thrust::copy_if(rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_one);
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index 869b0cc9dbf..d670492fb01 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -29,23 +29,37 @@ struct TextBPETokenize : public cudf::test::BaseFixture {};
 TEST_F(TextBPETokenize, BytePairEncoding)
 {
   // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
-  auto mpt = cudf::test::strings_column_wrapper(
-    {"e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "T h", "Th is", "t est", "s ent"});
+  auto mpt = cudf::test::strings_column_wrapper({
+    "e n",    // 14
+    "i t",    // 16
+    "i s",    // 17
+    "e s",    // 20
+    "en t",   // 44
+    "c e",    // 90
+    "es t",   // 141
+    "en ce",  // 340
+    "t h",    // 146
+    "h i",    // 5049
+    "th is",  // 5407
+    "t est",  // 9034
+    "s i",    // 13142
+    "s ent"   // 33832
+  });
 
   auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   auto validity = cudf::test::iterators::null_at(4);
   cudf::test::strings_column_wrapper input(
-    {"Thisisit", "Thisis test-sentence-1", "Thisistestsentence-2", "This-istestsentence 3", "", ""},
+    {"thisisit", "thisis test-sentence-1", "thisistestsentence-2", "this-istestsentence 3", "", ""},
     validity);
   auto sv = cudf::strings_column_view(input);
 
   auto results = nvtext::byte_pair_encoding(sv, *merge_pairs);
 
-  auto expected = cudf::test::strings_column_wrapper({"This is it",
-                                                      "This is   test - sent ence - 1",
-                                                      "This is test sent ence - 2",
-                                                      "This - is test sent ence   3",
+  auto expected = cudf::test::strings_column_wrapper({"this is it",
+                                                      "this is   test - sent ence - 1",
+                                                      "this is test sent ence - 2",
+                                                      "this - is test sent ence   3",
                                                       "",
                                                       ""},
                                                      validity);
@@ -63,6 +77,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
 {
   auto mpt = cudf::test::strings_column_wrapper(
     {"Ġ t", "Ġt he", "h e", "e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
+
   auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   cudf::test::strings_column_wrapper input(
@@ -78,7 +93,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
-TEST_F(TextBPETokenize, DISABLED_BPEAdjacentPairs)
+TEST_F(TextBPETokenize, BPEAdjacentPairs)
 {
   auto mpt         = cudf::test::strings_column_wrapper({
     "▁ H",    //    157

From 038809f0374075fa88dbcf47ae402a477927436a Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 18 Sep 2023 17:55:15 -0400
Subject: [PATCH 15/28] fusing reduce and unfusing output size calc

---
 cpp/src/text/subword/bpe_tokenizer.cu | 201 +++++++++++++++-----------
 1 file changed, 114 insertions(+), 87 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 2ce763fbf94..1058aa0521b 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -52,7 +52,6 @@ constexpr int block_size = 512;
 template <typename MapRefType>
 __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
                                 MapRefType const d_map,
-                                cudf::size_type* d_sizes,  // output size of encoded string
                                 int8_t* d_spaces_in,       // working memory
                                 cudf::size_type* d_ranks,  // more working memory
                                 int8_t* d_rerank_in        // and one more working memory
@@ -63,15 +62,9 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
   auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
 
-  if (d_strings.is_null(str_idx)) {
-    d_sizes[str_idx] = 0;
-    return;
-  }
+  if (d_strings.is_null(str_idx)) { return; }
   auto const d_str = d_strings.element<cudf::string_view>(str_idx);
-  if (d_str.empty()) {
-    d_sizes[str_idx] = 0;
-    return;
-  }
+  if (d_str.empty()) { return; }
 
   auto const offsets =
     d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
@@ -90,20 +83,21 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
   __shared__ typename block_reduce::TempStorage temp_storage;
 
-  if (lane_idx == 0) { block_min_rank = 0; }
+  auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
+    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
+    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+    return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
+  };
+
   // init all ranks to max
   for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
     *itr = max_rank;
   }
   __syncthreads();
 
-  auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
-    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
-    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-    return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
-  };
+  auto min_rank = max_rank;
 
-  // store all the initial ranks for each pair in the string for this block
+  // store all the initial ranks for each pair
   for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
     if (*itr == 0) { continue; }  // start on valid bytes only
     // resolve pair and lookup its rank
@@ -117,93 +111,123 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
         auto const map_itr = d_map.find(mp);                       // lookup pair in merges table;
         if (map_itr != d_map.end()) { rank = map_itr->second; }    // found a match;
         d_min_ranks[thrust::distance(d_spaces, next_itr)] = rank;  // store the rank
+        if (rank < min_rank) min_rank = rank;
       }
     }
   }
   __syncthreads();
+  // compute the min rank across the block
+  auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+  if (lane_idx == 0) { block_min_rank = reduce_rank; }
+  __syncthreads();
 
   // loop through the ranks finding the current minimum until there are no more
   while (block_min_rank < max_rank) {
-    // find new minimum rank
-    auto min_rank = max_rank;
-    for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
-      auto const rank = *itr;
-      if (rank < min_rank) { min_rank = rank; }
+    // (re)initialize all the re-rank identifiers to zero
+    for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+      *itr = 0;
     }
-    __syncthreads();
 
-    // compute the min rank across the block
-    auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
-    if (lane_idx == 0) { block_min_rank = reduce_rank; }
+    // search the d_min_ranks for all the places where the rank matches block_min_rank
+    for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+      if (*itr == block_min_rank) {
+        auto ptr = itr - 1;  // check for adjacent min-rank edge-case
+        while (ptr > d_min_ranks && *ptr == max_rank) {
+          --ptr;
+        }
+        // set the output value to 0 at this position (erases separator)
+        if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; }
+      }
+    }
     __syncthreads();
 
-    if (block_min_rank < max_rank) {
-      // (re)initialize all the re-rank identifiers to zero
-      for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
-        *itr = 0;
+    auto find_prev = [begin = d_spaces](int8_t* ptr) {
+      while (ptr > begin && *ptr == 0) {
+        --ptr;
       }
-
-      // search the d_min_ranks for all the places where the rank matches block_min_rank
-      for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
-        if (*itr == block_min_rank) {
-          auto ptr = itr - 1;  // check for adjacent min-rank edge-case
-          while (ptr > d_min_ranks && *ptr == max_rank) {
-            --ptr;
-          }
-          // set the output value to 0 at this position (erases separator)
-          if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; }
-        }
+      return ptr;
+    };
+    auto find_next = [end = end_spaces](int8_t* ptr) {
+      while (ptr < end && *ptr == 0) {
+        ++ptr;
       }
-      __syncthreads();
+      return ptr;
+    };
 
-      auto find_prev = [begin = d_spaces](int8_t* ptr) {
-        while (ptr > begin && *ptr == 0) {
-          --ptr;
-        }
-        return ptr;
-      };
-      auto find_next = [end = end_spaces](int8_t* ptr) {
-        while (ptr < end && *ptr == 0) {
-          ++ptr;
-        }
-        return ptr;
-      };
-
-      // identify the re-rank locations
-      for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
-        auto const index = thrust::distance(d_min_ranks, itr);
-        if (*itr == block_min_rank && d_spaces[index] == 0) {
-          auto ptr = find_prev(d_spaces + index - 1);  // find previous pair mid-point
-          if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }  // atomicExch
-          ptr = find_next(d_spaces + index + 1);  // find next pair mid-point
-          if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }  // atomicExch
-          *itr = max_rank;  // reset this rank
-        }
+    // identify the re-rank locations
+    for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+      auto const index = thrust::distance(d_min_ranks, itr);
+      if (*itr == block_min_rank && d_spaces[index] == 0) {
+        auto ptr = find_prev(d_spaces + index - 1);  // find previous pair mid-point
+        if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }
+        ptr = find_next(d_spaces + index + 1);       // find next pair mid-point
+        if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }
+        *itr = max_rank;                             // reset this rank
       }
-      __syncthreads();
-
-      // compute the ranks for the newly created pairs
-      for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
-        if (*itr) {
-          auto const index = thrust::distance(d_rerank, itr);
-          // build lhs of pair
-          auto const ptr  = find_prev(d_spaces + index - 1);
-          auto const size = static_cast<cudf::size_type>(thrust::distance(ptr, d_spaces + index));
-          auto const lhs  = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size);
-          // retrieve rhs of pair
-          auto const rhs = next_substr(d_spaces + index);
-          auto rank      = max_rank;
-          if (!rhs.empty()) {
-            auto const mp      = merge_pair_type{lhs, rhs};
-            auto const map_itr = d_map.find(mp);                     // lookup in merges;
-            if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match
-          }
-          d_min_ranks[index] = rank;
+    }
+    __syncthreads();
+
+    // compute the ranks for the newly created pairs
+    min_rank = max_rank;  // and record new minimum
+    for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+      auto const index = thrust::distance(d_rerank, itr);
+      auto rank        = d_min_ranks[index];
+      if (*itr) {
+        // build lhs of pair
+        auto const ptr  = find_prev(d_spaces + index - 1);
+        auto const size = static_cast<cudf::size_type>(thrust::distance(ptr, d_spaces + index));
+        auto const lhs  = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size);
+        // retrieve rhs of pair
+        auto const rhs = next_substr(d_spaces + index);
+        rank           = max_rank;
+        if (!rhs.empty()) {
+          auto const mp      = merge_pair_type{lhs, rhs};
+          auto const map_itr = d_map.find(mp);                     // lookup in merges;
+          if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match
         }
+        d_min_ranks[index] = rank;
       }
-      __syncthreads();
+      if (rank < min_rank) min_rank = rank;
     }
+    __syncthreads();
+
+    // compute the min rank across the block
+    auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+    if (lane_idx == 0) { block_min_rank = reduce_rank; }
+    __syncthreads();
   }  // if no mins were found we are done, otherwise start again
+}
+
+__global__ void bpe_finalize(cudf::column_device_view const d_strings,
+                             int8_t* d_spaces_in,      // where separators are inserted
+                             cudf::size_type* d_sizes  // output sizes of encoded strings
+)
+{
+  // string per block
+  auto const str_idx =
+    static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  if (d_strings.is_null(str_idx)) {
+    d_sizes[str_idx] = 0;
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_sizes[str_idx] = 0;
+    return;
+  }
+
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+
+  auto const d_spaces   = d_spaces_in + offset;
+  auto const end_spaces = d_spaces + d_str.size_bytes();
+  auto const num_valid  = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
+
+  using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
 
   // reset the first position -- no separator to be added here
   if (lane_idx == 0) { *d_spaces = 0; }
@@ -216,6 +240,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   auto const size = block_reduce(temp_storage).Sum(bytes, num_valid);
   if (lane_idx == 0) { d_sizes[str_idx] = size + d_str.size_bytes(); }
 }
+
 }  // namespace
 
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
@@ -250,6 +275,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
   // initialize the spaces vector which will hold the separators locations
+  // (this is about 20% of the run: look into vector loading)
   rmm::device_uvector<int8_t> d_spaces(chars_size, stream);
   auto const zero_itr  = thrust::counting_iterator<cudf::size_type>(0);
   auto const chars_end = thrust::counting_iterator<cudf::size_type>(chars_size);
@@ -266,10 +292,11 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   rmm::device_uvector<int8_t> d_rerank(chars_size, stream);          // re-ranking identifiers
   auto const map_ref = merge_pairs.impl->get_merge_pairs_ref();
 
-  // the main kernel here produces the sizes of the encoded output string
-  // as well as the d_spaces values which identify where to insert the separators
   bpe_parallel_fn<decltype(map_ref)><<<input.size(), block_size, 0, stream.value()>>>(
-    *d_strings, map_ref, d_offsets, d_spaces.data(), d_ranks.data(), d_rerank.data());
+    *d_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data());
+  // this could probably be re-fused into the above kernel
+  bpe_finalize<<<input.size(), block_size, 0, stream.value()>>>(
+    *d_strings, d_spaces.data(), d_offsets);
 
   // convert sizes to offsets in-place
   auto const bytes =

From 4528e6f2ecc4eeb84c2f2022badf1042fe7cec73 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 19 Sep 2023 18:00:11 -0400
Subject: [PATCH 16/28] move spaces init to main kernel

---
 cpp/src/text/subword/bpe_tokenizer.cu | 34 +++++++++++----------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index c15355b64c4..854e0e7407f 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -76,9 +77,10 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   auto const end_ranks   = d_min_ranks + d_str.size_bytes();
   auto const d_rerank    = d_rerank_in + offset;
   auto const end_rerank  = d_rerank + d_str.size_bytes();
-  auto const max_rank    = cuda::std::numeric_limits<cudf::size_type>::max();
   auto const num_valid   = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
 
+  auto constexpr max_rank = cuda::std::numeric_limits<cudf::size_type>::max();
+
   __shared__ cudf::size_type block_min_rank;
   using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
   __shared__ typename block_reduce::TempStorage temp_storage;
@@ -93,6 +95,11 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
     *itr = max_rank;
   }
+  // init all spaces to 1 as appropriate
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    auto const index = thrust::distance(d_spaces, itr);
+    *itr = static_cast<int8_t>(cudf::strings::detail::is_begin_utf8_char(d_str.data()[index]));
+  }
   __syncthreads();
 
   auto min_rank = max_rank;
@@ -115,7 +122,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
       }
     }
   }
-  __syncthreads();
   // compute the min rank across the block
   auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
   if (lane_idx == 0) { block_min_rank = reduce_rank; }
@@ -189,7 +195,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
       }
       if (rank < min_rank) min_rank = rank;
     }
-    __syncthreads();
 
     // compute the min rank across the block
     auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
@@ -235,7 +240,7 @@ __global__ void bpe_finalize(cudf::column_device_view const d_strings,
   // compute the output size for this string by counting the resulting separator positions
   auto bytes = 0;
   for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
-    bytes += *itr;
+    bytes += (*itr > 0);
   }
   auto const size = block_reduce(temp_storage).Sum(bytes, num_valid);
   if (lane_idx == 0) { d_sizes[str_idx] = size + d_str.size_bytes(); }
@@ -274,20 +279,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr);
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
-  // initialize the spaces vector which will hold the separators locations
-  // (this is about 20% of the run: look into vector loading)
   rmm::device_uvector<int8_t> d_spaces(chars_size, stream);
-  auto const zero_itr  = thrust::counting_iterator<cudf::size_type>(0);
-  auto const chars_end = thrust::counting_iterator<cudf::size_type>(chars_size);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    zero_itr,
-    chars_end,
-    d_spaces.begin(),
-    [d_input_chars] __device__(auto idx) {
-      return static_cast<int8_t>(cudf::strings::detail::is_begin_utf8_char(d_input_chars[idx]));
-    });
-
   rmm::device_uvector<cudf::size_type> d_ranks(chars_size, stream);  // rank per string pair;
   rmm::device_uvector<int8_t> d_rerank(chars_size, stream);          // re-ranking identifiers
   auto const map_ref = merge_pairs.impl->get_merge_pairs_ref();
@@ -313,11 +305,13 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   // the offsets are produced by the index of the d_spaces values
   auto const d_inserts = d_ranks.data();
   // create offsets where separators will be inserted
-  auto offsets_at_one = [d_spaces = d_spaces.data()] __device__(auto idx) {
+  auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
     return d_spaces[idx] > 0;  // separator to be inserted here
   };
-  auto const copy_end =
-    thrust::copy_if(rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_one);
+  auto const zero_itr  = thrust::counting_iterator<cudf::size_type>(0);
+  auto const chars_end = thrust::counting_iterator<cudf::size_type>(chars_size);
+  auto const copy_end  = thrust::copy_if(
+    rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_non_zero);
 
   // this will insert the single-byte separator in positions specified in d_inserts
   auto const sep_char = thrust::constant_iterator<char>(separator.to_string(stream)[0]);

From 5e27d5af7e15f9b696e0938ac1c3f776e674903c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 20 Sep 2023 16:45:36 -0400
Subject: [PATCH 17/28] limit adjacent pair search

---
 cpp/src/text/subword/bpe_tokenizer.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 854e0e7407f..6118c2aa453 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -137,9 +137,11 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     // search the d_min_ranks for all the places where the rank matches block_min_rank
     for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
       if (*itr == block_min_rank) {
-        auto ptr = itr - 1;  // check for adjacent min-rank edge-case
-        while (ptr > d_min_ranks && *ptr == max_rank) {
+        auto ptr   = itr - 1;  // check for adjacent min-rank edge-case
+        auto count = 8;
+        while (ptr > d_min_ranks && *ptr == max_rank && count > 0) {
           --ptr;
+          --count;
         }
         // set the output value to 0 at this position (erases separator)
         if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; }

From 75432d223df3905c81d09de1b551eb963b82f2d9 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 26 Sep 2023 19:44:04 -0400
Subject: [PATCH 18/28] fix replace bug

---
 cpp/src/strings/replace/replace.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index a622d1a742d..acc1502f4d6 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -97,7 +97,7 @@ struct replace_row_parallel_fn {
       } else {
         bytes += d_repl.size_bytes() - d_target.size_bytes();
       }
-      position = d_str.find(d_target, position + d_target.size_bytes());
+      position = d_str.find(d_target, position + d_target.length());
       --max_n;
     }
     if (out_ptr)  // copy whats left (or right depending on your point of view)

From 521cd1b6820b528e6b316ac91828d64e3eff902a Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 4 Oct 2023 17:17:50 -0400
Subject: [PATCH 19/28] exploit unpairable boundaries

---
 cpp/src/text/subword/bpe_tokenizer.cu    | 90 ++++++++++++++++++++++--
 cpp/src/text/subword/bpe_tokenizer.cuh   | 61 +++++++++++++++-
 cpp/src/text/subword/load_merges_file.cu | 33 +++++++--
 cpp/tests/text/bpe_tests.cpp             |  5 +-
 4 files changed, 173 insertions(+), 16 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 6118c2aa453..e36d53e17f4 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -42,6 +42,7 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/merge.h>
+#include <thrust/remove.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
@@ -50,6 +51,45 @@ namespace {
 
 constexpr int block_size = 512;
 
+template <typename MapRefType>
+__global__ void bpe_up_offsets_fn(char const* d_chars,
+                                  cudf::size_type chars_size,
+                                  cudf::size_type offset,
+                                  MapRefType const d_map,
+                                  cudf::size_type* d_offsets)
+{
+  auto const idx = static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id());
+  if (idx >= chars_size) { return; }
+  if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) {
+    d_offsets[idx] = 0;
+    return;
+  }
+
+  auto next_substr = [d_chars, end = d_chars + chars_size](char const* begin) {
+    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) {
+      return cudf::strings::detail::is_begin_utf8_char(v);
+    });
+    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+    return cudf::string_view(begin, size);
+  };
+
+  auto const itr      = d_chars + idx;
+  auto const end      = d_chars + chars_size;
+  auto const lhs      = next_substr(itr);
+  auto const next_itr = itr + lhs.size_bytes();
+  auto output         = 0;
+  if (next_itr < end) {
+    auto const rhs = next_substr(next_itr);
+    if (!rhs.empty()) {
+      // see if both halves exist anywhere in the table
+      if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
+        output = idx + lhs.size_bytes() + offset;  // candidate for artificial boundary
+      }
+    }
+  }
+  d_offsets[idx] = output;
+}
+
 template <typename MapRefType>
 __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
                                 MapRefType const d_map,
@@ -77,13 +117,13 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   auto const end_ranks   = d_min_ranks + d_str.size_bytes();
   auto const d_rerank    = d_rerank_in + offset;
   auto const end_rerank  = d_rerank + d_str.size_bytes();
-  auto const num_valid   = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
 
   auto constexpr max_rank = cuda::std::numeric_limits<cudf::size_type>::max();
 
   __shared__ cudf::size_type block_min_rank;
   using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
   __shared__ typename block_reduce::TempStorage temp_storage;
+  auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
 
   auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
     auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
@@ -284,11 +324,51 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   rmm::device_uvector<int8_t> d_spaces(chars_size, stream);
   rmm::device_uvector<cudf::size_type> d_ranks(chars_size, stream);  // rank per string pair;
   rmm::device_uvector<int8_t> d_rerank(chars_size, stream);          // re-ranking identifiers
-  auto const map_ref = merge_pairs.impl->get_merge_pairs_ref();
+  auto const map_ref  = merge_pairs.impl->get_merge_pairs_ref();
+  auto const map_ref2 = merge_pairs.impl->get_mp_table_ref();
+
+  if ((input.offset() == 0) && (input.size() == input.offsets().size() - 1)) {
+    // TODO: this fails for sliced columns for some reason;
+    //       we could get ride of the else{} if this was fixed
+
+    // this path locates unpairable sections of code to create artificial string row boundaries;
+    // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector
+    auto const block_count = (chars_size + block_size - 1) / block_size;
+    bpe_up_offsets_fn<decltype(map_ref2)><<<block_count, block_size, 0, stream.value()>>>(
+      d_input_chars, chars_size, input.offset(), map_ref2, d_ranks.data());
+    auto const end   = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0);
+    auto const total = thrust::distance(d_ranks.begin(), end);  // number of unpairables
+
+    // the new boundaries are combined with the existing offsets to build a temporary column
+    auto tmp_offsets = rmm::device_uvector<cudf::size_type>(total + input.size() + 1, stream);
+    thrust::merge(rmm::exec_policy(stream),
+                  input.offsets_begin(),
+                  input.offsets_end(),
+                  d_ranks.begin(),
+                  end,
+                  tmp_offsets.begin());
+
+    // the temp column is used for the encoding functions which is much faster
+    // on a larger number of smaller strings
+    auto const col_offsets =
+      cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
+    auto const tmp_input     = cudf::column_view(input.parent().type(),
+                                             static_cast<cudf::size_type>(input.size() + total),
+                                             nullptr,
+                                             nullptr,
+                                             0,
+                                             0,
+                                                 {col_offsets, input.chars()});
+    auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
+
+    bpe_parallel_fn<decltype(map_ref)><<<tmp_input.size(), block_size, 0, stream.value()>>>(
+      *d_tmp_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data());
+  } else {
+    bpe_parallel_fn<decltype(map_ref)><<<input.size(), block_size, 0, stream.value()>>>(
+      *d_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data());
+  }
 
-  bpe_parallel_fn<decltype(map_ref)><<<input.size(), block_size, 0, stream.value()>>>(
-    *d_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data());
-  // this could probably be re-fused into the above kernel
+  // compute the output sizes into the d_offsets vector
   bpe_finalize<<<input.size(), block_size, 0, stream.value()>>>(
     *d_strings, d_spaces.data(), d_offsets);
 
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 8e70b2b8905..25132a57ae2 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -46,6 +46,8 @@ using hash_value_type    = uint32_t;
 using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
 using merge_pair_type    = thrust::pair<cudf::string_view, cudf::string_view>;
 
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+
 /**
  * @brief Hasher function used for building and using the cuco static-map
  *
@@ -93,8 +95,6 @@ struct bpe_equal {
   }
 };
 
-using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
-
 using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
 
 using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
@@ -105,6 +105,56 @@ using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
                                                             probe_scheme,
                                                             hash_table_allocator_type>;
 
+struct table_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    auto const d_str = d_strings.element<cudf::string_view>(index);
+    return hasher(d_str);
+  }
+  // used by find
+  __device__ hash_value_type operator()(cudf::string_view const& d_str) const
+  {
+    return hasher(d_str);
+  }
+};
+
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct table_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    auto const left  = d_strings.element<cudf::string_view>(lhs);
+    auto const right = d_strings.element<cudf::string_view>(rhs);
+    return left == right;
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  {
+    auto const left = d_strings.element<cudf::string_view>(lhs);
+    return left == rhs;
+  }
+};
+
+using probe_scheme2 = cuco::experimental::linear_probing<1, table_hasher>;
+
+using merge_pairs_map_type2 =
+  cuco::experimental::static_map<cudf::size_type,
+                                 cudf::size_type,
+                                 cuco::experimental::extent<std::size_t>,
+                                 cuda::thread_scope_device,
+                                 table_equal,
+                                 probe_scheme2,
+                                 hash_table_allocator_type>;
+
 }  // namespace detail
 
 // since column_device_view::create() returns is a little more than
@@ -118,12 +168,17 @@ struct bpe_merge_pairs::bpe_merge_pairs_impl {
   col_device_view const d_merge_pairs;
   std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
 
+  std::unique_ptr<detail::merge_pairs_map_type2> mp_table_map;
+
   bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
                        col_device_view&& d_merge_pairs,
-                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
+                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map,
+                       std::unique_ptr<detail::merge_pairs_map_type2>&& mp_table_map);
 
   auto const get_merge_pairs() const { return *d_merge_pairs; }
   auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
+
+  auto get_mp_table_ref() const { return mp_table_map->ref(cuco::experimental::op::find); }
 };
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index c402ff7cc80..b5a49d80a31 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -104,13 +104,34 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   return merge_pairs_map;
 }
 
+std::unique_ptr<detail::merge_pairs_map_type2> initialize_mp_table_map(
+  cudf::column_device_view const& input, rmm::cuda_stream_view stream)
+{
+  auto mp_table_map = std::make_unique<merge_pairs_map_type2>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    table_equal{input},
+    probe_scheme2{table_hasher{input}},
+    hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value());
+
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
+
+  mp_table_map->insert_async(iter, iter + input.size(), stream.value());
+
+  return mp_table_map;
+}
+
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
 {
-  auto d_input     = cudf::column_device_view::create(input->view(), stream);
-  auto merge_pairs = initialize_merge_pairs_map(*d_input, stream);
+  auto d_input      = cudf::column_device_view::create(input->view(), stream);
+  auto merge_pairs  = initialize_merge_pairs_map(*d_input, stream);
+  auto mp_table_map = initialize_mp_table_map(*d_input, stream);
   return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
-    std::move(input), std::move(d_input), std::move(merge_pairs));
+    std::move(input), std::move(d_input), std::move(merge_pairs), std::move(mp_table_map));
 }
 
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
@@ -163,10 +184,12 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& merge_pairs,
   std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
     d_merge_pairs,
-  std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map)
+  std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map,
+  std::unique_ptr<detail::merge_pairs_map_type2>&& mp_table_map)
   : merge_pairs(std::move(merge_pairs)),
     d_merge_pairs(std::move(d_merge_pairs)),
-    merge_pairs_map(std::move(merge_pairs_map))
+    merge_pairs_map(std::move(merge_pairs_map)),
+    mp_table_map(std::move(mp_table_map))
 {
 }
 
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index d670492fb01..47466417553 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -54,8 +54,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
     validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto results = nvtext::byte_pair_encoding(sv, *merge_pairs);
-
+  auto results  = nvtext::byte_pair_encoding(sv, *merge_pairs);
   auto expected = cudf::test::strings_column_wrapper({"this is it",
                                                       "this is   test - sent ence - 1",
                                                       "this is test sent ence - 2",
@@ -67,8 +66,8 @@ TEST_F(TextBPETokenize, BytePairEncoding)
 
   auto sliced          = cudf::slice(input, {1, 4}).front();
   auto sliced_expected = cudf::slice(expected, {1, 4}).front();
-  sv                   = cudf::strings_column_view(sliced);
 
+  sv      = cudf::strings_column_view(sliced);
   results = nvtext::byte_pair_encoding(sv, *merge_pairs);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }

From b978e0112c8d3b2a4edcdebb92d2e11ad4ff1fc7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 4 Oct 2023 23:27:00 -0400
Subject: [PATCH 20/28] fix sliced input offset parm

---
 cpp/src/text/subword/bpe_tokenizer.cu | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index e36d53e17f4..47da3ef0231 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -324,18 +324,14 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   rmm::device_uvector<int8_t> d_spaces(chars_size, stream);
   rmm::device_uvector<cudf::size_type> d_ranks(chars_size, stream);  // rank per string pair;
   rmm::device_uvector<int8_t> d_rerank(chars_size, stream);          // re-ranking identifiers
-  auto const map_ref  = merge_pairs.impl->get_merge_pairs_ref();
-  auto const map_ref2 = merge_pairs.impl->get_mp_table_ref();
-
-  if ((input.offset() == 0) && (input.size() == input.offsets().size() - 1)) {
-    // TODO: this fails for sliced columns for some reason;
-    //       we could get ride of the else{} if this was fixed
 
+  {
+    auto const map_ref2 = merge_pairs.impl->get_mp_table_ref();
     // this path locates unpairable sections of code to create artificial string row boundaries;
     // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector
     auto const block_count = (chars_size + block_size - 1) / block_size;
     bpe_up_offsets_fn<decltype(map_ref2)><<<block_count, block_size, 0, stream.value()>>>(
-      d_input_chars, chars_size, input.offset(), map_ref2, d_ranks.data());
+      d_input_chars, chars_size, first_offset, map_ref2, d_ranks.data());
     auto const end   = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0);
     auto const total = thrust::distance(d_ranks.begin(), end);  // number of unpairables
 
@@ -361,11 +357,9 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  {col_offsets, input.chars()});
     auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
 
+    auto const map_ref = merge_pairs.impl->get_merge_pairs_ref();
     bpe_parallel_fn<decltype(map_ref)><<<tmp_input.size(), block_size, 0, stream.value()>>>(
       *d_tmp_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data());
-  } else {
-    bpe_parallel_fn<decltype(map_ref)><<<input.size(), block_size, 0, stream.value()>>>(
-      *d_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data());
   }
 
   // compute the output sizes into the d_offsets vector

From e158312652132d130139e083aa7bd649d35eb224 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 5 Oct 2023 15:44:19 -0400
Subject: [PATCH 21/28] remove re-rank re-init

---
 cpp/src/text/subword/bpe_tokenizer.cu    | 125 +++++++++++++++--------
 cpp/src/text/subword/bpe_tokenizer.cuh   |  53 ++++++----
 cpp/src/text/subword/load_merges_file.cu |  12 +--
 3 files changed, 123 insertions(+), 67 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 47da3ef0231..0d63e2b7507 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -51,12 +51,24 @@ namespace {
 
 constexpr int block_size = 512;
 
+/**
+ * @brief Produces offsets to unpairable locations in the given chars array
+ *
+ * Launched as a thread per byte of the chars array.
+ * The output is non-zero offsets to locations of unpairable strings.
+ *
+ * @tparam MapRefType The type of the map finder object
+ * @param d_chars Input chars memory
+ * @param chars_size Number of bytes pointed to by `d_chars`
+ * @param d_map For looking up individual string candidates
+ * @param d_offsets Output is the offset location of unpairables
+ */
 template <typename MapRefType>
-__global__ void bpe_up_offsets_fn(char const* d_chars,
-                                  cudf::size_type chars_size,
-                                  cudf::size_type offset,
-                                  MapRefType const d_map,
-                                  cudf::size_type* d_offsets)
+__global__ void bpe_unpairable_offsets_fn(char const* d_chars,
+                                          cudf::size_type chars_size,  // use device_span
+                                          cudf::size_type offset,
+                                          MapRefType const d_map,
+                                          cudf::size_type* d_offsets)
 {
   auto const idx = static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id());
   if (idx >= chars_size) { return; }
@@ -65,21 +77,22 @@ __global__ void bpe_up_offsets_fn(char const* d_chars,
     return;
   }
 
-  auto next_substr = [d_chars, end = d_chars + chars_size](char const* begin) {
-    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) {
-      return cudf::strings::detail::is_begin_utf8_char(v);
-    });
-    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-    return cudf::string_view(begin, size);
+  auto const itr = d_chars + idx;
+  auto const end = d_chars + chars_size;
+
+  auto next_str = [end](char const* begin) {
+    auto next = begin + (begin < end);
+    while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) {
+      ++next;
+    }
+    return cudf::string_view(begin, static_cast<cudf::size_type>(thrust::distance(begin, next)));
   };
 
-  auto const itr      = d_chars + idx;
-  auto const end      = d_chars + chars_size;
-  auto const lhs      = next_substr(itr);
+  auto const lhs      = next_str(itr);
   auto const next_itr = itr + lhs.size_bytes();
   auto output         = 0;
   if (next_itr < end) {
-    auto const rhs = next_substr(next_itr);
+    auto const rhs = next_str(next_itr);
     if (!rhs.empty()) {
       // see if both halves exist anywhere in the table
       if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
@@ -90,6 +103,29 @@ __global__ void bpe_up_offsets_fn(char const* d_chars,
   d_offsets[idx] = output;
 }
 
+/**
+ * @brief Performs byte-pair-encoding
+ *
+ * Computes the locations where the separator will be inserted in `d_spaces_in`.
+ * This is launched as a string per block.
+ *
+ * The process first initializes all characters to 1 per position in `d_spaces_in`.
+ * All pairs are realized and their ranks stored in `d_ranks`.
+ *
+ * Iteratively, the minimum rank is located, the corresponding `d_spaces_in` location
+ * is set to 0 resulting in new potential pairs. The process repeats accounting for
+ * the rank of the newly formed pairs.
+ *
+ * Once there no more rankable pairs, the process finishes and the `d_spaces_in`
+ * values identify the location to insert the separator.
+ *
+ * @tparam MapRefType The type of the map finder object
+ * @param d_strings Input data
+ * @param d_map For looking up individual string candidates
+ * @param d_spaces_in Output the location where separator will be inserted
+ * @param d_ranks Working memory to hold pair ranks
+ * @param d_rerank_in Working memory to hold locations where reranking is required
+ */
 template <typename MapRefType>
 __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
                                 MapRefType const d_map,
@@ -103,7 +139,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
   auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
 
-  if (d_strings.is_null(str_idx)) { return; }
+  // if (d_strings.is_null(str_idx)) { return; }  // maybe no longer needed
   auto const d_str = d_strings.element<cudf::string_view>(str_idx);
   if (d_str.empty()) { return; }
 
@@ -131,6 +167,10 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
   };
 
+  // init all the re-rank identifiers to zero
+  for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+    *itr = 0;
+  }
   // init all ranks to max
   for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
     *itr = max_rank;
@@ -167,21 +207,14 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   if (lane_idx == 0) { block_min_rank = reduce_rank; }
   __syncthreads();
 
-  // loop through the ranks finding the current minimum until there are no more
+  // loop through the ranks processing the current minimum until there are no more
   while (block_min_rank < max_rank) {
-    // (re)initialize all the re-rank identifiers to zero
-    for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
-      *itr = 0;
-    }
-
-    // search the d_min_ranks for all the places where the rank matches block_min_rank
+    // search the d_min_ranks for matches to block_min_rank
     for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
       if (*itr == block_min_rank) {
-        auto ptr   = itr - 1;  // check for adjacent min-rank edge-case
-        auto count = 8;
-        while (ptr > d_min_ranks && *ptr == max_rank && count > 0) {
+        auto ptr = itr - 1;  // check for adjacent min-rank (edge-case)
+        while (ptr > d_min_ranks && *ptr == max_rank) {
           --ptr;
-          --count;
         }
         // set the output value to 0 at this position (erases separator)
         if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; }
@@ -202,7 +235,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
       return ptr;
     };
 
-    // identify the re-rank locations
+    // identify all the re-rank locations (logic above created new pairs)
     for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
       auto const index = thrust::distance(d_min_ranks, itr);
       if (*itr == block_min_rank && d_spaces[index] == 0) {
@@ -216,11 +249,12 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     __syncthreads();
 
     // compute the ranks for the newly created pairs
-    min_rank = max_rank;  // and record new minimum
+    min_rank = max_rank;  // and record the new minimum along the way
     for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
       auto const index = thrust::distance(d_rerank, itr);
       auto rank        = d_min_ranks[index];
       if (*itr) {
+        *itr = 0;  // reset re-rank
         // build lhs of pair
         auto const ptr  = find_prev(d_spaces + index - 1);
         auto const size = static_cast<cudf::size_type>(thrust::distance(ptr, d_spaces + index));
@@ -230,21 +264,32 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
         rank           = max_rank;
         if (!rhs.empty()) {
           auto const mp      = merge_pair_type{lhs, rhs};
-          auto const map_itr = d_map.find(mp);                     // lookup in merges;
+          auto const map_itr = d_map.find(mp);                     // lookup rank for this pair;
           if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match
         }
-        d_min_ranks[index] = rank;
+        d_min_ranks[index] = rank;  // store new rank
       }
-      if (rank < min_rank) min_rank = rank;
+      if (rank < min_rank) { min_rank = rank; }
     }
 
-    // compute the min rank across the block
+    // re-compute the minimum rank across the block (since new pairs are created above)
     auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
     if (lane_idx == 0) { block_min_rank = reduce_rank; }
     __syncthreads();
   }  // if no mins were found we are done, otherwise start again
 }
 
+/**
+ * @brief Computes the output size of each strings row
+ *
+ * This launches as a string per block.
+ * The non-zero values in `d_spaces_in` for each string is added to
+ * the current string size to produce the total output bytes.
+ *
+ * @param d_strings Input data
+ * @param d_spaces_in Output the location where separator will be inserted
+ * @param d_sizes Output sizes of each row
+ */
 __global__ void bpe_finalize(cudf::column_device_view const d_strings,
                              int8_t* d_spaces_in,      // where separators are inserted
                              cudf::size_type* d_sizes  // output sizes of encoded strings
@@ -326,12 +371,12 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   rmm::device_uvector<int8_t> d_rerank(chars_size, stream);          // re-ranking identifiers
 
   {
-    auto const map_ref2 = merge_pairs.impl->get_mp_table_ref();
+    auto const mp_map = merge_pairs.impl->get_mp_table_ref();
     // this path locates unpairable sections of code to create artificial string row boundaries;
     // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector
-    auto const block_count = (chars_size + block_size - 1) / block_size;
-    bpe_up_offsets_fn<decltype(map_ref2)><<<block_count, block_size, 0, stream.value()>>>(
-      d_input_chars, chars_size, first_offset, map_ref2, d_ranks.data());
+    cudf::detail::grid_1d grid(chars_size, block_size);
+    bpe_unpairable_offsets_fn<decltype(mp_map)><<<grid.num_blocks, block_size, 0, stream.value()>>>(
+      d_input_chars, chars_size, first_offset, mp_map, d_ranks.data());
     auto const end   = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0);
     auto const total = thrust::distance(d_ranks.begin(), end);  // number of unpairables
 
@@ -357,9 +402,9 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  {col_offsets, input.chars()});
     auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
 
-    auto const map_ref = merge_pairs.impl->get_merge_pairs_ref();
-    bpe_parallel_fn<decltype(map_ref)><<<tmp_input.size(), block_size, 0, stream.value()>>>(
-      *d_tmp_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data());
+    auto const pair_map = merge_pairs.impl->get_merge_pairs_ref();
+    bpe_parallel_fn<decltype(pair_map)><<<tmp_input.size(), block_size, 0, stream.value()>>>(
+      *d_tmp_strings, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data());
   }
 
   // compute the output sizes into the d_offsets vector
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 25132a57ae2..be5a4e6e7dd 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -42,8 +42,8 @@
 namespace nvtext {
 namespace detail {
 
-using hash_value_type    = uint32_t;
 using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using hash_value_type    = string_hasher_type::result_type;
 using merge_pair_type    = thrust::pair<cudf::string_view, cudf::string_view>;
 
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
@@ -52,7 +52,10 @@ using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allo
  * @brief Hasher function used for building and using the cuco static-map
  *
  * This takes advantage of heterogeneous lookup feature in cuco static-map which
- * allows inserting with one type (index) and looking up with a different type (string).
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * The merge-pairs are in adjacent rows so each index will access two rows of string values.
+ * The hash of each string is combined for the returned result.
  */
 struct bpe_hasher {
   cudf::column_device_view const d_strings;
@@ -76,7 +79,10 @@ struct bpe_hasher {
  * @brief Equal function used for building and using the cuco static-map
  *
  * This takes advantage of heterogeneous lookup feature in cuco static-map which
- * allows inserting with one type (index) and looking up with a different type (string).
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * The merge-pairs are in adjacent rows so each index will access two rows of string values.
+ * All rows from the input merge-pairs are unique.
  */
 struct bpe_equal {
   cudf::column_device_view const d_strings;
@@ -95,17 +101,25 @@ struct bpe_equal {
   }
 };
 
-using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
+using bpe_probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
 
 using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
                                                             cudf::size_type,
                                                             cuco::experimental::extent<std::size_t>,
                                                             cuda::thread_scope_device,
                                                             bpe_equal,
-                                                            probe_scheme,
+                                                            bpe_probe_scheme,
                                                             hash_table_allocator_type>;
 
-struct table_hasher {
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * Each component of the merge-pairs (left and right) are stored individually in the map.
+ */
+struct mp_hasher {
   cudf::column_device_view const d_strings;
   string_hasher_type hasher{};
   // used by insert
@@ -127,7 +141,7 @@ struct table_hasher {
  * This takes advantage of heterogeneous lookup feature in cuco static-map which
  * allows inserting with one type (index) and looking up with a different type (string).
  */
-struct table_equal {
+struct mp_equal {
   cudf::column_device_view const d_strings;
   // used by insert
   __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
@@ -144,16 +158,15 @@ struct table_equal {
   }
 };
 
-using probe_scheme2 = cuco::experimental::linear_probing<1, table_hasher>;
+using mp_probe_scheme = cuco::experimental::linear_probing<1, mp_hasher>;
 
-using merge_pairs_map_type2 =
-  cuco::experimental::static_map<cudf::size_type,
-                                 cudf::size_type,
-                                 cuco::experimental::extent<std::size_t>,
-                                 cuda::thread_scope_device,
-                                 table_equal,
-                                 probe_scheme2,
-                                 hash_table_allocator_type>;
+using mp_table_map_type = cuco::experimental::static_map<cudf::size_type,
+                                                         cudf::size_type,
+                                                         cuco::experimental::extent<std::size_t>,
+                                                         cuda::thread_scope_device,
+                                                         mp_equal,
+                                                         mp_probe_scheme,
+                                                         hash_table_allocator_type>;
 
 }  // namespace detail
 
@@ -166,18 +179,16 @@ using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view:
 struct bpe_merge_pairs::bpe_merge_pairs_impl {
   std::unique_ptr<cudf::column> const merge_pairs;
   col_device_view const d_merge_pairs;
-  std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
-
-  std::unique_ptr<detail::merge_pairs_map_type2> mp_table_map;
+  std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;  // for BPE
+  std::unique_ptr<detail::mp_table_map_type> mp_table_map;        // for locating unpairables
 
   bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
                        col_device_view&& d_merge_pairs,
                        std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map,
-                       std::unique_ptr<detail::merge_pairs_map_type2>&& mp_table_map);
+                       std::unique_ptr<detail::mp_table_map_type>&& mp_table_map);
 
   auto const get_merge_pairs() const { return *d_merge_pairs; }
   auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
-
   auto get_mp_table_ref() const { return mp_table_map->ref(cuco::experimental::op::find); }
 };
 
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index b5a49d80a31..ffe8f2156dc 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -92,7 +92,7 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
     cuco::empty_key{-1},
     cuco::empty_value{-1},
     bpe_equal{input},
-    probe_scheme{bpe_hasher{input}},
+    bpe_probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
@@ -104,15 +104,15 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   return merge_pairs_map;
 }
 
-std::unique_ptr<detail::merge_pairs_map_type2> initialize_mp_table_map(
+std::unique_ptr<detail::mp_table_map_type> initialize_mp_table_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  auto mp_table_map = std::make_unique<merge_pairs_map_type2>(
+  auto mp_table_map = std::make_unique<mp_table_map_type>(
     static_cast<size_t>(input.size()),
     cuco::empty_key{-1},
     cuco::empty_value{-1},
-    table_equal{input},
-    probe_scheme2{table_hasher{input}},
+    mp_equal{input},
+    mp_probe_scheme{mp_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
@@ -185,7 +185,7 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
     d_merge_pairs,
   std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map,
-  std::unique_ptr<detail::merge_pairs_map_type2>&& mp_table_map)
+  std::unique_ptr<detail::mp_table_map_type>&& mp_table_map)
   : merge_pairs(std::move(merge_pairs)),
     d_merge_pairs(std::move(d_merge_pairs)),
     merge_pairs_map(std::move(merge_pairs_map)),

From 0f3bc94428ad520350b689d27b1a831558a8f272 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 6 Oct 2023 18:22:37 -0400
Subject: [PATCH 22/28] minor rework: variables and comments

---
 cpp/src/text/subword/bpe_tokenizer.cu | 80 +++++++++++++++------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 0d63e2b7507..fa4533480ac 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -25,9 +25,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -40,10 +38,8 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/merge.h>
 #include <thrust/remove.h>
-#include <thrust/transform.h>
 
 namespace nvtext {
 namespace detail {
@@ -55,7 +51,10 @@ constexpr int block_size = 512;
  * @brief Produces offsets to unpairable locations in the given chars array
  *
  * Launched as a thread per byte of the chars array.
- * The output is non-zero offsets to locations of unpairable strings.
+ * The output is non-zero offsets to locations of unpairable substring.
+ * An unpairable substring does not exist in the given map and so will
+ * never be paired :-(. Fortunately, this can be used as an artificial
+ * boundary providing better parallelism in the BPE kernel.
  *
  * @tparam MapRefType The type of the map finder object
  * @param d_chars Input chars memory
@@ -96,42 +95,55 @@ __global__ void bpe_unpairable_offsets_fn(char const* d_chars,
     if (!rhs.empty()) {
       // see if both halves exist anywhere in the table
       if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
-        output = idx + lhs.size_bytes() + offset;  // candidate for artificial boundary
+        output = idx + lhs.size_bytes() + offset;  // offset for artificial boundary
       }
     }
   }
   d_offsets[idx] = output;
+
+  // Alternate solution that only checks one substring.
+  // No noticeable performance improvement.
+  // auto const lhs = [begin = itr, end] {
+  //   auto next = begin + (begin < end);
+  //   while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) {
+  //     ++next;
+  //   }
+  //   return cudf::string_view(begin, static_cast<cudf::size_type>(thrust::distance(begin, next)));
+  // }();
+  // d_offsets[idx] = (((itr + lhs.size_bytes()) < end) && (d_map.find(lhs) == d_map.end()))
+  //                    ? idx + lhs.size_bytes() + offset  // offset for artificial boundary
+  //                    : 0;
 }
 
 /**
  * @brief Performs byte-pair-encoding
  *
- * Computes the locations where the separator will be inserted in `d_spaces_in`.
+ * Computes the locations where the separator will be inserted in `d_spaces_data`.
  * This is launched as a string per block.
  *
- * The process first initializes all characters to 1 per position in `d_spaces_in`.
+ * The process first initializes all characters to 1 per position in `d_spaces_data`.
  * All pairs are realized and their ranks stored in `d_ranks`.
  *
- * Iteratively, the minimum rank is located, the corresponding `d_spaces_in` location
+ * Iteratively, the minimum rank is located, the corresponding `d_spaces_data` location
  * is set to 0 resulting in new potential pairs. The process repeats accounting for
  * the rank of the newly formed pairs.
  *
- * Once there no more rankable pairs, the process finishes and the `d_spaces_in`
+ * Once there are no more rankable pairs, the process finishes and the `d_spaces_data`
  * values identify the location to insert the separator.
  *
  * @tparam MapRefType The type of the map finder object
  * @param d_strings Input data
  * @param d_map For looking up individual string candidates
- * @param d_spaces_in Output the location where separator will be inserted
+ * @param d_spaces_data Output the location where separator will be inserted
  * @param d_ranks Working memory to hold pair ranks
- * @param d_rerank_in Working memory to hold locations where reranking is required
+ * @param d_rerank_data Working memory to hold locations where reranking is required
  */
 template <typename MapRefType>
 __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
                                 MapRefType const d_map,
-                                int8_t* d_spaces_in,       // working memory
+                                int8_t* d_spaces_data,     // working memory
                                 cudf::size_type* d_ranks,  // more working memory
-                                int8_t* d_rerank_in        // and one more working memory
+                                int8_t* d_rerank_data      // and one more working memory
 )
 {
   // string per block
@@ -139,7 +151,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
   auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
 
-  // if (d_strings.is_null(str_idx)) { return; }  // maybe no longer needed
   auto const d_str = d_strings.element<cudf::string_view>(str_idx);
   if (d_str.empty()) { return; }
 
@@ -147,11 +158,11 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
   auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
 
-  auto const d_spaces    = d_spaces_in + offset;
+  auto const d_spaces    = d_spaces_data + offset;
   auto const end_spaces  = d_spaces + d_str.size_bytes();
   auto const d_min_ranks = d_ranks + offset;
   auto const end_ranks   = d_min_ranks + d_str.size_bytes();
-  auto const d_rerank    = d_rerank_in + offset;
+  auto const d_rerank    = d_rerank_data + offset;
   auto const end_rerank  = d_rerank + d_str.size_bytes();
 
   auto constexpr max_rank = cuda::std::numeric_limits<cudf::size_type>::max();
@@ -283,15 +294,15 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
  * @brief Computes the output size of each strings row
  *
  * This launches as a string per block.
- * The non-zero values in `d_spaces_in` for each string is added to
+ * The non-zero values in `d_spaces_data` for each string is added to
  * the current string size to produce the total output bytes.
  *
  * @param d_strings Input data
- * @param d_spaces_in Output the location where separator will be inserted
+ * @param d_spaces_data Output the location where separator will be inserted
  * @param d_sizes Output sizes of each row
  */
 __global__ void bpe_finalize(cudf::column_device_view const d_strings,
-                             int8_t* d_spaces_in,      // where separators are inserted
+                             int8_t* d_spaces_data,    // where separators are inserted
                              cudf::size_type* d_sizes  // output sizes of encoded strings
 )
 {
@@ -314,7 +325,7 @@ __global__ void bpe_finalize(cudf::column_device_view const d_strings,
     d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
   auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
 
-  auto const d_spaces   = d_spaces_in + offset;
+  auto const d_spaces   = d_spaces_data + offset;
   auto const end_spaces = d_spaces + d_str.size_bytes();
   auto const num_valid  = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
 
@@ -371,16 +382,16 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   rmm::device_uvector<int8_t> d_rerank(chars_size, stream);          // re-ranking identifiers
 
   {
-    auto const mp_map = merge_pairs.impl->get_mp_table_ref();
-    // this path locates unpairable sections of code to create artificial string row boundaries;
+    // this kernel locates unpairable sections of code to create artificial string row boundaries;
     // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector
+    auto const mp_map = merge_pairs.impl->get_mp_table_ref();
     cudf::detail::grid_1d grid(chars_size, block_size);
     bpe_unpairable_offsets_fn<decltype(mp_map)><<<grid.num_blocks, block_size, 0, stream.value()>>>(
       d_input_chars, chars_size, first_offset, mp_map, d_ranks.data());
     auto const end   = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0);
     auto const total = thrust::distance(d_ranks.begin(), end);  // number of unpairables
 
-    // the new boundaries are combined with the existing offsets to build a temporary column
+    // the new boundaries are combined with the existing offsets
     auto tmp_offsets = rmm::device_uvector<cudf::size_type>(total + input.size() + 1, stream);
     thrust::merge(rmm::exec_policy(stream),
                   input.offsets_begin(),
@@ -389,8 +400,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                   end,
                   tmp_offsets.begin());
 
-    // the temp column is used for the encoding functions which is much faster
-    // on a larger number of smaller strings
+    // temp column created for the encoding which parallelizes between the unpairable boundaries
     auto const col_offsets =
       cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
     auto const tmp_input     = cudf::column_view(input.parent().type(),
@@ -407,7 +417,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
       *d_tmp_strings, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data());
   }
 
-  // compute the output sizes into the d_offsets vector
+  // compute the output sizes into the output d_offsets vector
   bpe_finalize<<<input.size(), block_size, 0, stream.value()>>>(
     *d_strings, d_spaces.data(), d_offsets);
 
@@ -418,11 +428,11 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                "Size of output exceeds the column size limit",
                std::overflow_error);
 
-  // build the output: adding separators between the remaining pairs in each string
+  // build the output: adding separators to the input character data
   auto chars   = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
   auto d_chars = chars->mutable_view().data<char>();
 
-  // we can reuse the ranks working memory to store some temporary offsets now;
+  // we can reuse the ranks working memory to store some temporary offsets;
   // the offsets are produced by the index of the d_spaces values
   auto const d_inserts = d_ranks.data();
   // create offsets where separators will be inserted
@@ -437,12 +447,12 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   // this will insert the single-byte separator in positions specified in d_inserts
   auto const sep_char = thrust::constant_iterator<char>(separator.to_string(stream)[0]);
   thrust::merge_by_key(rmm::exec_policy(stream),
-                       d_inserts,  // where separator is inserted
-                       copy_end,
-                       zero_itr,  // all positions
-                       chars_end,
-                       sep_char,  // byte to insert
-                       d_input_chars,
+                       d_inserts,      // where to insert separator byte
+                       copy_end,       //
+                       zero_itr,       // all positions
+                       chars_end,      //
+                       sep_char,       // byte to insert
+                       d_input_chars,  // original data
                        thrust::make_discard_iterator(),
                        d_chars);  // result
 

From 747e10fd47db44664998d8282dd508e5a0217f0b Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 11 Oct 2023 14:48:40 -0400
Subject: [PATCH 23/28] remove any duplicate tmp offsets

---
 cpp/src/text/subword/bpe_tokenizer.cu | 75 ++++++++++++++-------------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index fa4533480ac..51cef6507aa 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -40,6 +40,8 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/merge.h>
 #include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
 
 namespace nvtext {
 namespace detail {
@@ -152,8 +154,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
 
   auto const d_str = d_strings.element<cudf::string_view>(str_idx);
-  if (d_str.empty()) { return; }
-
   auto const offsets =
     d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
   auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
@@ -172,12 +172,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   __shared__ typename block_reduce::TempStorage temp_storage;
   auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
 
-  auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
-    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
-    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-    return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
-  };
-
   // init all the re-rank identifiers to zero
   for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
     *itr = 0;
@@ -193,6 +187,12 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   }
   __syncthreads();
 
+  auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
+    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
+    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+    return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
+  };
+
   auto min_rank = max_rank;
 
   // store all the initial ranks for each pair
@@ -218,6 +218,20 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   if (lane_idx == 0) { block_min_rank = reduce_rank; }
   __syncthreads();
 
+  // these are used to locate adjacent pairs after merging a pair
+  auto find_prev = [begin = d_spaces](int8_t* ptr) {
+    while (ptr > begin && *ptr == 0) {
+      --ptr;
+    }
+    return ptr;
+  };
+  auto find_next = [end = end_spaces](int8_t* ptr) {
+    while (ptr < end && *ptr == 0) {
+      ++ptr;
+    }
+    return ptr;
+  };
+
   // loop through the ranks processing the current minimum until there are no more
   while (block_min_rank < max_rank) {
     // search the d_min_ranks for matches to block_min_rank
@@ -233,19 +247,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     }
     __syncthreads();
 
-    auto find_prev = [begin = d_spaces](int8_t* ptr) {
-      while (ptr > begin && *ptr == 0) {
-        --ptr;
-      }
-      return ptr;
-    };
-    auto find_next = [end = end_spaces](int8_t* ptr) {
-      while (ptr < end && *ptr == 0) {
-        ++ptr;
-      }
-      return ptr;
-    };
-
     // identify all the re-rank locations (logic above created new pairs)
     for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
       auto const index = thrust::distance(d_min_ranks, itr);
@@ -270,9 +271,8 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
         auto const ptr  = find_prev(d_spaces + index - 1);
         auto const size = static_cast<cudf::size_type>(thrust::distance(ptr, d_spaces + index));
         auto const lhs  = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size);
-        // retrieve rhs of pair
-        auto const rhs = next_substr(d_spaces + index);
-        rank           = max_rank;
+        auto const rhs  = next_substr(d_spaces + index);  // retrieve rhs of pair
+        rank            = max_rank;
         if (!rhs.empty()) {
           auto const mp      = merge_pair_type{lhs, rhs};
           auto const map_itr = d_map.find(mp);                     // lookup rank for this pair;
@@ -388,32 +388,33 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     cudf::detail::grid_1d grid(chars_size, block_size);
     bpe_unpairable_offsets_fn<decltype(mp_map)><<<grid.num_blocks, block_size, 0, stream.value()>>>(
       d_input_chars, chars_size, first_offset, mp_map, d_ranks.data());
-    auto const end   = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0);
-    auto const total = thrust::distance(d_ranks.begin(), end);  // number of unpairables
+    auto const up_end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0);
+    auto const unpairables = thrust::distance(d_ranks.begin(), up_end);  // number of unpairables
 
     // the new boundaries are combined with the existing offsets
-    auto tmp_offsets = rmm::device_uvector<cudf::size_type>(total + input.size() + 1, stream);
+    auto tmp_offsets = rmm::device_uvector<cudf::size_type>(unpairables + input.size() + 1, stream);
     thrust::merge(rmm::exec_policy(stream),
                   input.offsets_begin(),
                   input.offsets_end(),
                   d_ranks.begin(),
-                  end,
+                  up_end,
                   tmp_offsets.begin());
+    // remove adjacent duplicate offsets (empty or null rows)
+    auto const offsets_end =
+      thrust::unique(rmm::exec_policy(stream), tmp_offsets.begin(), tmp_offsets.end());
+    auto const offsets_total =
+      static_cast<cudf::size_type>(thrust::distance(tmp_offsets.begin(), offsets_end));
 
     // temp column created for the encoding which parallelizes between the unpairable boundaries
     auto const col_offsets =
       cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
-    auto const tmp_input     = cudf::column_view(input.parent().type(),
-                                             static_cast<cudf::size_type>(input.size() + total),
-                                             nullptr,
-                                             nullptr,
-                                             0,
-                                             0,
-                                                 {col_offsets, input.chars()});
+    auto const tmp_size  = offsets_total - 1;
+    auto const tmp_input = cudf::column_view(
+      input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()});
     auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
-
+    // launch the byte-pair-encoding kernel
     auto const pair_map = merge_pairs.impl->get_merge_pairs_ref();
-    bpe_parallel_fn<decltype(pair_map)><<<tmp_input.size(), block_size, 0, stream.value()>>>(
+    bpe_parallel_fn<decltype(pair_map)><<<tmp_size, block_size, 0, stream.value()>>>(
       *d_tmp_strings, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data());
   }
 

From 3c2a866eed6dc678d29192dc20a14156c1a8acd6 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 23 Oct 2023 12:59:44 -0400
Subject: [PATCH 24/28] fix tmp-offsets size

---
 cpp/src/text/subword/bpe_tokenizer.cu | 52 ++++++++++++++-------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 51cef6507aa..6bca8544b57 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -377,35 +377,39 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr);
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
-  rmm::device_uvector<int8_t> d_spaces(chars_size, stream);
-  rmm::device_uvector<cudf::size_type> d_ranks(chars_size, stream);  // rank per string pair;
-  rmm::device_uvector<int8_t> d_rerank(chars_size, stream);          // re-ranking identifiers
+  rmm::device_uvector<int8_t> d_spaces(chars_size, stream);  // identifies non-merged pairs
+  rmm::device_uvector<int8_t> d_rerank(chars_size, stream);  // re-ranking identifiers
+  // used for various purposes below: unpairable-offsets, pair ranks, separator insert positions
+  rmm::device_uvector<cudf::size_type> d_working(chars_size, stream);
 
   {
-    // this kernel locates unpairable sections of code to create artificial string row boundaries;
-    // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector
-    auto const mp_map = merge_pairs.impl->get_mp_table_ref();
+    // this kernel locates unpairable sections of strings to create artificial string row
+    // boundaries; the boundary values are recorded as offsets in d_up_offsets
+    auto const d_up_offsets = d_working.data();  // store unpairable offsets here
+    auto const mp_map       = merge_pairs.impl->get_mp_table_ref();  // lookup table
     cudf::detail::grid_1d grid(chars_size, block_size);
     bpe_unpairable_offsets_fn<decltype(mp_map)><<<grid.num_blocks, block_size, 0, stream.value()>>>(
-      d_input_chars, chars_size, first_offset, mp_map, d_ranks.data());
-    auto const up_end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0);
-    auto const unpairables = thrust::distance(d_ranks.begin(), up_end);  // number of unpairables
+      d_input_chars, chars_size, first_offset, mp_map, d_up_offsets);
+    auto const up_end =  // remove all but the unpairable offsets
+      thrust::remove(rmm::exec_policy(stream), d_up_offsets, d_up_offsets + chars_size, 0);
+    auto const unpairables = thrust::distance(d_up_offsets, up_end);  // number of unpairables
 
-    // the new boundaries are combined with the existing offsets
+    // the new boundaries created by combining unpairable offsets with the existing offsets
     auto tmp_offsets = rmm::device_uvector<cudf::size_type>(unpairables + input.size() + 1, stream);
     thrust::merge(rmm::exec_policy(stream),
                   input.offsets_begin(),
                   input.offsets_end(),
-                  d_ranks.begin(),
+                  d_up_offsets,
                   up_end,
                   tmp_offsets.begin());
-    // remove adjacent duplicate offsets (empty or null rows)
+    // remove any adjacent duplicate offsets (empty or null rows)
     auto const offsets_end =
       thrust::unique(rmm::exec_policy(stream), tmp_offsets.begin(), tmp_offsets.end());
     auto const offsets_total =
       static_cast<cudf::size_type>(thrust::distance(tmp_offsets.begin(), offsets_end));
+    tmp_offsets.resize(offsets_total, stream);
 
-    // temp column created for the encoding which parallelizes between the unpairable boundaries
+    // temp column created with the merged offsets and the original chars data
     auto const col_offsets =
       cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
     auto const tmp_size  = offsets_total - 1;
@@ -413,9 +417,10 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
       input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()});
     auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
     // launch the byte-pair-encoding kernel
+    auto const d_ranks  = d_working.data();  // store pair ranks here
     auto const pair_map = merge_pairs.impl->get_merge_pairs_ref();
     bpe_parallel_fn<decltype(pair_map)><<<tmp_size, block_size, 0, stream.value()>>>(
-      *d_tmp_strings, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data());
+      *d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data());
   }
 
   // compute the output sizes into the output d_offsets vector
@@ -429,28 +434,25 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                "Size of output exceeds the column size limit",
                std::overflow_error);
 
-  // build the output: adding separators to the input character data
+  // build the output: inserting separators to the input character data
   auto chars   = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
   auto d_chars = chars->mutable_view().data<char>();
 
-  // we can reuse the ranks working memory to store some temporary offsets;
-  // the offsets are produced by the index of the d_spaces values
-  auto const d_inserts = d_ranks.data();
-  // create offsets where separators will be inserted
+  auto const d_inserts     = d_working.data();  // stores the insert positions
   auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
     return d_spaces[idx] > 0;  // separator to be inserted here
   };
-  auto const zero_itr  = thrust::counting_iterator<cudf::size_type>(0);
-  auto const chars_end = thrust::counting_iterator<cudf::size_type>(chars_size);
-  auto const copy_end  = thrust::copy_if(
-    rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_non_zero);
+  auto const chars_begin = thrust::counting_iterator<cudf::size_type>(0);
+  auto const chars_end   = thrust::counting_iterator<cudf::size_type>(chars_size);
+  auto const copy_end    = thrust::copy_if(
+    rmm::exec_policy(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero);
 
-  // this will insert the single-byte separator in positions specified in d_inserts
+  // this will insert the single-byte separator into positions specified in d_inserts
   auto const sep_char = thrust::constant_iterator<char>(separator.to_string(stream)[0]);
   thrust::merge_by_key(rmm::exec_policy(stream),
                        d_inserts,      // where to insert separator byte
                        copy_end,       //
-                       zero_itr,       // all positions
+                       chars_begin,    // all positions
                        chars_end,      //
                        sep_char,       // byte to insert
                        d_input_chars,  // original data

From 40bd2aee9fa1dfdf354a33c07e713e5482a3e27a Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 25 Oct 2023 14:08:23 -0400
Subject: [PATCH 25/28] replaced some device utilities with thrust functions

---
 cpp/src/text/subword/bpe_tokenizer.cu | 89 +++++++++++----------------
 1 file changed, 37 insertions(+), 52 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 6bca8544b57..e7a030dbc0e 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -53,10 +53,10 @@ constexpr int block_size = 512;
  * @brief Produces offsets to unpairable locations in the given chars array
  *
  * Launched as a thread per byte of the chars array.
- * The output is non-zero offsets to locations of unpairable substring.
+ * The output is non-zero offsets to locations of unpairable substrings.
  * An unpairable substring does not exist in the given map and so will
  * never be paired :-(. Fortunately, this can be used as an artificial
- * boundary providing better parallelism in the BPE kernel.
+ * boundary providing increased parallelism in the BPE kernel.
  *
  * @tparam MapRefType The type of the map finder object
  * @param d_chars Input chars memory
@@ -65,8 +65,8 @@ constexpr int block_size = 512;
  * @param d_offsets Output is the offset location of unpairables
  */
 template <typename MapRefType>
-__global__ void bpe_unpairable_offsets_fn(char const* d_chars,
-                                          cudf::size_type chars_size,  // use device_span
+__global__ void bpe_unpairable_offsets_fn(char const* d_chars,  // use device_span
+                                          cudf::size_type chars_size,
                                           cudf::size_type offset,
                                           MapRefType const d_map,
                                           cudf::size_type* d_offsets)
@@ -78,27 +78,16 @@ __global__ void bpe_unpairable_offsets_fn(char const* d_chars,
     return;
   }
 
-  auto const itr = d_chars + idx;
-  auto const end = d_chars + chars_size;
-
-  auto next_str = [end](char const* begin) {
-    auto next = begin + (begin < end);
-    while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) {
-      ++next;
-    }
-    return cudf::string_view(begin, static_cast<cudf::size_type>(thrust::distance(begin, next)));
-  };
-
-  auto const lhs      = next_str(itr);
-  auto const next_itr = itr + lhs.size_bytes();
-  auto output         = 0;
-  if (next_itr < end) {
-    auto const rhs = next_str(next_itr);
-    if (!rhs.empty()) {
-      // see if both halves exist anywhere in the table
-      if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
-        output = idx + lhs.size_bytes() + offset;  // offset for artificial boundary
-      }
+  auto const itr  = d_chars + idx;
+  auto const end  = d_chars + chars_size;
+  auto const lhs  = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr));
+  auto const next = itr + lhs.size_bytes();
+  auto output     = 0;
+  if (next < end) {
+    auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next));
+    // see if both halves exist anywhere in the table, if not these are unpairable
+    if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
+      output = idx + lhs.size_bytes() + offset;  // offset for artificial boundary
     }
   }
   d_offsets[idx] = output;
@@ -187,17 +176,25 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   }
   __syncthreads();
 
+  // for finding the next half of a pair
   auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
-    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
+    auto const next = thrust::find(thrust::seq, begin + 1, end, 1);
     auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
     return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
   };
+  // for locating adjacent pairs after merging a pair
+  auto find_prev = [begin = d_spaces](int8_t* ptr) {
+    while (ptr > begin && *ptr == 0) {
+      --ptr;
+    }
+    return ptr;
+  };
 
   auto min_rank = max_rank;
 
   // store all the initial ranks for each pair
   for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
-    if (*itr == 0) { continue; }  // start on valid bytes only
+    if (*itr == 0) { continue; }  // skips any UTF-8 continuation bytes
     // resolve pair and lookup its rank
     auto const lhs      = next_substr(itr);  // retrieve lhs of the pair
     auto const next_itr = itr + lhs.size_bytes();
@@ -218,20 +215,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
   if (lane_idx == 0) { block_min_rank = reduce_rank; }
   __syncthreads();
 
-  // these are used to locate adjacent pairs after merging a pair
-  auto find_prev = [begin = d_spaces](int8_t* ptr) {
-    while (ptr > begin && *ptr == 0) {
-      --ptr;
-    }
-    return ptr;
-  };
-  auto find_next = [end = end_spaces](int8_t* ptr) {
-    while (ptr < end && *ptr == 0) {
-      ++ptr;
-    }
-    return ptr;
-  };
-
   // loop through the ranks processing the current minimum until there are no more
   while (block_min_rank < max_rank) {
     // search the d_min_ranks for matches to block_min_rank
@@ -241,19 +224,21 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
         while (ptr > d_min_ranks && *ptr == max_rank) {
           --ptr;
         }
-        // set the output value to 0 at this position (erases separator)
+        // set the output value to 0 at this position (erases separator, merges pair)
         if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; }
       }
     }
     __syncthreads();
 
-    // identify all the re-rank locations (logic above created new pairs)
+    // identify all the re-rank locations (logic above invalidated adjacent pairs)
     for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
       auto const index = thrust::distance(d_min_ranks, itr);
       if (*itr == block_min_rank && d_spaces[index] == 0) {
-        auto ptr = find_prev(d_spaces + index - 1);  // find previous pair mid-point
+        // find previous pair mid-point
+        auto ptr = find_prev(d_spaces + index - 1);
         if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }
-        ptr = find_next(d_spaces + index + 1);  // find next pair mid-point
+        // find next pair mid-point
+        ptr = thrust::find(thrust::seq, d_spaces + index + 1, end_spaces, 1);
         if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }
         *itr = max_rank;  // reset this rank
       }
@@ -287,7 +272,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
     if (lane_idx == 0) { block_min_rank = reduce_rank; }
     __syncthreads();
-  }  // if no mins were found we are done, otherwise start again
+  }  // if no min ranks are found we are done, otherwise start again
 }
 
 /**
@@ -340,8 +325,8 @@ __global__ void bpe_finalize(cudf::column_device_view const d_strings,
   for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
     bytes += (*itr > 0);
   }
-  auto const size = block_reduce(temp_storage).Sum(bytes, num_valid);
-  if (lane_idx == 0) { d_sizes[str_idx] = size + d_str.size_bytes(); }
+  auto const total_bytes = block_reduce(temp_storage).Sum(bytes, num_valid);
+  if (lane_idx == 0) { d_sizes[str_idx] = total_bytes + d_str.size_bytes(); }
 }
 
 }  // namespace
@@ -394,7 +379,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
       thrust::remove(rmm::exec_policy(stream), d_up_offsets, d_up_offsets + chars_size, 0);
     auto const unpairables = thrust::distance(d_up_offsets, up_end);  // number of unpairables
 
-    // the new boundaries created by combining unpairable offsets with the existing offsets
+    // new string boundaries created by combining unpairable offsets with the existing offsets
     auto tmp_offsets = rmm::device_uvector<cudf::size_type>(unpairables + input.size() + 1, stream);
     thrust::merge(rmm::exec_policy(stream),
                   input.offsets_begin(),
@@ -402,7 +387,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                   d_up_offsets,
                   up_end,
                   tmp_offsets.begin());
-    // remove any adjacent duplicate offsets (empty or null rows)
+    // remove any adjacent duplicate offsets (i.e. empty or null rows)
     auto const offsets_end =
       thrust::unique(rmm::exec_policy(stream), tmp_offsets.begin(), tmp_offsets.end());
     auto const offsets_total =
@@ -416,7 +401,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     auto const tmp_input = cudf::column_view(
       input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()});
     auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
-    // launch the byte-pair-encoding kernel
+    // launch the byte-pair-encoding kernel on the temp column
     auto const d_ranks  = d_working.data();  // store pair ranks here
     auto const pair_map = merge_pairs.impl->get_merge_pairs_ref();
     bpe_parallel_fn<decltype(pair_map)><<<tmp_size, block_size, 0, stream.value()>>>(
@@ -452,7 +437,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   thrust::merge_by_key(rmm::exec_policy(stream),
                        d_inserts,      // where to insert separator byte
                        copy_end,       //
-                       chars_begin,    // all positions
+                       chars_begin,    // all indices
                        chars_end,      //
                        sep_char,       // byte to insert
                        d_input_chars,  // original data

From 6738159e561fd77d5c9949dfb01c906e633c99b8 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 26 Oct 2023 09:24:37 -0400
Subject: [PATCH 26/28] change custom kernel to transform

---
 cpp/src/text/subword/bpe_tokenizer.cu | 143 +++++++++++++-------------
 1 file changed, 69 insertions(+), 74 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index e7a030dbc0e..542fe69f156 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -59,52 +59,45 @@ constexpr int block_size = 512;
  * boundary providing increased parallelism in the BPE kernel.
  *
  * @tparam MapRefType The type of the map finder object
- * @param d_chars Input chars memory
- * @param chars_size Number of bytes pointed to by `d_chars`
- * @param d_map For looking up individual string candidates
- * @param d_offsets Output is the offset location of unpairables
  */
 template <typename MapRefType>
-__global__ void bpe_unpairable_offsets_fn(char const* d_chars,  // use device_span
-                                          cudf::size_type chars_size,
-                                          cudf::size_type offset,
-                                          MapRefType const d_map,
-                                          cudf::size_type* d_offsets)
-{
-  auto const idx = static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id());
-  if (idx >= chars_size) { return; }
-  if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) {
-    d_offsets[idx] = 0;
-    return;
-  }
-
-  auto const itr  = d_chars + idx;
-  auto const end  = d_chars + chars_size;
-  auto const lhs  = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr));
-  auto const next = itr + lhs.size_bytes();
-  auto output     = 0;
-  if (next < end) {
-    auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next));
-    // see if both halves exist anywhere in the table, if not these are unpairable
-    if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
-      output = idx + lhs.size_bytes() + offset;  // offset for artificial boundary
+struct bpe_unpairable_offsets_fn {
+  cudf::device_span<char const> d_chars;
+  cudf::size_type offset;
+  MapRefType const d_map;
+  __device__ cudf::size_type operator()(cudf::size_type idx)
+  {
+    if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { return 0; }
+
+    auto const itr  = d_chars.data() + idx;
+    auto const end  = d_chars.end();  // + chars_size;
+    auto const lhs  = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr));
+    auto const next = itr + lhs.size_bytes();
+    auto output     = 0;
+    if (next < end) {
+      auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next));
+      // see if both halves exist anywhere in the table, if not these are unpairable
+      if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
+        output = idx + lhs.size_bytes() + offset;  // offset for artificial boundary
+      }
     }
+    return output;
+
+    // Alternate solution that only checks one substring.
+    // No noticeable performance improvement.
+    // auto const lhs = [begin = itr, end] {
+    //   auto next = begin + (begin < end);
+    //   while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) {
+    //     ++next;
+    //   }
+    //   return cudf::string_view(begin, static_cast<cudf::size_type>(thrust::distance(begin,
+    //   next)));
+    // }();
+    // d_offsets[idx] = (((itr + lhs.size_bytes()) < end) && (d_map.find(lhs) == d_map.end()))
+    //                    ? idx + lhs.size_bytes() + offset  // offset for artificial boundary
+    //                    : 0;
   }
-  d_offsets[idx] = output;
-
-  // Alternate solution that only checks one substring.
-  // No noticeable performance improvement.
-  // auto const lhs = [begin = itr, end] {
-  //   auto next = begin + (begin < end);
-  //   while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) {
-  //     ++next;
-  //   }
-  //   return cudf::string_view(begin, static_cast<cudf::size_type>(thrust::distance(begin, next)));
-  // }();
-  // d_offsets[idx] = (((itr + lhs.size_bytes()) < end) && (d_map.find(lhs) == d_map.end()))
-  //                    ? idx + lhs.size_bytes() + offset  // offset for artificial boundary
-  //                    : 0;
-}
+};
 
 /**
  * @brief Performs byte-pair-encoding
@@ -113,7 +106,7 @@ __global__ void bpe_unpairable_offsets_fn(char const* d_chars,  // use device_sp
  * This is launched as a string per block.
  *
  * The process first initializes all characters to 1 per position in `d_spaces_data`.
- * All pairs are realized and their ranks stored in `d_ranks`.
+ * All pairs are realized and their ranks stored in `d_ranks_data`.
  *
  * Iteratively, the minimum rank is located, the corresponding `d_spaces_data` location
  * is set to 0 resulting in new potential pairs. The process repeats accounting for
@@ -126,15 +119,15 @@ __global__ void bpe_unpairable_offsets_fn(char const* d_chars,  // use device_sp
  * @param d_strings Input data
  * @param d_map For looking up individual string candidates
  * @param d_spaces_data Output the location where separator will be inserted
- * @param d_ranks Working memory to hold pair ranks
+ * @param d_ranks_data Working memory to hold pair ranks
  * @param d_rerank_data Working memory to hold locations where reranking is required
  */
 template <typename MapRefType>
 __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
                                 MapRefType const d_map,
-                                int8_t* d_spaces_data,     // working memory
-                                cudf::size_type* d_ranks,  // more working memory
-                                int8_t* d_rerank_data      // and one more working memory
+                                int8_t* d_spaces_data,          // working memory
+                                cudf::size_type* d_ranks_data,  // more working memory
+                                int8_t* d_rerank_data           // and one more working memory
 )
 {
   // string per block
@@ -147,12 +140,12 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
   auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
 
-  auto const d_spaces    = d_spaces_data + offset;
-  auto const end_spaces  = d_spaces + d_str.size_bytes();
-  auto const d_min_ranks = d_ranks + offset;
-  auto const end_ranks   = d_min_ranks + d_str.size_bytes();
-  auto const d_rerank    = d_rerank_data + offset;
-  auto const end_rerank  = d_rerank + d_str.size_bytes();
+  auto const d_spaces   = d_spaces_data + offset;
+  auto const end_spaces = d_spaces + d_str.size_bytes();
+  auto const d_ranks    = d_ranks_data + offset;
+  auto const end_ranks  = d_ranks + d_str.size_bytes();
+  auto const d_rerank   = d_rerank_data + offset;
+  auto const end_rerank = d_rerank + d_str.size_bytes();
 
   auto constexpr max_rank = cuda::std::numeric_limits<cudf::size_type>::max();
 
@@ -166,7 +159,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     *itr = 0;
   }
   // init all ranks to max
-  for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+  for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
     *itr = max_rank;
   }
   // init all spaces to 1 as appropriate
@@ -203,9 +196,9 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
       if (!rhs.empty()) {
         auto rank          = max_rank;
         auto const mp      = merge_pair_type{lhs, rhs};
-        auto const map_itr = d_map.find(mp);                       // lookup pair in merges table;
-        if (map_itr != d_map.end()) { rank = map_itr->second; }    // found a match;
-        d_min_ranks[thrust::distance(d_spaces, next_itr)] = rank;  // store the rank
+        auto const map_itr = d_map.find(mp);                     // lookup pair in merges table;
+        if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match;
+        d_ranks[thrust::distance(d_spaces, next_itr)] = rank;    // store the rank
         if (rank < min_rank) min_rank = rank;
       }
     }
@@ -217,22 +210,22 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
 
   // loop through the ranks processing the current minimum until there are no more
   while (block_min_rank < max_rank) {
-    // search the d_min_ranks for matches to block_min_rank
-    for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+    // search the d_ranks for matches to block_min_rank
+    for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
       if (*itr == block_min_rank) {
         auto ptr = itr - 1;  // check for adjacent min-rank (edge-case)
-        while (ptr > d_min_ranks && *ptr == max_rank) {
+        while (ptr > d_ranks && *ptr == max_rank) {
           --ptr;
         }
         // set the output value to 0 at this position (erases separator, merges pair)
-        if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; }
+        if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_ranks, itr)] = 0; }
       }
     }
     __syncthreads();
 
     // identify all the re-rank locations (logic above invalidated adjacent pairs)
-    for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) {
-      auto const index = thrust::distance(d_min_ranks, itr);
+    for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+      auto const index = thrust::distance(d_ranks, itr);
       if (*itr == block_min_rank && d_spaces[index] == 0) {
         // find previous pair mid-point
         auto ptr = find_prev(d_spaces + index - 1);
@@ -249,7 +242,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
     min_rank = max_rank;  // and record the new minimum along the way
     for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
       auto const index = thrust::distance(d_rerank, itr);
-      auto rank        = d_min_ranks[index];
+      auto rank        = d_ranks[index];
       if (*itr) {
         *itr = 0;  // reset re-rank
         // build lhs of pair
@@ -263,7 +256,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
           auto const map_itr = d_map.find(mp);                     // lookup rank for this pair;
           if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match
         }
-        d_min_ranks[index] = rank;  // store new rank
+        d_ranks[index] = rank;  // store new rank
       }
       if (rank < min_rank) { min_rank = rank; }
     }
@@ -363,18 +356,20 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
   rmm::device_uvector<int8_t> d_spaces(chars_size, stream);  // identifies non-merged pairs
-  rmm::device_uvector<int8_t> d_rerank(chars_size, stream);  // re-ranking identifiers
   // used for various purposes below: unpairable-offsets, pair ranks, separator insert positions
   rmm::device_uvector<cudf::size_type> d_working(chars_size, stream);
 
+  auto const chars_begin = thrust::counting_iterator<cudf::size_type>(0);
+  auto const chars_end   = thrust::counting_iterator<cudf::size_type>(chars_size);
+
   {
     // this kernel locates unpairable sections of strings to create artificial string row
     // boundaries; the boundary values are recorded as offsets in d_up_offsets
     auto const d_up_offsets = d_working.data();  // store unpairable offsets here
     auto const mp_map       = merge_pairs.impl->get_mp_table_ref();  // lookup table
-    cudf::detail::grid_1d grid(chars_size, block_size);
-    bpe_unpairable_offsets_fn<decltype(mp_map)><<<grid.num_blocks, block_size, 0, stream.value()>>>(
-      d_input_chars, chars_size, first_offset, mp_map, d_up_offsets);
+    auto const d_chars_span = cudf::device_span<char const>(d_input_chars, chars_size);
+    auto up_fn = bpe_unpairable_offsets_fn<decltype(mp_map)>{d_chars_span, first_offset, mp_map};
+    thrust::transform(rmm::exec_policy(stream), chars_begin, chars_end, d_up_offsets, up_fn);
     auto const up_end =  // remove all but the unpairable offsets
       thrust::remove(rmm::exec_policy(stream), d_up_offsets, d_up_offsets + chars_size, 0);
     auto const unpairables = thrust::distance(d_up_offsets, up_end);  // number of unpairables
@@ -401,14 +396,16 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
     auto const tmp_input = cudf::column_view(
       input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()});
     auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
+
     // launch the byte-pair-encoding kernel on the temp column
-    auto const d_ranks  = d_working.data();  // store pair ranks here
+    rmm::device_uvector<int8_t> d_rerank(chars_size, stream);  // more working memory;
+    auto const d_ranks  = d_working.data();                    // store pair ranks here
     auto const pair_map = merge_pairs.impl->get_merge_pairs_ref();
     bpe_parallel_fn<decltype(pair_map)><<<tmp_size, block_size, 0, stream.value()>>>(
       *d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data());
   }
 
-  // compute the output sizes into the output d_offsets vector
+  // compute the output sizes and store them in the d_offsets vector
   bpe_finalize<<<input.size(), block_size, 0, stream.value()>>>(
     *d_strings, d_spaces.data(), d_offsets);
 
@@ -427,9 +424,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
   auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
     return d_spaces[idx] > 0;  // separator to be inserted here
   };
-  auto const chars_begin = thrust::counting_iterator<cudf::size_type>(0);
-  auto const chars_end   = thrust::counting_iterator<cudf::size_type>(chars_size);
-  auto const copy_end    = thrust::copy_if(
+  auto const copy_end = thrust::copy_if(
     rmm::exec_policy(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero);
 
   // this will insert the single-byte separator into positions specified in d_inserts

From f24ce0997e72a385d2821c30dba7966726127095 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 6 Nov 2023 11:04:51 -0500
Subject: [PATCH 27/28] add pytest for BytePairEncoder

---
 ...{bpe_tokenize.pxd => byte_pair_encode.pxd} |  2 +-
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |  2 +-
 ...{bpe_tokenize.pyx => byte_pair_encode.pyx} |  2 +-
 python/cudf/cudf/core/byte_pair_encoding.py   |  2 +-
 .../cudf/cudf/tests/text/test_text_methods.py | 41 +++++++++++++++++++
 5 files changed, 45 insertions(+), 4 deletions(-)
 rename python/cudf/cudf/_lib/cpp/nvtext/{bpe_tokenize.pxd => byte_pair_encode.pxd} (89%)
 rename python/cudf/cudf/_lib/nvtext/{bpe_tokenize.pyx => byte_pair_encode.pyx} (96%)

diff --git a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
rename to python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
index 45a8574d805..e678e4e84db 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
@@ -8,7 +8,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
 
 
-cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil:
+cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
 
     cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
         pass
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index 6d4a36b0428..d7cbdeb5bda 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources
-    bpe_tokenize.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
     ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
diff --git a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
rename to python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index cad33060eaf..63a653fce4e 100644
--- a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -9,7 +9,7 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.bpe_tokenize cimport (
+from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
     bpe_merge_pairs as cpp_bpe_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
     load_merge_pairs as cpp_load_merge_pairs,
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index a1674a099c5..737a31b965e 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import cudf
-from cudf._lib.nvtext.bpe_tokenize import (
+from cudf._lib.nvtext.byte_pair_encode import (
     BPE_Merge_Pairs as cpp_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
 )
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 2241390a531..ba2a7db6578 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
@@ -1030,3 +1031,43 @@ def test_jaccard_index_random_strings():
 
     actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "separator, input, results",
+    [
+        (" ", "thetestsentence", "the test sent ence"),
+        ("_", "sentenceistest", "sent_ence_is_test"),
+        ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
+    ],
+)
+def test_byte_pair_encoding(separator, input, results):
+    pairs_table = cudf.Series(
+        [
+            "t he",
+            "h e",
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t h",
+            "h i",
+            "th is",
+            "t est",
+            "s i",
+            "s ent",
+        ]
+    )
+    encoder = BytePairEncoder(pairs_table)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series([results, None, "", results])
+
+    actual = encoder(strings, separator)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)

From 335beb1bd1801ed2eb1d5abdcce73bb3e6f4fe36 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 13 Nov 2023 14:26:22 -0500
Subject: [PATCH 28/28] change BPE_Merge_Pairs to BPEMergePairs

---
 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx | 4 ++--
 python/cudf/cudf/core/byte_pair_encoding.py       | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index 63a653fce4e..cfc76afa8a5 100644
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -18,7 +18,7 @@ from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
-cdef class BPE_Merge_Pairs:
+cdef class BPEMergePairs:
     cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
 
     def __cinit__(self, Column merge_pairs):
@@ -30,7 +30,7 @@ cdef class BPE_Merge_Pairs:
 @acquire_spill_lock()
 def byte_pair_encoding(
     Column strings,
-    BPE_Merge_Pairs merge_pairs,
+    BPEMergePairs merge_pairs,
     object separator
 ):
     cdef column_view c_strings = strings.view()
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 737a31b965e..4c881022ecf 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -4,13 +4,15 @@
 
 import cudf
 from cudf._lib.nvtext.byte_pair_encode import (
-    BPE_Merge_Pairs as cpp_merge_pairs,
+    BPEMergePairs as cpp_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
 )
 
 
 class BytePairEncoder:
     """
+    Given a merge pairs strings series, performs byte pair encoding on
+    a strings series using the provided separator.
 
     Parameters
     ----------