Move and rename byte-pair-encoding source files (#14284)

Moves and renames the byte-pair-encoding source files. The source files are moved from the `text/subword` to `text/bpe` and the filenames have remove the `tokenize` since these functions only do encoding. No functions names have been changed. The `nvtext::load_merge_pairs_file` API has been deprecated. Callers must load the pairs into a strings column (using the CSV or text readers in cuio) and call the new `nvtext::load_merge_pairs` API instead. Follow on PRs will address function and performance issues. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: #14284
rapidsai · Oct 26, 2023 · d8f0790 · d8f0790
1 parent a2abdb1
commit d8f0790
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 17 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -614,10 +614,10 @@ add_library(
   src/text/normalize.cu
   src/text/replace.cu
   src/text/stemmer.cu
-  src/text/subword/bpe_tokenizer.cu
+  src/text/bpe/byte_pair_encoding.cu
+  src/text/bpe/load_merge_pairs.cu
   src/text/subword/data_normalizer.cu
   src/text/subword/load_hash_file.cu
-  src/text/subword/load_merges_file.cu
   src/text/subword/subword_tokenize.cu
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu

diff --git a/cpp/include/nvtext/bpe_tokenize.hpp → cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/bpe_tokenize.hpp → cpp/include/nvtext/byte_pair_encoding.hpp
@@ -32,7 +32,7 @@ namespace nvtext {
 /**
  * @brief The table of merge pairs for the BPE encoder.
  *
- * To create an instance, call @ref nvtext::load_merge_pairs_file
+ * To create an instance, call @ref nvtext::load_merge_pairs
  */
 struct bpe_merge_pairs {
   struct bpe_merge_pairs_impl;
@@ -66,6 +66,8 @@ struct bpe_merge_pairs {
 /**
  * @brief Create a nvtext::bpe_merge_pairs from an input file.
  *
+ * @deprecated Since 23.12
+ *
  * The file should contain a pair of strings per line separated by
  * a single space.
  *
@@ -94,10 +96,40 @@ struct bpe_merge_pairs {
  * @param mr Memory resource to allocate any returned objects.
  * @return A nvtext::bpe_merge_pairs object
  */
-std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
+[[deprecated]] std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
   std::string const& filename_merges,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create a nvtext::bpe_merge_pairs from a strings column
+ *
+ * The input column should contain a unique pair of strings per line separated by
+ * a single space. An incorrect format or non-unique entries will result in
+ * undefined behavior.
+ *
+ * Example:
+ * @code{.pseudo}
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * mps = load_merge_pairs(merge_pairs)
+ * // the mps object can be passed to the byte_pair_encoding API
+ * @endcode
+ *
+ * The pairs are expected to be ordered in the file by their rank
+ * relative to each other. A pair earlier in the file has priority over
+ * any pairs below it.
+ *
+ * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
+ *
+ * @param merge_pairs Column containing the unique merge pairs
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return A nvtext::bpe_merge_pairs object
+ */
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
+  cudf::strings_column_view const& merge_pairs,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Byte pair encode the input strings.
  *
@@ -110,7 +142,8 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * pairs before the result is joined to make the output string.
  *
  * @code{.pseudo}
- * mps = load_merges_file("merges.txt") // see doxygen for example contents
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * mps = load_merge_pairs(merge_pairs)
  * input = ["test sentence", "thisis test"]
  * result = byte_pair_encoding(input, mps)
  * result is now ["test sent ence", "this is test"]
@@ -120,7 +153,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * @throw cudf::logic_error if `separator` is invalid
  *
  * @param input Strings to encode.
- * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
+ * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
  * @param mr Memory resource to allocate any returned objects.

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu → cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/subword/bpe_tokenizer.cu → cpp/src/text/bpe/byte_pair_encoding.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <text/subword/bpe_tokenizer.cuh>
+#include <text/bpe/byte_pair_encoding.cuh>
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>

diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh → cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh → cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>

diff --git a/cpp/src/text/subword/load_merges_file.cu → cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/subword/load_merges_file.cu → cpp/src/text/bpe/load_merge_pairs.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <text/subword/bpe_tokenizer.cuh>
+#include <text/bpe/byte_pair_encoding.cuh>
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>

diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -24,9 +24,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-struct TextBPETokenize : public cudf::test::BaseFixture {};
+struct TextBytePairEncoding : public cudf::test::BaseFixture {};
 
-TEST_F(TextBPETokenize, BytePairEncoding)
+TEST_F(TextBytePairEncoding, BytePairEncoding)
 {
   // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
   auto mpt = cudf::test::strings_column_wrapper({
@@ -74,7 +74,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }
 
-TEST_F(TextBPETokenize, BytePairEncodingSeparator)
+TEST_F(TextBytePairEncoding, BytePairEncodingSeparator)
 {
   auto mpt = cudf::test::strings_column_wrapper(
     {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
@@ -91,7 +91,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
-TEST_F(TextBPETokenize, BPE_Empty)
+TEST_F(TextBytePairEncoding, BPE_Empty)
 {
   auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
   nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
@@ -100,7 +100,7 @@ TEST_F(TextBPETokenize, BPE_Empty)
   EXPECT_EQ(0, results->size());
 }
 
-TEST_F(TextBPETokenize, BPE_Error)
+TEST_F(TextBytePairEncoding, BPE_Error)
 {
   auto empty = cudf::make_empty_column(cudf::type_id::STRING);
   nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};