diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 472ee9d9fd4..f7662006cac 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -614,10 +614,10 @@ add_library( src/text/normalize.cu src/text/replace.cu src/text/stemmer.cu - src/text/subword/bpe_tokenizer.cu + src/text/bpe/byte_pair_encoding.cu + src/text/bpe/load_merge_pairs.cu src/text/subword/data_normalizer.cu src/text/subword/load_hash_file.cu - src/text/subword/load_merges_file.cu src/text/subword/subword_tokenize.cu src/text/subword/wordpiece_tokenizer.cu src/text/tokenize.cu diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp similarity index 73% rename from cpp/include/nvtext/bpe_tokenize.hpp rename to cpp/include/nvtext/byte_pair_encoding.hpp index c67f4bd8b1c..1f4851d7057 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/byte_pair_encoding.hpp @@ -32,7 +32,7 @@ namespace nvtext { /** * @brief The table of merge pairs for the BPE encoder. * - * To create an instance, call @ref nvtext::load_merge_pairs_file + * To create an instance, call @ref nvtext::load_merge_pairs */ struct bpe_merge_pairs { struct bpe_merge_pairs_impl; @@ -66,6 +66,8 @@ struct bpe_merge_pairs { /** * @brief Create a nvtext::bpe_merge_pairs from an input file. * + * @deprecated Since 23.12 + * * The file should contain a pair of strings per line separated by * a single space. * @@ -94,10 +96,40 @@ struct bpe_merge_pairs { * @param mr Memory resource to allocate any returned objects. * @return A nvtext::bpe_merge_pairs object */ -std::unique_ptr load_merge_pairs_file( +[[deprecated]] std::unique_ptr load_merge_pairs_file( std::string const& filename_merges, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a nvtext::bpe_merge_pairs from a strings column + * + * The input column should contain a unique pair of strings per line separated by + * a single space. An incorrect format or non-unique entries will result in + * undefined behavior. + * + * Example: + * @code{.pseudo} + * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"] + * mps = load_merge_pairs(merge_pairs) + * // the mps object can be passed to the byte_pair_encoding API + * @endcode + * + * The pairs are expected to be ordered in the file by their rank + * relative to each other. A pair earlier in the file has priority over + * any pairs below it. + * + * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls + * + * @param merge_pairs Column containing the unique merge pairs + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects + * @return A nvtext::bpe_merge_pairs object + */ +std::unique_ptr load_merge_pairs( + cudf::strings_column_view const& merge_pairs, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Byte pair encode the input strings. * @@ -110,7 +142,8 @@ std::unique_ptr load_merge_pairs_file( * pairs before the result is joined to make the output string. * * @code{.pseudo} - * mps = load_merges_file("merges.txt") // see doxygen for example contents + * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"] + * mps = load_merge_pairs(merge_pairs) * input = ["test sentence", "thisis test"] * result = byte_pair_encoding(input, mps) * result is now ["test sent ence", "this is test"] @@ -120,7 +153,7 @@ std::unique_ptr load_merge_pairs_file( * @throw cudf::logic_error if `separator` is invalid * * @param input Strings to encode. - * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file. + * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs. * @param separator String used to build the output after encoding. * Default is a space. * @param mr Memory resource to allocate any returned objects. diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/bpe/byte_pair_encoding.cu similarity index 99% rename from cpp/src/text/subword/bpe_tokenizer.cu rename to cpp/src/text/bpe/byte_pair_encoding.cu index 13c744ac6bd..42cd9bcbcbe 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include +#include -#include +#include #include #include diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh similarity index 99% rename from cpp/src/text/subword/bpe_tokenizer.cuh rename to cpp/src/text/bpe/byte_pair_encoding.cuh index 2fa879ea734..cefd32e8f60 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/bpe/byte_pair_encoding.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/bpe/load_merge_pairs.cu similarity index 98% rename from cpp/src/text/subword/load_merges_file.cu rename to cpp/src/text/bpe/load_merge_pairs.cu index db6ad2e2dd2..77f0ebba43f 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/bpe/load_merge_pairs.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include +#include -#include +#include #include #include diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index 234d8c4fecc..044c0ab0804 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include @@ -24,9 +24,9 @@ #include #include -struct TextBPETokenize : public cudf::test::BaseFixture {}; +struct TextBytePairEncoding : public cudf::test::BaseFixture {}; -TEST_F(TextBPETokenize, BytePairEncoding) +TEST_F(TextBytePairEncoding, BytePairEncoding) { // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt auto mpt = cudf::test::strings_column_wrapper({ @@ -74,7 +74,7 @@ TEST_F(TextBPETokenize, BytePairEncoding) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); } -TEST_F(TextBPETokenize, BytePairEncodingSeparator) +TEST_F(TextBytePairEncoding, BytePairEncodingSeparator) { auto mpt = cudf::test::strings_column_wrapper( {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); @@ -91,7 +91,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } -TEST_F(TextBPETokenize, BPE_Empty) +TEST_F(TextBytePairEncoding, BPE_Empty) { auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"}); nvtext::bpe_merge_pairs merge_pairs{mpt.release()}; @@ -100,7 +100,7 @@ TEST_F(TextBPETokenize, BPE_Empty) EXPECT_EQ(0, results->size()); } -TEST_F(TextBPETokenize, BPE_Error) +TEST_F(TextBytePairEncoding, BPE_Error) { auto empty = cudf::make_empty_column(cudf::type_id::STRING); nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};