Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move and rename byte-pair-encoding source files #14284

Merged
merged 9 commits into from
Oct 26, 2023
4 changes: 2 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -614,10 +614,10 @@ add_library(
src/text/normalize.cu
src/text/replace.cu
src/text/stemmer.cu
src/text/subword/bpe_tokenizer.cu
src/text/bpe/byte_pair_encoding.cu
src/text/bpe/load_merge_pairs.cu
src/text/subword/data_normalizer.cu
src/text/subword/load_hash_file.cu
src/text/subword/load_merges_file.cu
src/text/subword/subword_tokenize.cu
src/text/subword/wordpiece_tokenizer.cu
src/text/tokenize.cu
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@ namespace nvtext {
/**
* @brief The table of merge pairs for the BPE encoder.
*
* To create an instance, call @ref nvtext::load_merge_pairs_file
* To create an instance, call @ref nvtext::load_merge_pairs
*/
struct bpe_merge_pairs {
struct bpe_merge_pairs_impl;
@@ -66,6 +66,8 @@ struct bpe_merge_pairs {
/**
* @brief Create a nvtext::bpe_merge_pairs from an input file.
*
* @deprecated Since 23.12
*
* The file should contain a pair of strings per line separated by
* a single space.
*
@@ -94,10 +96,40 @@ struct bpe_merge_pairs {
* @param mr Memory resource to allocate any returned objects.
* @return A nvtext::bpe_merge_pairs object
*/
std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
[[deprecated]] std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
std::string const& filename_merges,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a nvtext::bpe_merge_pairs from a strings column
*
* The input column should contain a unique pair of strings per line separated by
* a single space. An incorrect format or non-unique entries will result in
* undefined behavior.
*
* Example:
* @code{.pseudo}
* merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
* mps = load_merge_pairs(merge_pairs)
* // the mps object can be passed to the byte_pair_encoding API
* @endcode
*
* The pairs are expected to be ordered in the file by their rank
* relative to each other. A pair earlier in the file has priority over
* any pairs below it.
*
* @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
*
* @param merge_pairs Column containing the unique merge pairs
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to allocate any returned objects
* @return A nvtext::bpe_merge_pairs object
*/
std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
cudf::strings_column_view const& merge_pairs,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Byte pair encode the input strings.
*
@@ -110,7 +142,8 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
* pairs before the result is joined to make the output string.
*
* @code{.pseudo}
* mps = load_merges_file("merges.txt") // see doxygen for example contents
* merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
* mps = load_merge_pairs(merge_pairs)
* input = ["test sentence", "thisis test"]
* result = byte_pair_encoding(input, mps)
* result is now ["test sent ence", "this is test"]
@@ -120,7 +153,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
* @throw cudf::logic_error if `separator` is invalid
*
* @param input Strings to encode.
* @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
* @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
* @param separator String used to build the output after encoding.
* Default is a space.
* @param mr Memory resource to allocate any returned objects.
Original file line number Diff line number Diff line change
@@ -14,9 +14,9 @@
* limitations under the License.
*/

#include <text/subword/bpe_tokenizer.cuh>
#include <text/bpe/byte_pair_encoding.cuh>

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
Original file line number Diff line number Diff line change
@@ -16,7 +16,7 @@

#pragma once

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
Original file line number Diff line number Diff line change
@@ -14,9 +14,9 @@
* limitations under the License.
*/

#include <text/subword/bpe_tokenizer.cuh>
#include <text/bpe/byte_pair_encoding.cuh>

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
12 changes: 6 additions & 6 deletions cpp/tests/text/bpe_tests.cpp
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@
* limitations under the License.
*/

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
@@ -24,9 +24,9 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/strings/strings_column_view.hpp>

struct TextBPETokenize : public cudf::test::BaseFixture {};
struct TextBytePairEncoding : public cudf::test::BaseFixture {};

TEST_F(TextBPETokenize, BytePairEncoding)
TEST_F(TextBytePairEncoding, BytePairEncoding)
{
// partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
auto mpt = cudf::test::strings_column_wrapper({
@@ -74,7 +74,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
}

TEST_F(TextBPETokenize, BytePairEncodingSeparator)
TEST_F(TextBytePairEncoding, BytePairEncodingSeparator)
{
auto mpt = cudf::test::strings_column_wrapper(
{"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
@@ -91,7 +91,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
}

TEST_F(TextBPETokenize, BPE_Empty)
TEST_F(TextBytePairEncoding, BPE_Empty)
{
auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
@@ -100,7 +100,7 @@ TEST_F(TextBPETokenize, BPE_Empty)
EXPECT_EQ(0, results->size());
}

TEST_F(TextBPETokenize, BPE_Error)
TEST_F(TextBytePairEncoding, BPE_Error)
{
auto empty = cudf::make_empty_column(cudf::type_id::STRING);
nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};