Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move and rename byte-pair-encoding source files #14284

Merged
merged 9 commits into from
Oct 26, 2023
4 changes: 2 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -614,10 +614,10 @@ add_library(
src/text/normalize.cu
src/text/replace.cu
src/text/stemmer.cu
src/text/subword/bpe_tokenizer.cu
src/text/bpe/byte_pair_encoding.cu
src/text/bpe/load_merge_pairs.cu
src/text/subword/data_normalizer.cu
src/text/subword/load_hash_file.cu
src/text/subword/load_merges_file.cu
src/text/subword/subword_tokenize.cu
src/text/subword/wordpiece_tokenizer.cu
src/text/tokenize.cu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace nvtext {
/**
* @brief The table of merge pairs for the BPE encoder.
*
* To create an instance, call @ref nvtext::load_merge_pairs_file
* To create an instance, call @ref nvtext::load_merge_pairs
*/
struct bpe_merge_pairs {
struct bpe_merge_pairs_impl;
Expand Down Expand Up @@ -66,6 +66,8 @@ struct bpe_merge_pairs {
/**
* @brief Create a nvtext::bpe_merge_pairs from an input file.
*
* @deprecated Since 23.12
*
* The file should contain a pair of strings per line separated by
* a single space.
*
Expand Down Expand Up @@ -94,10 +96,40 @@ struct bpe_merge_pairs {
* @param mr Memory resource to allocate any returned objects.
* @return A nvtext::bpe_merge_pairs object
*/
std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
[[deprecated]] std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
std::string const& filename_merges,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a nvtext::bpe_merge_pairs from a strings column
*
* The input column should contain a unique pair of strings per line separated by
* a single space. An incorrect format or non-unique entries will result in
* undefined behavior.
*
* Example:
* @code{.pseudo}
* merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
* mps = load_merge_pairs(merge_pairs)
* // the mps object can be passed to the byte_pair_encoding API
* @endcode
*
* The pairs are expected to be ordered in the file by their rank
* relative to each other. A pair earlier in the file has priority over
* any pairs below it.
*
* @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
*
* @param merge_pairs Column containing the unique merge pairs
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to allocate any returned objects
* @return A nvtext::bpe_merge_pairs object
*/
std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
cudf::strings_column_view const& merge_pairs,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Byte pair encode the input strings.
*
Expand All @@ -110,7 +142,8 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
* pairs before the result is joined to make the output string.
*
* @code{.pseudo}
* mps = load_merges_file("merges.txt") // see doxygen for example contents
* merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
* mps = load_merge_pairs(merge_pairs)
* input = ["test sentence", "thisis test"]
* result = byte_pair_encoding(input, mps)
* result is now ["test sent ence", "this is test"]
Expand All @@ -120,7 +153,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
* @throw cudf::logic_error if `separator` is invalid
*
* @param input Strings to encode.
* @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
* @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
* @param separator String used to build the output after encoding.
* Default is a space.
* @param mr Memory resource to allocate any returned objects.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
* limitations under the License.
*/

#include <text/subword/bpe_tokenizer.cuh>
#include <text/bpe/byte_pair_encoding.cuh>

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
* limitations under the License.
*/

#include <text/subword/bpe_tokenizer.cuh>
#include <text/bpe/byte_pair_encoding.cuh>

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
Expand Down
12 changes: 6 additions & 6 deletions cpp/tests/text/bpe_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
Expand All @@ -24,9 +24,9 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/strings/strings_column_view.hpp>

struct TextBPETokenize : public cudf::test::BaseFixture {};
struct TextBytePairEncoding : public cudf::test::BaseFixture {};

TEST_F(TextBPETokenize, BytePairEncoding)
TEST_F(TextBytePairEncoding, BytePairEncoding)
{
// partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
auto mpt = cudf::test::strings_column_wrapper({
Expand Down Expand Up @@ -74,7 +74,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
}

TEST_F(TextBPETokenize, BytePairEncodingSeparator)
TEST_F(TextBytePairEncoding, BytePairEncodingSeparator)
{
auto mpt = cudf::test::strings_column_wrapper(
{"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
Expand All @@ -91,7 +91,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
}

TEST_F(TextBPETokenize, BPE_Empty)
TEST_F(TextBytePairEncoding, BPE_Empty)
{
auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
Expand All @@ -100,7 +100,7 @@ TEST_F(TextBPETokenize, BPE_Empty)
EXPECT_EQ(0, results->size());
}

TEST_F(TextBPETokenize, BPE_Error)
TEST_F(TextBytePairEncoding, BPE_Error)
{
auto empty = cudf::make_empty_column(cudf::type_id::STRING);
nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};
Expand Down