Skip to content

Commit

Permalink
Move and rename byte-pair-encoding source files (#14284)
Browse files Browse the repository at this point in the history
Moves and renames the byte-pair-encoding source files. The source files are moved from the `text/subword` to `text/bpe` and the filenames have remove the `tokenize` since these functions only do encoding.
No functions names have been changed. The `nvtext::load_merge_pairs_file` API has been deprecated. Callers must load the pairs into a strings column (using the CSV or text readers in cuio) and call the new `nvtext::load_merge_pairs` API instead.

Follow on PRs will address function and performance issues.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #14284
  • Loading branch information
davidwendt authored Oct 26, 2023
1 parent a2abdb1 commit d8f0790
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 17 deletions.
4 changes: 2 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -614,10 +614,10 @@ add_library(
src/text/normalize.cu
src/text/replace.cu
src/text/stemmer.cu
src/text/subword/bpe_tokenizer.cu
src/text/bpe/byte_pair_encoding.cu
src/text/bpe/load_merge_pairs.cu
src/text/subword/data_normalizer.cu
src/text/subword/load_hash_file.cu
src/text/subword/load_merges_file.cu
src/text/subword/subword_tokenize.cu
src/text/subword/wordpiece_tokenizer.cu
src/text/tokenize.cu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace nvtext {
/**
* @brief The table of merge pairs for the BPE encoder.
*
* To create an instance, call @ref nvtext::load_merge_pairs_file
* To create an instance, call @ref nvtext::load_merge_pairs
*/
struct bpe_merge_pairs {
struct bpe_merge_pairs_impl;
Expand Down Expand Up @@ -66,6 +66,8 @@ struct bpe_merge_pairs {
/**
* @brief Create a nvtext::bpe_merge_pairs from an input file.
*
* @deprecated Since 23.12
*
* The file should contain a pair of strings per line separated by
* a single space.
*
Expand Down Expand Up @@ -94,10 +96,40 @@ struct bpe_merge_pairs {
* @param mr Memory resource to allocate any returned objects.
* @return A nvtext::bpe_merge_pairs object
*/
std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
[[deprecated]] std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
std::string const& filename_merges,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a nvtext::bpe_merge_pairs from a strings column
*
* The input column should contain a unique pair of strings per line separated by
* a single space. An incorrect format or non-unique entries will result in
* undefined behavior.
*
* Example:
* @code{.pseudo}
* merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
* mps = load_merge_pairs(merge_pairs)
* // the mps object can be passed to the byte_pair_encoding API
* @endcode
*
* The pairs are expected to be ordered in the file by their rank
* relative to each other. A pair earlier in the file has priority over
* any pairs below it.
*
* @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
*
* @param merge_pairs Column containing the unique merge pairs
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Memory resource to allocate any returned objects
* @return A nvtext::bpe_merge_pairs object
*/
std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
cudf::strings_column_view const& merge_pairs,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Byte pair encode the input strings.
*
Expand All @@ -110,7 +142,8 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
* pairs before the result is joined to make the output string.
*
* @code{.pseudo}
* mps = load_merges_file("merges.txt") // see doxygen for example contents
* merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
* mps = load_merge_pairs(merge_pairs)
* input = ["test sentence", "thisis test"]
* result = byte_pair_encoding(input, mps)
* result is now ["test sent ence", "this is test"]
Expand All @@ -120,7 +153,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
* @throw cudf::logic_error if `separator` is invalid
*
* @param input Strings to encode.
* @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
* @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
* @param separator String used to build the output after encoding.
* Default is a space.
* @param mr Memory resource to allocate any returned objects.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
* limitations under the License.
*/

#include <text/subword/bpe_tokenizer.cuh>
#include <text/bpe/byte_pair_encoding.cuh>

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#pragma once

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
* limitations under the License.
*/

#include <text/subword/bpe_tokenizer.cuh>
#include <text/bpe/byte_pair_encoding.cuh>

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
Expand Down
12 changes: 6 additions & 6 deletions cpp/tests/text/bpe_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include <nvtext/bpe_tokenize.hpp>
#include <nvtext/byte_pair_encoding.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
Expand All @@ -24,9 +24,9 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/strings/strings_column_view.hpp>

struct TextBPETokenize : public cudf::test::BaseFixture {};
struct TextBytePairEncoding : public cudf::test::BaseFixture {};

TEST_F(TextBPETokenize, BytePairEncoding)
TEST_F(TextBytePairEncoding, BytePairEncoding)
{
// partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
auto mpt = cudf::test::strings_column_wrapper({
Expand Down Expand Up @@ -74,7 +74,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
}

TEST_F(TextBPETokenize, BytePairEncodingSeparator)
TEST_F(TextBytePairEncoding, BytePairEncodingSeparator)
{
auto mpt = cudf::test::strings_column_wrapper(
{"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
Expand All @@ -91,7 +91,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
}

TEST_F(TextBPETokenize, BPE_Empty)
TEST_F(TextBytePairEncoding, BPE_Empty)
{
auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
Expand All @@ -100,7 +100,7 @@ TEST_F(TextBPETokenize, BPE_Empty)
EXPECT_EQ(0, results->size());
}

TEST_F(TextBPETokenize, BPE_Error)
TEST_F(TextBytePairEncoding, BPE_Error)
{
auto empty = cudf::make_empty_column(cudf::type_id::STRING);
nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};
Expand Down

0 comments on commit d8f0790

Please sign in to comment.