Skip to content

Commit

Permalink
Handle the punctuation definition mismatch between different Unicode …
Browse files Browse the repository at this point in the history
…versions.

PiperOrigin-RevId: 707239296
  • Loading branch information
tf-text-github-robot committed Dec 17, 2024
1 parent 31f22e9 commit 316b8c0
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 4 deletions.
2 changes: 2 additions & 0 deletions tensorflow_text/core/kernels/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,8 @@ cc_test(
srcs = ["fast_wordpiece_tokenizer_test.cc"],
data = [
"//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model.fb",
"//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model_ver_15_1.fb",
"//tensorflow_text:python/ops/test_data/fast_wordpiece_tokenizer_model_ver_16_0.fb",
],
deps = [
":fast_wordpiece_tokenizer",
Expand Down
10 changes: 6 additions & 4 deletions tensorflow_text/core/kernels/fast_wordpiece_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,16 @@ void FastWordpieceTokenizer::TokenizeTextImpl(
prev_unicode_char))) {
// If the current Unicode character is a valid word boundary, collect the
// remaining tokens stored on a path on the trie.
absl::string_view cur_str = absl::string_view(
input_substr.data(), cur_pos - input_word_offset_in_text);
HandleTheRemainingStringOnTriePath<kGetPieces, kGetIds, kGetOffsets>(
absl::string_view(input_substr.data(),
cur_pos - input_word_offset_in_text),
input_word_offset_in_text, cur_node, original_num_tokens,
cur_str, input_word_offset_in_text, cur_node, original_num_tokens,
cur_offset_in_input_word, output_pieces, output_ids,
output_start_offsets, output_end_offsets);
// Skip the whitespace.
if (is_white_space) cur_pos = next_pos;
// If the remaining tokens are empty, it means we encountered an
// unmappable separator, so skip to the next token.
if (is_white_space || cur_str.empty()) cur_pos = next_pos;
// Continue in the outer while loop to process the remaining input.
continue;
}
Expand Down
78 changes: 78 additions & 0 deletions tensorflow_text/core/kernels/fast_wordpiece_tokenizer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,84 @@ TEST(FastWordpieceTokenizerTest, LoadAndTokenize) {
EXPECT_THAT(output_end_offsets, ElementsAre(3, 5, 6, 9));
}

TEST(FastWordpieceTokenizerTest, PunctuationVersionMismatch) {
// The config_flatbuffer used here is built from the following config:
// * vocab = {"a", "abc", "abcdefghi", "##de", "##defgxy", "##deh", "##f",
// "##ghz", "<unk>"}
// * unk_token = "<unk>"
// * suffix_indicator = "##"
// * max_bytes_per_token = 100
// * end_to_end = True

// Built with unicode ver 15.1
const char kTestConfigUnicode15_1Path[] =
"third_party/tensorflow_text/python/ops/test_data/"
"fast_wordpiece_tokenizer_model_ver_15_1.fb";

// Built with unicode ver 16.0
const char kTestConfigUnicode16_0Path[] =
"third_party/tensorflow_text/python/ops/test_data/"
"fast_wordpiece_tokenizer_model_ver_16_0.fb";

// We test the new punctuation symbol: \341\255\277, which was available in
// Unicode 16: https://www.fileformat.info/info/unicode/char//1b7f/index.htm,
// but not in 15.1.
// We also test an existing punctuation symbol ">".
std::string input = "abc>abc\341\255\277abc";

// Read 15.1 config.
{
std::string config_flatbuffer;
auto status = tensorflow::ReadFileToString(tensorflow::Env::Default(),
kTestConfigUnicode15_1Path,
&config_flatbuffer);
ASSERT_TRUE(status.ok());

ASSERT_OK_AND_ASSIGN(auto tokenizer, FastWordpieceTokenizer::Create(
config_flatbuffer.data()));

std::vector<std::string> output_tokens;
std::vector<int> output_ids;
std::vector<int> output_start_offsets;
std::vector<int> output_end_offsets;
tokenizer.Tokenize(input, &output_tokens, &output_ids,
&output_start_offsets, &output_end_offsets);

// For 15.1, the flatbuffer does not have \341\255\277 as a punctuation.
EXPECT_THAT(output_tokens, ElementsAre("abc", "<unk>", "abc", "abc"));
EXPECT_THAT(output_ids, ElementsAre(1, 8, 1, 1));
// Note that the new-version punctuation symbol is ignored.
EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 4, 10));
EXPECT_THAT(output_end_offsets, ElementsAre(3, 4, 7, 13));
}

// Read 16.0 config.
{
std::string config_flatbuffer;
auto status = tensorflow::ReadFileToString(tensorflow::Env::Default(),
kTestConfigUnicode16_0Path,
&config_flatbuffer);
ASSERT_TRUE(status.ok());

ASSERT_OK_AND_ASSIGN(auto tokenizer, FastWordpieceTokenizer::Create(
config_flatbuffer.data()));

std::vector<std::string> output_tokens;
std::vector<int> output_ids;
std::vector<int> output_start_offsets;
std::vector<int> output_end_offsets;
tokenizer.Tokenize(input, &output_tokens, &output_ids,
&output_start_offsets, &output_end_offsets);

// For 16.0, \341\255\277 is treated as a punctuation.
EXPECT_THAT(output_tokens,
ElementsAre("abc", "<unk>", "abc", "<unk>", "abc"));
EXPECT_THAT(output_ids, ElementsAre(1, 8, 1, 8, 1));
EXPECT_THAT(output_start_offsets, ElementsAre(0, 3, 4, 7, 10));
EXPECT_THAT(output_end_offsets, ElementsAre(3, 4, 7, 10, 13));
}
}

template <typename T>
std::string ListToString(const std::vector<T>& list) {
return absl::StrCat("[", absl::StrJoin(list, ", "), "]");
Expand Down
Binary file not shown.
Binary file not shown.

0 comments on commit 316b8c0

Please sign in to comment.