From c53c47bf70113328f884f89725074d13b24a8702 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 16 Aug 2023 13:48:23 -0400 Subject: [PATCH 01/28] Add BytePairEncoder class to cuDF --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/text/bpe_tokenize.cpp | 65 +++++++++++++++++++ cpp/include/nvtext/bpe_tokenize.hpp | 17 +---- cpp/src/text/subword/load_merges_file.cu | 10 ++- .../cudf/_lib/cpp/nvtext/bpe_tokenize.pxd | 22 +++++++ python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 4 +- python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx | 39 +++++++++++ python/cudf/cudf/core/byte_pair_encoding.py | 54 +++++++++++++++ 8 files changed, 191 insertions(+), 24 deletions(-) create mode 100644 cpp/benchmarks/text/bpe_tokenize.cpp create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd create mode 100644 python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx create mode 100644 python/cudf/cudf/core/byte_pair_encoding.py diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 96e24efac8a..210ab415c53 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -276,8 +276,8 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.c ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp text/normalize.cpp - text/replace.cpp text/tokenize.cpp + TEXT_NVBENCH text/bpe_tokenize.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/text/bpe_tokenize.cpp b/cpp/benchmarks/text/bpe_tokenize.cpp new file mode 100644 index 00000000000..a80e7e5d84d --- /dev/null +++ b/cpp/benchmarks/text/bpe_tokenize.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include + +#include + +static void bench_tokenize(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + cudf::strings_column_view input(column->view()); + + auto mps = nvtext::load_merge_pairs_file("merges.txt"); + auto vocab = nvtext::load_vocabulary_file("hashed_vocab.txt"); + auto seq_len = 64; + auto stride = 48; + auto lower_case = true; + auto truncate = false; + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + auto chars_size = input.chars_size(); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto bpe = nvtext::byte_pair_encoding(input, *mps); + auto result = nvtext::subword_tokenize(input, *vocab, seq_len, stride, lower_case, truncate); + }); +} + +NVBENCH_BENCH(bench_tokenize) + .set_name("bpe_tokenize") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index b93d93b07c6..b4400f51ce9 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,7 +36,7 @@ namespace nvtext { */ struct bpe_merge_pairs { struct bpe_merge_pairs_impl; - std::unique_ptr impl{}; ///< Implementation of the BPE merge pairs table. + bpe_merge_pairs_impl* impl{}; ///< Implementation of the BPE merge pairs table. /** * @brief Construct a new bpe merge pairs object @@ -62,18 +62,7 @@ struct bpe_merge_pairs { ~bpe_merge_pairs(); - /** - * @brief Returns the number of merge pairs in the table. - * - * @return The number of merge pairs in the table - */ - cudf::size_type get_size(); - /** - * @brief Returns the number of unique merge pairs in the table. - * - * @return The number of unique merge pairs in the table - */ - std::size_t get_map_size(); + bpe_merge_pairs(); }; /** diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index b39413af98f..1dbde21e113 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -171,20 +171,18 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl( bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr&& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource*) - : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream)) + : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream).release()) { } bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr)) + : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr).release()) { } -bpe_merge_pairs::~bpe_merge_pairs() = default; - -cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); } -std::size_t bpe_merge_pairs::get_map_size() { return impl->merge_pairs_map->get_size(); } +bpe_merge_pairs::bpe_merge_pairs() = default; +bpe_merge_pairs::~bpe_merge_pairs() { delete impl; } } // namespace nvtext diff --git a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd new file mode 100644 index 00000000000..9dc1a0ad9c0 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd @@ -0,0 +1,22 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view + + +cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil: + + cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs": + pass + + cdef unique_ptr[bpe_merge_pairs] load_merge_pairs_file( + const string &filename_merges + ) except + + + cdef unique_ptr[column] byte_pair_encoding( + const column_view &strings, + const bpe_merge_pairs &merge_pairs + ) except + diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 515b9c1d6e4..86d5bc22858 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx - replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx + bpe_tokenize.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx + ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx new file mode 100644 index 00000000000..00b6779f144 --- /dev/null +++ b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + + +from cudf.core.buffer import acquire_spill_lock + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.nvtext.bpe_tokenize cimport ( + bpe_merge_pairs as cpp_bpe_merge_pairs, + byte_pair_encoding as cpp_byte_pair_encoding, + load_merge_pairs_file as cpp_load_merge_pairs_file, +) + + +cdef class BPE_Merge_Pairs: + cdef unique_ptr[cpp_bpe_merge_pairs] c_obj + + def __cinit__(self, merges_file): + cdef string c_merges_file = str(merges_file).encode() + with nogil: + self.c_obj = move(cpp_load_merge_pairs_file(c_merges_file)) + + +@acquire_spill_lock() +def byte_pair_encoding( + Column strings, + BPE_Merge_Pairs merge_pairs +): + cdef column_view c_strings = strings.view() + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_byte_pair_encoding(c_strings, merge_pairs.c_obj.get()[0])) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py new file mode 100644 index 00000000000..a254f5ac621 --- /dev/null +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -0,0 +1,54 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from __future__ import annotations + +import cudf +from cudf._lib.nvtext.bpe_tokenize import ( + BPE_Merge_Pairs as cpp_merge_pairs, + byte_pair_encoding as cpp_byte_pair_encoding, +) + + +class BytePairEncoder: + """ + + Parameters + ---------- + merges_file : str + Path to file containing merge pairs. + + Returns + ------- + BytePairEncoder + """ + + def __init__(self, merges_file: str): + self.merge_pairs = cpp_merge_pairs(merges_file) + + def __call__(self, text): + """ + + Parameters + ---------- + text : cudf string series + The strings to be encoded. + + Returns + ------- + Encoded strings + + Examples + -------- + >>> import cudf + >>> from cudf.core.byte_pair_encoding import BytePairEncoder + >>> bpe = BytePairEncoder('merges.txt') + >>> str_series = cudf.Series(['This is the sentence', 'thisisit']) + >>> bpe(str_series) + 0 This is a sent ence + 1 this is it + dtype: object + """ + + result = cpp_byte_pair_encoding(text._column, self.merge_pairs) + + return cudf.Series(result) From 370086fe62a062c96de272de7590d77c3a77b4b0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 16 Aug 2023 16:56:40 -0400 Subject: [PATCH 02/28] nvbench experiment --- cpp/benchmarks/text/bpe_tokenize.cpp | 29 +++++++++++++--------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/cpp/benchmarks/text/bpe_tokenize.cpp b/cpp/benchmarks/text/bpe_tokenize.cpp index a80e7e5d84d..6de87eeaae1 100644 --- a/cpp/benchmarks/text/bpe_tokenize.cpp +++ b/cpp/benchmarks/text/bpe_tokenize.cpp @@ -17,7 +17,9 @@ #include #include +#include #include +#include #include #include @@ -25,20 +27,18 @@ #include -static void bench_tokenize(nvbench::state& state) +static cudf::io::table_with_metadata read_csv(std::string const& file_path) { - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto source_info = cudf::io::source_info(file_path); + auto builder = cudf::io::csv_reader_options::builder(source_info); + auto options = builder.build(); + return cudf::io::read_csv(options); +} - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); - cudf::strings_column_view input(column->view()); +static void bench_tokenize(nvbench::state& state) +{ + auto csv_metadata = read_csv("input_strings.csv"); + cudf::strings_column_view input(csv_metadata.tbl->view().column(0)); auto mps = nvtext::load_merge_pairs_file("merges.txt"); auto vocab = nvtext::load_vocabulary_file("hashed_vocab.txt"); @@ -59,7 +59,4 @@ static void bench_tokenize(nvbench::state& state) }); } -NVBENCH_BENCH(bench_tokenize) - .set_name("bpe_tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); +NVBENCH_BENCH(bench_tokenize).set_name("bpe_tokenize"); From c1673691824aa9ece5fc87737299985cc9cd34e4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 17 Aug 2023 15:10:22 -0400 Subject: [PATCH 03/28] add separator parameter --- .../cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd | 4 +++- python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx | 16 ++++++++++++++-- python/cudf/cudf/core/byte_pair_encoding.py | 6 +++--- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd index 9dc1a0ad9c0..add6d62e85b 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd @@ -5,6 +5,7 @@ from libcpp.string cimport string from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil: @@ -18,5 +19,6 @@ cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] byte_pair_encoding( const column_view &strings, - const bpe_merge_pairs &merge_pairs + const bpe_merge_pairs &merge_pairs, + const string_scalar &separator ) except + diff --git a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx index 00b6779f144..9ea7fa2c51e 100644 --- a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx @@ -15,6 +15,8 @@ from cudf._lib.cpp.nvtext.bpe_tokenize cimport ( byte_pair_encoding as cpp_byte_pair_encoding, load_merge_pairs_file as cpp_load_merge_pairs_file, ) +from cudf._lib.cpp.scalar.scalar cimport string_scalar +from cudf._lib.scalar cimport DeviceScalar cdef class BPE_Merge_Pairs: @@ -29,11 +31,21 @@ cdef class BPE_Merge_Pairs: @acquire_spill_lock() def byte_pair_encoding( Column strings, - BPE_Merge_Pairs merge_pairs + BPE_Merge_Pairs merge_pairs, + object separator ): cdef column_view c_strings = strings.view() + cdef DeviceScalar d_separator = separator.device_value + cdef const string_scalar* c_separator = d_separator\ + .get_raw_ptr() cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_byte_pair_encoding(c_strings, merge_pairs.c_obj.get()[0])) + c_result = move( + cpp_byte_pair_encoding( + c_strings, + merge_pairs.c_obj.get()[0], + c_separator[0] + ) + ) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index a254f5ac621..db220ea6da4 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -25,7 +25,7 @@ class BytePairEncoder: def __init__(self, merges_file: str): self.merge_pairs = cpp_merge_pairs(merges_file) - def __call__(self, text): + def __call__(self, text, separator: str = " "): """ Parameters @@ -48,7 +48,7 @@ def __call__(self, text): 1 this is it dtype: object """ - - result = cpp_byte_pair_encoding(text._column, self.merge_pairs) + sep = cudf.Scalar(separator, dtype="str") + result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) return cudf.Series(result) From c0f04472ad186af318c067472e6c1c5ce6bfa0f6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 23 Aug 2023 14:48:56 -0400 Subject: [PATCH 04/28] deprecate loading merge-pairs from a file --- cpp/benchmarks/text/bpe_tokenize.cpp | 69 ++++++++++++----- cpp/include/nvtext/bpe_tokenize.hpp | 74 ++++++++++++++++++- cpp/src/text/subword/bpe_tokenizer.cu | 10 +++ cpp/src/text/subword/load_merges_file.cu | 17 +++++ cpp/tests/text/bpe_tests.cpp | 33 ++++++--- .../cudf/_lib/cpp/nvtext/bpe_tokenize.pxd | 4 +- python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx | 9 +-- python/cudf/cudf/core/byte_pair_encoding.py | 13 ++-- 8 files changed, 185 insertions(+), 44 deletions(-) diff --git a/cpp/benchmarks/text/bpe_tokenize.cpp b/cpp/benchmarks/text/bpe_tokenize.cpp index 6de87eeaae1..17f4bda81b3 100644 --- a/cpp/benchmarks/text/bpe_tokenize.cpp +++ b/cpp/benchmarks/text/bpe_tokenize.cpp @@ -17,35 +17,64 @@ #include #include +#include #include +#include #include #include #include +#include + #include #include #include -static cudf::io::table_with_metadata read_csv(std::string const& file_path) +static void bench_bpe(nvbench::state& state) { - auto source_info = cudf::io::source_info(file_path); - auto builder = cudf::io::csv_reader_options::builder(source_info); - auto options = builder.build(); - return cudf::io::read_csv(options); -} + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + if (static_cast(n_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } -static void bench_tokenize(nvbench::state& state) -{ - auto csv_metadata = read_csv("input_strings.csv"); - cudf::strings_column_view input(csv_metadata.tbl->view().column(0)); + auto raw_data = cudf::test::strings_column_wrapper({"test sentence ", + "thisis test ", + "this is sentence ", + "this istest ", + "thisistest ", + "sentence is test ", + "this sentence is test ", + "test test test ", + "this this test this ", + "sentence "}) + .release(); + + if (row_width / 20 > 1) { + std::vector columns; + for (int i = 0; i < row_width / 20; ++i) { + columns.push_back(raw_data->view()); + } + raw_data = cudf::strings::concatenate(cudf::table_view(columns)); + } + auto data_view = raw_data->view(); + + // Create a randomized gather-map to build a column out of the raw strings in data. + data_profile gather_profile = + data_profile_builder().cardinality(0).null_probability(0.0).distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); + auto gather_table = + create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile); + gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); + auto gather_map = gather_table->view().column(0); + auto table_input = cudf::gather(cudf::table_view({data_view}), gather_map); + auto input = cudf::strings_column_view(table_input->view().column(0)); - auto mps = nvtext::load_merge_pairs_file("merges.txt"); - auto vocab = nvtext::load_vocabulary_file("hashed_vocab.txt"); - auto seq_len = 64; - auto stride = 48; - auto lower_case = true; - auto truncate = false; + cudf::test::strings_column_wrapper merge_pairs( + {"e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); + auto mps = nvtext::load_merge_pairs(cudf::strings_column_view(merge_pairs)); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -54,9 +83,11 @@ static void bench_tokenize(nvbench::state& state) state.add_global_memory_writes(chars_size); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto bpe = nvtext::byte_pair_encoding(input, *mps); - auto result = nvtext::subword_tokenize(input, *vocab, seq_len, stride, lower_case, truncate); + auto result = nvtext::byte_pair_encoding(input, *mps); }); } -NVBENCH_BENCH(bench_tokenize).set_name("bpe_tokenize"); +NVBENCH_BENCH(bench_bpe) + .set_name("byte_pair_encoding") + .add_int64_axis("row_width", {32, 64, 128, 256, 512}) + .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}); diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index b4400f51ce9..258ee15ff87 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -68,6 +68,8 @@ struct bpe_merge_pairs { /** * @brief Create a nvtext::bpe_merge_pairs from an input file. * + * @deprecated Since 23.10 + * * The file should contain a pair of strings per line separated by * a single space. * @@ -96,10 +98,40 @@ struct bpe_merge_pairs { * @param mr Memory resource to allocate any returned objects. * @return A nvtext::bpe_merge_pairs object */ -std::unique_ptr load_merge_pairs_file( +[[deprecated]] std::unique_ptr load_merge_pairs_file( std::string const& filename_merges, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a nvtext::bpe_merge_pairs from a strings column + * + * The input column should contain a unique pair of strings per line separated by + * a single space. An incorrect format or non-unique entries will result in + * undefined behavior. + * + * Example: + * @code{.pseudo} + * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"] + * mps = load_merge_pairs(merge_pairs) + * // the mps object can be passed to the byte_pair_encoding API + * @endcode + * + * The pairs are expected to be ordered in the file by their rank + * relative to each other. A pair earlier in the file has priority over + * any pairs below it. + * + * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls + * + * @param merge_pairs Column containing the unique merge pairs + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects + * @return A nvtext::bpe_merge_pairs object + */ +std::unique_ptr load_merge_pairs( + cudf::strings_column_view const& merge_pairs, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Byte pair encode the input strings. * @@ -134,5 +166,45 @@ std::unique_ptr byte_pair_encoding( cudf::string_scalar const& separator = cudf::string_scalar(" "), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Byte pair encode the input strings + * + * This will split each string on whitespace, perform the encoding, + * and then build the output column using the given `separator`. + * + * The encoding algorithm rebuilds each string by matching substrings + * in the `merge_pairs` column and iteratively removing the minimum ranked pair + * until no pairs are left. Then, a space is inserted between the remaining + * pairs before the result is joined to make the output string. + * + * @code{.pseudo} + * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"] + * input = ["test sentence", "thisis test"] + * result = byte_pair_encoding(input, merge_pairs) + * result is now ["test sent ence", "this is test"] + * @endcode + * + * The `merges_pairs` column should contain a unique pair of strings per line separated by + * a single space. An incorrect format or non-unique entries will result in + * undefined behavior. + * + * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls + * @throw cudf::logic_error if `separator` is invalid + * + * @param input Strings to encode. + * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file. + * @param separator String used to build the output after encoding. + * Default is a space. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects. + * @return An encoded column of strings. + */ +std::unique_ptr byte_pair_encoding( + cudf::strings_column_view const& input, + cudf::strings_column_view const& merges_pairs, + cudf::string_scalar const& separator = cudf::string_scalar(" "), + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace nvtext diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 4c4f5b3a4b1..6407bfdf7c5 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -561,4 +561,14 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr); } +std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, + cudf::strings_column_view const& merges_table, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::byte_pair_encoding(input, merges_table, separator, stream, mr); +} + } // namespace nvtext diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index f382b8ba64d..ece9f9a2967 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -135,6 +135,15 @@ std::unique_ptr load_merge_pairs_file(std::string const& filena return std::make_unique(std::move(input_column), stream, mr); } +std::unique_ptr load_merge_pairs(cudf::strings_column_view const& merge_pairs, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!merge_pairs.is_empty(), "Merge pairs must not be empty"); + CUDF_EXPECTS(!merge_pairs.has_nulls(), "Merge pairs may not contain nulls"); + return std::make_unique(merge_pairs, stream, mr); +} + } // namespace detail std::unique_ptr load_merge_pairs_file(std::string const& filename_merges, @@ -144,6 +153,14 @@ std::unique_ptr load_merge_pairs_file(std::string const& filena return detail::load_merge_pairs_file(filename_merges, cudf::get_default_stream(), mr); } +std::unique_ptr load_merge_pairs(cudf::strings_column_view const& merge_pairs, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::load_merge_pairs(merge_pairs, stream, mr); +} + bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl( std::unique_ptr&& merge_pairs, std::unique_ptr>&& diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index 234d8c4fecc..1d9ad12225e 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -44,7 +44,7 @@ TEST_F(TextBPETokenize, BytePairEncoding) "s ent", // 33830 }); - nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)}; + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); auto validity = cudf::test::iterators::null_at(4); cudf::test::strings_column_wrapper input({" This\tis it\n", @@ -56,7 +56,7 @@ TEST_F(TextBPETokenize, BytePairEncoding) validity); auto sv = cudf::strings_column_view(input); - auto results = nvtext::byte_pair_encoding(sv, merge_pairs); + auto results = nvtext::byte_pair_encoding(sv, *merge_pairs); auto expected = cudf::test::strings_column_wrapper({" This is it", "This is test - sent ence - 1", @@ -66,11 +66,16 @@ TEST_F(TextBPETokenize, BytePairEncoding) ""}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + results = nvtext::byte_pair_encoding(sv, cudf::strings_column_view(mpt)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); auto sliced = cudf::slice(input, {1, 4}).front(); auto sliced_expected = cudf::slice(expected, {1, 4}).front(); + sv = cudf::strings_column_view(sliced); - results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs); + results = nvtext::byte_pair_encoding(sv, *merge_pairs); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); + results = nvtext::byte_pair_encoding(sv, cudf::strings_column_view(mpt)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); } @@ -78,13 +83,13 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator) { auto mpt = cudf::test::strings_column_wrapper( {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); - nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)}; + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); cudf::test::strings_column_wrapper input( {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "}); auto sv = cudf::strings_column_view(input); - auto results = nvtext::byte_pair_encoding(sv, merge_pairs, std::string(" Ġ")); + auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string(" Ġ")); auto expected = cudf::test::strings_column_wrapper( {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"}); @@ -93,18 +98,22 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator) TEST_F(TextBPETokenize, BPE_Empty) { - auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"}); - nvtext::bpe_merge_pairs merge_pairs{mpt.release()}; - auto empty = cudf::make_empty_column(cudf::type_id::STRING); - auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), merge_pairs); + auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"}); + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); + auto empty = cudf::make_empty_column(cudf::type_id::STRING); + auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), *merge_pairs); EXPECT_EQ(0, results->size()); } TEST_F(TextBPETokenize, BPE_Error) { auto empty = cudf::make_empty_column(cudf::type_id::STRING); - nvtext::bpe_merge_pairs merge_pairs{std::move(empty)}; - cudf::test::strings_column_wrapper input({"isit"}); - EXPECT_THROW(nvtext::byte_pair_encoding(cudf::strings_column_view(input), merge_pairs), + EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(*empty)), cudf::logic_error); + auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {1, 0}); + EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(null_pairs)), cudf::logic_error); + auto input = cudf::test::strings_column_wrapper({"isit"}); + auto separator = cudf::string_scalar("", false); + EXPECT_THROW(nvtext::byte_pair_encoding( + cudf::strings_column_view(input), cudf::strings_column_view(input), separator), cudf::logic_error); } diff --git a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd index add6d62e85b..45a8574d805 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd @@ -13,8 +13,8 @@ cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil: cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs": pass - cdef unique_ptr[bpe_merge_pairs] load_merge_pairs_file( - const string &filename_merges + cdef unique_ptr[bpe_merge_pairs] load_merge_pairs( + const column_view &merge_pairs ) except + cdef unique_ptr[column] byte_pair_encoding( diff --git a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx index 9ea7fa2c51e..cad33060eaf 100644 --- a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx @@ -4,7 +4,6 @@ from cudf.core.buffer import acquire_spill_lock from libcpp.memory cimport unique_ptr -from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.column cimport Column @@ -13,7 +12,7 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.nvtext.bpe_tokenize cimport ( bpe_merge_pairs as cpp_bpe_merge_pairs, byte_pair_encoding as cpp_byte_pair_encoding, - load_merge_pairs_file as cpp_load_merge_pairs_file, + load_merge_pairs as cpp_load_merge_pairs, ) from cudf._lib.cpp.scalar.scalar cimport string_scalar from cudf._lib.scalar cimport DeviceScalar @@ -22,10 +21,10 @@ from cudf._lib.scalar cimport DeviceScalar cdef class BPE_Merge_Pairs: cdef unique_ptr[cpp_bpe_merge_pairs] c_obj - def __cinit__(self, merges_file): - cdef string c_merges_file = str(merges_file).encode() + def __cinit__(self, Column merge_pairs): + cdef column_view c_pairs = merge_pairs.view() with nogil: - self.c_obj = move(cpp_load_merge_pairs_file(c_merges_file)) + self.c_obj = move(cpp_load_merge_pairs(c_pairs)) @acquire_spill_lock() diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index db220ea6da4..a1674a099c5 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -14,16 +14,16 @@ class BytePairEncoder: Parameters ---------- - merges_file : str - Path to file containing merge pairs. + merges_pairs : str + Strings column of merge pairs Returns ------- BytePairEncoder """ - def __init__(self, merges_file: str): - self.merge_pairs = cpp_merge_pairs(merges_file) + def __init__(self, merges_pair: "cudf.Series"): + self.merge_pairs = cpp_merge_pairs(merges_pair._column) def __call__(self, text, separator: str = " "): """ @@ -41,7 +41,10 @@ def __call__(self, text, separator: str = " "): -------- >>> import cudf >>> from cudf.core.byte_pair_encoding import BytePairEncoder - >>> bpe = BytePairEncoder('merges.txt') + >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t", + ... "c e", "es t", "en ce", "T h", "Th is", + ... "t est", "s ent", "t h", "th is"]) + >>> bpe = BytePairEncoder(mps) >>> str_series = cudf.Series(['This is the sentence', 'thisisit']) >>> bpe(str_series) 0 This is a sent ence From e6a08483e938204dde9969ec74b68d8fd8b6c847 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 24 Aug 2023 12:26:16 -0400 Subject: [PATCH 05/28] use u_char for is-whitespace fn --- cpp/src/text/subword/bpe_tokenizer.cu | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 6407bfdf7c5..1afad89571a 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -50,11 +50,7 @@ namespace detail { namespace { -template -constexpr bool is_whitespace(CharType ch) -{ - return ch <= ' '; -} +constexpr bool is_whitespace(u_char ch) { return ch <= ' '; } /** * @brief Resolve a substring up to the first whitespace character. @@ -69,9 +65,10 @@ constexpr bool is_whitespace(CharType ch) __device__ cudf::string_view get_first_token(cudf::string_view const& d_str) { auto const begin = d_str.data(); - auto const end = thrust::find_if( - thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); }); - auto const size = static_cast(thrust::distance(begin, end)); + auto const end = thrust::find_if(thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { + return is_whitespace(static_cast(ch)); + }); + auto const size = static_cast(thrust::distance(begin, end)); return cudf::string_view(begin, size); } @@ -402,7 +399,7 @@ struct edge_of_space_fn { __device__ bool operator()(cudf::size_type offset) { auto const d_chars = - d_strings.child(cudf::strings_column_view::chars_column_index).data(); + d_strings.child(cudf::strings_column_view::chars_column_index).data(); if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; } auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); @@ -484,10 +481,10 @@ struct list_offsets_fn { auto const end = thrust::make_counting_iterator(d_str.size_bytes()); // this counts the number of non-adjacent delimiters - auto const result = - thrust::count_if(thrust::seq, begin, end, [data = d_str.data()](auto chidx) { - return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]); - }); + auto const data = reinterpret_cast(d_str.data()); + auto const result = thrust::count_if(thrust::seq, begin, end, [data](auto chidx) { + return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]); + }); return static_cast(result) + 1; } }; From a1735ac3f0e584d0f9580f1618ec459b6a0a2b00 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 25 Aug 2023 17:05:31 -0400 Subject: [PATCH 06/28] use separator in final encoding step --- cpp/include/nvtext/bpe_tokenize.hpp | 40 ------------------- cpp/src/text/subword/bpe_tokenizer.cu | 56 ++++++++++++++------------- cpp/tests/text/bpe_tests.cpp | 15 ++----- 3 files changed, 33 insertions(+), 78 deletions(-) diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index 258ee15ff87..7a362847a7c 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -166,45 +166,5 @@ std::unique_ptr byte_pair_encoding( cudf::string_scalar const& separator = cudf::string_scalar(" "), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Byte pair encode the input strings - * - * This will split each string on whitespace, perform the encoding, - * and then build the output column using the given `separator`. - * - * The encoding algorithm rebuilds each string by matching substrings - * in the `merge_pairs` column and iteratively removing the minimum ranked pair - * until no pairs are left. Then, a space is inserted between the remaining - * pairs before the result is joined to make the output string. - * - * @code{.pseudo} - * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"] - * input = ["test sentence", "thisis test"] - * result = byte_pair_encoding(input, merge_pairs) - * result is now ["test sent ence", "this is test"] - * @endcode - * - * The `merges_pairs` column should contain a unique pair of strings per line separated by - * a single space. An incorrect format or non-unique entries will result in - * undefined behavior. - * - * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls - * @throw cudf::logic_error if `separator` is invalid - * - * @param input Strings to encode. - * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file. - * @param separator String used to build the output after encoding. - * Default is a space. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Memory resource to allocate any returned objects. - * @return An encoded column of strings. - */ -std::unique_ptr byte_pair_encoding( - cudf::strings_column_view const& input, - cudf::strings_column_view const& merges_pairs, - cudf::string_scalar const& separator = cudf::string_scalar(" "), - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** @} */ // end of group } // namespace nvtext diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 1afad89571a..bbf07f96b00 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -81,6 +82,7 @@ template struct byte_pair_encoding_fn { cudf::column_device_view const d_merges; cudf::column_device_view const d_strings; + cudf::string_view const d_separator; MapRefType const d_map; cudf::size_type* d_sizes; // output size of encoded string string_hasher_type const hasher; @@ -273,13 +275,11 @@ struct byte_pair_encoding_fn { } // compute and store the output size for this string's encoding - auto const encoded_size = d_str.size_bytes() + // number of original bytes + - thrust::count_if( // number of non-zero byte indices - thrust::seq, - d_indices, - d_indices + d_str.size_bytes(), - [](auto v) { return v != 0; }); - d_sizes[idx] = static_cast(encoded_size); + auto separators_size = + thrust::count_if( + thrust::seq, d_indices, d_indices + d_str.size_bytes(), [](auto v) { return v != 0; }) * + d_separator.size_bytes(); + d_sizes[idx] = static_cast(d_str.size_bytes() + separators_size); } }; @@ -296,6 +296,7 @@ struct byte_pair_encoding_fn { */ struct build_encoding_fn { cudf::column_device_view const d_strings; + cudf::string_view const d_separator; cudf::size_type const* d_byte_indices; cudf::size_type const* d_offsets; char* d_chars{}; @@ -319,7 +320,7 @@ struct build_encoding_fn { *d_output++ = *d_input++; auto itr = begin + 1; while (itr < end) { - if (*itr++) *d_output++ = ' '; + if (*itr++) { d_output = cudf::strings::detail::copy_string(d_output, d_separator); } *d_output++ = *d_input++; } // https://github.com/rapidsai/cudf/pull/10270/files#r826319405 @@ -345,6 +346,7 @@ struct build_encoding_fn { std::unique_ptr byte_pair_encoding( cudf::strings_column_view const& input, bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs, + cudf::string_view d_separator, rmm::cuda_stream_view stream) { auto const d_merges = merge_pairs.get_merge_pairs(); @@ -363,8 +365,13 @@ std::unique_ptr byte_pair_encoding( auto d_offsets = offsets->mutable_view().data(); auto map_ref = merge_pairs.get_merge_pairs_ref(); - byte_pair_encoding_fn fn{ - d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()}; + byte_pair_encoding_fn fn{d_merges, + *d_strings, + d_separator, + map_ref, + d_offsets, + string_hasher_type{}, + d_byte_indices.data()}; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), fn); @@ -378,10 +385,11 @@ std::unique_ptr byte_pair_encoding( bytes, stream, rmm::mr::get_current_device_resource()); auto d_chars = chars->mutable_view().data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars}); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars}); return make_strings_column( input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{}); @@ -497,8 +505,12 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (input.is_empty() || input.chars_size() == 0) + if (input.is_empty() || input.chars_size() == 0) { return cudf::make_empty_column(cudf::type_id::STRING); + } + + CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid"); + auto const d_separator = separator.value(stream); auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto const offsets = space_offsets(input, *d_strings, stream); @@ -513,8 +525,8 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const {offsets->view(), input.chars()}); // run BPE on this view - auto const bpe_column = - byte_pair_encoding(cudf::strings_column_view(input_view), *(merge_pairs.impl), stream); + auto const bpe_column = byte_pair_encoding( + cudf::strings_column_view(input_view), *(merge_pairs.impl), d_separator, stream); // recombine the result: // compute the offsets needed to build a list view @@ -558,14 +570,4 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr); } -std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, - cudf::strings_column_view const& merges_table, - cudf::string_scalar const& separator, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::byte_pair_encoding(input, merges_table, separator, stream, mr); -} - } // namespace nvtext diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index 1d9ad12225e..fbb2335f343 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -66,8 +66,6 @@ TEST_F(TextBPETokenize, BytePairEncoding) ""}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); - results = nvtext::byte_pair_encoding(sv, cudf::strings_column_view(mpt)); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); auto sliced = cudf::slice(input, {1, 4}).front(); auto sliced_expected = cudf::slice(expected, {1, 4}).front(); @@ -75,8 +73,6 @@ TEST_F(TextBPETokenize, BytePairEncoding) results = nvtext::byte_pair_encoding(sv, *merge_pairs); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); - results = nvtext::byte_pair_encoding(sv, cudf::strings_column_view(mpt)); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); } TEST_F(TextBPETokenize, BytePairEncodingSeparator) @@ -91,8 +87,10 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator) auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string(" Ġ")); - auto expected = cudf::test::strings_column_wrapper( - {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"}); + auto expected = cudf::test::strings_column_wrapper({"test Ġ- Ġsent Ġence Ġ- Ġ1", + "test Ġsent Ġence Ġ- Ġ2", + "test Ġsent Ġence Ġ3", + " Ġtest Ġsent Ġence Ġ4"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } @@ -111,9 +109,4 @@ TEST_F(TextBPETokenize, BPE_Error) EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(*empty)), cudf::logic_error); auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {1, 0}); EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(null_pairs)), cudf::logic_error); - auto input = cudf::test::strings_column_wrapper({"isit"}); - auto separator = cudf::string_scalar("", false); - EXPECT_THROW(nvtext::byte_pair_encoding( - cudf::strings_column_view(input), cudf::strings_column_view(input), separator), - cudf::logic_error); } From f3926176ea1751745ddae54ffda6445f212c43b4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 28 Aug 2023 15:26:16 -0400 Subject: [PATCH 07/28] remove whitespace checks --- cpp/src/text/subword/bpe_tokenizer.cu | 306 +++---------------------- cpp/src/text/subword/bpe_tokenizer.cuh | 40 +++- cpp/tests/text/bpe_tests.cpp | 46 ++-- 3 files changed, 85 insertions(+), 307 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index bbf07f96b00..ce5b6523854 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -21,8 +21,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -31,18 +31,11 @@ #include #include -#include #include #include -#include -#include -#include #include -#include #include #include -#include -#include #include #include @@ -51,28 +44,6 @@ namespace detail { namespace { -constexpr bool is_whitespace(u_char ch) { return ch <= ' '; } - -/** - * @brief Resolve a substring up to the first whitespace character. - * - * This will return a substring of the input starting with the first byte - * up to the first whitespace character found or the end of the string. - * Any whitespace is expected only at the end of the string. - * - * @param d_str Input string to resolve. - * @return Substring of the input excluding any trailing whitespace. - */ -__device__ cudf::string_view get_first_token(cudf::string_view const& d_str) -{ - auto const begin = d_str.data(); - auto const end = thrust::find_if(thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { - return is_whitespace(static_cast(ch)); - }); - auto const size = static_cast(thrust::distance(begin, end)); - return cudf::string_view(begin, size); -} - /** * @brief Main byte pair encoding algorithm function for each string. * @@ -88,29 +59,6 @@ struct byte_pair_encoding_fn { string_hasher_type const hasher; cudf::size_type* d_byte_indices; - /** - * @brief Parse the merge pair into components. - * - * The two substrings are separated by a single space. - * - * @param idx Index of merge pair to dissect. - * @return The left and right halves of the merge pair. - */ - __device__ thrust::pair dissect_merge_pair( - cudf::size_type idx) - { - auto const d_pair = d_merges.element(idx); - auto const lhs = d_pair.data(); - auto const end_str = d_pair.data() + d_pair.size_bytes(); - auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' '); // space always expected - // check for malformed pair entry to prevent segfault - if (rhs == end_str) { return thrust::make_pair(cudf::string_view{}, cudf::string_view{}); } - auto const lhs_size = static_cast(thrust::distance(lhs, rhs)); - auto const rhs_size = static_cast(thrust::distance(rhs + 1, end_str)); - return thrust::make_pair(cudf::string_view(lhs, lhs_size), - cudf::string_view(rhs + 1, rhs_size)); - } - /** * @brief Get the next substring of the given string. * @@ -144,25 +92,8 @@ struct byte_pair_encoding_fn { */ __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs) { - __shared__ char shmem[48 * 1024]; // max for Pascal - auto const total_size = lhs.size_bytes() + rhs.size_bytes() + 1; - auto const thread_memory_size = static_cast(sizeof(shmem) / blockDim.x); - - // Edge case check. - // Empirically found only two merge pair strings that were greater than 70 bytes - // and they both looked like ignorable errors. - if (thread_memory_size < total_size) { return d_map.end(); } - - // build the target string in shared memory - char* ptr = &shmem[threadIdx.x * thread_memory_size]; - - // build a temp string like: temp = lhs + ' ' + rhs - memcpy(ptr, lhs.data(), lhs.size_bytes()); - memcpy(ptr + lhs.size_bytes(), " ", 1); - memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes()); - - auto const d_str = cudf::string_view(ptr, total_size); - return d_map.find(d_str); + auto const mp = merge_pair_type{lhs, rhs}; + return d_map.find(mp); } /** @@ -188,14 +119,14 @@ struct byte_pair_encoding_fn { d_sizes[idx] = 0; return; } - auto const d_str = get_first_token(d_strings.element(idx)); + auto const d_str = d_strings.element(idx); if (d_str.empty()) { d_sizes[idx] = 0; return; } auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); + .element(idx + d_strings.offset()); auto const d_indices = d_byte_indices + offset; // initialize the byte indices for this string; @@ -227,7 +158,7 @@ struct byte_pair_encoding_fn { // check each adjacent pair against the d_map while (itr < end) { auto const rhs = next_substr(itr, end, d_str); - if (rhs.empty()) break; // no more adjacent pairs + if (rhs.empty()) { break; } // no more adjacent pairs auto const map_itr = get_merge_pair(lhs, rhs); if (map_itr != d_map.end()) { @@ -253,11 +184,12 @@ struct byte_pair_encoding_fn { // continue scanning for other occurrences in the remainder of the string itr += min_size; if (itr < end) { - auto const d_pair = dissect_merge_pair(min_rank); + // auto const d_pair = dissect_merge_pair(min_rank); + auto const d_pair = dissect_merge_pair(d_merges.element(min_rank)); lhs = next_substr(itr, end, d_str); itr += lhs.size_bytes(); - while (itr < end) { + while (itr < end && !lhs.empty()) { auto rhs = next_substr(itr, end, d_str); if (d_pair.first == lhs && d_pair.second == rhs) { *itr = 0; // removes the pair from this string @@ -304,13 +236,13 @@ struct build_encoding_fn { __device__ void operator()(cudf::size_type idx) { if (d_strings.is_null(idx)) { return; } - auto const d_str = get_first_token(d_strings.element(idx)); + auto const d_str = d_strings.element(idx); if (d_str.empty()) { return; } auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); + .element(idx + d_strings.offset()); auto const d_indices = d_byte_indices + offset; - auto d_output = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto d_output = d_chars + d_offsets[idx]; // copy chars while indices[i]==0, // insert space each time indices[i]!=0 @@ -327,36 +259,26 @@ struct build_encoding_fn { } }; -/** - * @brief Perform byte pair encoding on each string in the input column. - * - * The result is a strings column of the same size where each string has been encoded. - * - * The encoding is performed iteratively. Each pass determines the string's lowest - * ranked merge pair as determined by the strings in `merges_table`. This pair - * is removed (virtually) from each string before starting the next iteration. - * - * Once all pairs have exhausted for all strings, the output is constructed from - * the results by adding spaces between each remaining pair in each string. - * - * @param input Strings to encode. - * @param merge_pairs Merge pairs data and map used for encoding. - * @param stream CUDA stream used for device memory operations and kernel launches - */ -std::unique_ptr byte_pair_encoding( - cudf::strings_column_view const& input, - bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs, - cudf::string_view d_separator, - rmm::cuda_stream_view stream) +} // namespace + +std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, + bpe_merge_pairs const& merge_pairs, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto const d_merges = merge_pairs.get_merge_pairs(); - CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty"); + if (input.is_empty() || input.chars_size() == 0) { + return cudf::make_empty_column(cudf::type_id::STRING); + } - // build working vector to hold index values per byte - rmm::device_uvector d_byte_indices(input.chars().size(), stream); + CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid"); + auto const d_separator = separator.value(stream); auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + // build working vector to hold index values per byte + rmm::device_uvector d_byte_indices(input.chars().size(), stream); + auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, static_cast(input.size() + 1), cudf::mask_state::UNALLOCATED, @@ -364,7 +286,8 @@ std::unique_ptr byte_pair_encoding( rmm::mr::get_current_device_resource()); auto d_offsets = offsets->mutable_view().data(); - auto map_ref = merge_pairs.get_merge_pairs_ref(); + auto const d_merges = merge_pairs.impl->get_merge_pairs(); + auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); byte_pair_encoding_fn fn{d_merges, *d_strings, d_separator, @@ -391,172 +314,11 @@ std::unique_ptr byte_pair_encoding( input.size(), build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars}); - return make_strings_column( - input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{}); -} - -/** - * @brief Detect space to not-space transitions inside each string. - * - * This handles sliced input and null strings as well. - * It is parallelized over bytes and returns true only for valid left edges - * -- non-space preceded by a space. - */ -struct edge_of_space_fn { - cudf::column_device_view const d_strings; - __device__ bool operator()(cudf::size_type offset) - { - auto const d_chars = - d_strings.child(cudf::strings_column_view::chars_column_index).data(); - if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; } - - auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); - auto const d_offsets = offsets.data() + d_strings.offset(); - // ignore offsets outside sliced range - if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; } - - auto itr = - thrust::lower_bound(thrust::seq, d_offsets, d_offsets + d_strings.size() + 1, offset); - // ignore offsets at existing string boundaries - if (*itr == offset) { return false; } - - // count only edges for valid strings - auto const index = static_cast(thrust::distance(d_offsets, itr)) - 1; - return d_strings.is_valid(index); - } -}; - -/** - * @brief Create new offsets by identifying substrings by whitespace. - * - * This is similar to cudf::strings::split_record but does not fully split - * and only returns new offsets. The behavior is more like a view-only slice - * of the chars child with the result still including trailing delimiters. - * - * The encoding algorithm ignores the trailing whitespace of each string. - * - * @param input Strings to tokenize. - * @param stream CUDA stream used for device memory operations and kernel launches - * @return New offsets including those at the edge of each space. - */ -std::unique_ptr space_offsets(cudf::strings_column_view const& input, - cudf::column_device_view const& d_strings, - rmm::cuda_stream_view stream) -{ - // count space offsets - auto const begin = thrust::make_counting_iterator(1); - auto const end = thrust::make_counting_iterator(input.chars().size()); - edge_of_space_fn edge_of_space{d_strings}; - auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space); - - // copy space offsets - rmm::device_uvector space_offsets(space_count, stream); - thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space); - - // create output offsets - auto result = - cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, - static_cast(space_count + input.size() + 1), - cudf::mask_state::UNALLOCATED, - stream, - rmm::mr::get_current_device_resource()); - - // combine current offsets with space offsets - thrust::merge(rmm::exec_policy(stream), - input.offsets_begin(), - input.offsets_end(), - space_offsets.begin(), - space_offsets.end(), - result->mutable_view().begin()); - - return result; -} - -/** - * @brief Build new offsets that can be used to build a list column for calling join. - * - * This essentially returns the number of tokens for each string. - */ -struct list_offsets_fn { - cudf::column_device_view const d_strings; - __device__ cudf::size_type operator()(cudf::size_type idx) - { - if (d_strings.is_null(idx)) return 0; - auto const d_str = d_strings.element(idx); - if (d_str.empty()) return 1; // empty is a single valid result - - auto const begin = thrust::make_counting_iterator(1); - auto const end = thrust::make_counting_iterator(d_str.size_bytes()); - - // this counts the number of non-adjacent delimiters - auto const data = reinterpret_cast(d_str.data()); - auto const result = thrust::count_if(thrust::seq, begin, end, [data](auto chidx) { - return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]); - }); - return static_cast(result) + 1; - } -}; - -} // namespace - -std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, - bpe_merge_pairs const& merge_pairs, - cudf::string_scalar const& separator, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - if (input.is_empty() || input.chars_size() == 0) { - return cudf::make_empty_column(cudf::type_id::STRING); - } - - CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid"); - auto const d_separator = separator.value(stream); - - auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - auto const offsets = space_offsets(input, *d_strings, stream); - - // build a view using the new offsets and the current input chars column - auto const input_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, - offsets->size() - 1, - nullptr, // no parent data - nullptr, // null-mask - 0, // null-count - 0, // offset - {offsets->view(), input.chars()}); - - // run BPE on this view - auto const bpe_column = byte_pair_encoding( - cudf::strings_column_view(input_view), *(merge_pairs.impl), d_separator, stream); - - // recombine the result: - // compute the offsets needed to build a list view - auto const list_offsets = [d_strings = *d_strings, stream] { - auto offsets_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), list_offsets_fn{d_strings}); - auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column( - offsets_itr, offsets_itr + d_strings.size(), stream, rmm::mr::get_current_device_resource())); - return offsets_column; - }(); - - // build a list column_view using the BPE output and the list_offsets - auto const list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST}, - input.size(), - nullptr, // no parent data in list column - input.null_mask(), - input.null_count(), - 0, - {list_offsets->view(), bpe_column->view()}); - - // build the output strings column - auto result = - cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join), - separator, - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::NO, - cudf::strings::output_if_empty_list::EMPTY_STRING, - stream, - mr); - return result; + return cudf::make_strings_column(input.size(), + std::move(offsets), + std::move(chars), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } } // namespace detail diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 83aa22aaae9..c24fb8a745e 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -31,6 +32,11 @@ #include +#include +#include +#include +#include + #include #include @@ -39,6 +45,27 @@ namespace detail { using hash_value_type = uint32_t; using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; +using merge_pair_type = thrust::pair; + +/** + * @brief Parse the merge pair into components. + * + * The two substrings are separated by a single space. + * + * @param idx Index of merge pair to dissect. + * @return The left and right halves of the merge pair. + */ +__device__ __inline__ merge_pair_type dissect_merge_pair(cudf::string_view d_pair) +{ + auto const lhs = d_pair.data(); + auto const end_str = d_pair.data() + d_pair.size_bytes(); + auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' '); // space always expected + // check for malformed pair entry to prevent segfault + if (rhs == end_str) { return merge_pair_type{cudf::string_view{}, cudf::string_view{}}; } + auto const lhs_size = static_cast(thrust::distance(lhs, rhs)); + auto const rhs_size = static_cast(thrust::distance(rhs + 1, end_str)); + return merge_pair_type{cudf::string_view(lhs, lhs_size), cudf::string_view(rhs + 1, rhs_size)}; +} /** * @brief Hasher function used for building and using the cuco static-map @@ -52,10 +79,14 @@ struct bpe_hasher { // used by insert __device__ hash_value_type operator()(cudf::size_type index) const { - return hasher(d_strings.element(index)); + auto const [lhs, rhs] = dissect_merge_pair(d_strings.element(index)); + return cudf::hashing::detail::hash_combine(hasher(lhs), hasher(rhs)); } // used by find - __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); } + __device__ hash_value_type operator()(merge_pair_type const& mp) const + { + return cudf::hashing::detail::hash_combine(hasher(mp.first), hasher(mp.second)); + } }; /** @@ -72,9 +103,10 @@ struct bpe_equal { return d_strings.element(lhs) == d_strings.element(rhs); } // used by find - __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept + __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept { - return d_strings.element(lhs) == rhs; + auto const d_pair = dissect_merge_pair(d_strings.element(lhs)); + return d_pair.first == rhs.first && d_pair.second == rhs.second; } }; diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index fbb2335f343..f2fd487e7f9 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -29,39 +29,23 @@ struct TextBPETokenize : public cudf::test::BaseFixture {}; TEST_F(TextBPETokenize, BytePairEncoding) { // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt - auto mpt = cudf::test::strings_column_wrapper({ - "e n", // 12 - "i t", // 14 - "i s", // 15 - "e s", // 18 - "en t", // 42 - "c e", // 88 - "es t", // 139 - "en ce", // 338 - "T h", // 561 - "Th is", // 956 - "t est", // 9032 - "s ent", // 33830 - }); + auto mpt = cudf::test::strings_column_wrapper( + {"e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "T h", "Th is", "t est", "s ent"}); auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); auto validity = cudf::test::iterators::null_at(4); - cudf::test::strings_column_wrapper input({" This\tis it\n", - "This is test-sentence-1", - "This is test sentence-2", - "This-is test sentence 3", - "", - ""}, - validity); + cudf::test::strings_column_wrapper input( + {"Thisisit", "Thisis test-sentence-1", "Thisistestsentence-2", "This-istestsentence 3", "", ""}, + validity); auto sv = cudf::strings_column_view(input); auto results = nvtext::byte_pair_encoding(sv, *merge_pairs); - auto expected = cudf::test::strings_column_wrapper({" This is it", - "This is test - sent ence - 1", + auto expected = cudf::test::strings_column_wrapper({"This is it", + "This is test - sent ence - 1", "This is test sent ence - 2", - "This - is test sent ence 3", + "This - is test sent ence 3", "", ""}, validity); @@ -78,19 +62,19 @@ TEST_F(TextBPETokenize, BytePairEncoding) TEST_F(TextBPETokenize, BytePairEncodingSeparator) { auto mpt = cudf::test::strings_column_wrapper( - {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); + {"Ġ t", "Ġt he", "h e", "e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); cudf::test::strings_column_wrapper input( - {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "}); + {"Ġthe test sentence", "test Ġthe sentence", "Ġthetest sentence", "testĠthesentence"}); auto sv = cudf::strings_column_view(input); - auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string(" Ġ")); + auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string("$")); - auto expected = cudf::test::strings_column_wrapper({"test Ġ- Ġsent Ġence Ġ- Ġ1", - "test Ġsent Ġence Ġ- Ġ2", - "test Ġsent Ġence Ġ3", - " Ġtest Ġsent Ġence Ġ4"}); + auto expected = cudf::test::strings_column_wrapper({"Ġthe$ $test$ $sent$ence", + "test$ $Ġthe$ $sent$ence", + "Ġthe$test$ $sent$ence", + "test$Ġthe$sent$ence"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } From cb02a25f582f0982fb920fe5de0b0cae0ba29b3d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 29 Aug 2023 17:22:25 -0400 Subject: [PATCH 08/28] more efficient pair lookup --- cpp/include/nvtext/bpe_tokenize.hpp | 7 +-- cpp/src/text/subword/bpe_tokenizer.cu | 59 +++++++++--------------- cpp/src/text/subword/bpe_tokenizer.cuh | 42 ++++++----------- cpp/src/text/subword/load_merges_file.cu | 14 +++--- 4 files changed, 48 insertions(+), 74 deletions(-) diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index 7a362847a7c..4e361e3c130 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -32,7 +32,7 @@ namespace nvtext { /** * @brief The table of merge pairs for the BPE encoder. * - * To create an instance, call @ref nvtext::load_merge_pairs_file + * To create an instance, call @ref nvtext::load_merge_pairs */ struct bpe_merge_pairs { struct bpe_merge_pairs_impl; @@ -144,7 +144,8 @@ std::unique_ptr load_merge_pairs( * pairs before the result is joined to make the output string. * * @code{.pseudo} - * mps = load_merges_file("merges.txt") // see doxygen for example contents + * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"] + * mps = load_merge_pairs(merge_pairs) * input = ["test sentence", "thisis test"] * result = byte_pair_encoding(input, mps) * result is now ["test sent ence", "this is test"] @@ -154,7 +155,7 @@ std::unique_ptr load_merge_pairs( * @throw cudf::logic_error if `separator` is invalid * * @param input Strings to encode. - * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file. + * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs. * @param separator String used to build the output after encoding. * Default is a space. * @param mr Memory resource to allocate any returned objects. diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index ce5b6523854..e30dfa7aed6 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -55,9 +55,9 @@ struct byte_pair_encoding_fn { cudf::column_device_view const d_strings; cudf::string_view const d_separator; MapRefType const d_map; - cudf::size_type* d_sizes; // output size of encoded string + cudf::size_type* d_sizes; // output size of encoded string string_hasher_type const hasher; - cudf::size_type* d_byte_indices; + cudf::size_type* d_byte_indices; // output indices per string /** * @brief Get the next substring of the given string. @@ -76,26 +76,13 @@ struct byte_pair_encoding_fn { template __device__ cudf::string_view next_substr(Iterator begin, Iterator end, - cudf::string_view const& d_str) + cudf::string_view const& d_str) const { auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); auto const size = static_cast(thrust::distance(begin, next)); return cudf::string_view(d_str.data() + *begin, size); } - /** - * @brief Look up the pair of strings in the d_map/d_merges - * - * @param lhs Left half of the string - * @param rhs Right half of the string - * @return Position of merge pair within d_map - */ - __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs) - { - auto const mp = merge_pair_type{lhs, rhs}; - return d_map.find(mp); - } - /** * @brief Byte encode each string. * @@ -113,7 +100,7 @@ struct byte_pair_encoding_fn { * * @param idx The index of the string in `d_strings` to encode */ - __device__ void operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) const { if (d_strings.is_null(idx)) { d_sizes[idx] = 0; @@ -154,13 +141,15 @@ struct byte_pair_encoding_fn { auto min_itr = itr; // these are set along with auto min_size = lhs.size_bytes(); // the min_rank variable + auto min_mp = merge_pair_type{}; // check each adjacent pair against the d_map while (itr < end) { auto const rhs = next_substr(itr, end, d_str); if (rhs.empty()) { break; } // no more adjacent pairs - auto const map_itr = get_merge_pair(lhs, rhs); + auto const mp = merge_pair_type{lhs, rhs}; + auto const map_itr = d_map.find(mp); if (map_itr != d_map.end()) { // found a match; record the rank (and other min_ vars) auto const rank = map_itr->second; @@ -168,6 +157,7 @@ struct byte_pair_encoding_fn { min_rank = rank; min_itr = itr; min_size = rhs.size_bytes(); + min_mp = mp; } } // next substring @@ -184,14 +174,11 @@ struct byte_pair_encoding_fn { // continue scanning for other occurrences in the remainder of the string itr += min_size; if (itr < end) { - // auto const d_pair = dissect_merge_pair(min_rank); - auto const d_pair = dissect_merge_pair(d_merges.element(min_rank)); - lhs = next_substr(itr, end, d_str); itr += lhs.size_bytes(); while (itr < end && !lhs.empty()) { auto rhs = next_substr(itr, end, d_str); - if (d_pair.first == lhs && d_pair.second == rhs) { + if ((min_mp.first == lhs) && (min_mp.second == rhs)) { *itr = 0; // removes the pair from this string itr += rhs.size_bytes(); if (itr >= end) { break; } // done checking for pairs @@ -233,7 +220,7 @@ struct build_encoding_fn { cudf::size_type const* d_offsets; char* d_chars{}; - __device__ void operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) const { if (d_strings.is_null(idx)) { return; } auto const d_str = d_strings.element(idx); @@ -288,15 +275,15 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const auto const d_merges = merge_pairs.impl->get_merge_pairs(); auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); - byte_pair_encoding_fn fn{d_merges, - *d_strings, - d_separator, - map_ref, - d_offsets, - string_hasher_type{}, - d_byte_indices.data()}; - thrust::for_each_n( - rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), fn); + auto const bpe_fn = byte_pair_encoding_fn{d_merges, + *d_strings, + d_separator, + map_ref, + d_offsets, + string_hasher_type{}, + d_byte_indices.data()}; + auto const zero_itr = thrust::counting_iterator(0); + thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), bpe_fn); // build the output: add spaces between the remaining pairs in each string thrust::exclusive_scan( @@ -308,11 +295,9 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const bytes, stream, rmm::mr::get_current_device_resource()); auto d_chars = chars->mutable_view().data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars}); + auto const result_fn = + build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars}; + thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), result_fn); return cudf::make_strings_column(input.size(), std::move(offsets), diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index c24fb8a745e..7632d879b67 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -47,26 +47,6 @@ using hash_value_type = uint32_t; using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; using merge_pair_type = thrust::pair; -/** - * @brief Parse the merge pair into components. - * - * The two substrings are separated by a single space. - * - * @param idx Index of merge pair to dissect. - * @return The left and right halves of the merge pair. - */ -__device__ __inline__ merge_pair_type dissect_merge_pair(cudf::string_view d_pair) -{ - auto const lhs = d_pair.data(); - auto const end_str = d_pair.data() + d_pair.size_bytes(); - auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' '); // space always expected - // check for malformed pair entry to prevent segfault - if (rhs == end_str) { return merge_pair_type{cudf::string_view{}, cudf::string_view{}}; } - auto const lhs_size = static_cast(thrust::distance(lhs, rhs)); - auto const rhs_size = static_cast(thrust::distance(rhs + 1, end_str)); - return merge_pair_type{cudf::string_view(lhs, lhs_size), cudf::string_view(rhs + 1, rhs_size)}; -} - /** * @brief Hasher function used for building and using the cuco static-map * @@ -79,7 +59,9 @@ struct bpe_hasher { // used by insert __device__ hash_value_type operator()(cudf::size_type index) const { - auto const [lhs, rhs] = dissect_merge_pair(d_strings.element(index)); + index *= 2; + auto const lhs = d_strings.element(index); + auto const rhs = d_strings.element(index + 1); return cudf::hashing::detail::hash_combine(hasher(lhs), hasher(rhs)); } // used by find @@ -100,13 +82,20 @@ struct bpe_equal { // used by insert __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept { - return d_strings.element(lhs) == d_strings.element(rhs); + lhs *= 2; + rhs *= 2; + return (d_strings.element(lhs) == + d_strings.element(rhs)) && + (d_strings.element(lhs + 1) == + d_strings.element(rhs + 1)); } // used by find __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept { - auto const d_pair = dissect_merge_pair(d_strings.element(lhs)); - return d_pair.first == rhs.first && d_pair.second == rhs.second; + lhs *= 2; + auto const left = d_strings.element(lhs); + auto const right = d_strings.element(lhs + 1); + return (left == rhs.first) && (right == rhs.second); } }; @@ -124,9 +113,8 @@ using merge_pairs_map_type = cuco::experimental::static_map this helper simplifies the return type in a more maintainable -// way +// since column_device_view::create() returns is a little more than +// std::unique_ptr this helper simplifies the return type for us using col_device_view = std::invoke_result_t; diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index ece9f9a2967..b2c552e49bc 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -88,12 +89,10 @@ std::unique_ptr load_file_to_column(std::string const& filename_me std::unique_ptr initialize_merge_pairs_map( cudf::column_device_view const& input, rmm::cuda_stream_view stream) { - // Ensure capacity is at least (size/0.7) as documented here: - // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( - static_cast(input.size() * 2), // capacity is 2x; + static_cast(input.size()), cuco::empty_key{-1}, - cuco::empty_value{-1}, // empty value is not used + cuco::empty_value{-1}, bpe_equal{input}, probe_scheme{bpe_hasher{input}}, hash_table_allocator_type{default_allocator{}, stream}, @@ -102,7 +101,7 @@ std::unique_ptr initialize_merge_pairs_map( auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); }); - merge_pairs_map->insert_async(iter, iter + input.size(), stream.value()); + merge_pairs_map->insert_async(iter, iter + (input.size() / 2), stream.value()); return merge_pairs_map; } @@ -121,8 +120,9 @@ std::unique_ptr create_bpe_merge_pairs_im rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return create_bpe_merge_pairs_impl(std::make_unique(input.parent(), stream, mr), - stream); + auto pairs = cudf::strings::split_record(input); // Fix once 13997 is merged + auto content = pairs->release(); + return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream); } } // namespace From fb111166bb0baacc39cede35e72c266ec076576d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 31 Aug 2023 11:30:48 -0400 Subject: [PATCH 09/28] try adding zlib to dependencies.yaml --- dependencies.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dependencies.yaml b/dependencies.yaml index a1d928797b0..fc024a67946 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -179,6 +179,7 @@ dependencies: - c-compiler - cxx-compiler - dlpack>=0.5,<0.6.0a0 + - zlib specific: - output_types: conda matrices: From ea4150d640b4a9e5015c9b314b58185bf0c32d56 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 31 Aug 2023 11:38:13 -0400 Subject: [PATCH 10/28] add zlib to conda env yamls too --- conda/environments/all_cuda-118_arch-x86_64.yaml | 1 + conda/environments/all_cuda-120_arch-x86_64.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index e4a9b2f1d29..e2648090147 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -95,6 +95,7 @@ dependencies: - tokenizers==0.13.1 - transformers==4.24.0 - typing_extensions>=4.0.0 +- zlib - pip: - git+https://github.com/python-streamz/streamz.git@master name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index d03c4364435..e6ee24651fb 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -92,6 +92,7 @@ dependencies: - tokenizers==0.13.1 - transformers==4.24.0 - typing_extensions>=4.0.0 +- zlib - pip: - git+https://github.com/python-streamz/streamz.git@master name: all_cuda-120_arch-x86_64 From 346b0cbee2325ff6f6e10baab5fb5c70fe4a9e7b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 31 Aug 2023 13:11:52 -0400 Subject: [PATCH 11/28] undo temp changes --- conda/environments/all_cuda-118_arch-x86_64.yaml | 1 - conda/environments/all_cuda-120_arch-x86_64.yaml | 1 - dependencies.yaml | 1 - 3 files changed, 3 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index e2648090147..e4a9b2f1d29 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -95,7 +95,6 @@ dependencies: - tokenizers==0.13.1 - transformers==4.24.0 - typing_extensions>=4.0.0 -- zlib - pip: - git+https://github.com/python-streamz/streamz.git@master name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index e6ee24651fb..d03c4364435 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -92,7 +92,6 @@ dependencies: - tokenizers==0.13.1 - transformers==4.24.0 - typing_extensions>=4.0.0 -- zlib - pip: - git+https://github.com/python-streamz/streamz.git@master name: all_cuda-120_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index fc024a67946..a1d928797b0 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -179,7 +179,6 @@ dependencies: - c-compiler - cxx-compiler - dlpack>=0.5,<0.6.0a0 - - zlib specific: - output_types: conda matrices: From 6ad41876cb707c28ae5ebb781af6460b13e31626 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 31 Aug 2023 19:15:35 -0400 Subject: [PATCH 12/28] use segmented reduce --- cpp/src/text/subword/bpe_tokenizer.cu | 67 ++++++++++++------------ cpp/src/text/subword/bpe_tokenizer.cuh | 7 +-- cpp/src/text/subword/load_merges_file.cu | 6 +-- 3 files changed, 38 insertions(+), 42 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index e30dfa7aed6..298907ccb27 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -51,7 +52,6 @@ namespace { */ template struct byte_pair_encoding_fn { - cudf::column_device_view const d_merges; cudf::column_device_view const d_strings; cudf::string_view const d_separator; MapRefType const d_map; @@ -74,13 +74,16 @@ struct byte_pair_encoding_fn { * @return The substring found. */ template - __device__ cudf::string_view next_substr(Iterator begin, - Iterator end, - cudf::string_view const& d_str) const + __device__ __inline__ cudf::string_view next_substr(Iterator begin, + Iterator end, + cudf::string_view const& d_str) const { auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); auto const size = static_cast(thrust::distance(begin, next)); - return cudf::string_view(d_str.data() + *begin, size); + return cudf::string_view( + d_strings.child(cudf::strings_column_view::chars_column_index).data() + + thrust::distance(d_byte_indices, begin), + size); } /** @@ -116,20 +119,11 @@ struct byte_pair_encoding_fn { .element(idx + d_strings.offset()); auto const d_indices = d_byte_indices + offset; - // initialize the byte indices for this string; - // set the index value to 0 for any intermediate UTF-8 bytes - thrust::transform(thrust::seq, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(d_str.size_bytes()), - d_indices, - [data = d_str.data()](auto idx) { - auto const byte = static_cast(data[idx]); - return cudf::strings::detail::is_begin_utf8_char(byte) ? idx : 0; - }); - auto const begin = d_indices; auto const end = d_indices + d_str.size_bytes(); + *begin = d_str.size_bytes(); // init first char + // keep processing the string until there are no more adjacent pairs found in d_map cudf::size_type min_rank = 0; while (min_rank < cuda::std::numeric_limits::max()) { @@ -192,13 +186,6 @@ struct byte_pair_encoding_fn { } } } - - // compute and store the output size for this string's encoding - auto separators_size = - thrust::count_if( - thrust::seq, d_indices, d_indices + d_str.size_bytes(), [](auto v) { return v != 0; }) * - d_separator.size_bytes(); - d_sizes[idx] = static_cast(d_str.size_bytes() + separators_size); } }; @@ -265,6 +252,17 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const // build working vector to hold index values per byte rmm::device_uvector d_byte_indices(input.chars().size(), stream); + // initialize the byte indices for all strings; + // set the index value to 0 for any intermediate UTF-8 bytes + auto const zero_itr = thrust::counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + zero_itr, + thrust::counting_iterator(input.chars().size()), + d_byte_indices.begin(), + [data = input.chars().data(), d_separator] __device__(auto idx) { + return cudf::strings::detail::is_begin_utf8_char(data[idx]) ? d_separator.size_bytes() : 0; + }); auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, static_cast(input.size() + 1), @@ -273,18 +271,21 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const rmm::mr::get_current_device_resource()); auto d_offsets = offsets->mutable_view().data(); - auto const d_merges = merge_pairs.impl->get_merge_pairs(); - auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); - auto const bpe_fn = byte_pair_encoding_fn{d_merges, - *d_strings, - d_separator, - map_ref, - d_offsets, - string_hasher_type{}, - d_byte_indices.data()}; - auto const zero_itr = thrust::counting_iterator(0); + // auto const d_merges = merge_pairs.impl->get_merge_pairs(); + auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); + auto const bpe_fn = byte_pair_encoding_fn{ + *d_strings, d_separator, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()}; thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), bpe_fn); + // compute and store the output size for this string's encoding + cudf::reduction::detail::segmented_reduce(d_byte_indices.begin(), + input.offsets_begin(), + input.offsets_end(), + d_offsets, + thrust::plus{}, + 0, + stream); + // build the output: add spaces between the remaining pairs in each string thrust::exclusive_scan( rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets); diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 7632d879b67..55b192c27d4 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -82,12 +82,7 @@ struct bpe_equal { // used by insert __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept { - lhs *= 2; - rhs *= 2; - return (d_strings.element(lhs) == - d_strings.element(rhs)) && - (d_strings.element(lhs + 1) == - d_strings.element(rhs + 1)); + return lhs == rhs; // all rows are unique } // used by find __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index b2c552e49bc..4271f3bd3ad 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -29,8 +29,6 @@ #include #include -#include - #include #include #include @@ -120,7 +118,9 @@ std::unique_ptr create_bpe_merge_pairs_im rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto pairs = cudf::strings::split_record(input); // Fix once 13997 is merged + auto pairs = + cudf::strings::split_record(input, cudf::string_scalar(" ")); // Fix once 13997 is merged + // perhaps check the pairs are valid? auto content = pairs->release(); return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream); } From 511a076adc2f2ed773d50ff72b6507a2777f68cc Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 7 Sep 2023 12:00:48 -0400 Subject: [PATCH 13/28] block per string --- cpp/src/text/subword/bpe_tokenizer.cu | 395 ++++++++++------------- cpp/src/text/subword/load_merges_file.cu | 4 +- cpp/tests/text/bpe_tests.cpp | 24 ++ 3 files changed, 202 insertions(+), 221 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 298907ccb27..338f7164a9b 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include #include @@ -33,206 +35,134 @@ #include #include -#include -#include +#include +#include +#include #include +#include #include -#include +#include #include namespace nvtext { namespace detail { - namespace { -/** - * @brief Main byte pair encoding algorithm function for each string. - * - * @see The byte_pair_encoding_fn::operator() function below for details. - */ +constexpr int block_size = 512; + +__device__ __inline__ cudf::string_view next_substr(cudf::column_device_view const& d_strings, + int* d_spaces, + int* begin, + int* end, + cudf::string_view const& d_str) +{ + auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); + auto const size = static_cast(thrust::distance(begin, next)); + return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); +} + template -struct byte_pair_encoding_fn { - cudf::column_device_view const d_strings; - cudf::string_view const d_separator; - MapRefType const d_map; - cudf::size_type* d_sizes; // output size of encoded string - string_hasher_type const hasher; - cudf::size_type* d_byte_indices; // output indices per string - - /** - * @brief Get the next substring of the given string. - * - * This will find the next sequence of characters identified by the - * given byte indices iterator values. The beginning of the sequence - * starts at `begin` and the end of the sequence is the first non-zero - * index found between (begin,end) exclusive. - * - * @tparam Iterator The byte indices iterator type - * @param begin Start of indices to check - * @param end End of indices to check - * @param d_str String to substring - * @return The substring found. - */ - template - __device__ __inline__ cudf::string_view next_substr(Iterator begin, - Iterator end, - cudf::string_view const& d_str) const - { - auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); - auto const size = static_cast(thrust::distance(begin, next)); - return cudf::string_view( - d_strings.child(cudf::strings_column_view::chars_column_index).data() + - thrust::distance(d_byte_indices, begin), - size); +__global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, + MapRefType const d_map, + cudf::size_type* d_sizes, // output size of encoded string + cudf::size_type* d_spaces_in, // output per string + cudf::size_type* d_working // working memory +) +{ + // string per block + auto const str_idx = + static_cast(cudf::detail::grid_1d::global_thread_id() / block_size); + auto const lane_idx = static_cast(threadIdx.x); + + if (d_strings.is_null(str_idx)) { + d_sizes[str_idx] = 0; + return; + } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + d_sizes[str_idx] = 0; + return; } - /** - * @brief Byte encode each string. - * - * Each string is iteratively scanned for the minimum rank of adjacent substring pairs - * as found within the `d_map` table. Once the minimum pair is located, that pair - * is removed -- virtually by zero-ing the index value between any matching adjacent pairs. - * - * The iteration ends once there are no more adjacent pairs or there are no more - * matches found in `d_map`. At the end, the indices for each string reflect the - * encoding pattern and can be used to build the output. - * - * This function also computes the size of the encoded output of each string - * by simply counting the number of non-zero indices values remaining. This saves - * an extra kernel launch normally required to compute the offsets of the output column. - * - * @param idx The index of the string in `d_strings` to encode - */ - __device__ void operator()(cudf::size_type idx) const - { - if (d_strings.is_null(idx)) { - d_sizes[idx] = 0; - return; - } - auto const d_str = d_strings.element(idx); - if (d_str.empty()) { - d_sizes[idx] = 0; - return; + auto const offsets = + d_strings.child(cudf::strings_column_view::offsets_column_index).data(); + auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; + + auto const d_spaces = d_spaces_in + offset; + auto const end_spaces = d_spaces + d_str.size_bytes(); + auto const d_min_ranks = d_working + offset; + auto const end_ranks = d_min_ranks + d_str.size_bytes(); + auto const max_rank = cuda::std::numeric_limits::max(); + + __shared__ cudf::size_type block_min_rank; + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + if (lane_idx == 0) { + // the first character is free so we store the string's size here + // to help compute the encoded output size later + *d_spaces = d_str.size_bytes(); + block_min_rank = 0; + } + __syncthreads(); + + // each thread processes their part of the string and records its min_rank + while (block_min_rank < max_rank) { + auto min_rank = max_rank; + // initialize min ranks + // future optimization: only invalidate ranks where mins were + // found in the previous run + for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { + *itr = max_rank; } - - auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx + d_strings.offset()); - auto const d_indices = d_byte_indices + offset; - - auto const begin = d_indices; - auto const end = d_indices + d_str.size_bytes(); - - *begin = d_str.size_bytes(); // init first char - - // keep processing the string until there are no more adjacent pairs found in d_map - cudf::size_type min_rank = 0; - while (min_rank < cuda::std::numeric_limits::max()) { - // initialize working variables - min_rank = cuda::std::numeric_limits::max(); - - auto lhs = next_substr(begin, end, d_str); - auto itr = begin + lhs.size_bytes(); - - auto min_itr = itr; // these are set along with - auto min_size = lhs.size_bytes(); // the min_rank variable - auto min_mp = merge_pair_type{}; - - // check each adjacent pair against the d_map - while (itr < end) { - auto const rhs = next_substr(itr, end, d_str); - if (rhs.empty()) { break; } // no more adjacent pairs - - auto const mp = merge_pair_type{lhs, rhs}; - auto const map_itr = d_map.find(mp); - if (map_itr != d_map.end()) { - // found a match; record the rank (and other min_ vars) - auto const rank = map_itr->second; - if (rank < min_rank) { - min_rank = rank; - min_itr = itr; - min_size = rhs.size_bytes(); - min_mp = mp; - } - } - // next substring - lhs = rhs; - itr += rhs.size_bytes(); - } - - // if any pair matched, remove every occurrence from the string - if (min_rank < cuda::std::numeric_limits::max()) { - // remove the first pair we found - itr = min_itr; - *itr = 0; - - // continue scanning for other occurrences in the remainder of the string - itr += min_size; - if (itr < end) { - lhs = next_substr(itr, end, d_str); - itr += lhs.size_bytes(); - while (itr < end && !lhs.empty()) { - auto rhs = next_substr(itr, end, d_str); - if ((min_mp.first == lhs) && (min_mp.second == rhs)) { - *itr = 0; // removes the pair from this string - itr += rhs.size_bytes(); - if (itr >= end) { break; } // done checking for pairs - // skip to the next adjacent pair - rhs = next_substr(itr, end, d_str); + __syncthreads(); + + for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { + if (*itr == 0) { continue; } // start on valid bytes only + + // get left half of the pair + auto const lhs = next_substr(d_strings, d_spaces, itr, end_spaces, d_str); + auto const next_itr = itr + lhs.size_bytes(); + if (next_itr < end_spaces) { + // get the right half of the pair + auto const rhs = next_substr(d_strings, d_spaces, next_itr, end_spaces, d_str); + if (!rhs.empty()) { + auto const index = static_cast(thrust::distance(d_spaces, next_itr)); + // this is setup for future optimization mentioned above; + // we only want to hash/lookup if the rank is new for this pair + auto rank = d_min_ranks[index]; + if (rank == max_rank) { + // lookup pair in merge-pairs table + auto const mp = merge_pair_type{lhs, rhs}; + auto const map_itr = d_map.find(mp); + if (map_itr != d_map.end()) { // found a match + rank = map_itr->second; } - // next substring - lhs = rhs; - itr += rhs.size_bytes(); } + if (rank < min_rank) { min_rank = rank; } + d_min_ranks[index] = rank; // store the rank } } } - } -}; - -/** - * @brief Build the output string encoding. - * - * This copies each string to the output inserting a space at each non-zero byte index. - * - * @code{.txt} - * d_strings = ["helloworld", "testthis"] - * d_byte_indices = [ 0000050000 00004000] - * result is ["hello world", "test this"] - * @endcode - */ -struct build_encoding_fn { - cudf::column_device_view const d_strings; - cudf::string_view const d_separator; - cudf::size_type const* d_byte_indices; - cudf::size_type const* d_offsets; - char* d_chars{}; - - __device__ void operator()(cudf::size_type idx) const - { - if (d_strings.is_null(idx)) { return; } - auto const d_str = d_strings.element(idx); - if (d_str.empty()) { return; } - - auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx + d_strings.offset()); - auto const d_indices = d_byte_indices + offset; - auto d_output = d_chars + d_offsets[idx]; - - // copy chars while indices[i]==0, - // insert space each time indices[i]!=0 - auto const begin = d_indices; - auto const end = d_indices + d_str.size_bytes(); - auto d_input = d_str.data(); - *d_output++ = *d_input++; - auto itr = begin + 1; - while (itr < end) { - if (*itr++) { d_output = cudf::strings::detail::copy_string(d_output, d_separator); } - *d_output++ = *d_input++; + __syncthreads(); + + // once all threads are completed, find the min-rank across the block + block_min_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min()); + __syncthreads(); + + if (block_min_rank < max_rank) { + // search the d_min_ranks for all the places where the rank matches block_min_rank + for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { + auto const index = static_cast(thrust::distance(d_min_ranks, itr)); + if (*itr == block_min_rank) { + // set the output value to 0 at this position + if (index > 0 && *(itr - 1) != block_min_rank) { d_spaces[index] = 0; } + } + } + __syncthreads(); } - // https://github.com/rapidsai/cudf/pull/10270/files#r826319405 - } -}; - + } // if no mins were found we are done, otherwise start again +} } // namespace std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, @@ -247,58 +177,87 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid"); auto const d_separator = separator.value(stream); + CUDF_EXPECTS(d_separator.size_bytes() == 1, "for now, separator must be a single-byte character"); auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - // build working vector to hold index values per byte - rmm::device_uvector d_byte_indices(input.chars().size(), stream); - // initialize the byte indices for all strings; - // set the index value to 0 for any intermediate UTF-8 bytes - auto const zero_itr = thrust::counting_iterator(0); - thrust::transform( - rmm::exec_policy(stream), - zero_itr, - thrust::counting_iterator(input.chars().size()), - d_byte_indices.begin(), - [data = input.chars().data(), d_separator] __device__(auto idx) { - return cudf::strings::detail::is_begin_utf8_char(data[idx]) ? d_separator.size_bytes() : 0; - }); - - auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, - static_cast(input.size() + 1), - cudf::mask_state::UNALLOCATED, - stream, - rmm::mr::get_current_device_resource()); + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::detail::get_value( + input.offsets(), input.offset(), stream); + auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) + ? input.chars().size() + : cudf::detail::get_value( + input.offsets(), input.size() + input.offset(), stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars().data() + first_offset; + + auto const offset_data_type = cudf::data_type{cudf::type_to_id()}; + auto offsets = cudf::make_numeric_column( + offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets->mutable_view().data(); - // auto const d_merges = merge_pairs.impl->get_merge_pairs(); + // initialize the spaces vector which will hold encoding information + rmm::device_uvector d_spaces(chars_size, stream); + auto const zero_itr = thrust::counting_iterator(0); + auto const chars_end = thrust::counting_iterator(chars_size); + thrust::transform(rmm::exec_policy(stream), + zero_itr, + chars_end, + d_spaces.begin(), + [d_input_chars] __device__(auto idx) { + return static_cast( + cudf::strings::detail::is_begin_utf8_char(d_input_chars[idx])); + }); + + rmm::device_uvector d_working(chars_size, stream); auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); - auto const bpe_fn = byte_pair_encoding_fn{ - *d_strings, d_separator, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()}; - thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), bpe_fn); + // encoding step produces values in d_spaces that indicate where the separator is inserted + // and can also be reduced to compute the output size of each row + bpe_parallel_fn<<>>( + *d_strings, map_ref, d_offsets, d_spaces.data(), d_working.data()); // compute and store the output size for this string's encoding - cudf::reduction::detail::segmented_reduce(d_byte_indices.begin(), - input.offsets_begin(), - input.offsets_end(), + auto const input_offsets = thrust::make_transform_iterator( + input.offsets_begin(), + [first_offset] __device__(auto offset) { return offset - first_offset; }); + cudf::reduction::detail::segmented_reduce(d_spaces.begin(), + input_offsets, + input_offsets + input.size() + 1, d_offsets, thrust::plus{}, 0, stream); + // convert sizes to offsets + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), + "Size of output exceeds the column size limit", + std::overflow_error); // build the output: add spaces between the remaining pairs in each string - thrust::exclusive_scan( - rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets); - - auto const bytes = - cudf::detail::get_value(offsets->view(), input.size(), stream); - auto chars = cudf::strings::detail::create_chars_child_column( - bytes, stream, rmm::mr::get_current_device_resource()); + auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); auto d_chars = chars->mutable_view().data(); - auto const result_fn = - build_encoding_fn{*d_strings, d_separator, d_byte_indices.data(), d_offsets, d_chars}; - thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), result_fn); + // we can reuse the d_working memory to store some temporary offsets now + auto const d_inserts = d_working.data(); + // create offsets where separators will be inserted + auto offsets_at_one = [d_spaces = d_spaces.data()] __device__(auto idx) { + return d_spaces[idx] == 1; // this fails if any input string is a single byte + }; + auto const copy_end = + thrust::copy_if(rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_one); + + // this will insert the single-byte separator in positions specified in d_inserts + auto const sep_char = thrust::constant_iterator(separator.to_string(stream)[0]); + thrust::merge_by_key(rmm::exec_policy(stream), + d_inserts, // where separator is inserted + copy_end, + zero_itr, // all positions + chars_end, + sep_char, // byte to insert + d_input_chars, + thrust::make_discard_iterator(), + d_chars); // result return cudf::make_strings_column(input.size(), std::move(offsets), diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index 4271f3bd3ad..c402ff7cc80 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -118,9 +118,7 @@ std::unique_ptr create_bpe_merge_pairs_im rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto pairs = - cudf::strings::split_record(input, cudf::string_scalar(" ")); // Fix once 13997 is merged - // perhaps check the pairs are valid? + auto pairs = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr); auto content = pairs->release(); return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream); } diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index f2fd487e7f9..869b0cc9dbf 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -78,6 +78,30 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(TextBPETokenize, DISABLED_BPEAdjacentPairs) +{ + auto mpt = cudf::test::strings_column_wrapper({ + "▁ H", // 157 + "m m", // 10742 + "? !", // 50675 + "▁H mm", // 174381 + "mm m", // 262776 + "?! !", // 352313 + "? !?", // 352314 + "mm mm", // 387733 + "▁H m", // 471269 + "?! ?!", // 506981 + "?!? !", // 506982 + }); + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); + + cudf::test::strings_column_wrapper input({"▁Hmmmmm", "?!?!?!"}); + + auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(input), *merge_pairs); + auto expected = cudf::test::strings_column_wrapper({"▁Hmm mmm", "?!?! ?!"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + TEST_F(TextBPETokenize, BPE_Empty) { auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"}); From 56c967fe6e330617645eb306ce3b73f86ac6750f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 12 Sep 2023 16:23:31 -0400 Subject: [PATCH 14/28] add rerank working memory --- cpp/benchmarks/text/bpe_tokenize.cpp | 20 +-- cpp/src/text/subword/bpe_tokenizer.cu | 227 +++++++++++++++----------- cpp/tests/text/bpe_tests.cpp | 31 +++- 3 files changed, 169 insertions(+), 109 deletions(-) diff --git a/cpp/benchmarks/text/bpe_tokenize.cpp b/cpp/benchmarks/text/bpe_tokenize.cpp index 17f4bda81b3..10b384ca47b 100644 --- a/cpp/benchmarks/text/bpe_tokenize.cpp +++ b/cpp/benchmarks/text/bpe_tokenize.cpp @@ -40,16 +40,16 @@ static void bench_bpe(nvbench::state& state) state.skip("Skip benchmarks greater than size_type limit"); } - auto raw_data = cudf::test::strings_column_wrapper({"test sentence ", - "thisis test ", - "this is sentence ", - "this istest ", - "thisistest ", - "sentence is test ", - "this sentence is test ", - "test test test ", - "this this test this ", - "sentence "}) + auto raw_data = cudf::test::strings_column_wrapper({"test sentence", + "thisis it", + "this is sentence", + "this isit", + "thisisit", + "sentenceisit", + "this sentence is test", + "isitthis", + "this this it this", + "sentence"}) .release(); if (row_width / 20 > 1) { diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 338f7164a9b..2ce763fbf94 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -50,23 +49,13 @@ namespace { constexpr int block_size = 512; -__device__ __inline__ cudf::string_view next_substr(cudf::column_device_view const& d_strings, - int* d_spaces, - int* begin, - int* end, - cudf::string_view const& d_str) -{ - auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); - auto const size = static_cast(thrust::distance(begin, next)); - return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); -} - template __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, MapRefType const d_map, - cudf::size_type* d_sizes, // output size of encoded string - cudf::size_type* d_spaces_in, // output per string - cudf::size_type* d_working // working memory + cudf::size_type* d_sizes, // output size of encoded string + int8_t* d_spaces_in, // working memory + cudf::size_type* d_ranks, // more working memory + int8_t* d_rerank_in // and one more working memory ) { // string per block @@ -90,78 +79,142 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const d_spaces = d_spaces_in + offset; auto const end_spaces = d_spaces + d_str.size_bytes(); - auto const d_min_ranks = d_working + offset; + auto const d_min_ranks = d_ranks + offset; auto const end_ranks = d_min_ranks + d_str.size_bytes(); + auto const d_rerank = d_rerank_in + offset; + auto const end_rerank = d_rerank + d_str.size_bytes(); auto const max_rank = cuda::std::numeric_limits::max(); + auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); __shared__ cudf::size_type block_min_rank; using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage temp_storage; - if (lane_idx == 0) { - // the first character is free so we store the string's size here - // to help compute the encoded output size later - *d_spaces = d_str.size_bytes(); - block_min_rank = 0; + if (lane_idx == 0) { block_min_rank = 0; } + // init all ranks to max + for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { + *itr = max_rank; + } + __syncthreads(); + + auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) { + auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); + auto const size = static_cast(thrust::distance(begin, next)); + return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); + }; + + // store all the initial ranks for each pair in the string for this block + for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { + if (*itr == 0) { continue; } // start on valid bytes only + // resolve pair and lookup its rank + auto const lhs = next_substr(itr); // retrieve lhs of the pair + auto const next_itr = itr + lhs.size_bytes(); + if (next_itr < end_spaces) { + auto const rhs = next_substr(next_itr); // retrieve rhs of the pair + if (!rhs.empty()) { + auto rank = max_rank; + auto const mp = merge_pair_type{lhs, rhs}; + auto const map_itr = d_map.find(mp); // lookup pair in merges table; + if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match; + d_min_ranks[thrust::distance(d_spaces, next_itr)] = rank; // store the rank + } + } } __syncthreads(); - // each thread processes their part of the string and records its min_rank + // loop through the ranks finding the current minimum until there are no more while (block_min_rank < max_rank) { + // find new minimum rank auto min_rank = max_rank; - // initialize min ranks - // future optimization: only invalidate ranks where mins were - // found in the previous run for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { - *itr = max_rank; + auto const rank = *itr; + if (rank < min_rank) { min_rank = rank; } } __syncthreads(); - for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { - if (*itr == 0) { continue; } // start on valid bytes only - - // get left half of the pair - auto const lhs = next_substr(d_strings, d_spaces, itr, end_spaces, d_str); - auto const next_itr = itr + lhs.size_bytes(); - if (next_itr < end_spaces) { - // get the right half of the pair - auto const rhs = next_substr(d_strings, d_spaces, next_itr, end_spaces, d_str); - if (!rhs.empty()) { - auto const index = static_cast(thrust::distance(d_spaces, next_itr)); - // this is setup for future optimization mentioned above; - // we only want to hash/lookup if the rank is new for this pair - auto rank = d_min_ranks[index]; - if (rank == max_rank) { - // lookup pair in merge-pairs table - auto const mp = merge_pair_type{lhs, rhs}; - auto const map_itr = d_map.find(mp); - if (map_itr != d_map.end()) { // found a match - rank = map_itr->second; - } - } - if (rank < min_rank) { min_rank = rank; } - d_min_ranks[index] = rank; // store the rank - } - } - } - __syncthreads(); - - // once all threads are completed, find the min-rank across the block - block_min_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min()); + // compute the min rank across the block + auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); if (block_min_rank < max_rank) { + // (re)initialize all the re-rank identifiers to zero + for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { + *itr = 0; + } + // search the d_min_ranks for all the places where the rank matches block_min_rank for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { - auto const index = static_cast(thrust::distance(d_min_ranks, itr)); if (*itr == block_min_rank) { - // set the output value to 0 at this position - if (index > 0 && *(itr - 1) != block_min_rank) { d_spaces[index] = 0; } + auto ptr = itr - 1; // check for adjacent min-rank edge-case + while (ptr > d_min_ranks && *ptr == max_rank) { + --ptr; + } + // set the output value to 0 at this position (erases separator) + if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; } + } + } + __syncthreads(); + + auto find_prev = [begin = d_spaces](int8_t* ptr) { + while (ptr > begin && *ptr == 0) { + --ptr; + } + return ptr; + }; + auto find_next = [end = end_spaces](int8_t* ptr) { + while (ptr < end && *ptr == 0) { + ++ptr; + } + return ptr; + }; + + // identify the re-rank locations + for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { + auto const index = thrust::distance(d_min_ranks, itr); + if (*itr == block_min_rank && d_spaces[index] == 0) { + auto ptr = find_prev(d_spaces + index - 1); // find previous pair mid-point + if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } // atomicExch + ptr = find_next(d_spaces + index + 1); // find next pair mid-point + if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } // atomicExch + *itr = max_rank; // reset this rank + } + } + __syncthreads(); + + // compute the ranks for the newly created pairs + for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { + if (*itr) { + auto const index = thrust::distance(d_rerank, itr); + // build lhs of pair + auto const ptr = find_prev(d_spaces + index - 1); + auto const size = static_cast(thrust::distance(ptr, d_spaces + index)); + auto const lhs = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size); + // retrieve rhs of pair + auto const rhs = next_substr(d_spaces + index); + auto rank = max_rank; + if (!rhs.empty()) { + auto const mp = merge_pair_type{lhs, rhs}; + auto const map_itr = d_map.find(mp); // lookup in merges; + if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match + } + d_min_ranks[index] = rank; } } __syncthreads(); } } // if no mins were found we are done, otherwise start again + + // reset the first position -- no separator to be added here + if (lane_idx == 0) { *d_spaces = 0; } + + // compute the output size for this string by counting the resulting separator positions + auto bytes = 0; + for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { + bytes += *itr; + } + auto const size = block_reduce(temp_storage).Sum(bytes, num_valid); + if (lane_idx == 0) { d_sizes[str_idx] = size + d_str.size_bytes(); } } } // namespace @@ -196,53 +249,45 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets->mutable_view().data(); - // initialize the spaces vector which will hold encoding information - rmm::device_uvector d_spaces(chars_size, stream); + // initialize the spaces vector which will hold the separators locations + rmm::device_uvector d_spaces(chars_size, stream); auto const zero_itr = thrust::counting_iterator(0); auto const chars_end = thrust::counting_iterator(chars_size); - thrust::transform(rmm::exec_policy(stream), - zero_itr, - chars_end, - d_spaces.begin(), - [d_input_chars] __device__(auto idx) { - return static_cast( - cudf::strings::detail::is_begin_utf8_char(d_input_chars[idx])); - }); - - rmm::device_uvector d_working(chars_size, stream); + thrust::transform( + rmm::exec_policy(stream), + zero_itr, + chars_end, + d_spaces.begin(), + [d_input_chars] __device__(auto idx) { + return static_cast(cudf::strings::detail::is_begin_utf8_char(d_input_chars[idx])); + }); + + rmm::device_uvector d_ranks(chars_size, stream); // rank per string pair; + rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); - // encoding step produces values in d_spaces that indicate where the separator is inserted - // and can also be reduced to compute the output size of each row + // the main kernel here produces the sizes of the encoded output string + // as well as the d_spaces values which identify where to insert the separators bpe_parallel_fn<<>>( - *d_strings, map_ref, d_offsets, d_spaces.data(), d_working.data()); - // compute and store the output size for this string's encoding - auto const input_offsets = thrust::make_transform_iterator( - input.offsets_begin(), - [first_offset] __device__(auto offset) { return offset - first_offset; }); - cudf::reduction::detail::segmented_reduce(d_spaces.begin(), - input_offsets, - input_offsets + input.size() + 1, - d_offsets, - thrust::plus{}, - 0, - stream); - // convert sizes to offsets + *d_strings, map_ref, d_offsets, d_spaces.data(), d_ranks.data(), d_rerank.data()); + + // convert sizes to offsets in-place auto const bytes = cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream); CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), "Size of output exceeds the column size limit", std::overflow_error); - // build the output: add spaces between the remaining pairs in each string + // build the output: adding separators between the remaining pairs in each string auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); auto d_chars = chars->mutable_view().data(); - // we can reuse the d_working memory to store some temporary offsets now - auto const d_inserts = d_working.data(); + // we can reuse the ranks working memory to store some temporary offsets now; + // the offsets are produced by the index of the d_spaces values + auto const d_inserts = d_ranks.data(); // create offsets where separators will be inserted auto offsets_at_one = [d_spaces = d_spaces.data()] __device__(auto idx) { - return d_spaces[idx] == 1; // this fails if any input string is a single byte + return d_spaces[idx] > 0; // separator to be inserted here }; auto const copy_end = thrust::copy_if(rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_one); diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index 869b0cc9dbf..d670492fb01 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -29,23 +29,37 @@ struct TextBPETokenize : public cudf::test::BaseFixture {}; TEST_F(TextBPETokenize, BytePairEncoding) { // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt - auto mpt = cudf::test::strings_column_wrapper( - {"e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "T h", "Th is", "t est", "s ent"}); + auto mpt = cudf::test::strings_column_wrapper({ + "e n", // 14 + "i t", // 16 + "i s", // 17 + "e s", // 20 + "en t", // 44 + "c e", // 90 + "es t", // 141 + "en ce", // 340 + "t h", // 146 + "h i", // 5049 + "th is", // 5407 + "t est", // 9034 + "s i", // 13142 + "s ent" // 33832 + }); auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); auto validity = cudf::test::iterators::null_at(4); cudf::test::strings_column_wrapper input( - {"Thisisit", "Thisis test-sentence-1", "Thisistestsentence-2", "This-istestsentence 3", "", ""}, + {"thisisit", "thisis test-sentence-1", "thisistestsentence-2", "this-istestsentence 3", "", ""}, validity); auto sv = cudf::strings_column_view(input); auto results = nvtext::byte_pair_encoding(sv, *merge_pairs); - auto expected = cudf::test::strings_column_wrapper({"This is it", - "This is test - sent ence - 1", - "This is test sent ence - 2", - "This - is test sent ence 3", + auto expected = cudf::test::strings_column_wrapper({"this is it", + "this is test - sent ence - 1", + "this is test sent ence - 2", + "this - is test sent ence 3", "", ""}, validity); @@ -63,6 +77,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator) { auto mpt = cudf::test::strings_column_wrapper( {"Ġ t", "Ġt he", "h e", "e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); cudf::test::strings_column_wrapper input( @@ -78,7 +93,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } -TEST_F(TextBPETokenize, DISABLED_BPEAdjacentPairs) +TEST_F(TextBPETokenize, BPEAdjacentPairs) { auto mpt = cudf::test::strings_column_wrapper({ "▁ H", // 157 From 038809f0374075fa88dbcf47ae402a477927436a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 18 Sep 2023 17:55:15 -0400 Subject: [PATCH 15/28] fusing reduce and unfusing output size calc --- cpp/src/text/subword/bpe_tokenizer.cu | 201 +++++++++++++++----------- 1 file changed, 114 insertions(+), 87 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 2ce763fbf94..1058aa0521b 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -52,7 +52,6 @@ constexpr int block_size = 512; template __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, MapRefType const d_map, - cudf::size_type* d_sizes, // output size of encoded string int8_t* d_spaces_in, // working memory cudf::size_type* d_ranks, // more working memory int8_t* d_rerank_in // and one more working memory @@ -63,15 +62,9 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, static_cast(cudf::detail::grid_1d::global_thread_id() / block_size); auto const lane_idx = static_cast(threadIdx.x); - if (d_strings.is_null(str_idx)) { - d_sizes[str_idx] = 0; - return; - } + if (d_strings.is_null(str_idx)) { return; } auto const d_str = d_strings.element(str_idx); - if (d_str.empty()) { - d_sizes[str_idx] = 0; - return; - } + if (d_str.empty()) { return; } auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index).data(); @@ -90,20 +83,21 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage temp_storage; - if (lane_idx == 0) { block_min_rank = 0; } + auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) { + auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); + auto const size = static_cast(thrust::distance(begin, next)); + return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); + }; + // init all ranks to max for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { *itr = max_rank; } __syncthreads(); - auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) { - auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); - auto const size = static_cast(thrust::distance(begin, next)); - return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); - }; + auto min_rank = max_rank; - // store all the initial ranks for each pair in the string for this block + // store all the initial ranks for each pair for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { if (*itr == 0) { continue; } // start on valid bytes only // resolve pair and lookup its rank @@ -117,93 +111,123 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const map_itr = d_map.find(mp); // lookup pair in merges table; if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match; d_min_ranks[thrust::distance(d_spaces, next_itr)] = rank; // store the rank + if (rank < min_rank) min_rank = rank; } } } __syncthreads(); + // compute the min rank across the block + auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + if (lane_idx == 0) { block_min_rank = reduce_rank; } + __syncthreads(); // loop through the ranks finding the current minimum until there are no more while (block_min_rank < max_rank) { - // find new minimum rank - auto min_rank = max_rank; - for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { - auto const rank = *itr; - if (rank < min_rank) { min_rank = rank; } + // (re)initialize all the re-rank identifiers to zero + for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { + *itr = 0; } - __syncthreads(); - // compute the min rank across the block - auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); - if (lane_idx == 0) { block_min_rank = reduce_rank; } + // search the d_min_ranks for all the places where the rank matches block_min_rank + for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { + if (*itr == block_min_rank) { + auto ptr = itr - 1; // check for adjacent min-rank edge-case + while (ptr > d_min_ranks && *ptr == max_rank) { + --ptr; + } + // set the output value to 0 at this position (erases separator) + if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; } + } + } __syncthreads(); - if (block_min_rank < max_rank) { - // (re)initialize all the re-rank identifiers to zero - for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { - *itr = 0; + auto find_prev = [begin = d_spaces](int8_t* ptr) { + while (ptr > begin && *ptr == 0) { + --ptr; } - - // search the d_min_ranks for all the places where the rank matches block_min_rank - for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { - if (*itr == block_min_rank) { - auto ptr = itr - 1; // check for adjacent min-rank edge-case - while (ptr > d_min_ranks && *ptr == max_rank) { - --ptr; - } - // set the output value to 0 at this position (erases separator) - if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; } - } + return ptr; + }; + auto find_next = [end = end_spaces](int8_t* ptr) { + while (ptr < end && *ptr == 0) { + ++ptr; } - __syncthreads(); + return ptr; + }; - auto find_prev = [begin = d_spaces](int8_t* ptr) { - while (ptr > begin && *ptr == 0) { - --ptr; - } - return ptr; - }; - auto find_next = [end = end_spaces](int8_t* ptr) { - while (ptr < end && *ptr == 0) { - ++ptr; - } - return ptr; - }; - - // identify the re-rank locations - for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { - auto const index = thrust::distance(d_min_ranks, itr); - if (*itr == block_min_rank && d_spaces[index] == 0) { - auto ptr = find_prev(d_spaces + index - 1); // find previous pair mid-point - if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } // atomicExch - ptr = find_next(d_spaces + index + 1); // find next pair mid-point - if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } // atomicExch - *itr = max_rank; // reset this rank - } + // identify the re-rank locations + for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { + auto const index = thrust::distance(d_min_ranks, itr); + if (*itr == block_min_rank && d_spaces[index] == 0) { + auto ptr = find_prev(d_spaces + index - 1); // find previous pair mid-point + if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } + ptr = find_next(d_spaces + index + 1); // find next pair mid-point + if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } + *itr = max_rank; // reset this rank } - __syncthreads(); - - // compute the ranks for the newly created pairs - for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { - if (*itr) { - auto const index = thrust::distance(d_rerank, itr); - // build lhs of pair - auto const ptr = find_prev(d_spaces + index - 1); - auto const size = static_cast(thrust::distance(ptr, d_spaces + index)); - auto const lhs = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size); - // retrieve rhs of pair - auto const rhs = next_substr(d_spaces + index); - auto rank = max_rank; - if (!rhs.empty()) { - auto const mp = merge_pair_type{lhs, rhs}; - auto const map_itr = d_map.find(mp); // lookup in merges; - if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match - } - d_min_ranks[index] = rank; + } + __syncthreads(); + + // compute the ranks for the newly created pairs + min_rank = max_rank; // and record new minimum + for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { + auto const index = thrust::distance(d_rerank, itr); + auto rank = d_min_ranks[index]; + if (*itr) { + // build lhs of pair + auto const ptr = find_prev(d_spaces + index - 1); + auto const size = static_cast(thrust::distance(ptr, d_spaces + index)); + auto const lhs = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size); + // retrieve rhs of pair + auto const rhs = next_substr(d_spaces + index); + rank = max_rank; + if (!rhs.empty()) { + auto const mp = merge_pair_type{lhs, rhs}; + auto const map_itr = d_map.find(mp); // lookup in merges; + if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match } + d_min_ranks[index] = rank; } - __syncthreads(); + if (rank < min_rank) min_rank = rank; } + __syncthreads(); + + // compute the min rank across the block + auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + if (lane_idx == 0) { block_min_rank = reduce_rank; } + __syncthreads(); } // if no mins were found we are done, otherwise start again +} + +__global__ void bpe_finalize(cudf::column_device_view const d_strings, + int8_t* d_spaces_in, // where separators are inserted + cudf::size_type* d_sizes // output sizes of encoded strings +) +{ + // string per block + auto const str_idx = + static_cast(cudf::detail::grid_1d::global_thread_id() / block_size); + auto const lane_idx = static_cast(threadIdx.x); + + if (d_strings.is_null(str_idx)) { + d_sizes[str_idx] = 0; + return; + } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + d_sizes[str_idx] = 0; + return; + } + + auto const offsets = + d_strings.child(cudf::strings_column_view::offsets_column_index).data(); + auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; + + auto const d_spaces = d_spaces_in + offset; + auto const end_spaces = d_spaces + d_str.size_bytes(); + auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; // reset the first position -- no separator to be added here if (lane_idx == 0) { *d_spaces = 0; } @@ -216,6 +240,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const size = block_reduce(temp_storage).Sum(bytes, num_valid); if (lane_idx == 0) { d_sizes[str_idx] = size + d_str.size_bytes(); } } + } // namespace std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, @@ -250,6 +275,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const auto d_offsets = offsets->mutable_view().data(); // initialize the spaces vector which will hold the separators locations + // (this is about 20% of the run: look into vector loading) rmm::device_uvector d_spaces(chars_size, stream); auto const zero_itr = thrust::counting_iterator(0); auto const chars_end = thrust::counting_iterator(chars_size); @@ -266,10 +292,11 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); - // the main kernel here produces the sizes of the encoded output string - // as well as the d_spaces values which identify where to insert the separators bpe_parallel_fn<<>>( - *d_strings, map_ref, d_offsets, d_spaces.data(), d_ranks.data(), d_rerank.data()); + *d_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data()); + // this could probably be re-fused into the above kernel + bpe_finalize<<>>( + *d_strings, d_spaces.data(), d_offsets); // convert sizes to offsets in-place auto const bytes = From 4528e6f2ecc4eeb84c2f2022badf1042fe7cec73 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 19 Sep 2023 18:00:11 -0400 Subject: [PATCH 16/28] move spaces init to main kernel --- cpp/src/text/subword/bpe_tokenizer.cu | 34 +++++++++++---------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index c15355b64c4..854e0e7407f 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -76,9 +77,10 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const end_ranks = d_min_ranks + d_str.size_bytes(); auto const d_rerank = d_rerank_in + offset; auto const end_rerank = d_rerank + d_str.size_bytes(); - auto const max_rank = cuda::std::numeric_limits::max(); auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); + auto constexpr max_rank = cuda::std::numeric_limits::max(); + __shared__ cudf::size_type block_min_rank; using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage temp_storage; @@ -93,6 +95,11 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { *itr = max_rank; } + // init all spaces to 1 as appropriate + for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { + auto const index = thrust::distance(d_spaces, itr); + *itr = static_cast(cudf::strings::detail::is_begin_utf8_char(d_str.data()[index])); + } __syncthreads(); auto min_rank = max_rank; @@ -115,7 +122,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, } } } - __syncthreads(); // compute the min rank across the block auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); if (lane_idx == 0) { block_min_rank = reduce_rank; } @@ -189,7 +195,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, } if (rank < min_rank) min_rank = rank; } - __syncthreads(); // compute the min rank across the block auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); @@ -235,7 +240,7 @@ __global__ void bpe_finalize(cudf::column_device_view const d_strings, // compute the output size for this string by counting the resulting separator positions auto bytes = 0; for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { - bytes += *itr; + bytes += (*itr > 0); } auto const size = block_reduce(temp_storage).Sum(bytes, num_valid); if (lane_idx == 0) { d_sizes[str_idx] = size + d_str.size_bytes(); } @@ -274,20 +279,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets->mutable_view().data(); - // initialize the spaces vector which will hold the separators locations - // (this is about 20% of the run: look into vector loading) rmm::device_uvector d_spaces(chars_size, stream); - auto const zero_itr = thrust::counting_iterator(0); - auto const chars_end = thrust::counting_iterator(chars_size); - thrust::transform( - rmm::exec_policy(stream), - zero_itr, - chars_end, - d_spaces.begin(), - [d_input_chars] __device__(auto idx) { - return static_cast(cudf::strings::detail::is_begin_utf8_char(d_input_chars[idx])); - }); - rmm::device_uvector d_ranks(chars_size, stream); // rank per string pair; rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); @@ -313,11 +305,13 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const // the offsets are produced by the index of the d_spaces values auto const d_inserts = d_ranks.data(); // create offsets where separators will be inserted - auto offsets_at_one = [d_spaces = d_spaces.data()] __device__(auto idx) { + auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) { return d_spaces[idx] > 0; // separator to be inserted here }; - auto const copy_end = - thrust::copy_if(rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_one); + auto const zero_itr = thrust::counting_iterator(0); + auto const chars_end = thrust::counting_iterator(chars_size); + auto const copy_end = thrust::copy_if( + rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_non_zero); // this will insert the single-byte separator in positions specified in d_inserts auto const sep_char = thrust::constant_iterator(separator.to_string(stream)[0]); From 5e27d5af7e15f9b696e0938ac1c3f776e674903c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 20 Sep 2023 16:45:36 -0400 Subject: [PATCH 17/28] limit adjacent pair search --- cpp/src/text/subword/bpe_tokenizer.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 854e0e7407f..6118c2aa453 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -137,9 +137,11 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, // search the d_min_ranks for all the places where the rank matches block_min_rank for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { if (*itr == block_min_rank) { - auto ptr = itr - 1; // check for adjacent min-rank edge-case - while (ptr > d_min_ranks && *ptr == max_rank) { + auto ptr = itr - 1; // check for adjacent min-rank edge-case + auto count = 8; + while (ptr > d_min_ranks && *ptr == max_rank && count > 0) { --ptr; + --count; } // set the output value to 0 at this position (erases separator) if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; } From 75432d223df3905c81d09de1b551eb963b82f2d9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 26 Sep 2023 19:44:04 -0400 Subject: [PATCH 18/28] fix replace bug --- cpp/src/strings/replace/replace.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index a622d1a742d..acc1502f4d6 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -97,7 +97,7 @@ struct replace_row_parallel_fn { } else { bytes += d_repl.size_bytes() - d_target.size_bytes(); } - position = d_str.find(d_target, position + d_target.size_bytes()); + position = d_str.find(d_target, position + d_target.length()); --max_n; } if (out_ptr) // copy whats left (or right depending on your point of view) From 521cd1b6820b528e6b316ac91828d64e3eff902a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 4 Oct 2023 17:17:50 -0400 Subject: [PATCH 19/28] exploit unpairable boundaries --- cpp/src/text/subword/bpe_tokenizer.cu | 90 ++++++++++++++++++++++-- cpp/src/text/subword/bpe_tokenizer.cuh | 61 +++++++++++++++- cpp/src/text/subword/load_merges_file.cu | 33 +++++++-- cpp/tests/text/bpe_tests.cpp | 5 +- 4 files changed, 173 insertions(+), 16 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 6118c2aa453..e36d53e17f4 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -42,6 +42,7 @@ #include #include #include +#include #include namespace nvtext { @@ -50,6 +51,45 @@ namespace { constexpr int block_size = 512; +template +__global__ void bpe_up_offsets_fn(char const* d_chars, + cudf::size_type chars_size, + cudf::size_type offset, + MapRefType const d_map, + cudf::size_type* d_offsets) +{ + auto const idx = static_cast(cudf::detail::grid_1d::global_thread_id()); + if (idx >= chars_size) { return; } + if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { + d_offsets[idx] = 0; + return; + } + + auto next_substr = [d_chars, end = d_chars + chars_size](char const* begin) { + auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { + return cudf::strings::detail::is_begin_utf8_char(v); + }); + auto const size = static_cast(thrust::distance(begin, next)); + return cudf::string_view(begin, size); + }; + + auto const itr = d_chars + idx; + auto const end = d_chars + chars_size; + auto const lhs = next_substr(itr); + auto const next_itr = itr + lhs.size_bytes(); + auto output = 0; + if (next_itr < end) { + auto const rhs = next_substr(next_itr); + if (!rhs.empty()) { + // see if both halves exist anywhere in the table + if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) { + output = idx + lhs.size_bytes() + offset; // candidate for artificial boundary + } + } + } + d_offsets[idx] = output; +} + template __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, MapRefType const d_map, @@ -77,13 +117,13 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const end_ranks = d_min_ranks + d_str.size_bytes(); auto const d_rerank = d_rerank_in + offset; auto const end_rerank = d_rerank + d_str.size_bytes(); - auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); auto constexpr max_rank = cuda::std::numeric_limits::max(); __shared__ cudf::size_type block_min_rank; using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage temp_storage; + auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) { auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); @@ -284,11 +324,51 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const rmm::device_uvector d_spaces(chars_size, stream); rmm::device_uvector d_ranks(chars_size, stream); // rank per string pair; rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers - auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); + auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); + auto const map_ref2 = merge_pairs.impl->get_mp_table_ref(); + + if ((input.offset() == 0) && (input.size() == input.offsets().size() - 1)) { + // TODO: this fails for sliced columns for some reason; + // we could get ride of the else{} if this was fixed + + // this path locates unpairable sections of code to create artificial string row boundaries; + // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector + auto const block_count = (chars_size + block_size - 1) / block_size; + bpe_up_offsets_fn<<>>( + d_input_chars, chars_size, input.offset(), map_ref2, d_ranks.data()); + auto const end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0); + auto const total = thrust::distance(d_ranks.begin(), end); // number of unpairables + + // the new boundaries are combined with the existing offsets to build a temporary column + auto tmp_offsets = rmm::device_uvector(total + input.size() + 1, stream); + thrust::merge(rmm::exec_policy(stream), + input.offsets_begin(), + input.offsets_end(), + d_ranks.begin(), + end, + tmp_offsets.begin()); + + // the temp column is used for the encoding functions which is much faster + // on a larger number of smaller strings + auto const col_offsets = + cudf::column_view(cudf::device_span(tmp_offsets)); + auto const tmp_input = cudf::column_view(input.parent().type(), + static_cast(input.size() + total), + nullptr, + nullptr, + 0, + 0, + {col_offsets, input.chars()}); + auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); + + bpe_parallel_fn<<>>( + *d_tmp_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data()); + } else { + bpe_parallel_fn<<>>( + *d_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data()); + } - bpe_parallel_fn<<>>( - *d_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data()); - // this could probably be re-fused into the above kernel + // compute the output sizes into the d_offsets vector bpe_finalize<<>>( *d_strings, d_spaces.data(), d_offsets); diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 8e70b2b8905..25132a57ae2 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -46,6 +46,8 @@ using hash_value_type = uint32_t; using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; using merge_pair_type = thrust::pair; +using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + /** * @brief Hasher function used for building and using the cuco static-map * @@ -93,8 +95,6 @@ struct bpe_equal { } }; -using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; - using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>; using merge_pairs_map_type = cuco::experimental::static_map; +struct table_hasher { + cudf::column_device_view const d_strings; + string_hasher_type hasher{}; + // used by insert + __device__ hash_value_type operator()(cudf::size_type index) const + { + auto const d_str = d_strings.element(index); + return hasher(d_str); + } + // used by find + __device__ hash_value_type operator()(cudf::string_view const& d_str) const + { + return hasher(d_str); + } +}; + +/** + * @brief Equal function used for building and using the cuco static-map + * + * This takes advantage of heterogeneous lookup feature in cuco static-map which + * allows inserting with one type (index) and looking up with a different type (string). + */ +struct table_equal { + cudf::column_device_view const d_strings; + // used by insert + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept + { + auto const left = d_strings.element(lhs); + auto const right = d_strings.element(rhs); + return left == right; + } + // used by find + __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept + { + auto const left = d_strings.element(lhs); + return left == rhs; + } +}; + +using probe_scheme2 = cuco::experimental::linear_probing<1, table_hasher>; + +using merge_pairs_map_type2 = + cuco::experimental::static_map, + cuda::thread_scope_device, + table_equal, + probe_scheme2, + hash_table_allocator_type>; + } // namespace detail // since column_device_view::create() returns is a little more than @@ -118,12 +168,17 @@ struct bpe_merge_pairs::bpe_merge_pairs_impl { col_device_view const d_merge_pairs; std::unique_ptr merge_pairs_map; + std::unique_ptr mp_table_map; + bpe_merge_pairs_impl(std::unique_ptr&& merge_pairs, col_device_view&& d_merge_pairs, - std::unique_ptr&& merge_pairs_map); + std::unique_ptr&& merge_pairs_map, + std::unique_ptr&& mp_table_map); auto const get_merge_pairs() const { return *d_merge_pairs; } auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); } + + auto get_mp_table_ref() const { return mp_table_map->ref(cuco::experimental::op::find); } }; } // namespace nvtext diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index c402ff7cc80..b5a49d80a31 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -104,13 +104,34 @@ std::unique_ptr initialize_merge_pairs_map( return merge_pairs_map; } +std::unique_ptr initialize_mp_table_map( + cudf::column_device_view const& input, rmm::cuda_stream_view stream) +{ + auto mp_table_map = std::make_unique( + static_cast(input.size()), + cuco::empty_key{-1}, + cuco::empty_value{-1}, + table_equal{input}, + probe_scheme2{table_hasher{input}}, + hash_table_allocator_type{default_allocator{}, stream}, + stream.value()); + + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); }); + + mp_table_map->insert_async(iter, iter + input.size(), stream.value()); + + return mp_table_map; +} + std::unique_ptr create_bpe_merge_pairs_impl( std::unique_ptr&& input, rmm::cuda_stream_view stream) { - auto d_input = cudf::column_device_view::create(input->view(), stream); - auto merge_pairs = initialize_merge_pairs_map(*d_input, stream); + auto d_input = cudf::column_device_view::create(input->view(), stream); + auto merge_pairs = initialize_merge_pairs_map(*d_input, stream); + auto mp_table_map = initialize_mp_table_map(*d_input, stream); return std::make_unique( - std::move(input), std::move(d_input), std::move(merge_pairs)); + std::move(input), std::move(d_input), std::move(merge_pairs), std::move(mp_table_map)); } std::unique_ptr create_bpe_merge_pairs_impl( @@ -163,10 +184,12 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl( std::unique_ptr&& merge_pairs, std::unique_ptr>&& d_merge_pairs, - std::unique_ptr&& merge_pairs_map) + std::unique_ptr&& merge_pairs_map, + std::unique_ptr&& mp_table_map) : merge_pairs(std::move(merge_pairs)), d_merge_pairs(std::move(d_merge_pairs)), - merge_pairs_map(std::move(merge_pairs_map)) + merge_pairs_map(std::move(merge_pairs_map)), + mp_table_map(std::move(mp_table_map)) { } diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index d670492fb01..47466417553 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -54,8 +54,7 @@ TEST_F(TextBPETokenize, BytePairEncoding) validity); auto sv = cudf::strings_column_view(input); - auto results = nvtext::byte_pair_encoding(sv, *merge_pairs); - + auto results = nvtext::byte_pair_encoding(sv, *merge_pairs); auto expected = cudf::test::strings_column_wrapper({"this is it", "this is test - sent ence - 1", "this is test sent ence - 2", @@ -67,8 +66,8 @@ TEST_F(TextBPETokenize, BytePairEncoding) auto sliced = cudf::slice(input, {1, 4}).front(); auto sliced_expected = cudf::slice(expected, {1, 4}).front(); - sv = cudf::strings_column_view(sliced); + sv = cudf::strings_column_view(sliced); results = nvtext::byte_pair_encoding(sv, *merge_pairs); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); } From b978e0112c8d3b2a4edcdebb92d2e11ad4ff1fc7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 4 Oct 2023 23:27:00 -0400 Subject: [PATCH 20/28] fix sliced input offset parm --- cpp/src/text/subword/bpe_tokenizer.cu | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index e36d53e17f4..47da3ef0231 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -324,18 +324,14 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const rmm::device_uvector d_spaces(chars_size, stream); rmm::device_uvector d_ranks(chars_size, stream); // rank per string pair; rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers - auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); - auto const map_ref2 = merge_pairs.impl->get_mp_table_ref(); - - if ((input.offset() == 0) && (input.size() == input.offsets().size() - 1)) { - // TODO: this fails for sliced columns for some reason; - // we could get ride of the else{} if this was fixed + { + auto const map_ref2 = merge_pairs.impl->get_mp_table_ref(); // this path locates unpairable sections of code to create artificial string row boundaries; // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector auto const block_count = (chars_size + block_size - 1) / block_size; bpe_up_offsets_fn<<>>( - d_input_chars, chars_size, input.offset(), map_ref2, d_ranks.data()); + d_input_chars, chars_size, first_offset, map_ref2, d_ranks.data()); auto const end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0); auto const total = thrust::distance(d_ranks.begin(), end); // number of unpairables @@ -361,11 +357,9 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const {col_offsets, input.chars()}); auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); + auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); bpe_parallel_fn<<>>( *d_tmp_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data()); - } else { - bpe_parallel_fn<<>>( - *d_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data()); } // compute the output sizes into the d_offsets vector From e158312652132d130139e083aa7bd649d35eb224 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 5 Oct 2023 15:44:19 -0400 Subject: [PATCH 21/28] remove re-rank re-init --- cpp/src/text/subword/bpe_tokenizer.cu | 125 +++++++++++++++-------- cpp/src/text/subword/bpe_tokenizer.cuh | 53 ++++++---- cpp/src/text/subword/load_merges_file.cu | 12 +-- 3 files changed, 123 insertions(+), 67 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 47da3ef0231..0d63e2b7507 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -51,12 +51,24 @@ namespace { constexpr int block_size = 512; +/** + * @brief Produces offsets to unpairable locations in the given chars array + * + * Launched as a thread per byte of the chars array. + * The output is non-zero offsets to locations of unpairable strings. + * + * @tparam MapRefType The type of the map finder object + * @param d_chars Input chars memory + * @param chars_size Number of bytes pointed to by `d_chars` + * @param d_map For looking up individual string candidates + * @param d_offsets Output is the offset location of unpairables + */ template -__global__ void bpe_up_offsets_fn(char const* d_chars, - cudf::size_type chars_size, - cudf::size_type offset, - MapRefType const d_map, - cudf::size_type* d_offsets) +__global__ void bpe_unpairable_offsets_fn(char const* d_chars, + cudf::size_type chars_size, // use device_span + cudf::size_type offset, + MapRefType const d_map, + cudf::size_type* d_offsets) { auto const idx = static_cast(cudf::detail::grid_1d::global_thread_id()); if (idx >= chars_size) { return; } @@ -65,21 +77,22 @@ __global__ void bpe_up_offsets_fn(char const* d_chars, return; } - auto next_substr = [d_chars, end = d_chars + chars_size](char const* begin) { - auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { - return cudf::strings::detail::is_begin_utf8_char(v); - }); - auto const size = static_cast(thrust::distance(begin, next)); - return cudf::string_view(begin, size); + auto const itr = d_chars + idx; + auto const end = d_chars + chars_size; + + auto next_str = [end](char const* begin) { + auto next = begin + (begin < end); + while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) { + ++next; + } + return cudf::string_view(begin, static_cast(thrust::distance(begin, next))); }; - auto const itr = d_chars + idx; - auto const end = d_chars + chars_size; - auto const lhs = next_substr(itr); + auto const lhs = next_str(itr); auto const next_itr = itr + lhs.size_bytes(); auto output = 0; if (next_itr < end) { - auto const rhs = next_substr(next_itr); + auto const rhs = next_str(next_itr); if (!rhs.empty()) { // see if both halves exist anywhere in the table if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) { @@ -90,6 +103,29 @@ __global__ void bpe_up_offsets_fn(char const* d_chars, d_offsets[idx] = output; } +/** + * @brief Performs byte-pair-encoding + * + * Computes the locations where the separator will be inserted in `d_spaces_in`. + * This is launched as a string per block. + * + * The process first initializes all characters to 1 per position in `d_spaces_in`. + * All pairs are realized and their ranks stored in `d_ranks`. + * + * Iteratively, the minimum rank is located, the corresponding `d_spaces_in` location + * is set to 0 resulting in new potential pairs. The process repeats accounting for + * the rank of the newly formed pairs. + * + * Once there no more rankable pairs, the process finishes and the `d_spaces_in` + * values identify the location to insert the separator. + * + * @tparam MapRefType The type of the map finder object + * @param d_strings Input data + * @param d_map For looking up individual string candidates + * @param d_spaces_in Output the location where separator will be inserted + * @param d_ranks Working memory to hold pair ranks + * @param d_rerank_in Working memory to hold locations where reranking is required + */ template __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, MapRefType const d_map, @@ -103,7 +139,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, static_cast(cudf::detail::grid_1d::global_thread_id() / block_size); auto const lane_idx = static_cast(threadIdx.x); - if (d_strings.is_null(str_idx)) { return; } + // if (d_strings.is_null(str_idx)) { return; } // maybe no longer needed auto const d_str = d_strings.element(str_idx); if (d_str.empty()) { return; } @@ -131,6 +167,10 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); }; + // init all the re-rank identifiers to zero + for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { + *itr = 0; + } // init all ranks to max for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { *itr = max_rank; @@ -167,21 +207,14 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); - // loop through the ranks finding the current minimum until there are no more + // loop through the ranks processing the current minimum until there are no more while (block_min_rank < max_rank) { - // (re)initialize all the re-rank identifiers to zero - for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { - *itr = 0; - } - - // search the d_min_ranks for all the places where the rank matches block_min_rank + // search the d_min_ranks for matches to block_min_rank for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { if (*itr == block_min_rank) { - auto ptr = itr - 1; // check for adjacent min-rank edge-case - auto count = 8; - while (ptr > d_min_ranks && *ptr == max_rank && count > 0) { + auto ptr = itr - 1; // check for adjacent min-rank (edge-case) + while (ptr > d_min_ranks && *ptr == max_rank) { --ptr; - --count; } // set the output value to 0 at this position (erases separator) if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; } @@ -202,7 +235,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, return ptr; }; - // identify the re-rank locations + // identify all the re-rank locations (logic above created new pairs) for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { auto const index = thrust::distance(d_min_ranks, itr); if (*itr == block_min_rank && d_spaces[index] == 0) { @@ -216,11 +249,12 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, __syncthreads(); // compute the ranks for the newly created pairs - min_rank = max_rank; // and record new minimum + min_rank = max_rank; // and record the new minimum along the way for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { auto const index = thrust::distance(d_rerank, itr); auto rank = d_min_ranks[index]; if (*itr) { + *itr = 0; // reset re-rank // build lhs of pair auto const ptr = find_prev(d_spaces + index - 1); auto const size = static_cast(thrust::distance(ptr, d_spaces + index)); @@ -230,21 +264,32 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, rank = max_rank; if (!rhs.empty()) { auto const mp = merge_pair_type{lhs, rhs}; - auto const map_itr = d_map.find(mp); // lookup in merges; + auto const map_itr = d_map.find(mp); // lookup rank for this pair; if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match } - d_min_ranks[index] = rank; + d_min_ranks[index] = rank; // store new rank } - if (rank < min_rank) min_rank = rank; + if (rank < min_rank) { min_rank = rank; } } - // compute the min rank across the block + // re-compute the minimum rank across the block (since new pairs are created above) auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); } // if no mins were found we are done, otherwise start again } +/** + * @brief Computes the output size of each strings row + * + * This launches as a string per block. + * The non-zero values in `d_spaces_in` for each string is added to + * the current string size to produce the total output bytes. + * + * @param d_strings Input data + * @param d_spaces_in Output the location where separator will be inserted + * @param d_sizes Output sizes of each row + */ __global__ void bpe_finalize(cudf::column_device_view const d_strings, int8_t* d_spaces_in, // where separators are inserted cudf::size_type* d_sizes // output sizes of encoded strings @@ -326,12 +371,12 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers { - auto const map_ref2 = merge_pairs.impl->get_mp_table_ref(); + auto const mp_map = merge_pairs.impl->get_mp_table_ref(); // this path locates unpairable sections of code to create artificial string row boundaries; // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector - auto const block_count = (chars_size + block_size - 1) / block_size; - bpe_up_offsets_fn<<>>( - d_input_chars, chars_size, first_offset, map_ref2, d_ranks.data()); + cudf::detail::grid_1d grid(chars_size, block_size); + bpe_unpairable_offsets_fn<<>>( + d_input_chars, chars_size, first_offset, mp_map, d_ranks.data()); auto const end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0); auto const total = thrust::distance(d_ranks.begin(), end); // number of unpairables @@ -357,9 +402,9 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const {col_offsets, input.chars()}); auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); - auto const map_ref = merge_pairs.impl->get_merge_pairs_ref(); - bpe_parallel_fn<<>>( - *d_tmp_strings, map_ref, d_spaces.data(), d_ranks.data(), d_rerank.data()); + auto const pair_map = merge_pairs.impl->get_merge_pairs_ref(); + bpe_parallel_fn<<>>( + *d_tmp_strings, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data()); } // compute the output sizes into the d_offsets vector diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index 25132a57ae2..be5a4e6e7dd 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -42,8 +42,8 @@ namespace nvtext { namespace detail { -using hash_value_type = uint32_t; using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; +using hash_value_type = string_hasher_type::result_type; using merge_pair_type = thrust::pair; using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; @@ -52,7 +52,10 @@ using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor; +using bpe_probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>; using merge_pairs_map_type = cuco::experimental::static_map, cuda::thread_scope_device, bpe_equal, - probe_scheme, + bpe_probe_scheme, hash_table_allocator_type>; -struct table_hasher { +/** + * @brief Hasher function used for building and using the cuco static-map + * + * This takes advantage of heterogeneous lookup feature in cuco static-map which + * allows inserting with one type (index) and looking up with a different type (merge_pair_type). + * + * Each component of the merge-pairs (left and right) are stored individually in the map. + */ +struct mp_hasher { cudf::column_device_view const d_strings; string_hasher_type hasher{}; // used by insert @@ -127,7 +141,7 @@ struct table_hasher { * This takes advantage of heterogeneous lookup feature in cuco static-map which * allows inserting with one type (index) and looking up with a different type (string). */ -struct table_equal { +struct mp_equal { cudf::column_device_view const d_strings; // used by insert __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept @@ -144,16 +158,15 @@ struct table_equal { } }; -using probe_scheme2 = cuco::experimental::linear_probing<1, table_hasher>; +using mp_probe_scheme = cuco::experimental::linear_probing<1, mp_hasher>; -using merge_pairs_map_type2 = - cuco::experimental::static_map, - cuda::thread_scope_device, - table_equal, - probe_scheme2, - hash_table_allocator_type>; +using mp_table_map_type = cuco::experimental::static_map, + cuda::thread_scope_device, + mp_equal, + mp_probe_scheme, + hash_table_allocator_type>; } // namespace detail @@ -166,18 +179,16 @@ using col_device_view = std::invoke_result_t const merge_pairs; col_device_view const d_merge_pairs; - std::unique_ptr merge_pairs_map; - - std::unique_ptr mp_table_map; + std::unique_ptr merge_pairs_map; // for BPE + std::unique_ptr mp_table_map; // for locating unpairables bpe_merge_pairs_impl(std::unique_ptr&& merge_pairs, col_device_view&& d_merge_pairs, std::unique_ptr&& merge_pairs_map, - std::unique_ptr&& mp_table_map); + std::unique_ptr&& mp_table_map); auto const get_merge_pairs() const { return *d_merge_pairs; } auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); } - auto get_mp_table_ref() const { return mp_table_map->ref(cuco::experimental::op::find); } }; diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index b5a49d80a31..ffe8f2156dc 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -92,7 +92,7 @@ std::unique_ptr initialize_merge_pairs_map( cuco::empty_key{-1}, cuco::empty_value{-1}, bpe_equal{input}, - probe_scheme{bpe_hasher{input}}, + bpe_probe_scheme{bpe_hasher{input}}, hash_table_allocator_type{default_allocator{}, stream}, stream.value()); @@ -104,15 +104,15 @@ std::unique_ptr initialize_merge_pairs_map( return merge_pairs_map; } -std::unique_ptr initialize_mp_table_map( +std::unique_ptr initialize_mp_table_map( cudf::column_device_view const& input, rmm::cuda_stream_view stream) { - auto mp_table_map = std::make_unique( + auto mp_table_map = std::make_unique( static_cast(input.size()), cuco::empty_key{-1}, cuco::empty_value{-1}, - table_equal{input}, - probe_scheme2{table_hasher{input}}, + mp_equal{input}, + mp_probe_scheme{mp_hasher{input}}, hash_table_allocator_type{default_allocator{}, stream}, stream.value()); @@ -185,7 +185,7 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl( std::unique_ptr>&& d_merge_pairs, std::unique_ptr&& merge_pairs_map, - std::unique_ptr&& mp_table_map) + std::unique_ptr&& mp_table_map) : merge_pairs(std::move(merge_pairs)), d_merge_pairs(std::move(d_merge_pairs)), merge_pairs_map(std::move(merge_pairs_map)), From 0f3bc94428ad520350b689d27b1a831558a8f272 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 6 Oct 2023 18:22:37 -0400 Subject: [PATCH 22/28] minor rework: variables and comments --- cpp/src/text/subword/bpe_tokenizer.cu | 80 +++++++++++++++------------ 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 0d63e2b7507..fa4533480ac 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -25,9 +25,7 @@ #include #include #include -#include #include -#include #include #include @@ -40,10 +38,8 @@ #include #include #include -#include #include #include -#include namespace nvtext { namespace detail { @@ -55,7 +51,10 @@ constexpr int block_size = 512; * @brief Produces offsets to unpairable locations in the given chars array * * Launched as a thread per byte of the chars array. - * The output is non-zero offsets to locations of unpairable strings. + * The output is non-zero offsets to locations of unpairable substring. + * An unpairable substring does not exist in the given map and so will + * never be paired :-(. Fortunately, this can be used as an artificial + * boundary providing better parallelism in the BPE kernel. * * @tparam MapRefType The type of the map finder object * @param d_chars Input chars memory @@ -96,42 +95,55 @@ __global__ void bpe_unpairable_offsets_fn(char const* d_chars, if (!rhs.empty()) { // see if both halves exist anywhere in the table if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) { - output = idx + lhs.size_bytes() + offset; // candidate for artificial boundary + output = idx + lhs.size_bytes() + offset; // offset for artificial boundary } } } d_offsets[idx] = output; + + // Alternate solution that only checks one substring. + // No noticeable performance improvement. + // auto const lhs = [begin = itr, end] { + // auto next = begin + (begin < end); + // while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) { + // ++next; + // } + // return cudf::string_view(begin, static_cast(thrust::distance(begin, next))); + // }(); + // d_offsets[idx] = (((itr + lhs.size_bytes()) < end) && (d_map.find(lhs) == d_map.end())) + // ? idx + lhs.size_bytes() + offset // offset for artificial boundary + // : 0; } /** * @brief Performs byte-pair-encoding * - * Computes the locations where the separator will be inserted in `d_spaces_in`. + * Computes the locations where the separator will be inserted in `d_spaces_data`. * This is launched as a string per block. * - * The process first initializes all characters to 1 per position in `d_spaces_in`. + * The process first initializes all characters to 1 per position in `d_spaces_data`. * All pairs are realized and their ranks stored in `d_ranks`. * - * Iteratively, the minimum rank is located, the corresponding `d_spaces_in` location + * Iteratively, the minimum rank is located, the corresponding `d_spaces_data` location * is set to 0 resulting in new potential pairs. The process repeats accounting for * the rank of the newly formed pairs. * - * Once there no more rankable pairs, the process finishes and the `d_spaces_in` + * Once there are no more rankable pairs, the process finishes and the `d_spaces_data` * values identify the location to insert the separator. * * @tparam MapRefType The type of the map finder object * @param d_strings Input data * @param d_map For looking up individual string candidates - * @param d_spaces_in Output the location where separator will be inserted + * @param d_spaces_data Output the location where separator will be inserted * @param d_ranks Working memory to hold pair ranks - * @param d_rerank_in Working memory to hold locations where reranking is required + * @param d_rerank_data Working memory to hold locations where reranking is required */ template __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, MapRefType const d_map, - int8_t* d_spaces_in, // working memory + int8_t* d_spaces_data, // working memory cudf::size_type* d_ranks, // more working memory - int8_t* d_rerank_in // and one more working memory + int8_t* d_rerank_data // and one more working memory ) { // string per block @@ -139,7 +151,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, static_cast(cudf::detail::grid_1d::global_thread_id() / block_size); auto const lane_idx = static_cast(threadIdx.x); - // if (d_strings.is_null(str_idx)) { return; } // maybe no longer needed auto const d_str = d_strings.element(str_idx); if (d_str.empty()) { return; } @@ -147,11 +158,11 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, d_strings.child(cudf::strings_column_view::offsets_column_index).data(); auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; - auto const d_spaces = d_spaces_in + offset; + auto const d_spaces = d_spaces_data + offset; auto const end_spaces = d_spaces + d_str.size_bytes(); auto const d_min_ranks = d_ranks + offset; auto const end_ranks = d_min_ranks + d_str.size_bytes(); - auto const d_rerank = d_rerank_in + offset; + auto const d_rerank = d_rerank_data + offset; auto const end_rerank = d_rerank + d_str.size_bytes(); auto constexpr max_rank = cuda::std::numeric_limits::max(); @@ -283,15 +294,15 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, * @brief Computes the output size of each strings row * * This launches as a string per block. - * The non-zero values in `d_spaces_in` for each string is added to + * The non-zero values in `d_spaces_data` for each string is added to * the current string size to produce the total output bytes. * * @param d_strings Input data - * @param d_spaces_in Output the location where separator will be inserted + * @param d_spaces_data Output the location where separator will be inserted * @param d_sizes Output sizes of each row */ __global__ void bpe_finalize(cudf::column_device_view const d_strings, - int8_t* d_spaces_in, // where separators are inserted + int8_t* d_spaces_data, // where separators are inserted cudf::size_type* d_sizes // output sizes of encoded strings ) { @@ -314,7 +325,7 @@ __global__ void bpe_finalize(cudf::column_device_view const d_strings, d_strings.child(cudf::strings_column_view::offsets_column_index).data(); auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; - auto const d_spaces = d_spaces_in + offset; + auto const d_spaces = d_spaces_data + offset; auto const end_spaces = d_spaces + d_str.size_bytes(); auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); @@ -371,16 +382,16 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers { - auto const mp_map = merge_pairs.impl->get_mp_table_ref(); - // this path locates unpairable sections of code to create artificial string row boundaries; + // this kernel locates unpairable sections of code to create artificial string row boundaries; // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector + auto const mp_map = merge_pairs.impl->get_mp_table_ref(); cudf::detail::grid_1d grid(chars_size, block_size); bpe_unpairable_offsets_fn<<>>( d_input_chars, chars_size, first_offset, mp_map, d_ranks.data()); auto const end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0); auto const total = thrust::distance(d_ranks.begin(), end); // number of unpairables - // the new boundaries are combined with the existing offsets to build a temporary column + // the new boundaries are combined with the existing offsets auto tmp_offsets = rmm::device_uvector(total + input.size() + 1, stream); thrust::merge(rmm::exec_policy(stream), input.offsets_begin(), @@ -389,8 +400,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const end, tmp_offsets.begin()); - // the temp column is used for the encoding functions which is much faster - // on a larger number of smaller strings + // temp column created for the encoding which parallelizes between the unpairable boundaries auto const col_offsets = cudf::column_view(cudf::device_span(tmp_offsets)); auto const tmp_input = cudf::column_view(input.parent().type(), @@ -407,7 +417,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const *d_tmp_strings, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data()); } - // compute the output sizes into the d_offsets vector + // compute the output sizes into the output d_offsets vector bpe_finalize<<>>( *d_strings, d_spaces.data(), d_offsets); @@ -418,11 +428,11 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const "Size of output exceeds the column size limit", std::overflow_error); - // build the output: adding separators between the remaining pairs in each string + // build the output: adding separators to the input character data auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); auto d_chars = chars->mutable_view().data(); - // we can reuse the ranks working memory to store some temporary offsets now; + // we can reuse the ranks working memory to store some temporary offsets; // the offsets are produced by the index of the d_spaces values auto const d_inserts = d_ranks.data(); // create offsets where separators will be inserted @@ -437,12 +447,12 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const // this will insert the single-byte separator in positions specified in d_inserts auto const sep_char = thrust::constant_iterator(separator.to_string(stream)[0]); thrust::merge_by_key(rmm::exec_policy(stream), - d_inserts, // where separator is inserted - copy_end, - zero_itr, // all positions - chars_end, - sep_char, // byte to insert - d_input_chars, + d_inserts, // where to insert separator byte + copy_end, // + zero_itr, // all positions + chars_end, // + sep_char, // byte to insert + d_input_chars, // original data thrust::make_discard_iterator(), d_chars); // result From 747e10fd47db44664998d8282dd508e5a0217f0b Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 11 Oct 2023 14:48:40 -0400 Subject: [PATCH 23/28] remove any duplicate tmp offsets --- cpp/src/text/subword/bpe_tokenizer.cu | 75 ++++++++++++++------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index fa4533480ac..51cef6507aa 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -40,6 +40,8 @@ #include #include #include +#include +#include namespace nvtext { namespace detail { @@ -152,8 +154,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const lane_idx = static_cast(threadIdx.x); auto const d_str = d_strings.element(str_idx); - if (d_str.empty()) { return; } - auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index).data(); auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; @@ -172,12 +172,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, __shared__ typename block_reduce::TempStorage temp_storage; auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); - auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) { - auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); - auto const size = static_cast(thrust::distance(begin, next)); - return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); - }; - // init all the re-rank identifiers to zero for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { *itr = 0; @@ -193,6 +187,12 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, } __syncthreads(); + auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) { + auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); + auto const size = static_cast(thrust::distance(begin, next)); + return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); + }; + auto min_rank = max_rank; // store all the initial ranks for each pair @@ -218,6 +218,20 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); + // these are used to locate adjacent pairs after merging a pair + auto find_prev = [begin = d_spaces](int8_t* ptr) { + while (ptr > begin && *ptr == 0) { + --ptr; + } + return ptr; + }; + auto find_next = [end = end_spaces](int8_t* ptr) { + while (ptr < end && *ptr == 0) { + ++ptr; + } + return ptr; + }; + // loop through the ranks processing the current minimum until there are no more while (block_min_rank < max_rank) { // search the d_min_ranks for matches to block_min_rank @@ -233,19 +247,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, } __syncthreads(); - auto find_prev = [begin = d_spaces](int8_t* ptr) { - while (ptr > begin && *ptr == 0) { - --ptr; - } - return ptr; - }; - auto find_next = [end = end_spaces](int8_t* ptr) { - while (ptr < end && *ptr == 0) { - ++ptr; - } - return ptr; - }; - // identify all the re-rank locations (logic above created new pairs) for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { auto const index = thrust::distance(d_min_ranks, itr); @@ -270,9 +271,8 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const ptr = find_prev(d_spaces + index - 1); auto const size = static_cast(thrust::distance(ptr, d_spaces + index)); auto const lhs = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size); - // retrieve rhs of pair - auto const rhs = next_substr(d_spaces + index); - rank = max_rank; + auto const rhs = next_substr(d_spaces + index); // retrieve rhs of pair + rank = max_rank; if (!rhs.empty()) { auto const mp = merge_pair_type{lhs, rhs}; auto const map_itr = d_map.find(mp); // lookup rank for this pair; @@ -388,32 +388,33 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const cudf::detail::grid_1d grid(chars_size, block_size); bpe_unpairable_offsets_fn<<>>( d_input_chars, chars_size, first_offset, mp_map, d_ranks.data()); - auto const end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0); - auto const total = thrust::distance(d_ranks.begin(), end); // number of unpairables + auto const up_end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0); + auto const unpairables = thrust::distance(d_ranks.begin(), up_end); // number of unpairables // the new boundaries are combined with the existing offsets - auto tmp_offsets = rmm::device_uvector(total + input.size() + 1, stream); + auto tmp_offsets = rmm::device_uvector(unpairables + input.size() + 1, stream); thrust::merge(rmm::exec_policy(stream), input.offsets_begin(), input.offsets_end(), d_ranks.begin(), - end, + up_end, tmp_offsets.begin()); + // remove adjacent duplicate offsets (empty or null rows) + auto const offsets_end = + thrust::unique(rmm::exec_policy(stream), tmp_offsets.begin(), tmp_offsets.end()); + auto const offsets_total = + static_cast(thrust::distance(tmp_offsets.begin(), offsets_end)); // temp column created for the encoding which parallelizes between the unpairable boundaries auto const col_offsets = cudf::column_view(cudf::device_span(tmp_offsets)); - auto const tmp_input = cudf::column_view(input.parent().type(), - static_cast(input.size() + total), - nullptr, - nullptr, - 0, - 0, - {col_offsets, input.chars()}); + auto const tmp_size = offsets_total - 1; + auto const tmp_input = cudf::column_view( + input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()}); auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); - + // launch the byte-pair-encoding kernel auto const pair_map = merge_pairs.impl->get_merge_pairs_ref(); - bpe_parallel_fn<<>>( + bpe_parallel_fn<<>>( *d_tmp_strings, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data()); } From 3c2a866eed6dc678d29192dc20a14156c1a8acd6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 23 Oct 2023 12:59:44 -0400 Subject: [PATCH 24/28] fix tmp-offsets size --- cpp/src/text/subword/bpe_tokenizer.cu | 52 ++++++++++++++------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 51cef6507aa..6bca8544b57 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -377,35 +377,39 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr); auto d_offsets = offsets->mutable_view().data(); - rmm::device_uvector d_spaces(chars_size, stream); - rmm::device_uvector d_ranks(chars_size, stream); // rank per string pair; - rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers + rmm::device_uvector d_spaces(chars_size, stream); // identifies non-merged pairs + rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers + // used for various purposes below: unpairable-offsets, pair ranks, separator insert positions + rmm::device_uvector d_working(chars_size, stream); { - // this kernel locates unpairable sections of code to create artificial string row boundaries; - // the boundary values are recorded as offsets and stored temporarily in the d_ranks vector - auto const mp_map = merge_pairs.impl->get_mp_table_ref(); + // this kernel locates unpairable sections of strings to create artificial string row + // boundaries; the boundary values are recorded as offsets in d_up_offsets + auto const d_up_offsets = d_working.data(); // store unpairable offsets here + auto const mp_map = merge_pairs.impl->get_mp_table_ref(); // lookup table cudf::detail::grid_1d grid(chars_size, block_size); bpe_unpairable_offsets_fn<<>>( - d_input_chars, chars_size, first_offset, mp_map, d_ranks.data()); - auto const up_end = thrust::remove(rmm::exec_policy(stream), d_ranks.begin(), d_ranks.end(), 0); - auto const unpairables = thrust::distance(d_ranks.begin(), up_end); // number of unpairables + d_input_chars, chars_size, first_offset, mp_map, d_up_offsets); + auto const up_end = // remove all but the unpairable offsets + thrust::remove(rmm::exec_policy(stream), d_up_offsets, d_up_offsets + chars_size, 0); + auto const unpairables = thrust::distance(d_up_offsets, up_end); // number of unpairables - // the new boundaries are combined with the existing offsets + // the new boundaries created by combining unpairable offsets with the existing offsets auto tmp_offsets = rmm::device_uvector(unpairables + input.size() + 1, stream); thrust::merge(rmm::exec_policy(stream), input.offsets_begin(), input.offsets_end(), - d_ranks.begin(), + d_up_offsets, up_end, tmp_offsets.begin()); - // remove adjacent duplicate offsets (empty or null rows) + // remove any adjacent duplicate offsets (empty or null rows) auto const offsets_end = thrust::unique(rmm::exec_policy(stream), tmp_offsets.begin(), tmp_offsets.end()); auto const offsets_total = static_cast(thrust::distance(tmp_offsets.begin(), offsets_end)); + tmp_offsets.resize(offsets_total, stream); - // temp column created for the encoding which parallelizes between the unpairable boundaries + // temp column created with the merged offsets and the original chars data auto const col_offsets = cudf::column_view(cudf::device_span(tmp_offsets)); auto const tmp_size = offsets_total - 1; @@ -413,9 +417,10 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()}); auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); // launch the byte-pair-encoding kernel + auto const d_ranks = d_working.data(); // store pair ranks here auto const pair_map = merge_pairs.impl->get_merge_pairs_ref(); bpe_parallel_fn<<>>( - *d_tmp_strings, pair_map, d_spaces.data(), d_ranks.data(), d_rerank.data()); + *d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data()); } // compute the output sizes into the output d_offsets vector @@ -429,28 +434,25 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const "Size of output exceeds the column size limit", std::overflow_error); - // build the output: adding separators to the input character data + // build the output: inserting separators to the input character data auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); auto d_chars = chars->mutable_view().data(); - // we can reuse the ranks working memory to store some temporary offsets; - // the offsets are produced by the index of the d_spaces values - auto const d_inserts = d_ranks.data(); - // create offsets where separators will be inserted + auto const d_inserts = d_working.data(); // stores the insert positions auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) { return d_spaces[idx] > 0; // separator to be inserted here }; - auto const zero_itr = thrust::counting_iterator(0); - auto const chars_end = thrust::counting_iterator(chars_size); - auto const copy_end = thrust::copy_if( - rmm::exec_policy(stream), zero_itr + 1, chars_end, d_inserts, offsets_at_non_zero); + auto const chars_begin = thrust::counting_iterator(0); + auto const chars_end = thrust::counting_iterator(chars_size); + auto const copy_end = thrust::copy_if( + rmm::exec_policy(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero); - // this will insert the single-byte separator in positions specified in d_inserts + // this will insert the single-byte separator into positions specified in d_inserts auto const sep_char = thrust::constant_iterator(separator.to_string(stream)[0]); thrust::merge_by_key(rmm::exec_policy(stream), d_inserts, // where to insert separator byte copy_end, // - zero_itr, // all positions + chars_begin, // all positions chars_end, // sep_char, // byte to insert d_input_chars, // original data From 40bd2aee9fa1dfdf354a33c07e713e5482a3e27a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 25 Oct 2023 14:08:23 -0400 Subject: [PATCH 25/28] replaced some device utilities with thrust functions --- cpp/src/text/subword/bpe_tokenizer.cu | 89 +++++++++++---------------- 1 file changed, 37 insertions(+), 52 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 6bca8544b57..e7a030dbc0e 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -53,10 +53,10 @@ constexpr int block_size = 512; * @brief Produces offsets to unpairable locations in the given chars array * * Launched as a thread per byte of the chars array. - * The output is non-zero offsets to locations of unpairable substring. + * The output is non-zero offsets to locations of unpairable substrings. * An unpairable substring does not exist in the given map and so will * never be paired :-(. Fortunately, this can be used as an artificial - * boundary providing better parallelism in the BPE kernel. + * boundary providing increased parallelism in the BPE kernel. * * @tparam MapRefType The type of the map finder object * @param d_chars Input chars memory @@ -65,8 +65,8 @@ constexpr int block_size = 512; * @param d_offsets Output is the offset location of unpairables */ template -__global__ void bpe_unpairable_offsets_fn(char const* d_chars, - cudf::size_type chars_size, // use device_span +__global__ void bpe_unpairable_offsets_fn(char const* d_chars, // use device_span + cudf::size_type chars_size, cudf::size_type offset, MapRefType const d_map, cudf::size_type* d_offsets) @@ -78,27 +78,16 @@ __global__ void bpe_unpairable_offsets_fn(char const* d_chars, return; } - auto const itr = d_chars + idx; - auto const end = d_chars + chars_size; - - auto next_str = [end](char const* begin) { - auto next = begin + (begin < end); - while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) { - ++next; - } - return cudf::string_view(begin, static_cast(thrust::distance(begin, next))); - }; - - auto const lhs = next_str(itr); - auto const next_itr = itr + lhs.size_bytes(); - auto output = 0; - if (next_itr < end) { - auto const rhs = next_str(next_itr); - if (!rhs.empty()) { - // see if both halves exist anywhere in the table - if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) { - output = idx + lhs.size_bytes() + offset; // offset for artificial boundary - } + auto const itr = d_chars + idx; + auto const end = d_chars + chars_size; + auto const lhs = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr)); + auto const next = itr + lhs.size_bytes(); + auto output = 0; + if (next < end) { + auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next)); + // see if both halves exist anywhere in the table, if not these are unpairable + if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) { + output = idx + lhs.size_bytes() + offset; // offset for artificial boundary } } d_offsets[idx] = output; @@ -187,17 +176,25 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, } __syncthreads(); + // for finding the next half of a pair auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) { - auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); + auto const next = thrust::find(thrust::seq, begin + 1, end, 1); auto const size = static_cast(thrust::distance(begin, next)); return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); }; + // for locating adjacent pairs after merging a pair + auto find_prev = [begin = d_spaces](int8_t* ptr) { + while (ptr > begin && *ptr == 0) { + --ptr; + } + return ptr; + }; auto min_rank = max_rank; // store all the initial ranks for each pair for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { - if (*itr == 0) { continue; } // start on valid bytes only + if (*itr == 0) { continue; } // skips any UTF-8 continuation bytes // resolve pair and lookup its rank auto const lhs = next_substr(itr); // retrieve lhs of the pair auto const next_itr = itr + lhs.size_bytes(); @@ -218,20 +215,6 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); - // these are used to locate adjacent pairs after merging a pair - auto find_prev = [begin = d_spaces](int8_t* ptr) { - while (ptr > begin && *ptr == 0) { - --ptr; - } - return ptr; - }; - auto find_next = [end = end_spaces](int8_t* ptr) { - while (ptr < end && *ptr == 0) { - ++ptr; - } - return ptr; - }; - // loop through the ranks processing the current minimum until there are no more while (block_min_rank < max_rank) { // search the d_min_ranks for matches to block_min_rank @@ -241,19 +224,21 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, while (ptr > d_min_ranks && *ptr == max_rank) { --ptr; } - // set the output value to 0 at this position (erases separator) + // set the output value to 0 at this position (erases separator, merges pair) if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; } } } __syncthreads(); - // identify all the re-rank locations (logic above created new pairs) + // identify all the re-rank locations (logic above invalidated adjacent pairs) for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { auto const index = thrust::distance(d_min_ranks, itr); if (*itr == block_min_rank && d_spaces[index] == 0) { - auto ptr = find_prev(d_spaces + index - 1); // find previous pair mid-point + // find previous pair mid-point + auto ptr = find_prev(d_spaces + index - 1); if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } - ptr = find_next(d_spaces + index + 1); // find next pair mid-point + // find next pair mid-point + ptr = thrust::find(thrust::seq, d_spaces + index + 1, end_spaces, 1); if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } *itr = max_rank; // reset this rank } @@ -287,7 +272,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); - } // if no mins were found we are done, otherwise start again + } // if no min ranks are found we are done, otherwise start again } /** @@ -340,8 +325,8 @@ __global__ void bpe_finalize(cudf::column_device_view const d_strings, for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { bytes += (*itr > 0); } - auto const size = block_reduce(temp_storage).Sum(bytes, num_valid); - if (lane_idx == 0) { d_sizes[str_idx] = size + d_str.size_bytes(); } + auto const total_bytes = block_reduce(temp_storage).Sum(bytes, num_valid); + if (lane_idx == 0) { d_sizes[str_idx] = total_bytes + d_str.size_bytes(); } } } // namespace @@ -394,7 +379,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const thrust::remove(rmm::exec_policy(stream), d_up_offsets, d_up_offsets + chars_size, 0); auto const unpairables = thrust::distance(d_up_offsets, up_end); // number of unpairables - // the new boundaries created by combining unpairable offsets with the existing offsets + // new string boundaries created by combining unpairable offsets with the existing offsets auto tmp_offsets = rmm::device_uvector(unpairables + input.size() + 1, stream); thrust::merge(rmm::exec_policy(stream), input.offsets_begin(), @@ -402,7 +387,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const d_up_offsets, up_end, tmp_offsets.begin()); - // remove any adjacent duplicate offsets (empty or null rows) + // remove any adjacent duplicate offsets (i.e. empty or null rows) auto const offsets_end = thrust::unique(rmm::exec_policy(stream), tmp_offsets.begin(), tmp_offsets.end()); auto const offsets_total = @@ -416,7 +401,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const auto const tmp_input = cudf::column_view( input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()}); auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); - // launch the byte-pair-encoding kernel + // launch the byte-pair-encoding kernel on the temp column auto const d_ranks = d_working.data(); // store pair ranks here auto const pair_map = merge_pairs.impl->get_merge_pairs_ref(); bpe_parallel_fn<<>>( @@ -452,7 +437,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const thrust::merge_by_key(rmm::exec_policy(stream), d_inserts, // where to insert separator byte copy_end, // - chars_begin, // all positions + chars_begin, // all indices chars_end, // sep_char, // byte to insert d_input_chars, // original data From 6738159e561fd77d5c9949dfb01c906e633c99b8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 26 Oct 2023 09:24:37 -0400 Subject: [PATCH 26/28] change custom kernel to transform --- cpp/src/text/subword/bpe_tokenizer.cu | 143 +++++++++++++------------- 1 file changed, 69 insertions(+), 74 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index e7a030dbc0e..542fe69f156 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -59,52 +59,45 @@ constexpr int block_size = 512; * boundary providing increased parallelism in the BPE kernel. * * @tparam MapRefType The type of the map finder object - * @param d_chars Input chars memory - * @param chars_size Number of bytes pointed to by `d_chars` - * @param d_map For looking up individual string candidates - * @param d_offsets Output is the offset location of unpairables */ template -__global__ void bpe_unpairable_offsets_fn(char const* d_chars, // use device_span - cudf::size_type chars_size, - cudf::size_type offset, - MapRefType const d_map, - cudf::size_type* d_offsets) -{ - auto const idx = static_cast(cudf::detail::grid_1d::global_thread_id()); - if (idx >= chars_size) { return; } - if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { - d_offsets[idx] = 0; - return; - } - - auto const itr = d_chars + idx; - auto const end = d_chars + chars_size; - auto const lhs = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr)); - auto const next = itr + lhs.size_bytes(); - auto output = 0; - if (next < end) { - auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next)); - // see if both halves exist anywhere in the table, if not these are unpairable - if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) { - output = idx + lhs.size_bytes() + offset; // offset for artificial boundary +struct bpe_unpairable_offsets_fn { + cudf::device_span d_chars; + cudf::size_type offset; + MapRefType const d_map; + __device__ cudf::size_type operator()(cudf::size_type idx) + { + if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { return 0; } + + auto const itr = d_chars.data() + idx; + auto const end = d_chars.end(); // + chars_size; + auto const lhs = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr)); + auto const next = itr + lhs.size_bytes(); + auto output = 0; + if (next < end) { + auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next)); + // see if both halves exist anywhere in the table, if not these are unpairable + if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) { + output = idx + lhs.size_bytes() + offset; // offset for artificial boundary + } } + return output; + + // Alternate solution that only checks one substring. + // No noticeable performance improvement. + // auto const lhs = [begin = itr, end] { + // auto next = begin + (begin < end); + // while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) { + // ++next; + // } + // return cudf::string_view(begin, static_cast(thrust::distance(begin, + // next))); + // }(); + // d_offsets[idx] = (((itr + lhs.size_bytes()) < end) && (d_map.find(lhs) == d_map.end())) + // ? idx + lhs.size_bytes() + offset // offset for artificial boundary + // : 0; } - d_offsets[idx] = output; - - // Alternate solution that only checks one substring. - // No noticeable performance improvement. - // auto const lhs = [begin = itr, end] { - // auto next = begin + (begin < end); - // while (next < end && !cudf::strings::detail::is_begin_utf8_char(*next)) { - // ++next; - // } - // return cudf::string_view(begin, static_cast(thrust::distance(begin, next))); - // }(); - // d_offsets[idx] = (((itr + lhs.size_bytes()) < end) && (d_map.find(lhs) == d_map.end())) - // ? idx + lhs.size_bytes() + offset // offset for artificial boundary - // : 0; -} +}; /** * @brief Performs byte-pair-encoding @@ -113,7 +106,7 @@ __global__ void bpe_unpairable_offsets_fn(char const* d_chars, // use device_sp * This is launched as a string per block. * * The process first initializes all characters to 1 per position in `d_spaces_data`. - * All pairs are realized and their ranks stored in `d_ranks`. + * All pairs are realized and their ranks stored in `d_ranks_data`. * * Iteratively, the minimum rank is located, the corresponding `d_spaces_data` location * is set to 0 resulting in new potential pairs. The process repeats accounting for @@ -126,15 +119,15 @@ __global__ void bpe_unpairable_offsets_fn(char const* d_chars, // use device_sp * @param d_strings Input data * @param d_map For looking up individual string candidates * @param d_spaces_data Output the location where separator will be inserted - * @param d_ranks Working memory to hold pair ranks + * @param d_ranks_data Working memory to hold pair ranks * @param d_rerank_data Working memory to hold locations where reranking is required */ template __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, MapRefType const d_map, - int8_t* d_spaces_data, // working memory - cudf::size_type* d_ranks, // more working memory - int8_t* d_rerank_data // and one more working memory + int8_t* d_spaces_data, // working memory + cudf::size_type* d_ranks_data, // more working memory + int8_t* d_rerank_data // and one more working memory ) { // string per block @@ -147,12 +140,12 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, d_strings.child(cudf::strings_column_view::offsets_column_index).data(); auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; - auto const d_spaces = d_spaces_data + offset; - auto const end_spaces = d_spaces + d_str.size_bytes(); - auto const d_min_ranks = d_ranks + offset; - auto const end_ranks = d_min_ranks + d_str.size_bytes(); - auto const d_rerank = d_rerank_data + offset; - auto const end_rerank = d_rerank + d_str.size_bytes(); + auto const d_spaces = d_spaces_data + offset; + auto const end_spaces = d_spaces + d_str.size_bytes(); + auto const d_ranks = d_ranks_data + offset; + auto const end_ranks = d_ranks + d_str.size_bytes(); + auto const d_rerank = d_rerank_data + offset; + auto const end_rerank = d_rerank + d_str.size_bytes(); auto constexpr max_rank = cuda::std::numeric_limits::max(); @@ -166,7 +159,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, *itr = 0; } // init all ranks to max - for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { + for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) { *itr = max_rank; } // init all spaces to 1 as appropriate @@ -203,9 +196,9 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, if (!rhs.empty()) { auto rank = max_rank; auto const mp = merge_pair_type{lhs, rhs}; - auto const map_itr = d_map.find(mp); // lookup pair in merges table; - if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match; - d_min_ranks[thrust::distance(d_spaces, next_itr)] = rank; // store the rank + auto const map_itr = d_map.find(mp); // lookup pair in merges table; + if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match; + d_ranks[thrust::distance(d_spaces, next_itr)] = rank; // store the rank if (rank < min_rank) min_rank = rank; } } @@ -217,22 +210,22 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, // loop through the ranks processing the current minimum until there are no more while (block_min_rank < max_rank) { - // search the d_min_ranks for matches to block_min_rank - for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { + // search the d_ranks for matches to block_min_rank + for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) { if (*itr == block_min_rank) { auto ptr = itr - 1; // check for adjacent min-rank (edge-case) - while (ptr > d_min_ranks && *ptr == max_rank) { + while (ptr > d_ranks && *ptr == max_rank) { --ptr; } // set the output value to 0 at this position (erases separator, merges pair) - if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_min_ranks, itr)] = 0; } + if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_ranks, itr)] = 0; } } } __syncthreads(); // identify all the re-rank locations (logic above invalidated adjacent pairs) - for (auto itr = d_min_ranks + lane_idx; itr < end_ranks; itr += block_size) { - auto const index = thrust::distance(d_min_ranks, itr); + for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) { + auto const index = thrust::distance(d_ranks, itr); if (*itr == block_min_rank && d_spaces[index] == 0) { // find previous pair mid-point auto ptr = find_prev(d_spaces + index - 1); @@ -249,7 +242,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, min_rank = max_rank; // and record the new minimum along the way for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { auto const index = thrust::distance(d_rerank, itr); - auto rank = d_min_ranks[index]; + auto rank = d_ranks[index]; if (*itr) { *itr = 0; // reset re-rank // build lhs of pair @@ -263,7 +256,7 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, auto const map_itr = d_map.find(mp); // lookup rank for this pair; if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match } - d_min_ranks[index] = rank; // store new rank + d_ranks[index] = rank; // store new rank } if (rank < min_rank) { min_rank = rank; } } @@ -363,18 +356,20 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const auto d_offsets = offsets->mutable_view().data(); rmm::device_uvector d_spaces(chars_size, stream); // identifies non-merged pairs - rmm::device_uvector d_rerank(chars_size, stream); // re-ranking identifiers // used for various purposes below: unpairable-offsets, pair ranks, separator insert positions rmm::device_uvector d_working(chars_size, stream); + auto const chars_begin = thrust::counting_iterator(0); + auto const chars_end = thrust::counting_iterator(chars_size); + { // this kernel locates unpairable sections of strings to create artificial string row // boundaries; the boundary values are recorded as offsets in d_up_offsets auto const d_up_offsets = d_working.data(); // store unpairable offsets here auto const mp_map = merge_pairs.impl->get_mp_table_ref(); // lookup table - cudf::detail::grid_1d grid(chars_size, block_size); - bpe_unpairable_offsets_fn<<>>( - d_input_chars, chars_size, first_offset, mp_map, d_up_offsets); + auto const d_chars_span = cudf::device_span(d_input_chars, chars_size); + auto up_fn = bpe_unpairable_offsets_fn{d_chars_span, first_offset, mp_map}; + thrust::transform(rmm::exec_policy(stream), chars_begin, chars_end, d_up_offsets, up_fn); auto const up_end = // remove all but the unpairable offsets thrust::remove(rmm::exec_policy(stream), d_up_offsets, d_up_offsets + chars_size, 0); auto const unpairables = thrust::distance(d_up_offsets, up_end); // number of unpairables @@ -401,14 +396,16 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const auto const tmp_input = cudf::column_view( input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()}); auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); + // launch the byte-pair-encoding kernel on the temp column - auto const d_ranks = d_working.data(); // store pair ranks here + rmm::device_uvector d_rerank(chars_size, stream); // more working memory; + auto const d_ranks = d_working.data(); // store pair ranks here auto const pair_map = merge_pairs.impl->get_merge_pairs_ref(); bpe_parallel_fn<<>>( *d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data()); } - // compute the output sizes into the output d_offsets vector + // compute the output sizes and store them in the d_offsets vector bpe_finalize<<>>( *d_strings, d_spaces.data(), d_offsets); @@ -427,9 +424,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) { return d_spaces[idx] > 0; // separator to be inserted here }; - auto const chars_begin = thrust::counting_iterator(0); - auto const chars_end = thrust::counting_iterator(chars_size); - auto const copy_end = thrust::copy_if( + auto const copy_end = thrust::copy_if( rmm::exec_policy(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero); // this will insert the single-byte separator into positions specified in d_inserts From f24ce0997e72a385d2821c30dba7966726127095 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 6 Nov 2023 11:04:51 -0500 Subject: [PATCH 27/28] add pytest for BytePairEncoder --- ...{bpe_tokenize.pxd => byte_pair_encode.pxd} | 2 +- python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 2 +- ...{bpe_tokenize.pyx => byte_pair_encode.pyx} | 2 +- python/cudf/cudf/core/byte_pair_encoding.py | 2 +- .../cudf/cudf/tests/text/test_text_methods.py | 41 +++++++++++++++++++ 5 files changed, 45 insertions(+), 4 deletions(-) rename python/cudf/cudf/_lib/cpp/nvtext/{bpe_tokenize.pxd => byte_pair_encode.pxd} (89%) rename python/cudf/cudf/_lib/nvtext/{bpe_tokenize.pyx => byte_pair_encode.pyx} (96%) diff --git a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd similarity index 89% rename from python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd rename to python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd index 45a8574d805..e678e4e84db 100644 --- a/python/cudf/cudf/_lib/cpp/nvtext/bpe_tokenize.pxd +++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd @@ -8,7 +8,7 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport string_scalar -cdef extern from "nvtext/bpe_tokenize.hpp" namespace "nvtext" nogil: +cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil: cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs": pass diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 6d4a36b0428..d7cbdeb5bda 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources - bpe_tokenize.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx + byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx similarity index 96% rename from python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx rename to python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx index cad33060eaf..63a653fce4e 100644 --- a/python/cudf/cudf/_lib/nvtext/bpe_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx @@ -9,7 +9,7 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.nvtext.bpe_tokenize cimport ( +from cudf._lib.cpp.nvtext.byte_pair_encode cimport ( bpe_merge_pairs as cpp_bpe_merge_pairs, byte_pair_encoding as cpp_byte_pair_encoding, load_merge_pairs as cpp_load_merge_pairs, diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index a1674a099c5..737a31b965e 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -3,7 +3,7 @@ from __future__ import annotations import cudf -from cudf._lib.nvtext.bpe_tokenize import ( +from cudf._lib.nvtext.byte_pair_encode import ( BPE_Merge_Pairs as cpp_merge_pairs, byte_pair_encoding as cpp_byte_pair_encoding, ) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 2241390a531..ba2a7db6578 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf.core.byte_pair_encoding import BytePairEncoder from cudf.core.tokenize_vocabulary import TokenizeVocabulary from cudf.testing._utils import assert_eq @@ -1030,3 +1031,43 @@ def test_jaccard_index_random_strings(): actual = str1.str.jaccard_index(str2, jaccard_width) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "separator, input, results", + [ + (" ", "thetestsentence", "the test sent ence"), + ("_", "sentenceistest", "sent_ence_is_test"), + ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"), + ], +) +def test_byte_pair_encoding(separator, input, results): + pairs_table = cudf.Series( + [ + "t he", + "h e", + "e n", + "i t", + "i s", + "e s", + "en t", + "c e", + "es t", + "en ce", + "t h", + "h i", + "th is", + "t est", + "s i", + "s ent", + ] + ) + encoder = BytePairEncoder(pairs_table) + + strings = cudf.Series([input, None, "", input]) + + expected = cudf.Series([results, None, "", results]) + + actual = encoder(strings, separator) + assert type(expected) == type(actual) + assert_eq(expected, actual) From 335beb1bd1801ed2eb1d5abdcce73bb3e6f4fe36 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 13 Nov 2023 14:26:22 -0500 Subject: [PATCH 28/28] change BPE_Merge_Pairs to BPEMergePairs --- python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx | 4 ++-- python/cudf/cudf/core/byte_pair_encoding.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx index 63a653fce4e..cfc76afa8a5 100644 --- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx +++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx @@ -18,7 +18,7 @@ from cudf._lib.cpp.scalar.scalar cimport string_scalar from cudf._lib.scalar cimport DeviceScalar -cdef class BPE_Merge_Pairs: +cdef class BPEMergePairs: cdef unique_ptr[cpp_bpe_merge_pairs] c_obj def __cinit__(self, Column merge_pairs): @@ -30,7 +30,7 @@ cdef class BPE_Merge_Pairs: @acquire_spill_lock() def byte_pair_encoding( Column strings, - BPE_Merge_Pairs merge_pairs, + BPEMergePairs merge_pairs, object separator ): cdef column_view c_strings = strings.view() diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 737a31b965e..4c881022ecf 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -4,13 +4,15 @@ import cudf from cudf._lib.nvtext.byte_pair_encode import ( - BPE_Merge_Pairs as cpp_merge_pairs, + BPEMergePairs as cpp_merge_pairs, byte_pair_encoding as cpp_byte_pair_encoding, ) class BytePairEncoder: """ + Given a merge pairs strings series, performs byte pair encoding on + a strings series using the provided separator. Parameters ----------