From 56758d88ad5665dfd0d4c5542ec25c240db67ece Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 10 Feb 2022 14:20:57 -0500 Subject: [PATCH 01/10] Add nvtext::byte_pair_encoding API --- conda/recipes/libcudf/meta.yaml | 3 +- cpp/CMakeLists.txt | 2 + cpp/include/cudf/strings/detail/combine.hpp | 16 +- cpp/include/cudf/strings/detail/split.hpp | 38 ++ cpp/include/nvtext/bpe_tokenize.hpp | 99 +++++ cpp/src/strings/split/split_record.cu | 31 +- cpp/src/text/subword/bpe_tokenizer.cu | 439 ++++++++++++++++++++ cpp/src/text/subword/load_merges_file.cu | 93 +++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/text/bpe_tests.cpp | 87 ++++ 10 files changed, 800 insertions(+), 9 deletions(-) create mode 100644 cpp/include/cudf/strings/detail/split.hpp create mode 100644 cpp/include/nvtext/bpe_tokenize.hpp create mode 100644 cpp/src/text/subword/bpe_tokenizer.cu create mode 100644 cpp/src/text/subword/load_merges_file.cu create mode 100644 cpp/tests/text/bpe_tests.cpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 70c020d4abd..8388ad4afbf 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -203,6 +203,7 @@ test: - test -f $PREFIX/include/cudf/strings/detail/fill.hpp - test -f $PREFIX/include/cudf/strings/detail/json.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp + - test -f $PREFIX/include/cudf/strings/detail/split.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 90e94ffcc7b..4d165b470b2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -468,8 +468,10 @@ add_library( src/text/normalize.cu src/text/replace.cu src/text/stemmer.cu + src/text/subword/bpe_tokenizer.cu src/text/subword/data_normalizer.cu src/text/subword/load_hash_file.cu + src/text/subword/load_merges_file.cu src/text/subword/subword_tokenize.cu src/text/subword/wordpiece_tokenizer.cu src/text/tokenize.cu diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index d6bdf398886..50f9a70e21c 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,6 +54,20 @@ std::unique_ptr join_strings( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar + * const&,separator_on_nulls,output_if_empty_list,rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/detail/split.hpp b/cpp/include/cudf/strings/detail/split.hpp new file mode 100644 index 00000000000..2f00a82678b --- /dev/null +++ b/cpp/include/cudf/strings/detail/split.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @copydoc split_record(strings_column_view const&,string_scalar const&,size_type, + * rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr split_record(strings_column_view const& input, + string_scalar const& delimiter, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp new file mode 100644 index 00000000000..a8f971d97b2 --- /dev/null +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +namespace nvtext { + +/** + * @addtogroup nvtext_tokenize + * @{ + * @file + */ + +/** + * @brief The table of merge pairs for the BPE encoder. + * + * To create an instance, call nvtext::load_merges_table + */ +struct bpe_merge_pairs { + std::unique_ptr merge_pairs; // strings +}; + +/** + * @brief Create a nvtext::bpe_merge_pairs from an input file. + * + * The file should contain a pair of strings per line separated by + * a single space. + * + * Example: + * @code{.txt} + * e n + * i t + * i s + * ... + * @endcode + * + * The pairs are expected to be ordered in the file by their rank + * relative to each other. A pair will be is in priority over + * any pairs below it. + * + * @param filename_merges Local file path of pairs encoded in UTF-8 + * @param mr Memory resource to allocate any returned objects. + */ +std::unique_ptr load_merge_pairs_file( + std::string const& filename_merges, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Byte pair encode the input strings. + * + * This will split each string on whitespace, perform the encoding, + * and then build the output using the given `separator`. + * + * The encoding algorithm rebuilds each string by matching substrings + * in the `merge_pairs` table and iteratively removing the minimum ranked pair + * until no pairs are left. Then, a space is inserted between the remaining + * pairs before the result is joined to make the output string. + * + * @code{.pseudo} + * mps = load_merges_file("merges.txt") + * input = ["test sentence", "this is test"] + * result = byte_pair_encoding(input, mps) + * result is now ["test Ġsent tence", "this Ġis Ġtest"] + * @endcode + * + * @throw cudf::logic_error if `merge_pairs` is empty + * @throw cudf::logic_error if `separator` is invalid + * + * @param input Strings to encode. + * @param merge_pairs Created by a call to nvtext::load_merges_file. + * @param separator String used to build the output after encoding. + * Default is a space followed by `Ġ`. + * @param mr Memory resource to allocate any returned objects. + */ +std::unique_ptr byte_pair_encoding( + cudf::strings_column_view const& input, + bpe_merge_pairs const& merges_pairs, + cudf::string_scalar const& separator = cudf::string_scalar(" Ġ"), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace nvtext diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 929d21a024c..832fd00725a 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -217,8 +217,6 @@ struct whitespace_token_reader_fn { } }; -} // namespace - // The output is one list item per string template std::unique_ptr split_record_fn(strings_column_view const& strings, @@ -289,6 +287,27 @@ std::unique_ptr split_record( mr); } } +} // namespace + +std::unique_ptr split_record(strings_column_view const& strings, + string_scalar const& delimiter, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record( + strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr rsplit_record(strings_column_view const& strings, + string_scalar const& delimiter, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record( + strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); +} } // namespace detail @@ -300,8 +319,7 @@ std::unique_ptr split_record(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record( - strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); + return detail::split_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); } std::unique_ptr rsplit_record(strings_column_view const& strings, @@ -310,8 +328,7 @@ std::unique_ptr rsplit_record(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record( - strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); + return detail::split_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu new file mode 100644 index 00000000000..a47be57126f --- /dev/null +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace nvtext { +namespace detail { + +namespace { + +/** + * @brief Initialize the byte indices and the pair rank for each string. + */ +struct initialize_indices_fn { + cudf::column_device_view const d_merges; + cudf::column_device_view const d_strings; + cudf::size_type* d_byte_indices; + cudf::size_type* d_min_ranks; + + __device__ void operator()(cudf::size_type idx) + { + d_min_ranks[idx] = cuda::std::numeric_limits::max(); + + if (d_strings.is_null(idx)) { return; } + + auto const d_str = d_strings.element(idx); + if (d_str.empty()) { return; } + + auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) + .element(idx); + auto d_indices = d_byte_indices + offset; + + // set the index value for each byte + for (auto i = 0; i < d_str.size_bytes(); ++i) { + auto const byte = static_cast(d_str.data()[i]); + // for intermediate UTF-8 bytes set the index value to 0 + d_indices[i] = cudf::strings::detail::is_begin_utf8_char(byte) ? i : 0; + } + } +}; + +/** + * @brief Parse the merge pair into components. + * + * The two substrings are separated by a single space. + * + * @param d_pair String to dissect + * @return The left and right halves of the input pair. + */ +__device__ thrust::pair dissect_merge_pair( + cudf::string_view const& d_pair) +{ + auto const lhs = d_pair.data(); + auto const end_str = d_pair.data() + d_pair.size_bytes(); + auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' ') + 1; + auto const lhs_size = static_cast(thrust::distance(lhs, rhs - 1)); + auto const rhs_size = static_cast(thrust::distance(rhs, end_str)); + return thrust::make_pair(cudf::string_view(lhs, lhs_size), cudf::string_view(rhs, rhs_size)); +} + +/** + * @brief Get the next substring of the given string. + * + * This will find the next sequence of characters identified by the + * given byte indices iterator values. The beginning of the sequence + * starts at `begin` and the end of the sequence is the first non-zero + * index found between (begin,end) exclusive. + * + * @tparam Iterator The byte indices iterator type + * @param begin Start of indices to check + * @param end End of indices to check + * @param d_str String to substring + * @return The substring found. + */ +template +__device__ cudf::string_view next_substr(Iterator begin, + Iterator end, + cudf::string_view const& d_str) +{ + auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); + auto const size = static_cast(thrust::distance(begin, next)); + return cudf::string_view(d_str.data() + *begin, size); +} + +/** + * @brief Iterate over the merge pairs and to find the minimum rank in each string. + * + * As a merge pair is located in each string, the minimum rank is accumulated in + * the output `d_min_ranks`. The rank is simply the position of the merge pair + * in the `d_merges` column. + */ +struct find_minimum_pair_fn { + cudf::column_device_view const d_merges; + cudf::column_device_view const d_strings; + cudf::size_type* d_byte_indices; + cudf::size_type* d_min_ranks; + + // index is over the merges table + __device__ void operator()(cudf::size_type index) + { + auto const d_pair = dissect_merge_pair(d_merges.element(index)); + + // locate this pair in each string + for (auto idx = 0; idx < d_strings.size(); ++idx) { + if (d_strings.is_null(idx)) continue; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) continue; + + auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) + .element(idx); + auto d_indices = d_byte_indices + offset; + + auto const begin = d_indices; + auto const end = d_indices + d_str.size_bytes(); + + // check for the merge-pair in this string + auto lhs = next_substr(begin, end, d_str); + auto itr = begin + lhs.size_bytes(); + while (itr < end) { + auto rhs = next_substr(itr, end, d_str); + if (rhs.empty()) break; + + if (d_pair.first == lhs && d_pair.second == rhs) { + // found a match, record the rank + atomicMin(d_min_ranks + idx, index); + break; // done with this string + } + + // next substring + lhs = rhs; + itr += rhs.size_bytes(); + } + } + } +}; + +/** + * @brief Remove merge pair from each string. + * + * The minimum rank found for each string used to identify the pair(s) + * to be removed. The pairs are removed by just zeroing the byte index + * found between the adjacent substrings. + * + * @code{.txt} + * d_strings = ["helloworld", "testisthis"] + * d_byte_indices = [ 0123456789 01234567] + * d_merges[d_min_ranks] = [ "ll o", "i s" ] + * + * d_bytes_indices -> [ 0123056789 01234060 ] + * d_min_ranks is reset to [ max, max ] + * @endcode + * + */ +struct remove_pair_fn { + cudf::column_device_view const d_merges; + cudf::column_device_view const d_strings; + cudf::size_type* d_byte_indices; + cudf::size_type* d_min_ranks; + + __device__ void operator()(cudf::size_type idx) + { + if (d_strings.is_null(idx)) return; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) return; + + auto rank = d_min_ranks[idx]; + if (rank == cuda::std::numeric_limits::max()) return; + + auto const d_pair = dissect_merge_pair(d_merges.element(rank)); + + // resolve byte indices for this string + auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) + .element(idx); + auto d_indices = d_byte_indices + offset; + + auto const begin = d_indices; + auto const end = d_indices + d_str.size_bytes(); + + // locate d_pair and remove it from this string + auto lhs = next_substr(begin, end, d_str); + auto itr = begin + lhs.size_bytes(); + while (itr < end) { + auto rhs = next_substr(itr, end, d_str); + if (d_pair.first == lhs && d_pair.second == rhs) { + *itr = 0; // removes the pair from this string + itr += rhs.size_bytes(); + if (itr < end) { + rhs = next_substr(itr, end, d_str); // skip to the next pair + } else { + break; // done with this string + } + } + // next substring + lhs = rhs; + itr += rhs.size_bytes(); + } + + // reset for next iteration + d_min_ranks[idx] = cuda::std::numeric_limits::max(); + } +}; + +/** + * @brief Computes the output size of each string. + * + * The output size is the size of the current string plus the + * number of spaces to be added between adjacent substrings. + * The number of spaces will equal the number of non-zero byte indices + * for the string. + */ +struct compute_sizes_fn { + cudf::column_device_view const d_strings; + cudf::size_type* d_byte_indices; + + __device__ cudf::size_type operator()(cudf::size_type idx) + { + if (d_strings.is_null(idx)) return 0; + auto const d_str = d_strings.element(idx); + auto offset = d_strings.child(cudf::strings_column_view::offsets_column_index) + .element(idx); + auto d_indices = d_byte_indices + offset; + return d_str.size_bytes() + thrust::count_if( // number of non-zero byte indices + thrust::seq, + d_indices, + d_indices + d_str.size_bytes(), + [](auto v) { return v != 0; }); + } +}; + +/** + * @brief Build the output string encoding. + * + * This copies each string to the output inserting a space at each non-zero byte index. + * + * @code{.txt} + * d_strings = ["helloworld", "testthis"] + * d_byte_indices = [ 0000050000 00004000] + * result is ["hello world", "test this"] + * @endcode + */ +struct build_encoding_fn { + cudf::column_device_view const d_strings; + cudf::size_type* d_byte_indices; + cudf::offset_type const* d_offsets; + char* d_chars{}; + + __device__ void operator()(cudf::size_type idx) + { + if (d_strings.is_null(idx)) return; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) return; + + auto offset = d_strings.child(cudf::strings_column_view::offsets_column_index) + .element(idx); + auto d_indices = d_byte_indices + offset; + auto d_output = d_chars ? d_chars + d_offsets[idx] : nullptr; + + // copy chars while indices==0, add space each time indices!=0 + auto begin = d_indices; + auto end = d_indices + d_str.size_bytes(); + auto d_input = d_str.data(); + *d_output++ = *d_input++; + auto itr = begin + 1; + while (itr < end) { + if (*itr++) *d_output++ = ' '; + *d_output++ = *d_input++; + } + } +}; + +/** + * @brief Perform byte pair encoding on each string in the input column. + * + * The result is a strings column of the same size where each string has been encoded. + * + * The encoding is performed iteratively. Each pass determines the string's lowest + * ranked merge pair as determined by the strings in `merges_table`. This pair + * is the removed (virtually) from each string before starting the next iteration. + * + * Once all pairs have exhausted for all strings, the output is constructed from + * the results by adding spaces between each remaining pair in each string. + */ +std::unique_ptr byte_pair_encoding( + cudf::strings_column_view const& input, + bpe_merge_pairs const& merges_table, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + // build working vectors + rmm::device_uvector d_byte_indices(input.chars().size(), stream); + rmm::device_uvector d_min_ranks(input.size(), stream); + + auto d_merges = cudf::column_device_view::create(merges_table.merge_pairs->view(), stream); + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + initialize_indices_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()}); + + cudf::size_type min_rank = 0; + while (min_rank < std::numeric_limits::max()) { + // find minimum merge pair for each string + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + d_merges->size(), + find_minimum_pair_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()}); + + // get the minimum rank over all strings; + // this is only used to see if we are finished + min_rank = thrust::reduce(rmm::exec_policy(stream), + d_min_ranks.begin(), + d_min_ranks.end(), + std::numeric_limits::max(), + thrust::minimum{}); + + // check if any pairs have been found; + // if so, remove that pair from each string + if (min_rank < std::numeric_limits::max()) { + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + remove_pair_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()}); + } + } + + // build the output: + // add spaces between the remaining pairs in each string + auto offsets_itr = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + compute_sizes_fn{*d_strings, d_byte_indices.data()}); + auto offsets = cudf::strings::detail::make_offsets_child_column( + offsets_itr, offsets_itr + input.size(), stream, mr); + auto d_offsets = offsets->view().data(); + + auto const bytes = cudf::detail::get_value(offsets->view(), input.size(), stream); + auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); + auto d_chars = chars->mutable_view().data(); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars}); + + return make_strings_column(input.size(), + std::move(offsets), + std::move(chars), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); +} + +} // namespace + +std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, + bpe_merge_pairs const& merges_table, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const strings_count = input.size(); + if (strings_count == 0 || input.chars_size() == 0) + return cudf::make_empty_column(cudf::type_id::STRING); + CUDF_EXPECTS(!merges_table.merge_pairs->view().is_empty(), "Merge pairs table must not be empty"); + + // split input on whitespace + auto split_result = cudf::strings::detail::split_record( + input, cudf::string_scalar(""), -1, stream, rmm::mr::get_current_device_resource()); + auto split_view = cudf::lists_column_view(split_result->view()); + + // run BPE on the strings child column + auto bpe_column = byte_pair_encoding(split_view.child(), merges_table, stream); + + // recombine the result: + // use the offsets from split_record and the strings from byte_pair_encoding + // to build a lists column_view + auto list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST}, + strings_count, + nullptr, // no parent data in list column + split_view.null_mask(), + split_view.null_count(), + 0, + {split_view.offsets(), bpe_column->view()}); + + // use join_list_elements to build the output strings column + return cudf::strings::detail::join_list_elements( + cudf::lists_column_view(list_join), + separator, + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO, + cudf::strings::output_if_empty_list::EMPTY_STRING, + stream, + mr); +} + +} // namespace detail + +std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, + bpe_merge_pairs const& merges_table, + cudf::string_scalar const& separator, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::byte_pair_encoding(input, merges_table, separator, rmm::cuda_stream_default, mr); +} + +} // namespace nvtext diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu new file mode 100644 index 00000000000..eaeaec03a96 --- /dev/null +++ b/cpp/src/text/subword/load_merges_file.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace nvtext { +namespace detail { + +/** + * @brief Loads a text file of merge-pairs into a strings column. + * + * The line position in the file indicates the pair's rank. + * + * @code{.pseudo} + * Format of the file: + * #version .. + * a1 a2 + * b1 b2 + * c1 c3 + * ... + * @endcode + * + * @param filename_merges Path to text file containing merge-pairs. + * @return object containing table elements for the BPE function + */ +std::unique_ptr load_merge_pairs_file(std::string const& filename_merges, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + bpe_merge_pairs result; + std::ifstream merges_file(filename_merges); + CUDF_EXPECTS(merges_file.good(), "Could not open " + filename_merges); + + std::vector chars{}; + std::vector offsets(1, 0); + + std::string line; + std::getline(merges_file, line); + std::string version = "#version"; + if (line.substr(0, version.size()).compare(version) == 0) { std::getline(merges_file, line); } + + // This is a text file delimited only by CR/LF. + // Look into using the CSV reader to load the strings column instead. + while (!line.empty()) { + chars.insert(chars.end(), std::cbegin(line), std::cend(line)); + offsets.push_back(offsets.back() + line.length()); + std::getline(merges_file, line); + } + + CUDF_EXPECTS(!chars.empty(), "No data found in " + filename_merges); + + auto d_chars = cudf::detail::make_device_uvector_async(chars, stream, mr); + auto d_offsets = cudf::detail::make_device_uvector_async(offsets, stream, mr); + result.merge_pairs = cudf::make_strings_column(d_chars, d_offsets); + + return std::make_unique(std::move(result)); +} + +} // namespace detail + +std::unique_ptr load_merge_pairs_file(std::string const& filename_merges, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::load_merge_pairs_file(filename_merges, rmm::cuda_stream_default, mr); +} + +} // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 6b5670630ec..e5313c1d061 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -390,6 +390,7 @@ ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_te # * nvtext test ----------------------------------------------------------------------------------- ConfigureTest( TEXT_TEST + text/bpe_tests.cpp text/edit_distance_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp new file mode 100644 index 00000000000..a009919fa21 --- /dev/null +++ b/cpp/tests/text/bpe_tests.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +struct TextBPETokenize : public cudf::test::BaseFixture { +}; + +TEST_F(TextBPETokenize, BytePairEncoding) +{ + // fake table based on values from https://huggingface.co/gpt2/raw/main/merges.txt + auto mpt = cudf::test::strings_column_wrapper({ + "e n", // 12 + "i t", // 14 + "i s", // 15 + "e s", // 18 + "en t", // 42 + "c e", // 88 + "es t", // 139 + "en ce", // 338 + "T h", // 561 + "Th is", // 956 + "t est", // 9032 + "s ent", // 33830 + }); + nvtext::bpe_merge_pairs merge_pairs{mpt.release()}; + + auto validity = cudf::test::iterators::null_at(4); + cudf::test::strings_column_wrapper input({"This is it", + "This is test-sentence-1", + "This is test sentence-2", + "This-is test sentence 3", + "", + ""}, + validity); + auto sv = cudf::strings_column_view(input); + + auto results = nvtext::byte_pair_encoding(sv, merge_pairs); + + auto expected = cudf::test::strings_column_wrapper({"This Ġis Ġit", + "This Ġis Ġtest - sent ence - 1", + "This Ġis Ġtest Ġsent ence - 2", + "This - is Ġtest Ġsent ence Ġ3", + "", + ""}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + +TEST_F(TextBPETokenize, BPE_Empty) +{ + auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"}); + nvtext::bpe_merge_pairs merge_pairs{mpt.release()}; + auto empty = cudf::make_empty_column(cudf::type_id::STRING); + auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), merge_pairs); + EXPECT_EQ(0, results->size()); +} + +TEST_F(TextBPETokenize, BPE_Error) +{ + auto empty = cudf::make_empty_column(cudf::type_id::STRING); + nvtext::bpe_merge_pairs merge_pairs{std::move(empty)}; + cudf::test::strings_column_wrapper input({"isit"}); + EXPECT_THROW(nvtext::byte_pair_encoding(cudf::strings_column_view(input), merge_pairs), + cudf::logic_error); +} From ae2baa04d790722a5d81e0992b20b99501296682 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 11 Feb 2022 09:24:32 -0500 Subject: [PATCH 02/10] fix call to detail::rsplit_record --- cpp/src/strings/split/split_record.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 832fd00725a..8834ece8734 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -328,7 +328,7 @@ std::unique_ptr rsplit_record(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); + return detail::rsplit_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); } } // namespace strings From aa6f8e896019cf61445e2c57856a9407e46a790c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 17 Feb 2022 16:47:37 -0500 Subject: [PATCH 03/10] change algorithm to use cuco::static-map --- conda/recipes/libcudf/meta.yaml | 1 - cpp/include/cudf/strings/detail/split.hpp | 38 -- cpp/include/nvtext/bpe_tokenize.hpp | 24 +- cpp/src/text/subword/bpe_tokenizer.cu | 617 ++++++++++++---------- cpp/src/text/subword/bpe_tokenizer.cuh | 59 +++ cpp/src/text/subword/load_merges_file.cu | 111 +++- cpp/tests/text/bpe_tests.cpp | 13 +- 7 files changed, 530 insertions(+), 333 deletions(-) delete mode 100644 cpp/include/cudf/strings/detail/split.hpp create mode 100644 cpp/src/text/subword/bpe_tokenizer.cuh diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 4dfacb76a95..4e20c979f6c 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -203,7 +203,6 @@ test: - test -f $PREFIX/include/cudf/strings/detail/fill.hpp - test -f $PREFIX/include/cudf/strings/detail/json.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - - test -f $PREFIX/include/cudf/strings/detail/split.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp diff --git a/cpp/include/cudf/strings/detail/split.hpp b/cpp/include/cudf/strings/detail/split.hpp deleted file mode 100644 index 2f00a82678b..00000000000 --- a/cpp/include/cudf/strings/detail/split.hpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace cudf { -namespace strings { -namespace detail { - -/** - * @copydoc split_record(strings_column_view const&,string_scalar const&,size_type, - * rmm::mr::device_memory_resource*) - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr split_record(strings_column_view const& input, - string_scalar const& delimiter, - size_type maxsplit, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - -} // namespace detail -} // namespace strings -} // namespace cudf diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index a8f971d97b2..53eda236d0b 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -34,7 +34,21 @@ namespace nvtext { * To create an instance, call nvtext::load_merges_table */ struct bpe_merge_pairs { - std::unique_ptr merge_pairs; // strings + struct bpe_merge_pairs_impl; + std::unique_ptr impl{}; + + bpe_merge_pairs(std::unique_ptr&& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + + bpe_merge_pairs(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + + ~bpe_merge_pairs(); + + cudf::size_type get_size(); + std::size_t get_map_size(); }; /** @@ -75,9 +89,9 @@ std::unique_ptr load_merge_pairs_file( * * @code{.pseudo} * mps = load_merges_file("merges.txt") - * input = ["test sentence", "this is test"] + * input = ["test sentence", "thisis test"] * result = byte_pair_encoding(input, mps) - * result is now ["test Ġsent tence", "this Ġis Ġtest"] + * result is now ["test sent tence", "this is test"] * @endcode * * @throw cudf::logic_error if `merge_pairs` is empty @@ -86,13 +100,13 @@ std::unique_ptr load_merge_pairs_file( * @param input Strings to encode. * @param merge_pairs Created by a call to nvtext::load_merges_file. * @param separator String used to build the output after encoding. - * Default is a space followed by `Ġ`. + * Default is a space. * @param mr Memory resource to allocate any returned objects. */ std::unique_ptr byte_pair_encoding( cudf::strings_column_view const& input, bpe_merge_pairs const& merges_pairs, - cudf::string_scalar const& separator = cudf::string_scalar(" Ġ"), + cudf::string_scalar const& separator = cudf::string_scalar(" "), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index a47be57126f..20837716845 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -14,16 +14,15 @@ * limitations under the License. */ +#include + #include #include #include #include -#include #include -#include #include -#include #include #include #include @@ -31,230 +30,245 @@ #include #include +#include +#include #include -#include +#include +#include namespace nvtext { namespace detail { namespace { -/** - * @brief Initialize the byte indices and the pair rank for each string. - */ -struct initialize_indices_fn { - cudf::column_device_view const d_merges; - cudf::column_device_view const d_strings; - cudf::size_type* d_byte_indices; - cudf::size_type* d_min_ranks; - - __device__ void operator()(cudf::size_type idx) - { - d_min_ranks[idx] = cuda::std::numeric_limits::max(); - - if (d_strings.is_null(idx)) { return; } - - auto const d_str = d_strings.element(idx); - if (d_str.empty()) { return; } - - auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); - auto d_indices = d_byte_indices + offset; - - // set the index value for each byte - for (auto i = 0; i < d_str.size_bytes(); ++i) { - auto const byte = static_cast(d_str.data()[i]); - // for intermediate UTF-8 bytes set the index value to 0 - d_indices[i] = cudf::strings::detail::is_begin_utf8_char(byte) ? i : 0; - } - } -}; - -/** - * @brief Parse the merge pair into components. - * - * The two substrings are separated by a single space. - * - * @param d_pair String to dissect - * @return The left and right halves of the input pair. - */ -__device__ thrust::pair dissect_merge_pair( - cudf::string_view const& d_pair) +template +constexpr bool is_whitespace(CharType ch) { - auto const lhs = d_pair.data(); - auto const end_str = d_pair.data() + d_pair.size_bytes(); - auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' ') + 1; - auto const lhs_size = static_cast(thrust::distance(lhs, rhs - 1)); - auto const rhs_size = static_cast(thrust::distance(rhs, end_str)); - return thrust::make_pair(cudf::string_view(lhs, lhs_size), cudf::string_view(rhs, rhs_size)); + return ch <= ' '; } /** - * @brief Get the next substring of the given string. + * @brief Resolve a truncated string from a full string. * - * This will find the next sequence of characters identified by the - * given byte indices iterator values. The beginning of the sequence - * starts at `begin` and the end of the sequence is the first non-zero - * index found between (begin,end) exclusive. + * This will return a substring of the input starting with the first byte + * upto the first whitespace character is found or the end of the string. + * Any whitespace is expected only at the end of the string. * - * @tparam Iterator The byte indices iterator type - * @param begin Start of indices to check - * @param end End of indices to check - * @param d_str String to substring - * @return The substring found. + * @param d_str Input string to resolve. + * @return Substring of the input excluding trailing whitespace. */ -template -__device__ cudf::string_view next_substr(Iterator begin, - Iterator end, - cudf::string_view const& d_str) +__device__ cudf::string_view resolve_string(cudf::string_view const& d_str) { - auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); - auto const size = static_cast(thrust::distance(begin, next)); - return cudf::string_view(d_str.data() + *begin, size); + auto const begin = d_str.data(); + auto end = thrust::find_if( + thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); }); + auto size = static_cast(thrust::distance(begin, end)); + return cudf::string_view(begin, size); } /** - * @brief Iterate over the merge pairs and to find the minimum rank in each string. + * @brief Main byte pair encoding algorithm function for each string. * - * As a merge pair is located in each string, the minimum rank is accumulated in - * the output `d_min_ranks`. The rank is simply the position of the merge pair - * in the `d_merges` column. + * @see The byte_pair_encoding_fn::operator() function below for details. */ -struct find_minimum_pair_fn { +struct byte_pair_encoding_fn { cudf::column_device_view const d_merges; cudf::column_device_view const d_strings; + merge_pairs_map_type::device_view const d_map; + cudf::size_type* d_sizes; // output size of encoded string + string_hasher_type const hasher; cudf::size_type* d_byte_indices; - cudf::size_type* d_min_ranks; - // index is over the merges table - __device__ void operator()(cudf::size_type index) + /** + * @brief Parse the merge pair into components. + * + * The two substrings are separated by a single space. + * + * @param idx Index of merge pair to dissect. + * @return The left and right halves of the merge pair. + */ + __device__ thrust::pair dissect_merge_pair( + cudf::size_type idx) { - auto const d_pair = dissect_merge_pair(d_merges.element(index)); + auto const d_pair = d_merges.element(idx); + auto const lhs = d_pair.data(); + auto const end_str = d_pair.data() + d_pair.size_bytes(); + auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' ') + 1; + auto const lhs_size = static_cast(thrust::distance(lhs, rhs - 1)); + auto const rhs_size = static_cast(thrust::distance(rhs, end_str)); + return thrust::make_pair(cudf::string_view(lhs, lhs_size), cudf::string_view(rhs, rhs_size)); + } - // locate this pair in each string - for (auto idx = 0; idx < d_strings.size(); ++idx) { - if (d_strings.is_null(idx)) continue; - auto const d_str = d_strings.element(idx); - if (d_str.empty()) continue; + /** + * @brief Get the next substring of the given string. + * + * This will find the next sequence of characters identified by the + * given byte indices iterator values. The beginning of the sequence + * starts at `begin` and the end of the sequence is the first non-zero + * index found between (begin,end) exclusive. + * + * @tparam Iterator The byte indices iterator type + * @param begin Start of indices to check + * @param end End of indices to check + * @param d_str String to substring + * @return The substring found. + */ + template + __device__ cudf::string_view next_substr(Iterator begin, + Iterator end, + cudf::string_view const& d_str) + { + auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); + auto const size = static_cast(thrust::distance(begin, next)); + return cudf::string_view(d_str.data() + *begin, size); + } - auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); - auto d_indices = d_byte_indices + offset; + /** + * @brief Compute the hash over the input strings. + * + * The input strings are combined with a space to produce hash for matching + * a merge pair within the `d_map`. + * + * @param lhs First string. + * @param rhs Second string. + * @return The hash value to match with `d_map`. + */ + __device__ hash_value_type compute_hash(cudf::string_view const& lhs, + cudf::string_view const& rhs) + { + __shared__ char shmem[48 * 1024]; // max for Pascal + auto const total_size = lhs.size_bytes() + rhs.size_bytes() + 1; + auto const thread_memory_size = static_cast(sizeof(shmem) / blockDim.x); - auto const begin = d_indices; - auto const end = d_indices + d_str.size_bytes(); + // Edge case check. + // Empirically found only two merge pair strings that were greater than 70 bytes + // and they both looked like ignorable errors. Double check this analysis with Vibhu. + if (thread_memory_size < total_size) { return 0; } - // check for the merge-pair in this string - auto lhs = next_substr(begin, end, d_str); - auto itr = begin + lhs.size_bytes(); - while (itr < end) { - auto rhs = next_substr(itr, end, d_str); - if (rhs.empty()) break; + // build the target string in shared memory + char* ptr = &shmem[threadIdx.x * thread_memory_size]; - if (d_pair.first == lhs && d_pair.second == rhs) { - // found a match, record the rank - atomicMin(d_min_ranks + idx, index); - break; // done with this string - } + // build a temp string like: temp = lhs + ' ' + rhs + memcpy(ptr, lhs.data(), lhs.size_bytes()); + memcpy(ptr + lhs.size_bytes(), " ", 1); + memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes()); - // next substring - lhs = rhs; - itr += rhs.size_bytes(); - } - } + auto const d_hash_str = cudf::string_view(ptr, total_size); + return hasher(d_hash_str); // return the hash for the temp string } -}; - -/** - * @brief Remove merge pair from each string. - * - * The minimum rank found for each string used to identify the pair(s) - * to be removed. The pairs are removed by just zeroing the byte index - * found between the adjacent substrings. - * - * @code{.txt} - * d_strings = ["helloworld", "testisthis"] - * d_byte_indices = [ 0123456789 01234567] - * d_merges[d_min_ranks] = [ "ll o", "i s" ] - * - * d_bytes_indices -> [ 0123056789 01234060 ] - * d_min_ranks is reset to [ max, max ] - * @endcode - * - */ -struct remove_pair_fn { - cudf::column_device_view const d_merges; - cudf::column_device_view const d_strings; - cudf::size_type* d_byte_indices; - cudf::size_type* d_min_ranks; + /** + * @brief Byte encode each string. + * + * Each string is iteratively scanned for the minimum rank of adjacent substring pairs + * as found within the `d_map` table. Once the minimum pair is located, that pair + * is removed -- virtually by zero-ing the index value between any matching adjacent pairs. + * + * The iteration ends once there are no more adjacent pairs or there are no more + * matches found in `d_map`. At the end, the indices for each string reflect the + * encoding pattern and can be used to build the output. + * + * This function also computes the size of the encoded output of each string + * by simply counting the number of non-zero indices values remaining. This saves + * an extra kernel launch normally required to compute the offsets of the output column. + * + * @param idx The index of the string in `d_strings` to encode + */ __device__ void operator()(cudf::size_type idx) { - if (d_strings.is_null(idx)) return; - auto const d_str = d_strings.element(idx); - if (d_str.empty()) return; - - auto rank = d_min_ranks[idx]; - if (rank == cuda::std::numeric_limits::max()) return; - - auto const d_pair = dissect_merge_pair(d_merges.element(rank)); + if (d_strings.is_null(idx)) { return; } + auto const d_str = resolve_string(d_strings.element(idx)); + if (d_str.empty()) { return; } - // resolve byte indices for this string auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) .element(idx); - auto d_indices = d_byte_indices + offset; + auto const d_indices = d_byte_indices + offset; + + // initialize the byte indices for this string; + // set the index value to 0 for any intermediate UTF-8 bytes + thrust::transform(thrust::seq, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(d_str.size_bytes()), + d_indices, + [data = d_str.data()](auto idx) { + auto const byte = static_cast(data[idx]); + return cudf::strings::detail::is_begin_utf8_char(byte) ? idx : 0; + }); auto const begin = d_indices; auto const end = d_indices + d_str.size_bytes(); - // locate d_pair and remove it from this string - auto lhs = next_substr(begin, end, d_str); - auto itr = begin + lhs.size_bytes(); - while (itr < end) { - auto rhs = next_substr(itr, end, d_str); - if (d_pair.first == lhs && d_pair.second == rhs) { - *itr = 0; // removes the pair from this string + // keep processing the string until there are no more adjacent pairs found in d_map + cudf::size_type min_rank = 0; + while (min_rank < cuda::std::numeric_limits::max()) { + // initialize working variables + min_rank = cuda::std::numeric_limits::max(); + + auto lhs = next_substr(begin, end, d_str); + auto itr = begin + lhs.size_bytes(); + + auto min_itr = itr; // these are set along with + auto min_size = lhs.size_bytes(); // the min_rank variable + + // check each adjacent pair against the d_map + while (itr < end) { + auto const rhs = next_substr(itr, end, d_str); + if (rhs.empty()) break; // no more adjacent pairs + + auto const hash = compute_hash(lhs, rhs); + auto const map_itr = d_map.find(hash); + if (map_itr != d_map.end()) { + // found a match; record the rank (and other min_ vars) + auto const rank = static_cast(map_itr->second); + if (rank < min_rank) { + min_rank = rank; + min_itr = itr; + min_size = rhs.size_bytes(); + } + } + // next substring + lhs = rhs; itr += rhs.size_bytes(); + } + + // if any pair matched, remove every occurrence from the string + if (min_rank < cuda::std::numeric_limits::max()) { + // remove the first pair we found + itr = min_itr; + *itr = 0; + + // continue scanning for other occurrences in the remainder of the string + itr += min_size; if (itr < end) { - rhs = next_substr(itr, end, d_str); // skip to the next pair - } else { - break; // done with this string + auto const d_pair = dissect_merge_pair(min_rank); + + lhs = next_substr(itr, end, d_str); + itr += lhs.size_bytes(); + while (itr < end) { + auto rhs = next_substr(itr, end, d_str); + if (d_pair.first == lhs && d_pair.second == rhs) { + *itr = 0; // removes the pair from this string + itr += rhs.size_bytes(); + if (itr >= end) { break; } // done checking for pairs + // skip to the next adjacent pair + rhs = next_substr(itr, end, d_str); + } + // next substring + lhs = rhs; + itr += rhs.size_bytes(); + } } } - // next substring - lhs = rhs; - itr += rhs.size_bytes(); } - // reset for next iteration - d_min_ranks[idx] = cuda::std::numeric_limits::max(); - } -}; - -/** - * @brief Computes the output size of each string. - * - * The output size is the size of the current string plus the - * number of spaces to be added between adjacent substrings. - * The number of spaces will equal the number of non-zero byte indices - * for the string. - */ -struct compute_sizes_fn { - cudf::column_device_view const d_strings; - cudf::size_type* d_byte_indices; - - __device__ cudf::size_type operator()(cudf::size_type idx) - { - if (d_strings.is_null(idx)) return 0; - auto const d_str = d_strings.element(idx); - auto offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); - auto d_indices = d_byte_indices + offset; - return d_str.size_bytes() + thrust::count_if( // number of non-zero byte indices - thrust::seq, - d_indices, - d_indices + d_str.size_bytes(), - [](auto v) { return v != 0; }); + // compute and store the output size for this string's encoding + auto const encoded_size = d_str.size_bytes() + // number of original bytes + + thrust::count_if( // number of non-zero byte indices + thrust::seq, + d_indices, + d_indices + d_str.size_bytes(), + [](auto v) { return v != 0; }); + d_sizes[idx] = static_cast(encoded_size); } }; @@ -271,27 +285,28 @@ struct compute_sizes_fn { */ struct build_encoding_fn { cudf::column_device_view const d_strings; - cudf::size_type* d_byte_indices; + cudf::size_type const* d_byte_indices; cudf::offset_type const* d_offsets; char* d_chars{}; __device__ void operator()(cudf::size_type idx) { - if (d_strings.is_null(idx)) return; - auto const d_str = d_strings.element(idx); - if (d_str.empty()) return; - - auto offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); - auto d_indices = d_byte_indices + offset; - auto d_output = d_chars ? d_chars + d_offsets[idx] : nullptr; - - // copy chars while indices==0, add space each time indices!=0 - auto begin = d_indices; - auto end = d_indices + d_str.size_bytes(); - auto d_input = d_str.data(); - *d_output++ = *d_input++; - auto itr = begin + 1; + if (d_strings.is_null(idx)) { return; } + auto const d_str = resolve_string(d_strings.element(idx)); + if (d_str.empty()) { return; } + + auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) + .element(idx); + auto const d_indices = d_byte_indices + offset; + auto d_output = d_chars ? d_chars + d_offsets[idx] : nullptr; + + // copy chars while indices[i]==0, + // insert space each time indices[i]!=0 + auto const begin = d_indices; + auto const end = d_indices + d_str.size_bytes(); + auto d_input = d_str.data(); + *d_output++ = *d_input++; + auto itr = begin + 1; while (itr < end) { if (*itr++) *d_output++ = ' '; *d_output++ = *d_input++; @@ -310,119 +325,173 @@ struct build_encoding_fn { * * Once all pairs have exhausted for all strings, the output is constructed from * the results by adding spaces between each remaining pair in each string. + * + * @param input Strings to encode. + * @param merge_pairs Merge pairs data and map used for encoding. + * @param stream CUDA stream used for device memory operations and kernel launches */ std::unique_ptr byte_pair_encoding( cudf::strings_column_view const& input, - bpe_merge_pairs const& merges_table, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs, + rmm::cuda_stream_view stream) { - // build working vectors - rmm::device_uvector d_byte_indices(input.chars().size(), stream); - rmm::device_uvector d_min_ranks(input.size(), stream); + CUDF_EXPECTS(!merge_pairs.get_merge_pairs().is_empty(), "Merge pairs table must not be empty"); - auto d_merges = cudf::column_device_view::create(merges_table.merge_pairs->view(), stream); - auto d_strings = cudf::column_device_view::create(input.parent(), stream); + // build working vector to hold index values per byte + rmm::device_uvector d_byte_indices(input.chars().size(), stream); + auto const d_merges = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream); + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, + static_cast(input.size() + 1), + cudf::mask_state::UNALLOCATED, + stream, + rmm::mr::get_current_device_resource()); + auto d_offsets = offsets->mutable_view().data(); + + byte_pair_encoding_fn fn{*d_merges, + *d_strings, + merge_pairs.get_merge_pairs_map(), + d_offsets, + string_hasher_type{}, + d_byte_indices.data()}; thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - initialize_indices_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()}); - - cudf::size_type min_rank = 0; - while (min_rank < std::numeric_limits::max()) { - // find minimum merge pair for each string - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - d_merges->size(), - find_minimum_pair_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()}); - - // get the minimum rank over all strings; - // this is only used to see if we are finished - min_rank = thrust::reduce(rmm::exec_policy(stream), - d_min_ranks.begin(), - d_min_ranks.end(), - std::numeric_limits::max(), - thrust::minimum{}); - - // check if any pairs have been found; - // if so, remove that pair from each string - if (min_rank < std::numeric_limits::max()) { - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - remove_pair_fn{*d_merges, *d_strings, d_byte_indices.data(), d_min_ranks.data()}); - } - } + rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), fn); - // build the output: - // add spaces between the remaining pairs in each string - auto offsets_itr = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), - compute_sizes_fn{*d_strings, d_byte_indices.data()}); - auto offsets = cudf::strings::detail::make_offsets_child_column( - offsets_itr, offsets_itr + input.size(), stream, mr); - auto d_offsets = offsets->view().data(); + // build the output: add spaces between the remaining pairs in each string + thrust::exclusive_scan( + rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets); auto const bytes = cudf::detail::get_value(offsets->view(), input.size(), stream); - auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); - auto d_chars = chars->mutable_view().data(); + auto chars = cudf::strings::detail::create_chars_child_column( + bytes, stream, rmm::mr::get_current_device_resource()); + auto d_chars = chars->mutable_view().data(); + thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars}); - return make_strings_column(input.size(), - std::move(offsets), - std::move(chars), - input.null_count(), - cudf::detail::copy_bitmask(input.parent(), stream, mr)); + return make_strings_column( + input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{}); +} + +/** + * @brief Create new offsets by identifying substrings by whitespace. + * + * This is similar to cudf::strings::split_record but does not fully split + * and only returns new offsets. The behavior is more like a view-only slice + * of the chars child with the result still including trailing delimiters. + * + * The encoding algorithm knows to ignore the trailing whitespace of each string. + * + * @param input Strings to tokenize. + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr space_offsets(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream) +{ + // count space offsets + // TODO: does not yet account for sliced input column + auto const begin = thrust::make_counting_iterator(1); + auto const end = thrust::make_counting_iterator(input.chars().size()); + auto edge_of_space = [d_chars = input.chars().data()] __device__(auto idx) -> bool { + return !is_whitespace(d_chars[idx]) && is_whitespace(d_chars[idx - 1]); + }; + auto space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space); + + // copy space offsets + rmm::device_uvector space_offsets(space_count, stream); + thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space); + + // create output offsets + auto split_offsets = + cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, + static_cast(space_count + input.size() + 1), + cudf::mask_state::UNALLOCATED, + stream, + rmm::mr::get_current_device_resource()); + + // combine current offsets with space offsets + thrust::merge(rmm::exec_policy(stream), + input.offsets_begin(), + input.offsets_end(), + space_offsets.begin(), + space_offsets.end(), + split_offsets->mutable_view().begin()); + + return split_offsets; } +/** + * @brief Build new offsets that can be used to build a list column for calling join. + * + * This essentially returns the number of tokens for each string. + */ +struct split_offsets_fn { + cudf::column_device_view const d_strings; + __device__ cudf::size_type operator()(cudf::size_type idx) + { + if (d_strings.is_null(idx)) return 0; + auto const d_str = d_strings.element(idx); + // TODO: does not correctly account for adjacent whitespace + auto const result = thrust::count_if( + thrust::seq, d_str.begin(), d_str.end(), [](auto ch) { return is_whitespace(ch); }); + return static_cast(result) + 1; + } +}; + } // namespace std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, - bpe_merge_pairs const& merges_table, + bpe_merge_pairs const& merge_pairs, cudf::string_scalar const& separator, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const strings_count = input.size(); - if (strings_count == 0 || input.chars_size() == 0) + if (input.is_empty() || input.chars_size() == 0) return cudf::make_empty_column(cudf::type_id::STRING); - CUDF_EXPECTS(!merges_table.merge_pairs->view().is_empty(), "Merge pairs table must not be empty"); - // split input on whitespace - auto split_result = cudf::strings::detail::split_record( - input, cudf::string_scalar(""), -1, stream, rmm::mr::get_current_device_resource()); - auto split_view = cudf::lists_column_view(split_result->view()); + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + auto offsets_itr = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), split_offsets_fn{*d_strings}); + auto split_offsets = cudf::strings::detail::make_offsets_child_column( + offsets_itr, offsets_itr + input.size(), stream, rmm::mr::get_current_device_resource()); + + auto offsets = space_offsets(input, stream); + + auto const split_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, + offsets->size() - 1, + nullptr, // no parent data + nullptr, // null-mask + 0, // null-count + 0, // offset + {offsets->view(), input.chars()}); // run BPE on the strings child column - auto bpe_column = byte_pair_encoding(split_view.child(), merges_table, stream); + auto bpe_column = + byte_pair_encoding(cudf::strings_column_view(split_view), *(merge_pairs.impl), stream); - // recombine the result: - // use the offsets from split_record and the strings from byte_pair_encoding - // to build a lists column_view + // recombine the result auto list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST}, - strings_count, + input.size(), nullptr, // no parent data in list column - split_view.null_mask(), - split_view.null_count(), + input.null_mask(), + input.null_count(), 0, - {split_view.offsets(), bpe_column->view()}); + {split_offsets->view(), bpe_column->view()}); // use join_list_elements to build the output strings column - return cudf::strings::detail::join_list_elements( - cudf::lists_column_view(list_join), - separator, - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::NO, - cudf::strings::output_if_empty_list::EMPTY_STRING, - stream, - mr); + auto result = + cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join), + separator, + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO, + cudf::strings::output_if_empty_list::EMPTY_STRING, + stream, + mr); + return result; } } // namespace detail diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh new file mode 100644 index 00000000000..ac21eea0261 --- /dev/null +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include +#include + +#include + +#include +#include +#include + +#include + +namespace nvtext { +namespace detail { + +using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + +using merge_pairs_map_type = cuco::static_map; + +using string_hasher_type = MurmurHash3_32; + +} // namespace detail + +struct bpe_merge_pairs::bpe_merge_pairs_impl { + std::unique_ptr const merge_pairs; + std::unique_ptr merge_pairs_map; + + bpe_merge_pairs_impl(std::unique_ptr&& merge_pairs, + std::unique_ptr&& merge_pairs_map); + + auto get_merge_pairs() const { return merge_pairs->view(); } + auto get_merge_pairs_map() const { return merge_pairs_map->get_device_view(); } +}; + +} // namespace nvtext diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index eaeaec03a96..1ea2bdcaba5 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -14,9 +14,12 @@ * limitations under the License. */ +#include + #include #include +#include #include #include #include @@ -31,6 +34,22 @@ namespace nvtext { namespace detail { +namespace { + +struct make_pair_function { + /** + * @brief Hash the merge pair entry + */ + __device__ cuco::pair_type operator()(cudf::size_type idx) + { + auto const result = _hasher(d_strings.element(idx)); + return cuco::make_pair(result, idx); + } + + string_hasher_type const _hasher; + cudf::column_device_view const d_strings; +}; + /** * @brief Loads a text file of merge-pairs into a strings column. * @@ -41,18 +60,17 @@ namespace detail { * #version .. * a1 a2 * b1 b2 - * c1 c3 + * c1 c2 * ... * @endcode * * @param filename_merges Path to text file containing merge-pairs. * @return object containing table elements for the BPE function */ -std::unique_ptr load_merge_pairs_file(std::string const& filename_merges, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr load_file_to_column(std::string const& filename_merges, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - bpe_merge_pairs result; std::ifstream merges_file(filename_merges); CUDF_EXPECTS(merges_file.good(), "Could not open " + filename_merges); @@ -74,11 +92,60 @@ std::unique_ptr load_merge_pairs_file(std::string const& filena CUDF_EXPECTS(!chars.empty(), "No data found in " + filename_merges); - auto d_chars = cudf::detail::make_device_uvector_async(chars, stream, mr); - auto d_offsets = cudf::detail::make_device_uvector_async(offsets, stream, mr); - result.merge_pairs = cudf::make_strings_column(d_chars, d_offsets); + auto d_chars = cudf::detail::make_device_uvector_async(chars, stream, mr); + auto d_offsets = cudf::detail::make_device_uvector_async(offsets, stream, mr); + return cudf::make_strings_column(d_chars, d_offsets); +} - return std::make_unique(std::move(result)); +std::unique_ptr initialize_merge_pairs_map( + cudf::strings_column_view const& input, rmm::cuda_stream_view stream) +{ + auto merge_pairs_map = std::make_unique( + static_cast(input.size() * 2), // ensure capacity is at least (size*10/7) + std::numeric_limits::max(), + -1, // empty-value is not used + hash_table_allocator_type{default_allocator{}, stream}, + stream.value()); + + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + make_pair_function pair_func{string_hasher_type{}, *d_strings}; + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); + + merge_pairs_map->insert(iter, + iter + input.size(), + cuco::detail::MurmurHash3_32{}, + thrust::equal_to{}, + stream.value()); + + return merge_pairs_map; +} + +std::unique_ptr create_bpe_merge_pairs_impl( + std::unique_ptr&& input, rmm::cuda_stream_view stream) +{ + auto merge_pairs = initialize_merge_pairs_map(cudf::strings_column_view(input->view()), stream); + auto result = std::make_unique( + std::move(input), std::move(merge_pairs)); + return result; +} + +std::unique_ptr create_bpe_merge_pairs_impl( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return create_bpe_merge_pairs_impl(std::make_unique(input.parent(), stream, mr), + stream); +} + +} // namespace + +std::unique_ptr load_merge_pairs_file(std::string const& filename_merges, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto input_column = load_file_to_column(filename_merges, stream, mr); + return std::make_unique(std::move(input_column), stream, mr); } } // namespace detail @@ -90,4 +157,30 @@ std::unique_ptr load_merge_pairs_file(std::string const& filena return detail::load_merge_pairs_file(filename_merges, rmm::cuda_stream_default, mr); } +bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl( + std::unique_ptr&& merge_pairs, + std::unique_ptr&& merge_pairs_map) + : merge_pairs(std::move(merge_pairs)), merge_pairs_map(std::move(merge_pairs_map)) +{ +} + +bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr&& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource*) + : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream)) +{ +} + +bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr)) +{ +} + +bpe_merge_pairs::~bpe_merge_pairs() = default; + +cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); } +std::size_t bpe_merge_pairs::get_map_size() { return impl->merge_pairs_map->get_size(); } + } // namespace nvtext diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index a009919fa21..a0bb926ee9d 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -29,7 +29,7 @@ struct TextBPETokenize : public cudf::test::BaseFixture { TEST_F(TextBPETokenize, BytePairEncoding) { - // fake table based on values from https://huggingface.co/gpt2/raw/main/merges.txt + // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt auto mpt = cudf::test::strings_column_wrapper({ "e n", // 12 "i t", // 14 @@ -44,7 +44,8 @@ TEST_F(TextBPETokenize, BytePairEncoding) "t est", // 9032 "s ent", // 33830 }); - nvtext::bpe_merge_pairs merge_pairs{mpt.release()}; + + nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)}; auto validity = cudf::test::iterators::null_at(4); cudf::test::strings_column_wrapper input({"This is it", @@ -58,10 +59,10 @@ TEST_F(TextBPETokenize, BytePairEncoding) auto results = nvtext::byte_pair_encoding(sv, merge_pairs); - auto expected = cudf::test::strings_column_wrapper({"This Ġis Ġit", - "This Ġis Ġtest - sent ence - 1", - "This Ġis Ġtest Ġsent ence - 2", - "This - is Ġtest Ġsent ence Ġ3", + auto expected = cudf::test::strings_column_wrapper({"This is it", + "This is test - sent ence - 1", + "This is test sent ence - 2", + "This - is test sent ence 3", "", ""}, validity); From 3df89a0dd8df7bd603887d0fb448048705dfd2cd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 18 Feb 2022 17:10:14 -0500 Subject: [PATCH 04/10] handle sliced input column --- cpp/src/text/subword/bpe_tokenizer.cu | 124 +++++++++++++++++--------- cpp/tests/text/bpe_tests.cpp | 8 +- 2 files changed, 91 insertions(+), 41 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 20837716845..49c53b7547c 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -60,9 +61,9 @@ constexpr bool is_whitespace(CharType ch) __device__ cudf::string_view resolve_string(cudf::string_view const& d_str) { auto const begin = d_str.data(); - auto end = thrust::find_if( + auto const end = thrust::find_if( thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); }); - auto size = static_cast(thrust::distance(begin, end)); + auto const size = static_cast(thrust::distance(begin, end)); return cudf::string_view(begin, size); } @@ -363,8 +364,9 @@ std::unique_ptr byte_pair_encoding( thrust::exclusive_scan( rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets); - auto const bytes = cudf::detail::get_value(offsets->view(), input.size(), stream); - auto chars = cudf::strings::detail::create_chars_child_column( + auto const bytes = + cudf::detail::get_value(offsets->view(), input.size(), stream); + auto chars = cudf::strings::detail::create_chars_child_column( bytes, stream, rmm::mr::get_current_device_resource()); auto d_chars = chars->mutable_view().data(); @@ -377,6 +379,37 @@ std::unique_ptr byte_pair_encoding( input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{}); } +/** + * @brief Detect space to not-space transitions inside each string. + * + * This handles sliced input and null strings as well. + * It is parallelized over bytes and returns true only for valid left edges + * -- non-space proceeded by a space. + */ +struct edge_of_space_fn { + cudf::column_device_view const d_strings; + __device__ bool operator()(cudf::offset_type offset) + { + auto const d_chars = + d_strings.child(cudf::strings_column_view::chars_column_index).data(); + if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; } + + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const d_offsets = offsets.data() + d_strings.offset(); + // ignore offsets outside sliced range + if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; } + + auto itr = + thrust::lower_bound(thrust::seq, d_offsets, d_offsets + d_strings.size() + 1, offset); + // ignore offsets at existing string boundaries + if (*itr == offset) { return false; } + + // count only edges for valid strings + auto const index = static_cast(thrust::distance(d_offsets, itr)) - 1; + return d_strings.is_valid(index); + } +}; + /** * @brief Create new offsets by identifying substrings by whitespace. * @@ -388,25 +421,24 @@ std::unique_ptr byte_pair_encoding( * * @param input Strings to tokenize. * @param stream CUDA stream used for device memory operations and kernel launches + * @return New offsets including those at the edge of each space. */ std::unique_ptr space_offsets(cudf::strings_column_view const& input, + cudf::column_device_view const& d_strings, rmm::cuda_stream_view stream) { // count space offsets - // TODO: does not yet account for sliced input column - auto const begin = thrust::make_counting_iterator(1); - auto const end = thrust::make_counting_iterator(input.chars().size()); - auto edge_of_space = [d_chars = input.chars().data()] __device__(auto idx) -> bool { - return !is_whitespace(d_chars[idx]) && is_whitespace(d_chars[idx - 1]); - }; - auto space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space); + auto const begin = thrust::make_counting_iterator(1); + auto const end = thrust::make_counting_iterator(input.chars().size()); + edge_of_space_fn edge_of_space{d_strings}; + auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space); // copy space offsets rmm::device_uvector space_offsets(space_count, stream); thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space); // create output offsets - auto split_offsets = + auto result = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, static_cast(space_count + input.size() + 1), cudf::mask_state::UNALLOCATED, @@ -419,9 +451,9 @@ std::unique_ptr space_offsets(cudf::strings_column_view const& inp input.offsets_end(), space_offsets.begin(), space_offsets.end(), - split_offsets->mutable_view().begin()); + result->mutable_view().begin()); - return split_offsets; + return result; } /** @@ -429,15 +461,22 @@ std::unique_ptr space_offsets(cudf::strings_column_view const& inp * * This essentially returns the number of tokens for each string. */ -struct split_offsets_fn { +struct list_offsets_fn { cudf::column_device_view const d_strings; __device__ cudf::size_type operator()(cudf::size_type idx) { if (d_strings.is_null(idx)) return 0; auto const d_str = d_strings.element(idx); - // TODO: does not correctly account for adjacent whitespace - auto const result = thrust::count_if( - thrust::seq, d_str.begin(), d_str.end(), [](auto ch) { return is_whitespace(ch); }); + if (d_str.empty()) return 1; // empty is a single valid result + + auto const begin = thrust::make_counting_iterator(1); + auto const end = thrust::make_counting_iterator(d_str.size_bytes()); + + // this counts the number of non-adjacent delimiters + auto const result = + thrust::count_if(thrust::seq, begin, end, [data = d_str.data()](auto chidx) { + return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]); + }); return static_cast(result) + 1; } }; @@ -453,15 +492,11 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const if (input.is_empty() || input.chars_size() == 0) return cudf::make_empty_column(cudf::type_id::STRING); - auto d_strings = cudf::column_device_view::create(input.parent(), stream); - auto offsets_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), split_offsets_fn{*d_strings}); - auto split_offsets = cudf::strings::detail::make_offsets_child_column( - offsets_itr, offsets_itr + input.size(), stream, rmm::mr::get_current_device_resource()); - - auto offsets = space_offsets(input, stream); + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto const offsets = space_offsets(input, *d_strings, stream); - auto const split_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, + // build a view using the new offsets and the current input chars column + auto const input_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, offsets->size() - 1, nullptr, // no parent data nullptr, // null-mask @@ -469,20 +504,29 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const 0, // offset {offsets->view(), input.chars()}); - // run BPE on the strings child column - auto bpe_column = - byte_pair_encoding(cudf::strings_column_view(split_view), *(merge_pairs.impl), stream); - - // recombine the result - auto list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST}, - input.size(), - nullptr, // no parent data in list column - input.null_mask(), - input.null_count(), - 0, - {split_offsets->view(), bpe_column->view()}); - - // use join_list_elements to build the output strings column + // run BPE on this view + auto const bpe_column = + byte_pair_encoding(cudf::strings_column_view(input_view), *(merge_pairs.impl), stream); + + // recombine the result: + // compute the offsets needed to build a list view + auto const list_offsets = [d_strings = *d_strings, stream] { + auto offsets_itr = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), list_offsets_fn{d_strings}); + return cudf::strings::detail::make_offsets_child_column( + offsets_itr, offsets_itr + d_strings.size(), stream, rmm::mr::get_current_device_resource()); + }(); + + // build a list column_view using the BPE output and the list_offsets + auto const list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST}, + input.size(), + nullptr, // no parent data in list column + input.null_mask(), + input.null_count(), + 0, + {list_offsets->view(), bpe_column->view()}); + + // build the output strings column auto result = cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join), separator, diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index a0bb926ee9d..1edf94c6c61 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -48,7 +48,7 @@ TEST_F(TextBPETokenize, BytePairEncoding) nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)}; auto validity = cudf::test::iterators::null_at(4); - cudf::test::strings_column_wrapper input({"This is it", + cudf::test::strings_column_wrapper input({"This\tis it\n", "This is test-sentence-1", "This is test sentence-2", "This-is test sentence 3", @@ -67,6 +67,12 @@ TEST_F(TextBPETokenize, BytePairEncoding) ""}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto sliced = cudf::slice(input, {1, 4}).front(); + auto sliced_expected = cudf::slice(expected, {1, 4}).front(); + + results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), sliced_expected); } TEST_F(TextBPETokenize, BPE_Empty) From 6eb61713a9887093a051661e45ac7bca1c36a310 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 22 Feb 2022 19:38:24 -0500 Subject: [PATCH 05/10] add leading space to test --- cpp/tests/text/bpe_tests.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index 1edf94c6c61..c9f61a16964 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -48,7 +48,7 @@ TEST_F(TextBPETokenize, BytePairEncoding) nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)}; auto validity = cudf::test::iterators::null_at(4); - cudf::test::strings_column_wrapper input({"This\tis it\n", + cudf::test::strings_column_wrapper input({" This\tis it\n", "This is test-sentence-1", "This is test sentence-2", "This-is test sentence 3", @@ -59,7 +59,7 @@ TEST_F(TextBPETokenize, BytePairEncoding) auto results = nvtext::byte_pair_encoding(sv, merge_pairs); - auto expected = cudf::test::strings_column_wrapper({"This is it", + auto expected = cudf::test::strings_column_wrapper({" This is it", "This is test - sent ence - 1", "This is test sent ence - 2", "This - is test sent ence 3", From 84a2cbece465d455fe458cdea7eb4a6120cb28ee Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 23 Feb 2022 09:39:20 -0500 Subject: [PATCH 06/10] add separator test --- cpp/tests/text/bpe_tests.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index c9f61a16964..07f3a41f0e2 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -66,13 +66,30 @@ TEST_F(TextBPETokenize, BytePairEncoding) "", ""}, validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); auto sliced = cudf::slice(input, {1, 4}).front(); auto sliced_expected = cudf::slice(expected, {1, 4}).front(); results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), sliced_expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); +} + +TEST_F(TextBPETokenize, BytePairEncodingSeparator) +{ + auto mpt = cudf::test::strings_column_wrapper( + {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); + nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)}; + + cudf::test::strings_column_wrapper input( + {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "}); + auto sv = cudf::strings_column_view(input); + + auto results = nvtext::byte_pair_encoding(sv, merge_pairs, std::string(" Ġ")); + + auto expected = cudf::test::strings_column_wrapper( + {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } TEST_F(TextBPETokenize, BPE_Empty) From d282330b885aa21bbb2106e1cb55ef67ae8c273e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 7 Mar 2022 11:00:37 -0500 Subject: [PATCH 07/10] fix typos in and clarify comments --- cpp/include/nvtext/bpe_tokenize.hpp | 8 +++--- cpp/src/strings/split/split_record.cu | 31 ++++++------------------ cpp/src/text/subword/bpe_tokenizer.cuh | 4 +-- cpp/src/text/subword/load_merges_file.cu | 10 +++++--- 4 files changed, 19 insertions(+), 34 deletions(-) diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index 53eda236d0b..23af9731268 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -66,10 +66,10 @@ struct bpe_merge_pairs { * @endcode * * The pairs are expected to be ordered in the file by their rank - * relative to each other. A pair will be is in priority over + * relative to each other. A pair earlier in the file has priority over * any pairs below it. * - * @param filename_merges Local file path of pairs encoded in UTF-8 + * @param filename_merges Local file path of pairs encoded in UTF-8. * @param mr Memory resource to allocate any returned objects. */ std::unique_ptr load_merge_pairs_file( @@ -80,7 +80,7 @@ std::unique_ptr load_merge_pairs_file( * @brief Byte pair encode the input strings. * * This will split each string on whitespace, perform the encoding, - * and then build the output using the given `separator`. + * and then build the output column using the given `separator`. * * The encoding algorithm rebuilds each string by matching substrings * in the `merge_pairs` table and iteratively removing the minimum ranked pair @@ -91,7 +91,7 @@ std::unique_ptr load_merge_pairs_file( * mps = load_merges_file("merges.txt") * input = ["test sentence", "thisis test"] * result = byte_pair_encoding(input, mps) - * result is now ["test sent tence", "this is test"] + * result is now ["test sent ence", "this is test"] * @endcode * * @throw cudf::logic_error if `merge_pairs` is empty diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 8834ece8734..929d21a024c 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -217,6 +217,8 @@ struct whitespace_token_reader_fn { } }; +} // namespace + // The output is one list item per string template std::unique_ptr split_record_fn(strings_column_view const& strings, @@ -287,27 +289,6 @@ std::unique_ptr split_record( mr); } } -} // namespace - -std::unique_ptr split_record(strings_column_view const& strings, - string_scalar const& delimiter, - size_type maxsplit, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return split_record( - strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); -} - -std::unique_ptr rsplit_record(strings_column_view const& strings, - string_scalar const& delimiter, - size_type maxsplit, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return split_record( - strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); -} } // namespace detail @@ -319,7 +300,8 @@ std::unique_ptr split_record(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); + return detail::split_record( + strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); } std::unique_ptr rsplit_record(strings_column_view const& strings, @@ -328,7 +310,8 @@ std::unique_ptr rsplit_record(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_record(strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); + return detail::split_record( + strings, delimiter, maxsplit, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh index ac21eea0261..31cc29a8d8a 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ b/cpp/src/text/subword/bpe_tokenizer.cuh @@ -16,10 +16,10 @@ #pragma once -#include - #include +#include + #include #include diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index 1ea2bdcaba5..bd206e30a27 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -83,7 +83,7 @@ std::unique_ptr load_file_to_column(std::string const& filename_me if (line.substr(0, version.size()).compare(version) == 0) { std::getline(merges_file, line); } // This is a text file delimited only by CR/LF. - // Look into using the CSV reader to load the strings column instead. + // TODO: Look into using the CSV reader to load the strings column instead. while (!line.empty()) { chars.insert(chars.end(), std::cbegin(line), std::cend(line)); offsets.push_back(offsets.back() + line.length()); @@ -100,10 +100,12 @@ std::unique_ptr load_file_to_column(std::string const& filename_me std::unique_ptr initialize_merge_pairs_map( cudf::strings_column_view const& input, rmm::cuda_stream_view stream) { + // Ensure capacity is at least (size*10/7) as documented here: + // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( - static_cast(input.size() * 2), // ensure capacity is at least (size*10/7) - std::numeric_limits::max(), - -1, // empty-value is not used + static_cast(input.size() * 2), // capacity is 2x; + std::numeric_limits::max(), // empty key; + -1, // empty value is not used hash_table_allocator_type{default_allocator{}, stream}, stream.value()); From 93b0842134ded39bd5aae33617ffaf4c778ca7e2 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 15 Mar 2022 09:54:20 -0400 Subject: [PATCH 08/10] fix grammar and typos --- cpp/include/nvtext/bpe_tokenize.hpp | 4 ++-- cpp/src/text/subword/bpe_tokenizer.cu | 23 ++++++++++++----------- cpp/src/text/subword/load_merges_file.cu | 9 ++++----- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index 23af9731268..3af521a4fc5 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -31,7 +31,7 @@ namespace nvtext { /** * @brief The table of merge pairs for the BPE encoder. * - * To create an instance, call nvtext::load_merges_table + * To create an instance, call @ref nvtext::load_merge_pairs_file */ struct bpe_merge_pairs { struct bpe_merge_pairs_impl; @@ -98,7 +98,7 @@ std::unique_ptr load_merge_pairs_file( * @throw cudf::logic_error if `separator` is invalid * * @param input Strings to encode. - * @param merge_pairs Created by a call to nvtext::load_merges_file. + * @param merge_pairs Created by a call to @ref nvtext::load_merge_pairs_file. * @param separator String used to build the output after encoding. * Default is a space. * @param mr Memory resource to allocate any returned objects. diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 49c53b7547c..92f2e483bc6 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -49,16 +49,16 @@ constexpr bool is_whitespace(CharType ch) } /** - * @brief Resolve a truncated string from a full string. + * @brief Resolve a substring up to the first whitespace character. * * This will return a substring of the input starting with the first byte - * upto the first whitespace character is found or the end of the string. + * up to the first whitespace character found or the end of the string. * Any whitespace is expected only at the end of the string. * * @param d_str Input string to resolve. - * @return Substring of the input excluding trailing whitespace. + * @return Substring of the input excluding any trailing whitespace. */ -__device__ cudf::string_view resolve_string(cudf::string_view const& d_str) +__device__ cudf::string_view get_first_token(cudf::string_view const& d_str) { auto const begin = d_str.data(); auto const end = thrust::find_if( @@ -178,7 +178,7 @@ struct byte_pair_encoding_fn { __device__ void operator()(cudf::size_type idx) { if (d_strings.is_null(idx)) { return; } - auto const d_str = resolve_string(d_strings.element(idx)); + auto const d_str = get_first_token(d_strings.element(idx)); if (d_str.empty()) { return; } auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) @@ -293,7 +293,7 @@ struct build_encoding_fn { __device__ void operator()(cudf::size_type idx) { if (d_strings.is_null(idx)) { return; } - auto const d_str = resolve_string(d_strings.element(idx)); + auto const d_str = get_first_token(d_strings.element(idx)); if (d_str.empty()) { return; } auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) @@ -312,6 +312,7 @@ struct build_encoding_fn { if (*itr++) *d_output++ = ' '; *d_output++ = *d_input++; } + // https://github.com/rapidsai/cudf/pull/10270/files#r826319405 } }; @@ -322,7 +323,7 @@ struct build_encoding_fn { * * The encoding is performed iteratively. Each pass determines the string's lowest * ranked merge pair as determined by the strings in `merges_table`. This pair - * is the removed (virtually) from each string before starting the next iteration. + * is removed (virtually) from each string before starting the next iteration. * * Once all pairs have exhausted for all strings, the output is constructed from * the results by adding spaces between each remaining pair in each string. @@ -344,7 +345,7 @@ std::unique_ptr byte_pair_encoding( auto const d_merges = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream); auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, + auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, static_cast(input.size() + 1), cudf::mask_state::UNALLOCATED, stream, @@ -384,7 +385,7 @@ std::unique_ptr byte_pair_encoding( * * This handles sliced input and null strings as well. * It is parallelized over bytes and returns true only for valid left edges - * -- non-space proceeded by a space. + * -- non-space preceded by a space. */ struct edge_of_space_fn { cudf::column_device_view const d_strings; @@ -417,7 +418,7 @@ struct edge_of_space_fn { * and only returns new offsets. The behavior is more like a view-only slice * of the chars child with the result still including trailing delimiters. * - * The encoding algorithm knows to ignore the trailing whitespace of each string. + * The encoding algorithm ignores the trailing whitespace of each string. * * @param input Strings to tokenize. * @param stream CUDA stream used for device memory operations and kernel launches @@ -439,7 +440,7 @@ std::unique_ptr space_offsets(cudf::strings_column_view const& inp // create output offsets auto result = - cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, + cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, static_cast(space_count + input.size() + 1), cudf::mask_state::UNALLOCATED, stream, diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu index bd206e30a27..bdcbe45df64 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/subword/load_merges_file.cu @@ -64,7 +64,7 @@ struct make_pair_function { * ... * @endcode * - * @param filename_merges Path to text file containing merge-pairs. + * @param filename_merges Path to text file containing merge-pairs * @return object containing table elements for the BPE function */ std::unique_ptr load_file_to_column(std::string const& filename_merges, @@ -100,7 +100,7 @@ std::unique_ptr load_file_to_column(std::string const& filename_me std::unique_ptr initialize_merge_pairs_map( cudf::strings_column_view const& input, rmm::cuda_stream_view stream) { - // Ensure capacity is at least (size*10/7) as documented here: + // Ensure capacity is at least (size/0.7) as documented here: // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( static_cast(input.size() * 2), // capacity is 2x; @@ -126,9 +126,8 @@ std::unique_ptr create_bpe_merge_pairs_im std::unique_ptr&& input, rmm::cuda_stream_view stream) { auto merge_pairs = initialize_merge_pairs_map(cudf::strings_column_view(input->view()), stream); - auto result = std::make_unique( - std::move(input), std::move(merge_pairs)); - return result; + return std::make_unique(std::move(input), + std::move(merge_pairs)); } std::unique_ptr create_bpe_merge_pairs_impl( From 845a414b705f362ca186051e4f92633afe412ff2 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 15 Mar 2022 17:18:53 -0400 Subject: [PATCH 09/10] add more entries in load_merge_pairs_file doxygen example --- cpp/include/nvtext/bpe_tokenize.hpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index 3af521a4fc5..23fcd3acd03 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -62,6 +62,15 @@ struct bpe_merge_pairs { * e n * i t * i s + * e s + * en t + * c e + * es t + * en ce + * T h + * Th is + * t est + * s ent * ... * @endcode * @@ -88,7 +97,7 @@ std::unique_ptr load_merge_pairs_file( * pairs before the result is joined to make the output string. * * @code{.pseudo} - * mps = load_merges_file("merges.txt") + * mps = load_merges_file("merges.txt") // see doxygen for example contents * input = ["test sentence", "thisis test"] * result = byte_pair_encoding(input, mps) * result is now ["test sent ence", "this is test"] From 060077bdb9ed0e3ce64d878e1f1681684ba738ee Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 15 Mar 2022 18:32:26 -0400 Subject: [PATCH 10/10] add check for unexpected data format --- cpp/src/text/subword/bpe_tokenizer.cu | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu index 92f2e483bc6..c9a1d685f2e 100644 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ b/cpp/src/text/subword/bpe_tokenizer.cu @@ -91,13 +91,16 @@ struct byte_pair_encoding_fn { __device__ thrust::pair dissect_merge_pair( cudf::size_type idx) { - auto const d_pair = d_merges.element(idx); - auto const lhs = d_pair.data(); - auto const end_str = d_pair.data() + d_pair.size_bytes(); - auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' ') + 1; - auto const lhs_size = static_cast(thrust::distance(lhs, rhs - 1)); - auto const rhs_size = static_cast(thrust::distance(rhs, end_str)); - return thrust::make_pair(cudf::string_view(lhs, lhs_size), cudf::string_view(rhs, rhs_size)); + auto const d_pair = d_merges.element(idx); + auto const lhs = d_pair.data(); + auto const end_str = d_pair.data() + d_pair.size_bytes(); + auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' '); // space always expected + // check for malformed pair entry to prevent segfault + if (rhs == end_str) { return thrust::make_pair(cudf::string_view{}, cudf::string_view{}); } + auto const lhs_size = static_cast(thrust::distance(lhs, rhs)); + auto const rhs_size = static_cast(thrust::distance(rhs + 1, end_str)); + return thrust::make_pair(cudf::string_view(lhs, lhs_size), + cudf::string_view(rhs + 1, rhs_size)); } /**