diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp index 43cc059eddd..9d75295cd39 100644 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ b/cpp/include/nvtext/subword_tokenize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,6 @@ #include #include -#include -#include - namespace nvtext { /** @@ -43,6 +40,8 @@ struct hashed_vocabulary { std::unique_ptr table; // uint64 std::unique_ptr bin_coefficients; // uint64 std::unique_ptr bin_offsets; // uint16 + std::unique_ptr cp_metadata; // uint32 + std::unique_ptr aux_cp_table; // uint64 }; /** diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index c6dd11c1d82..62fd98d2027 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -199,12 +200,14 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); // create the normalizer and call it - data_normalizer normalizer(stream, do_lower_case); - auto result = [&strings, &normalizer, stream] { - auto const offsets = strings.offsets(); - auto const d_offsets = offsets.data() + strings.offset(); - auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); - auto const d_chars = strings.chars().data() + offset; + auto result = [&] { + auto const cp_metadata = get_codepoint_metadata(stream); + auto const aux_table = get_aux_codepoint_data(stream); + auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case); + auto const offsets = strings.offsets(); + auto const d_offsets = offsets.data() + strings.offset(); + auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); + auto const d_chars = strings.chars().data() + offset; return normalizer.normalize(d_chars, d_offsets, strings.size(), stream); }(); diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index f3b642132e3..5af87f4de0e 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -261,17 +261,17 @@ __global__ void kernel_data_normalizer(unsigned char const* strings, } // namespace -data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case) - : do_lower_case(do_lower_case) +data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case) + : d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case} { - d_cp_metadata = detail::get_codepoint_metadata(stream); - d_aux_table = detail::get_aux_codepoint_data(stream); } uvector_pair data_normalizer::normalize(char const* d_strings, uint32_t const* d_offsets, uint32_t num_strings, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream) const { if (num_strings == 0) return std::make_pair(std::make_unique>(0, stream), diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp index 1a9eb5ba997..927de5a74f9 100644 --- a/cpp/src/text/subword/detail/data_normalizer.hpp +++ b/cpp/src/text/subword/detail/data_normalizer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,14 +48,17 @@ namespace detail { class data_normalizer { public: /** - * @brief Transfer to the GPU the metadata needed to normalize characters. + * @brief Create instance of the normalizer. * - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param cp_metadata The code point metadata table to use for normalization. + * @param aux_table The auxiliary code point table. * @param do_lower_case If true, the normalizer will convert uppercase characters in the * input stream to lower case and strip accents from those characters. * If false, accented and uppercase characters are not transformed. */ - data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true); + data_normalizer(codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case = true); /** * @brief Normalize a vector of strings. @@ -84,7 +87,7 @@ class data_normalizer { uvector_pair normalize(char const* d_strings, uint32_t const* d_offsets, uint32_t num_strings, - rmm::cuda_stream_view stream); + rmm::cuda_stream_view stream) const; private: bool const do_lower_case; diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh index dcd241fc045..5e8de1ba244 100644 --- a/cpp/src/text/subword/detail/tokenizer_utils.cuh +++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include +#include #include @@ -57,22 +58,16 @@ struct update_strings_lengths_fn { /** * @brief Retrieve the code point metadata table. * - * This is a singleton instance that copies a large table of integers into - * device memory on the very first call. - * * @param stream CUDA stream used for device memory operations and kernel launches. */ -codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream); +rmm::device_uvector get_codepoint_metadata(rmm::cuda_stream_view stream); /** - * @brief Retrieve the aux code point metadata table. - * - * This is a singleton instance that copies a large table of integers into - * device memory on the very first call. + * @brief Retrieve the auxiliary code point metadata table. * * @param stream CUDA stream used for device memory operations and kernel launches. */ -aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream); +rmm::device_uvector get_aux_codepoint_data(rmm::cuda_stream_view stream); } // namespace detail } // namespace nvtext diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp index 0259e8ce4f4..b5ad9724d72 100644 --- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp +++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,7 +61,6 @@ class wordpiece_tokenizer { * @param do_lower_case If true, the tokenizer will convert uppercase characters in the * input stream to lowercase and strip accents from those characters. * If false, accented and uppercase characters are not transformed. - * @param stream CUDA stream used for device memory operations and kernel launches. * @param max_word_length The length of the longest word that will be tokenized. Words * longer than this will simply be replaced by the unknown token * specified in the `vocab_file`. @@ -72,7 +71,6 @@ class wordpiece_tokenizer { uint32_t stride, bool do_truncate, bool do_lower_case, - rmm::cuda_stream_view stream, uint32_t max_word_length = 200); /** diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index 75c79381032..7cfdb4dea96 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #include -#include #include #include @@ -37,87 +36,32 @@ namespace nvtext { namespace detail { -namespace { -struct get_codepoint_metadata_init { - rmm::cuda_stream_view stream; - - rmm::device_uvector* operator()() const - { - auto table_vector = - new rmm::device_uvector(codepoint_metadata_size, stream); - auto table = table_vector->data(); - thrust::fill(rmm::exec_policy(stream), - table + cp_section1_end, - table + codepoint_metadata_size, - codepoint_metadata_default_value); - CUDA_TRY(cudaMemcpyAsync(table, - codepoint_metadata, - cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section - cudaMemcpyHostToDevice, - stream.value())); - CUDA_TRY(cudaMemcpyAsync( - table + cp_section2_begin, - cp_metadata_917505_917999, - (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section - cudaMemcpyHostToDevice, - stream.value())); - return table_vector; - }; -}; - -struct get_aux_codepoint_data_init { - rmm::cuda_stream_view stream; - - rmm::device_uvector* operator()() const - { - auto table_vector = - new rmm::device_uvector(aux_codepoint_data_size, stream); - auto table = table_vector->data(); - thrust::fill(rmm::exec_policy(stream), - table + aux_section1_end, - table + aux_codepoint_data_size, - aux_codepoint_default_value); - CUDA_TRY(cudaMemcpyAsync(table, - aux_codepoint_data, - aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section - cudaMemcpyHostToDevice, - stream.value())); - CUDA_TRY(cudaMemcpyAsync( - table + aux_section2_begin, - aux_cp_data_44032_55203, - (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section - cudaMemcpyHostToDevice, - stream.value())); - CUDA_TRY(cudaMemcpyAsync( - table + aux_section3_begin, - aux_cp_data_70475_71099, - (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section - cudaMemcpyHostToDevice, - stream.value())); - CUDA_TRY(cudaMemcpyAsync( - table + aux_section4_begin, - aux_cp_data_119134_119232, - (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section - cudaMemcpyHostToDevice, - stream.value())); - return table_vector; - } -}; -} // namespace - /** * @brief Retrieve the code point metadata table. * * Build the code point metadata table in device memory * using the vector pieces from codepoint_metadata.ah */ -const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream) +rmm::device_uvector get_codepoint_metadata(rmm::cuda_stream_view stream) { - static cudf::strings::detail::thread_safe_per_context_cache< - rmm::device_uvector> - g_codepoint_metadata; - - return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data(); + auto table_vector = rmm::device_uvector(codepoint_metadata_size, stream); + auto table = table_vector.data(); + thrust::fill(rmm::exec_policy(stream), + table + cp_section1_end, + table + codepoint_metadata_size, + codepoint_metadata_default_value); + CUDA_TRY(cudaMemcpyAsync(table, + codepoint_metadata, + cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( + table + cp_section2_begin, + cp_metadata_917505_917999, + (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section + cudaMemcpyHostToDevice, + stream.value())); + return table_vector; } /** @@ -126,13 +70,38 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre * Build the aux code point data table in device memory * using the vector pieces from codepoint_metadata.ah */ -const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream) +rmm::device_uvector get_aux_codepoint_data(rmm::cuda_stream_view stream) { - static cudf::strings::detail::thread_safe_per_context_cache< - rmm::device_uvector> - g_aux_codepoint_data; - - return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data(); + auto table_vector = rmm::device_uvector(aux_codepoint_data_size, stream); + auto table = table_vector.data(); + thrust::fill(rmm::exec_policy(stream), + table + aux_section1_end, + table + aux_codepoint_data_size, + aux_codepoint_default_value); + CUDA_TRY(cudaMemcpyAsync(table, + aux_codepoint_data, + aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( + table + aux_section2_begin, + aux_cp_data_44032_55203, + (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( + table + aux_section3_begin, + aux_cp_data_70475_71099, + (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section + cudaMemcpyHostToDevice, + stream.value())); + CUDA_TRY(cudaMemcpyAsync( + table + aux_section4_begin, + aux_cp_data_119134_119232, + (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section + cudaMemcpyHostToDevice, + stream.value())); + return table_vector; } namespace { @@ -293,10 +262,15 @@ std::unique_ptr load_vocabulary_file( cudaMemcpyHostToDevice, stream.value())); - // this just initializes some constant tables into device memory - // to help speed up the runtime - detail::get_codepoint_metadata(stream); - detail::get_aux_codepoint_data(stream); + auto cp_metadata = detail::get_codepoint_metadata(stream); + auto const cp_metadata_size = static_cast(cp_metadata.size()); + result.cp_metadata = std::make_unique( + cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release()); + + auto aux_cp_table = detail::get_aux_codepoint_data(stream); + auto const aux_cp_table_size = static_cast(aux_cp_table.size()); + result.aux_cp_table = std::make_unique( + cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release()); return std::make_unique(std::move(result)); } diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 193cd80d9a6..1ac7dd0d8a1 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -153,7 +153,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, // Create tokenizer wordpiece_tokenizer tokenizer( - vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case, stream); + vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case); // Run tokenizer auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream); // assign output components diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index 00798e7e4e2..afd82f0bb5d 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -394,10 +394,11 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table, uint32_t stride, bool do_truncate, bool do_lower_case, - rmm::cuda_stream_view stream, uint32_t max_word_length) : vocab_table(vocab_table), - normalizer(stream, do_lower_case), + normalizer(vocab_table.cp_metadata->view().data(), + vocab_table.aux_cp_table->view().data(), + do_lower_case), max_sequence_length{max_sequence_length}, stride(stride), do_truncate(do_truncate),