Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove metadata singleton from nvtext normalizer #10090

Merged
merged 9 commits into from
Jan 26, 2022
4 changes: 3 additions & 1 deletion cpp/include/nvtext/subword_tokenize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,6 +43,8 @@ struct hashed_vocabulary {
std::unique_ptr<cudf::column> table; // uint64
std::unique_ptr<cudf::column> bin_coefficients; // uint64
std::unique_ptr<cudf::column> bin_offsets; // uint16
std::unique_ptr<cudf::column> cp_metadata; // uint32
std::unique_ptr<cudf::column> aux_cp_table; // uint64
};

/**
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/text/normalize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@
*/

#include <text/subword/detail/data_normalizer.hpp>
#include <text/subword/detail/tokenizer_utils.cuh>
#include <text/utilities/tokenize_ops.cuh>

#include <nvtext/normalize.hpp>
Expand Down Expand Up @@ -199,8 +200,10 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});

// create the normalizer and call it
data_normalizer normalizer(stream, do_lower_case);
auto result = [&strings, &normalizer, stream] {
auto result = [&] {
auto const cp_metadata = get_codepoint_metadata(stream);
auto const aux_table = get_aux_codepoint_data(stream);
data_normalizer normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
auto const offset = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/text/subword/data_normalizer.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -262,11 +262,11 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,

} // namespace

data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case)
: do_lower_case(do_lower_case)
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
aux_codepoint_data_type const* aux_table,
bool do_lower_case)
: d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case}
{
d_cp_metadata = detail::get_codepoint_metadata(stream);
d_aux_table = detail::get_aux_codepoint_data(stream);
}

uvector_pair data_normalizer::normalize(char const* d_strings,
Expand Down
11 changes: 7 additions & 4 deletions cpp/src/text/subword/detail/data_normalizer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,14 +48,17 @@ namespace detail {
class data_normalizer {
public:
/**
* @brief Transfer to the GPU the metadata needed to normalize characters.
* @brief Create instance of the normalizer.
*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param cp_metadata The code point metadata table to use for normalization.
* @param aux_table The auxiliary code point table.
* @param do_lower_case If true, the normalizer will convert uppercase characters in the
* input stream to lower case and strip accents from those characters.
* If false, accented and uppercase characters are not transformed.
*/
data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true);
data_normalizer(codepoint_metadata_type const* cp_metadata,
aux_codepoint_data_type const* aux_table,
bool do_lower_case = true);

/**
* @brief Normalize a vector of strings.
Expand Down
15 changes: 5 additions & 10 deletions cpp/src/text/subword/detail/tokenizer_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,7 @@
#include <text/subword/detail/cp_data.h>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <stdint.h>

Expand Down Expand Up @@ -57,22 +58,16 @@ struct update_strings_lengths_fn {
/**
* @brief Retrieve the code point metadata table.
*
* This is a singleton instance that copies a large table of integers into
* device memory on the very first call.
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream);
rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream);

/**
* @brief Retrieve the aux code point metadata table.
*
* This is a singleton instance that copies a large table of integers into
* device memory on the very first call.
* @brief Retrieve the auxiliary code point metadata table.
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream);
rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream);

} // namespace detail
} // namespace nvtext
4 changes: 1 addition & 3 deletions cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -61,7 +61,6 @@ class wordpiece_tokenizer {
* @param do_lower_case If true, the tokenizer will convert uppercase characters in the
* input stream to lowercase and strip accents from those characters.
* If false, accented and uppercase characters are not transformed.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param max_word_length The length of the longest word that will be tokenized. Words
* longer than this will simply be replaced by the unknown token
* specified in the `vocab_file`.
Expand All @@ -72,7 +71,6 @@ class wordpiece_tokenizer {
uint32_t stride,
bool do_truncate,
bool do_lower_case,
rmm::cuda_stream_view stream,
uint32_t max_word_length = 200);

/**
Expand Down
148 changes: 62 additions & 86 deletions cpp/src/text/subword/load_hash_file.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,7 +15,6 @@
*/

#include <text/subword/detail/codepoint_metadata.ah>
#include <text/subword/detail/data_normalizer.hpp>
#include <text/subword/detail/tokenizer_utils.cuh>

#include <nvtext/detail/load_hash_file.hpp>
Expand All @@ -37,87 +36,32 @@
namespace nvtext {
namespace detail {

namespace {
struct get_codepoint_metadata_init {
rmm::cuda_stream_view stream;

rmm::device_uvector<codepoint_metadata_type>* operator()() const
{
auto table_vector =
new rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
auto table = table_vector->data();
thrust::fill(rmm::exec_policy(stream),
table + cp_section1_end,
table + codepoint_metadata_size,
codepoint_metadata_default_value);
CUDA_TRY(cudaMemcpyAsync(table,
codepoint_metadata,
cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + cp_section2_begin,
cp_metadata_917505_917999,
(cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section
cudaMemcpyHostToDevice,
stream.value()));
return table_vector;
};
};

struct get_aux_codepoint_data_init {
rmm::cuda_stream_view stream;

rmm::device_uvector<aux_codepoint_data_type>* operator()() const
{
auto table_vector =
new rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
auto table = table_vector->data();
thrust::fill(rmm::exec_policy(stream),
table + aux_section1_end,
table + aux_codepoint_data_size,
aux_codepoint_default_value);
CUDA_TRY(cudaMemcpyAsync(table,
aux_codepoint_data,
aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section2_begin,
aux_cp_data_44032_55203,
(aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section3_begin,
aux_cp_data_70475_71099,
(aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section4_begin,
aux_cp_data_119134_119232,
(aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section
cudaMemcpyHostToDevice,
stream.value()));
return table_vector;
}
};
} // namespace

/**
* @brief Retrieve the code point metadata table.
*
* Build the code point metadata table in device memory
* using the vector pieces from codepoint_metadata.ah
*/
const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream)
{
static cudf::strings::detail::thread_safe_per_context_cache<
rmm::device_uvector<codepoint_metadata_type>>
g_codepoint_metadata;

return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data();
auto table_vector = rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
auto table = table_vector.data();
thrust::fill(rmm::exec_policy(stream),
table + cp_section1_end,
table + codepoint_metadata_size,
codepoint_metadata_default_value);
CUDA_TRY(cudaMemcpyAsync(table,
codepoint_metadata,
cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + cp_section2_begin,
cp_metadata_917505_917999,
(cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section
cudaMemcpyHostToDevice,
stream.value()));
return table_vector;
}

/**
Expand All @@ -126,13 +70,38 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre
* Build the aux code point data table in device memory
* using the vector pieces from codepoint_metadata.ah
*/
const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream)
{
static cudf::strings::detail::thread_safe_per_context_cache<
rmm::device_uvector<aux_codepoint_data_type>>
g_aux_codepoint_data;

return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data();
auto table_vector = rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
auto table = table_vector.data();
thrust::fill(rmm::exec_policy(stream),
table + aux_section1_end,
table + aux_codepoint_data_size,
aux_codepoint_default_value);
CUDA_TRY(cudaMemcpyAsync(table,
aux_codepoint_data,
aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section2_begin,
aux_cp_data_44032_55203,
(aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section3_begin,
aux_cp_data_70475_71099,
(aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section4_begin,
aux_cp_data_119134_119232,
(aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section
cudaMemcpyHostToDevice,
stream.value()));
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
return table_vector;
}

namespace {
Expand Down Expand Up @@ -293,10 +262,17 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
cudaMemcpyHostToDevice,
stream.value()));

// this just initializes some constant tables into device memory
// to help speed up the runtime
detail::get_codepoint_metadata(stream);
detail::get_aux_codepoint_data(stream);
auto cp_metadata = detail::get_codepoint_metadata(stream);
result.cp_metadata =
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::UINT32},
static_cast<cudf::size_type>(cp_metadata.size()),
cp_metadata.release());

auto aux_cp_table = detail::get_aux_codepoint_data(stream);
result.aux_cp_table =
std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::UINT64},
static_cast<cudf::size_type>(aux_cp_table.size()),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
aux_cp_table.release());

return std::make_unique<hashed_vocabulary>(std::move(result));
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -153,7 +153,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,

// Create tokenizer
wordpiece_tokenizer tokenizer(
vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case, stream);
vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
// Run tokenizer
auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
// assign output components
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -394,10 +394,11 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
uint32_t stride,
bool do_truncate,
bool do_lower_case,
rmm::cuda_stream_view stream,
uint32_t max_word_length)
: vocab_table(vocab_table),
normalizer(stream, do_lower_case),
normalizer(vocab_table.cp_metadata->view().data<codepoint_metadata_type>(),
vocab_table.aux_cp_table->view().data<aux_codepoint_data_type>(),
do_lower_case),
max_sequence_length{max_sequence_length},
stride(stride),
do_truncate(do_truncate),
Expand Down