Skip to content

Commit

Permalink
Remove metadata singleton from nvtext normalizer (#10090)
Browse files Browse the repository at this point in the history
Closes #10043 

A metadata singleton was allocated from the current rmm memory-resource inside the nvtext normalizer functions. If the memory resource is later changed, the metadata pointer may become invalidated. This PR removes the singleton pattern. 

The normalizer is used by the subword-tokenizer which is passed a vocabulary structure that is built only once and is maintained by the caller. The metadata has been added to this structure so it's lifetime can share the same scope.

The normalizer can also be called directly through the `nvtext::normalize_characters` API. Here the metadata table (size about 1MB) is now created on each call. This showed only significant performance impact on benchmarks testing a small number (<50K) of shorter strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert Maynard (https://github.com/robertmaynard)

URL: #10090
  • Loading branch information
davidwendt authored Jan 26, 2022
1 parent eacaea0 commit 85109e6
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 126 deletions.
7 changes: 3 additions & 4 deletions cpp/include/nvtext/subword_tokenize.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,9 +19,6 @@
#include <cudf/column/column_view.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <cstdint>
#include <cstring>

namespace nvtext {

/**
Expand All @@ -43,6 +40,8 @@ struct hashed_vocabulary {
std::unique_ptr<cudf::column> table; // uint64
std::unique_ptr<cudf::column> bin_coefficients; // uint64
std::unique_ptr<cudf::column> bin_offsets; // uint16
std::unique_ptr<cudf::column> cp_metadata; // uint32
std::unique_ptr<cudf::column> aux_cp_table; // uint64
};

/**
Expand Down
17 changes: 10 additions & 7 deletions cpp/src/text/normalize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,7 @@
*/

#include <text/subword/detail/data_normalizer.hpp>
#include <text/subword/detail/tokenizer_utils.cuh>
#include <text/utilities/tokenize_ops.cuh>

#include <nvtext/normalize.hpp>
Expand Down Expand Up @@ -199,12 +200,14 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});

// create the normalizer and call it
data_normalizer normalizer(stream, do_lower_case);
auto result = [&strings, &normalizer, stream] {
auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
auto const offset = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;
auto result = [&] {
auto const cp_metadata = get_codepoint_metadata(stream);
auto const aux_table = get_aux_codepoint_data(stream);
auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
auto const offset = cudf::detail::get_value<int32_t>(offsets, strings.offset(), stream);
auto const d_chars = strings.chars().data<char>() + offset;
return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
}();

Expand Down
12 changes: 6 additions & 6 deletions cpp/src/text/subword/data_normalizer.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -261,17 +261,17 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,

} // namespace

data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case)
: do_lower_case(do_lower_case)
data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata,
aux_codepoint_data_type const* aux_table,
bool do_lower_case)
: d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case}
{
d_cp_metadata = detail::get_codepoint_metadata(stream);
d_aux_table = detail::get_aux_codepoint_data(stream);
}

uvector_pair data_normalizer::normalize(char const* d_strings,
uint32_t const* d_offsets,
uint32_t num_strings,
rmm::cuda_stream_view stream)
rmm::cuda_stream_view stream) const
{
if (num_strings == 0)
return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
Expand Down
13 changes: 8 additions & 5 deletions cpp/src/text/subword/detail/data_normalizer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,14 +48,17 @@ namespace detail {
class data_normalizer {
public:
/**
* @brief Transfer to the GPU the metadata needed to normalize characters.
* @brief Create instance of the normalizer.
*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param cp_metadata The code point metadata table to use for normalization.
* @param aux_table The auxiliary code point table.
* @param do_lower_case If true, the normalizer will convert uppercase characters in the
* input stream to lower case and strip accents from those characters.
* If false, accented and uppercase characters are not transformed.
*/
data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true);
data_normalizer(codepoint_metadata_type const* cp_metadata,
aux_codepoint_data_type const* aux_table,
bool do_lower_case = true);

/**
* @brief Normalize a vector of strings.
Expand Down Expand Up @@ -84,7 +87,7 @@ class data_normalizer {
uvector_pair normalize(char const* d_strings,
uint32_t const* d_offsets,
uint32_t num_strings,
rmm::cuda_stream_view stream);
rmm::cuda_stream_view stream) const;

private:
bool const do_lower_case;
Expand Down
15 changes: 5 additions & 10 deletions cpp/src/text/subword/detail/tokenizer_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,7 @@
#include <text/subword/detail/cp_data.h>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <cstdint>

Expand Down Expand Up @@ -57,22 +58,16 @@ struct update_strings_lengths_fn {
/**
* @brief Retrieve the code point metadata table.
*
* This is a singleton instance that copies a large table of integers into
* device memory on the very first call.
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream);
rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream);

/**
* @brief Retrieve the aux code point metadata table.
*
* This is a singleton instance that copies a large table of integers into
* device memory on the very first call.
* @brief Retrieve the auxiliary code point metadata table.
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream);
rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream);

} // namespace detail
} // namespace nvtext
4 changes: 1 addition & 3 deletions cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -61,7 +61,6 @@ class wordpiece_tokenizer {
* @param do_lower_case If true, the tokenizer will convert uppercase characters in the
* input stream to lowercase and strip accents from those characters.
* If false, accented and uppercase characters are not transformed.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param max_word_length The length of the longest word that will be tokenized. Words
* longer than this will simply be replaced by the unknown token
* specified in the `vocab_file`.
Expand All @@ -72,7 +71,6 @@ class wordpiece_tokenizer {
uint32_t stride,
bool do_truncate,
bool do_lower_case,
rmm::cuda_stream_view stream,
uint32_t max_word_length = 200);

/**
Expand Down
146 changes: 60 additions & 86 deletions cpp/src/text/subword/load_hash_file.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,7 +15,6 @@
*/

#include <text/subword/detail/codepoint_metadata.ah>
#include <text/subword/detail/data_normalizer.hpp>
#include <text/subword/detail/tokenizer_utils.cuh>

#include <nvtext/detail/load_hash_file.hpp>
Expand All @@ -37,87 +36,32 @@
namespace nvtext {
namespace detail {

namespace {
struct get_codepoint_metadata_init {
rmm::cuda_stream_view stream;

rmm::device_uvector<codepoint_metadata_type>* operator()() const
{
auto table_vector =
new rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
auto table = table_vector->data();
thrust::fill(rmm::exec_policy(stream),
table + cp_section1_end,
table + codepoint_metadata_size,
codepoint_metadata_default_value);
CUDA_TRY(cudaMemcpyAsync(table,
codepoint_metadata,
cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + cp_section2_begin,
cp_metadata_917505_917999,
(cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section
cudaMemcpyHostToDevice,
stream.value()));
return table_vector;
};
};

struct get_aux_codepoint_data_init {
rmm::cuda_stream_view stream;

rmm::device_uvector<aux_codepoint_data_type>* operator()() const
{
auto table_vector =
new rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
auto table = table_vector->data();
thrust::fill(rmm::exec_policy(stream),
table + aux_section1_end,
table + aux_codepoint_data_size,
aux_codepoint_default_value);
CUDA_TRY(cudaMemcpyAsync(table,
aux_codepoint_data,
aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section2_begin,
aux_cp_data_44032_55203,
(aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section3_begin,
aux_cp_data_70475_71099,
(aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section4_begin,
aux_cp_data_119134_119232,
(aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section
cudaMemcpyHostToDevice,
stream.value()));
return table_vector;
}
};
} // namespace

/**
* @brief Retrieve the code point metadata table.
*
* Build the code point metadata table in device memory
* using the vector pieces from codepoint_metadata.ah
*/
const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
rmm::device_uvector<codepoint_metadata_type> get_codepoint_metadata(rmm::cuda_stream_view stream)
{
static cudf::strings::detail::thread_safe_per_context_cache<
rmm::device_uvector<codepoint_metadata_type>>
g_codepoint_metadata;

return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data();
auto table_vector = rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
auto table = table_vector.data();
thrust::fill(rmm::exec_policy(stream),
table + cp_section1_end,
table + codepoint_metadata_size,
codepoint_metadata_default_value);
CUDA_TRY(cudaMemcpyAsync(table,
codepoint_metadata,
cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + cp_section2_begin,
cp_metadata_917505_917999,
(cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section
cudaMemcpyHostToDevice,
stream.value()));
return table_vector;
}

/**
Expand All @@ -126,13 +70,38 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre
* Build the aux code point data table in device memory
* using the vector pieces from codepoint_metadata.ah
*/
const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
rmm::device_uvector<aux_codepoint_data_type> get_aux_codepoint_data(rmm::cuda_stream_view stream)
{
static cudf::strings::detail::thread_safe_per_context_cache<
rmm::device_uvector<aux_codepoint_data_type>>
g_aux_codepoint_data;

return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data();
auto table_vector = rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
auto table = table_vector.data();
thrust::fill(rmm::exec_policy(stream),
table + aux_section1_end,
table + aux_codepoint_data_size,
aux_codepoint_default_value);
CUDA_TRY(cudaMemcpyAsync(table,
aux_codepoint_data,
aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section2_begin,
aux_cp_data_44032_55203,
(aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section3_begin,
aux_cp_data_70475_71099,
(aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section
cudaMemcpyHostToDevice,
stream.value()));
CUDA_TRY(cudaMemcpyAsync(
table + aux_section4_begin,
aux_cp_data_119134_119232,
(aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section
cudaMemcpyHostToDevice,
stream.value()));
return table_vector;
}

namespace {
Expand Down Expand Up @@ -293,10 +262,15 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
cudaMemcpyHostToDevice,
stream.value()));

// this just initializes some constant tables into device memory
// to help speed up the runtime
detail::get_codepoint_metadata(stream);
detail::get_aux_codepoint_data(stream);
auto cp_metadata = detail::get_codepoint_metadata(stream);
auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
result.cp_metadata = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::UINT32}, cp_metadata_size, cp_metadata.release());

auto aux_cp_table = detail::get_aux_codepoint_data(stream);
auto const aux_cp_table_size = static_cast<cudf::size_type>(aux_cp_table.size());
result.aux_cp_table = std::make_unique<cudf::column>(
cudf::data_type{cudf::type_id::UINT64}, aux_cp_table_size, aux_cp_table.release());

return std::make_unique<hashed_vocabulary>(std::move(result));
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -153,7 +153,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,

// Create tokenizer
wordpiece_tokenizer tokenizer(
vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case, stream);
vocab_table, max_rows_tensor, max_sequence_length, stride, do_truncate, do_lower_case);
// Run tokenizer
auto const tokens = tokenizer.tokenize(d_chars, d_offsets, strings_count, stream);
// assign output components
Expand Down
Loading

0 comments on commit 85109e6

Please sign in to comment.