Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use offsetalator in nvtext tokenize functions #14783

Merged
merged 9 commits into from
Feb 6, 2024
86 changes: 34 additions & 52 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
Expand All @@ -37,12 +38,9 @@
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/transform_scan.h>

#include <cuda/functional>

#include <stdexcept>

namespace nvtext {
namespace detail {
namespace {
Expand All @@ -60,10 +58,10 @@ namespace {
* member.
*/
struct string_tokens_positions_fn {
cudf::column_device_view const d_strings; // strings to tokenize
cudf::string_view const d_delimiter; // delimiter to tokenize around
cudf::size_type const* d_token_offsets; // offsets into the d_token_positions for each string
position_pair* d_token_positions; // token positions in each string
cudf::column_device_view const d_strings; // strings to tokenize
cudf::string_view const d_delimiter; // delimiter to tokenize around
cudf::detail::input_offsetalator d_token_offsets; // offsets of d_token_positions for each string
position_pair* d_token_positions; // token positions in each string

__device__ void operator()(cudf::size_type idx)
{
Expand Down Expand Up @@ -95,12 +93,12 @@ struct ngram_builder_fn {
cudf::column_device_view const d_strings; // strings to generate ngrams from
cudf::string_view const d_separator; // separator to place between them 'grams
cudf::size_type const ngrams; // ngram number to generate (2=bi-gram, 3=tri-gram)
cudf::size_type const* d_token_offsets; // offsets for token position for each string
position_pair const* d_token_positions; // token positions for each string
cudf::size_type const* d_chars_offsets{}; // offsets for each string's ngrams
char* d_chars{}; // write ngram strings to here
cudf::size_type const* d_ngram_offsets{}; // offsets for sizes of each string's ngrams
cudf::size_type* d_ngram_sizes{}; // write ngram sizes to here
cudf::detail::input_offsetalator d_token_offsets; // offsets for token position for each string
position_pair const* d_token_positions; // token positions for each string
cudf::detail::input_offsetalator d_chars_offsets{}; // offsets for each string's ngrams
char* d_chars{}; // write ngram strings to here
cudf::size_type const* d_ngram_offsets{}; // offsets for sizes of each string's ngrams
cudf::size_type* d_ngram_sizes{}; // write ngram sizes to here

__device__ cudf::size_type operator()(cudf::size_type idx)
{
Expand Down Expand Up @@ -165,16 +163,12 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s

// first, get the number of tokens per string to get the token-offsets
// Ex. token-counts = [3,2]; token-offsets = [0,3,5]
rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
auto d_token_offsets = token_offsets.data();
thrust::transform_inclusive_scan(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_token_offsets + 1,
strings_tokenizer{d_strings, d_delimiter},
thrust::plus<cudf::size_type>());
token_offsets.set_element_to_zero_async(0, stream);
auto const total_tokens = token_offsets.back_element(stream); // Ex. 5 tokens
auto const count_itr =
cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{d_strings, d_delimiter});
auto [token_offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
count_itr, count_itr + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_token_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());

// get the token positions (in bytes) per string
// Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
Expand All @@ -188,21 +182,17 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s

// compute the number of ngrams per string to get the total number of ngrams to generate
// Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
auto d_ngram_offsets = ngram_offsets.data();
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_ngram_offsets + 1,
auto const ngram_counts = cudf::detail::make_counting_transform_iterator(
0,
cuda::proclaim_return_type<cudf::size_type>(
[d_token_offsets, ngrams] __device__(cudf::size_type idx) {
auto token_count = d_token_offsets[idx + 1] - d_token_offsets[idx];
auto token_count =
static_cast<cudf::size_type>(d_token_offsets[idx + 1] - d_token_offsets[idx]);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
}),
thrust::plus{});
ngram_offsets.set_element_to_zero_async(0, stream);
auto const total_ngrams = ngram_offsets.back_element(stream);
}));
auto [ngram_offsets, total_ngrams] = cudf::detail::make_offsets_child_column(
ngram_counts, ngram_counts + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_ngram_offsets = ngram_offsets->view().begin<cudf::size_type>();

// Compute the total size of the ngrams for each string (not for each ngram)
// Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
Expand All @@ -212,21 +202,14 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
// ngrams for each string.
// Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
// is built in memory like this: "a_bbbb_ccc"
rmm::device_uvector<cudf::size_type> chars_offsets(strings_count + 1, stream);
// First compute the output sizes for each string (this not the final output result)
thrust::transform(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
chars_offsets.begin(),
ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
// Convert the sizes to offsets
auto const output_chars_size = cudf::detail::sizes_to_offsets(
chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
CUDF_EXPECTS(
output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
"Size of output exceeds the column size limit",
std::overflow_error);

// First compute the output sizes for each string (this not the final output result)
auto const sizes_itr = cudf::detail::make_counting_transform_iterator(
0, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
auto [chars_offsets, output_chars_size] = cudf::strings::detail::make_offsets_child_column(
sizes_itr, sizes_itr + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_chars_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(chars_offsets->view());

// This will contain the size in bytes of each ngram to generate
rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
Expand All @@ -245,14 +228,13 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
ngrams,
d_token_offsets,
d_token_positions,
chars_offsets.data(),
d_chars_offsets,
d_chars,
d_ngram_offsets,
ngram_sizes.data()});
// build the offsets column -- converting the ngram sizes into offsets
auto offsets_column = std::get<0>(
cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr));
offsets_column->set_null_count(0);
// create the output strings column
return make_strings_column(
total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
Expand Down
47 changes: 21 additions & 26 deletions cpp/src/text/tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand All @@ -38,7 +38,6 @@
#include <thrust/count.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/scan.h>
#include <thrust/transform.h>

namespace nvtext {
Expand Down Expand Up @@ -80,18 +79,17 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource());
auto d_token_counts = token_counts->view();
// create token-index offsets from the counts
rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
thrust::inclusive_scan(rmm::exec_policy(stream),
d_token_counts.template begin<cudf::size_type>(),
d_token_counts.template end<cudf::size_type>(),
token_offsets.begin() + 1);
token_offsets.set_element_to_zero_async(0, stream);
auto const total_tokens = token_offsets.back_element(stream);
// build a list of pointers to each token
auto [token_offsets, total_tokens] =
cudf::detail::make_offsets_child_column(d_token_counts.template begin<cudf::size_type>(),
d_token_counts.template end<cudf::size_type>(),
stream,
rmm::mr::get_current_device_resource());
// build a list of pointers to each token
rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
// now go get the tokens
tokenizer.d_offsets = token_offsets.data();
tokenizer.d_tokens = tokens.data();
tokenizer.d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
tokenizer.d_tokens = tokens.data();
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
Expand Down Expand Up @@ -178,8 +176,8 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
}

auto offsets = strings_column.offsets();
auto offset = cudf::detail::get_value<cudf::size_type>(offsets, strings_column.offset(), stream);
auto chars_bytes = cudf::detail::get_value<cudf::size_type>(
auto offset = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream);
auto chars_bytes = cudf::strings::detail::get_offset_value(
offsets, strings_column.offset() + strings_count, stream) -
offset;
auto d_chars =
Expand All @@ -202,22 +200,19 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
// create output offsets column
// -- conditionally copy a counting iterator where
// the first byte of each character is located
auto offsets_column =
cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
num_characters + 1,
cudf::mask_state::UNALLOCATED,
stream,
mr);
auto d_new_offsets = offsets_column->mutable_view().begin<cudf::size_type>();
thrust::copy_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(chars_bytes + 1),
auto offsets_column = cudf::make_numeric_column(
offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_new_offsets =
cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
cudf::detail::copy_if_safe(
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(chars_bytes + 1),
d_new_offsets,
[d_chars, chars_bytes] __device__(auto idx) {
// this will also set the final value to the size chars_bytes
return idx < chars_bytes ? cudf::strings::detail::is_begin_utf8_char(d_chars[idx]) : true;
});
},
stream);

// create the output chars buffer -- just a copy of the input's chars
rmm::device_uvector<char> output_chars(chars_bytes, stream, mr);
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/text/utilities/tokenize_ops.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -147,10 +147,10 @@ struct characters_tokenizer {
* positions into the d_tokens vector.
*/
struct strings_tokenizer {
cudf::column_device_view const d_strings; ///< strings to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters to tokenize around
cudf::size_type* d_offsets{}; ///< offsets into the d_tokens vector for each string
string_index_pair* d_tokens{}; ///< token positions in device memory
cudf::column_device_view const d_strings; ///< strings to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters to tokenize around
cudf::detail::input_offsetalator d_offsets; ///< offsets into the d_tokens vector for each string
string_index_pair* d_tokens{}; ///< token positions in device memory

/**
* @brief Identifies the token positions within each string.
Expand Down Expand Up @@ -191,11 +191,11 @@ using delimiterator = cudf::column_device_view::const_iterator<cudf::string_view
* each string of a given strings column.
*/
struct multi_delimiter_strings_tokenizer {
cudf::column_device_view const d_strings; ///< strings column to tokenize
delimiterator delimiters_begin; ///< first delimiter
delimiterator delimiters_end; ///< last delimiter
cudf::size_type* d_offsets{}; ///< offsets into the d_tokens output vector
string_index_pair* d_tokens{}; ///< token positions found for each string
cudf::column_device_view const d_strings; ///< strings column to tokenize
delimiterator delimiters_begin; ///< first delimiter
delimiterator delimiters_end; ///< last delimiter
cudf::detail::input_offsetalator d_offsets; ///< offsets into the d_tokens output vector
string_index_pair* d_tokens{}; ///< token positions found for each string

/**
* @brief Identifies the token positions within each string.
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/text/vocabulary_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/cuco_helpers.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -297,7 +298,7 @@ struct vocabulary_tokenizer_fn {
cudf::string_view const d_delimiter;
MapRefType d_map;
cudf::size_type const default_id;
cudf::size_type const* d_offsets;
cudf::detail::input_offsetalator d_offsets;
cudf::size_type* d_results;

__device__ void operator()(cudf::size_type idx) const
Expand Down Expand Up @@ -378,7 +379,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
auto tokens = cudf::make_numeric_column(
output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_tokens = tokens->mutable_view().data<cudf::size_type>();
auto d_offsets = token_offsets->view().data<cudf::size_type>();
auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
*d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer);
Expand All @@ -394,11 +395,11 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
// longer strings perform better with warp-parallel approach

auto const first_offset = (input.offset() == 0) ? 0
: cudf::detail::get_value<cudf::size_type>(
: cudf::strings::detail::get_offset_value(
input.offsets(), input.offset(), stream);
auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
? input.chars_size(stream)
: cudf::detail::get_value<cudf::size_type>(
: cudf::strings::detail::get_offset_value(
input.offsets(), input.size() + input.offset(), stream);
auto const chars_size = last_offset - first_offset;
auto const d_input_chars = input.chars_begin(stream) + first_offset;
Expand Down
Loading