Skip to content

Commit

Permalink
Use offsetalator in nvtext tokenize functions (#14783)
Browse files Browse the repository at this point in the history
Adds offsetalator in place of hardcoded offset type arrays to the strings split functions:
- `nvtext::tokenize()`
- `nvtext::count_tokens()`
- `nvtext::character_tokenize()`
- `nvtext::ngrams_tokenize()`
- `nvtext::tokenize_with_vocabulary()`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #14783
  • Loading branch information
davidwendt authored Feb 6, 2024
1 parent cf32049 commit 72ecbe9
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 93 deletions.
86 changes: 34 additions & 52 deletions cpp/src/text/ngrams_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
Expand All @@ -37,12 +38,9 @@
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/transform_scan.h>

#include <cuda/functional>

#include <stdexcept>

namespace nvtext {
namespace detail {
namespace {
Expand All @@ -60,10 +58,10 @@ namespace {
* member.
*/
struct string_tokens_positions_fn {
cudf::column_device_view const d_strings; // strings to tokenize
cudf::string_view const d_delimiter; // delimiter to tokenize around
cudf::size_type const* d_token_offsets; // offsets into the d_token_positions for each string
position_pair* d_token_positions; // token positions in each string
cudf::column_device_view const d_strings; // strings to tokenize
cudf::string_view const d_delimiter; // delimiter to tokenize around
cudf::detail::input_offsetalator d_token_offsets; // offsets of d_token_positions for each string
position_pair* d_token_positions; // token positions in each string

__device__ void operator()(cudf::size_type idx)
{
Expand Down Expand Up @@ -95,12 +93,12 @@ struct ngram_builder_fn {
cudf::column_device_view const d_strings; // strings to generate ngrams from
cudf::string_view const d_separator; // separator to place between them 'grams
cudf::size_type const ngrams; // ngram number to generate (2=bi-gram, 3=tri-gram)
cudf::size_type const* d_token_offsets; // offsets for token position for each string
position_pair const* d_token_positions; // token positions for each string
cudf::size_type const* d_chars_offsets{}; // offsets for each string's ngrams
char* d_chars{}; // write ngram strings to here
cudf::size_type const* d_ngram_offsets{}; // offsets for sizes of each string's ngrams
cudf::size_type* d_ngram_sizes{}; // write ngram sizes to here
cudf::detail::input_offsetalator d_token_offsets; // offsets for token position for each string
position_pair const* d_token_positions; // token positions for each string
cudf::detail::input_offsetalator d_chars_offsets{}; // offsets for each string's ngrams
char* d_chars{}; // write ngram strings to here
cudf::size_type const* d_ngram_offsets{}; // offsets for sizes of each string's ngrams
cudf::size_type* d_ngram_sizes{}; // write ngram sizes to here

__device__ cudf::size_type operator()(cudf::size_type idx)
{
Expand Down Expand Up @@ -165,16 +163,12 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s

// first, get the number of tokens per string to get the token-offsets
// Ex. token-counts = [3,2]; token-offsets = [0,3,5]
rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
auto d_token_offsets = token_offsets.data();
thrust::transform_inclusive_scan(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_token_offsets + 1,
strings_tokenizer{d_strings, d_delimiter},
thrust::plus<cudf::size_type>());
token_offsets.set_element_to_zero_async(0, stream);
auto const total_tokens = token_offsets.back_element(stream); // Ex. 5 tokens
auto const count_itr =
cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{d_strings, d_delimiter});
auto [token_offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
count_itr, count_itr + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_token_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());

// get the token positions (in bytes) per string
// Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
Expand All @@ -188,21 +182,17 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s

// compute the number of ngrams per string to get the total number of ngrams to generate
// Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
auto d_ngram_offsets = ngram_offsets.data();
thrust::transform_inclusive_scan(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
d_ngram_offsets + 1,
auto const ngram_counts = cudf::detail::make_counting_transform_iterator(
0,
cuda::proclaim_return_type<cudf::size_type>(
[d_token_offsets, ngrams] __device__(cudf::size_type idx) {
auto token_count = d_token_offsets[idx + 1] - d_token_offsets[idx];
auto token_count =
static_cast<cudf::size_type>(d_token_offsets[idx + 1] - d_token_offsets[idx]);
return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
}),
thrust::plus{});
ngram_offsets.set_element_to_zero_async(0, stream);
auto const total_ngrams = ngram_offsets.back_element(stream);
}));
auto [ngram_offsets, total_ngrams] = cudf::detail::make_offsets_child_column(
ngram_counts, ngram_counts + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_ngram_offsets = ngram_offsets->view().begin<cudf::size_type>();

// Compute the total size of the ngrams for each string (not for each ngram)
// Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
Expand All @@ -212,21 +202,14 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
// ngrams for each string.
// Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
// is built in memory like this: "a_bbbb_ccc"
rmm::device_uvector<cudf::size_type> chars_offsets(strings_count + 1, stream);
// First compute the output sizes for each string (this not the final output result)
thrust::transform(
rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(strings_count),
chars_offsets.begin(),
ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
// Convert the sizes to offsets
auto const output_chars_size = cudf::detail::sizes_to_offsets(
chars_offsets.begin(), chars_offsets.end(), chars_offsets.begin(), stream);
CUDF_EXPECTS(
output_chars_size <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
"Size of output exceeds the column size limit",
std::overflow_error);

// First compute the output sizes for each string (this not the final output result)
auto const sizes_itr = cudf::detail::make_counting_transform_iterator(
0, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
auto [chars_offsets, output_chars_size] = cudf::strings::detail::make_offsets_child_column(
sizes_itr, sizes_itr + strings_count, stream, rmm::mr::get_current_device_resource());
auto d_chars_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(chars_offsets->view());

// This will contain the size in bytes of each ngram to generate
rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
Expand All @@ -245,14 +228,13 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
ngrams,
d_token_offsets,
d_token_positions,
chars_offsets.data(),
d_chars_offsets,
d_chars,
d_ngram_offsets,
ngram_sizes.data()});
// build the offsets column -- converting the ngram sizes into offsets
auto offsets_column = std::get<0>(
cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr));
offsets_column->set_null_count(0);
// create the output strings column
return make_strings_column(
total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
Expand Down
47 changes: 21 additions & 26 deletions cpp/src/text/tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand All @@ -38,7 +38,6 @@
#include <thrust/count.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/scan.h>
#include <thrust/transform.h>

namespace nvtext {
Expand Down Expand Up @@ -80,18 +79,17 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource());
auto d_token_counts = token_counts->view();
// create token-index offsets from the counts
rmm::device_uvector<cudf::size_type> token_offsets(strings_count + 1, stream);
thrust::inclusive_scan(rmm::exec_policy(stream),
d_token_counts.template begin<cudf::size_type>(),
d_token_counts.template end<cudf::size_type>(),
token_offsets.begin() + 1);
token_offsets.set_element_to_zero_async(0, stream);
auto const total_tokens = token_offsets.back_element(stream);
// build a list of pointers to each token
auto [token_offsets, total_tokens] =
cudf::detail::make_offsets_child_column(d_token_counts.template begin<cudf::size_type>(),
d_token_counts.template end<cudf::size_type>(),
stream,
rmm::mr::get_current_device_resource());
// build a list of pointers to each token
rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
// now go get the tokens
tokenizer.d_offsets = token_offsets.data();
tokenizer.d_tokens = tokens.data();
tokenizer.d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
tokenizer.d_tokens = tokens.data();
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
strings_count,
Expand Down Expand Up @@ -178,8 +176,8 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
}

auto offsets = strings_column.offsets();
auto offset = cudf::detail::get_value<cudf::size_type>(offsets, strings_column.offset(), stream);
auto chars_bytes = cudf::detail::get_value<cudf::size_type>(
auto offset = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream);
auto chars_bytes = cudf::strings::detail::get_offset_value(
offsets, strings_column.offset() + strings_count, stream) -
offset;
auto d_chars =
Expand All @@ -202,22 +200,19 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
// create output offsets column
// -- conditionally copy a counting iterator where
// the first byte of each character is located
auto offsets_column =
cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
num_characters + 1,
cudf::mask_state::UNALLOCATED,
stream,
mr);
auto d_new_offsets = offsets_column->mutable_view().begin<cudf::size_type>();
thrust::copy_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(chars_bytes + 1),
auto offsets_column = cudf::make_numeric_column(
offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_new_offsets =
cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
cudf::detail::copy_if_safe(
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(chars_bytes + 1),
d_new_offsets,
[d_chars, chars_bytes] __device__(auto idx) {
// this will also set the final value to the size chars_bytes
return idx < chars_bytes ? cudf::strings::detail::is_begin_utf8_char(d_chars[idx]) : true;
});
},
stream);

// create the output chars buffer -- just a copy of the input's chars
rmm::device_uvector<char> output_chars(chars_bytes, stream, mr);
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/text/utilities/tokenize_ops.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -147,10 +147,10 @@ struct characters_tokenizer {
* positions into the d_tokens vector.
*/
struct strings_tokenizer {
cudf::column_device_view const d_strings; ///< strings to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters to tokenize around
cudf::size_type* d_offsets{}; ///< offsets into the d_tokens vector for each string
string_index_pair* d_tokens{}; ///< token positions in device memory
cudf::column_device_view const d_strings; ///< strings to tokenize
cudf::string_view const d_delimiter; ///< delimiter characters to tokenize around
cudf::detail::input_offsetalator d_offsets; ///< offsets into the d_tokens vector for each string
string_index_pair* d_tokens{}; ///< token positions in device memory

/**
* @brief Identifies the token positions within each string.
Expand Down Expand Up @@ -191,11 +191,11 @@ using delimiterator = cudf::column_device_view::const_iterator<cudf::string_view
* each string of a given strings column.
*/
struct multi_delimiter_strings_tokenizer {
cudf::column_device_view const d_strings; ///< strings column to tokenize
delimiterator delimiters_begin; ///< first delimiter
delimiterator delimiters_end; ///< last delimiter
cudf::size_type* d_offsets{}; ///< offsets into the d_tokens output vector
string_index_pair* d_tokens{}; ///< token positions found for each string
cudf::column_device_view const d_strings; ///< strings column to tokenize
delimiterator delimiters_begin; ///< first delimiter
delimiterator delimiters_end; ///< last delimiter
cudf::detail::input_offsetalator d_offsets; ///< offsets into the d_tokens output vector
string_index_pair* d_tokens{}; ///< token positions found for each string

/**
* @brief Identifies the token positions within each string.
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/text/vocabulary_tokenize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/cuco_helpers.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -297,7 +298,7 @@ struct vocabulary_tokenizer_fn {
cudf::string_view const d_delimiter;
MapRefType d_map;
cudf::size_type const default_id;
cudf::size_type const* d_offsets;
cudf::detail::input_offsetalator d_offsets;
cudf::size_type* d_results;

__device__ void operator()(cudf::size_type idx) const
Expand Down Expand Up @@ -378,7 +379,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
auto tokens = cudf::make_numeric_column(
output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_tokens = tokens->mutable_view().data<cudf::size_type>();
auto d_offsets = token_offsets->view().data<cudf::size_type>();
auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
*d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer);
Expand All @@ -394,11 +395,11 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
// longer strings perform better with warp-parallel approach

auto const first_offset = (input.offset() == 0) ? 0
: cudf::detail::get_value<cudf::size_type>(
: cudf::strings::detail::get_offset_value(
input.offsets(), input.offset(), stream);
auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
? input.chars_size(stream)
: cudf::detail::get_value<cudf::size_type>(
: cudf::strings::detail::get_offset_value(
input.offsets(), input.size() + input.offset(), stream);
auto const chars_size = last_offset - first_offset;
auto const d_input_chars = input.chars_begin(stream) + first_offset;
Expand Down

0 comments on commit 72ecbe9

Please sign in to comment.