Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use offsetalator in cudf::strings::split functions #14757

Merged
merged 17 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions cpp/src/strings/split/split.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,7 +19,6 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/split_utils.cuh>
Expand Down Expand Up @@ -123,7 +122,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,

// builds the offsets and the vector of all tokens
auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
auto const d_offsets = offsets->view().template data<size_type>();
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
auto const d_tokens = tokens.data();

// compute the maximum number of tokens for any string
Expand All @@ -132,7 +131,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto idx) -> size_type {
return d_offsets[idx + 1] - d_offsets[idx];
return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
}),
0,
thrust::maximum{});
Expand All @@ -144,7 +143,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
cuda::proclaim_return_type<string_index_pair>(
[d_tokens, d_offsets, col] __device__(size_type idx) {
auto const offset = d_offsets[idx];
auto const token_count = d_offsets[idx + 1] - offset;
auto const token_count = static_cast<size_type>(d_offsets[idx + 1] - offset);
return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0};
}));
results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr));
Expand Down Expand Up @@ -360,12 +359,11 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
}

// get the positions for every token
rmm::device_uvector<string_index_pair> tokens(columns_count * strings_count, stream);
rmm::device_uvector<string_index_pair> tokens(
static_cast<int64_t>(columns_count) * static_cast<int64_t>(strings_count), stream);
string_index_pair* d_tokens = tokens.data();
thrust::fill(rmm::exec_policy(stream),
d_tokens,
d_tokens + (columns_count * strings_count),
string_index_pair{nullptr, 0});
thrust::fill(
rmm::exec_policy(stream), tokens.begin(), tokens.end(), string_index_pair{nullptr, 0});
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
Expand Down
102 changes: 52 additions & 50 deletions cpp/src/strings/split/split.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/strings/detail/split_utils.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/string_view.cuh>
Expand Down Expand Up @@ -66,9 +66,9 @@ struct base_split_tokenizer {
* @param chars_bytes Total number of characters to process
* @return true if delimiter is found starting at position `idx`
*/
__device__ bool is_delimiter(size_type idx,
size_type const* d_offsets,
size_type chars_bytes) const
__device__ bool is_delimiter(int64_t idx,
cudf::detail::input_offsetalator const d_offsets,
int64_t chars_bytes) const
{
auto const d_chars = get_base_ptr() + d_offsets[0];
if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; }
Expand All @@ -87,21 +87,22 @@ struct base_split_tokenizer {
* @param d_delimiter_offsets Offsets per string to delimiters in d_positions
*/
__device__ size_type count_tokens(size_type idx,
size_type const* d_positions,
size_type const* d_delimiter_offsets) const
int64_t const* d_positions,
int64_t const* d_delimiter_offsets) const
{
if (!is_valid(idx)) { return 0; }

auto const delim_size = d_delimiter.size_bytes();
auto const d_str = get_string(idx);
auto const d_str_end = d_str.data() + d_str.size_bytes();
auto const base_ptr = get_base_ptr() + delim_size - 1;

auto const delimiters =
cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
cudf::device_span<int64_t const>(d_positions + d_delimiter_offsets[idx],
d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);

size_type token_count = 1; // all strings will have at least one token
size_type last_pos = delimiters[0] - delim_size;
auto last_pos = !delimiters.empty() ? (delimiters[0] - delim_size) : 0L;
for (auto d_pos : delimiters) {
// delimiter must fit in string && overlapping delimiters are ignored
if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) {
Expand Down Expand Up @@ -129,9 +130,9 @@ struct base_split_tokenizer {
* @param d_all_tokens All output tokens for the strings column
*/
__device__ void get_tokens(size_type idx,
size_type const* d_tokens_offsets,
size_type const* d_positions,
size_type const* d_delimiter_offsets,
cudf::detail::input_offsetalator const d_tokens_offsets,
int64_t const* d_positions,
int64_t const* d_delimiter_offsets,
string_index_pair* d_all_tokens) const
{
auto const d_tokens = // this string's tokens output
Expand All @@ -149,8 +150,8 @@ struct base_split_tokenizer {
}

auto const delimiters =
cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
cudf::device_span<int64_t const>(d_positions + d_delimiter_offsets[idx],
d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);

auto& derived = static_cast<Derived const&>(*this);
derived.process_tokens(d_str, delimiters, d_tokens);
Expand Down Expand Up @@ -184,7 +185,7 @@ struct split_tokenizer_fn : base_split_tokenizer<split_tokenizer_fn> {
* @param d_tokens Output vector to store tokens for this string
*/
__device__ void process_tokens(string_view const d_str,
device_span<size_type const> d_delimiters,
device_span<int64_t const> d_delimiters,
device_span<string_index_pair> d_tokens) const
{
auto const base_ptr = get_base_ptr(); // d_positions values based on this
Expand Down Expand Up @@ -239,7 +240,7 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
* @param d_tokens Output vector to store tokens for this string
*/
__device__ void process_tokens(string_view const d_str,
device_span<size_type const> d_delimiters,
device_span<int64_t const> d_delimiters,
device_span<string_index_pair> d_tokens) const
{
auto const base_ptr = get_base_ptr(); // d_positions values are based on this ptr
Expand Down Expand Up @@ -290,7 +291,8 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
* @param input The input column of strings to split
* @param tokenizer Object used for counting and identifying delimiters and tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned objects' device memory.
* @param mr Device memory resource used to allocate the returned objects' device memory
* @return Token offsets and a vector of string indices
*/
template <typename Tokenizer>
std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split_helper(
Expand All @@ -301,37 +303,38 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
{
auto const strings_count = input.size();
auto const chars_bytes =
cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);

auto d_offsets = input.offsets_begin();
get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
get_offset_value(input.offsets(), input.offset(), stream);
auto const d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());

// count the number of delimiters in the entire column
auto const delimiter_count =
thrust::count_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(chars_bytes),
[tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(chars_bytes),
[tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
});

// Create a vector of every delimiter position in the chars column.
// These may include overlapping or otherwise out-of-bounds delimiters which
// will be resolved during token processing.
auto delimiter_positions = rmm::device_uvector<size_type>(delimiter_count, stream);
auto delimiter_positions = rmm::device_uvector<int64_t>(delimiter_count, stream);
auto d_positions = delimiter_positions.data();
auto const copy_end =
thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(chars_bytes),
delimiter_positions.begin(),
[tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
});
auto const copy_end = cudf::detail::copy_if_safe(
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(chars_bytes),
delimiter_positions.begin(),
[tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
},
stream);

// create a vector of offsets to each string's delimiter set within delimiter_positions
auto const delimiter_offsets = [&] {
// first, create a vector of string indices for each delimiter
auto string_indices = rmm::device_uvector<size_type>(delimiter_count, stream);
auto string_indices = rmm::device_uvector<int64_t>(delimiter_count, stream);
thrust::upper_bound(rmm::exec_policy(stream),
d_offsets,
d_offsets + strings_count,
Expand All @@ -340,24 +343,24 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
string_indices.begin());

// compute delimiter offsets per string
auto delimiter_offsets = rmm::device_uvector<size_type>(strings_count + 1, stream);
auto delimiter_offsets = rmm::device_uvector<int64_t>(strings_count + 1, stream);
auto d_delimiter_offsets = delimiter_offsets.data();

// memset to zero-out the delimiter counts for any null-entries or strings with no delimiters
CUDF_CUDA_TRY(cudaMemsetAsync(
d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value()));
d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(int64_t), stream.value()));

// next, count the number of delimiters per string
auto d_string_indices = string_indices.data(); // identifies strings with delimiters only
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
delimiter_count,
[d_string_indices, d_delimiter_offsets] __device__(size_type idx) {
auto const str_idx = d_string_indices[idx] - 1;
cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{
*(d_delimiter_offsets + str_idx)};
ref.fetch_add(1, cuda::std::memory_order_relaxed);
});
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::counting_iterator<int64_t>(0),
delimiter_count,
[d_string_indices, d_delimiter_offsets] __device__(int64_t idx) {
auto const str_idx = d_string_indices[idx] - 1;
cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*(d_delimiter_offsets + str_idx)};
ref.fetch_add(1L, cuda::std::memory_order_relaxed);
});
// finally, convert the delimiter counts into offsets
thrust::exclusive_scan(rmm::exec_policy(stream),
delimiter_offsets.begin(),
Expand All @@ -379,11 +382,10 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
});

// create offsets from the counts for return to the caller
auto offsets = std::get<0>(
cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr));
auto const total_tokens =
cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
auto const d_tokens_offsets = offsets->view().data<size_type>();
auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
token_counts.begin(), token_counts.end(), stream, mr);
auto const d_tokens_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

// build a vector of all the token positions for all the strings
auto tokens = rmm::device_uvector<string_index_pair>(total_tokens, stream);
Expand Down
Loading
Loading