Skip to content

Commit

Permalink
Merge branch 'branch-24.04' into json-whitespace-fst
Browse files Browse the repository at this point in the history
  • Loading branch information
shrshi authored Feb 8, 2024
2 parents 183e66b + 49c7d2c commit 55d3bc5
Show file tree
Hide file tree
Showing 16 changed files with 208 additions and 150 deletions.
18 changes: 8 additions & 10 deletions cpp/src/strings/split/split.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,7 +19,6 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/split_utils.cuh>
Expand Down Expand Up @@ -123,7 +122,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,

// builds the offsets and the vector of all tokens
auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
auto const d_offsets = offsets->view().template data<size_type>();
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
auto const d_tokens = tokens.data();

// compute the maximum number of tokens for any string
Expand All @@ -132,7 +131,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto idx) -> size_type {
return d_offsets[idx + 1] - d_offsets[idx];
return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
}),
0,
thrust::maximum{});
Expand All @@ -144,7 +143,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
cuda::proclaim_return_type<string_index_pair>(
[d_tokens, d_offsets, col] __device__(size_type idx) {
auto const offset = d_offsets[idx];
auto const token_count = d_offsets[idx + 1] - offset;
auto const token_count = static_cast<size_type>(d_offsets[idx + 1] - offset);
return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0};
}));
results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr));
Expand Down Expand Up @@ -360,12 +359,11 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
}

// get the positions for every token
rmm::device_uvector<string_index_pair> tokens(columns_count * strings_count, stream);
rmm::device_uvector<string_index_pair> tokens(
static_cast<int64_t>(columns_count) * static_cast<int64_t>(strings_count), stream);
string_index_pair* d_tokens = tokens.data();
thrust::fill(rmm::exec_policy(stream),
d_tokens,
d_tokens + (columns_count * strings_count),
string_index_pair{nullptr, 0});
thrust::fill(
rmm::exec_policy(stream), tokens.begin(), tokens.end(), string_index_pair{nullptr, 0});
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
Expand Down
102 changes: 52 additions & 50 deletions cpp/src/strings/split/split.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/strings/detail/split_utils.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/string_view.cuh>
Expand Down Expand Up @@ -66,9 +66,9 @@ struct base_split_tokenizer {
* @param chars_bytes Total number of characters to process
* @return true if delimiter is found starting at position `idx`
*/
__device__ bool is_delimiter(size_type idx,
size_type const* d_offsets,
size_type chars_bytes) const
__device__ bool is_delimiter(int64_t idx,
cudf::detail::input_offsetalator const d_offsets,
int64_t chars_bytes) const
{
auto const d_chars = get_base_ptr() + d_offsets[0];
if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; }
Expand All @@ -87,21 +87,22 @@ struct base_split_tokenizer {
* @param d_delimiter_offsets Offsets per string to delimiters in d_positions
*/
__device__ size_type count_tokens(size_type idx,
size_type const* d_positions,
size_type const* d_delimiter_offsets) const
int64_t const* d_positions,
int64_t const* d_delimiter_offsets) const
{
if (!is_valid(idx)) { return 0; }

auto const delim_size = d_delimiter.size_bytes();
auto const d_str = get_string(idx);
auto const d_str_end = d_str.data() + d_str.size_bytes();
auto const base_ptr = get_base_ptr() + delim_size - 1;

auto const delimiters =
cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
cudf::device_span<int64_t const>(d_positions + d_delimiter_offsets[idx],
d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);

size_type token_count = 1; // all strings will have at least one token
size_type last_pos = delimiters[0] - delim_size;
auto last_pos = !delimiters.empty() ? (delimiters[0] - delim_size) : 0L;
for (auto d_pos : delimiters) {
// delimiter must fit in string && overlapping delimiters are ignored
if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) {
Expand Down Expand Up @@ -129,9 +130,9 @@ struct base_split_tokenizer {
* @param d_all_tokens All output tokens for the strings column
*/
__device__ void get_tokens(size_type idx,
size_type const* d_tokens_offsets,
size_type const* d_positions,
size_type const* d_delimiter_offsets,
cudf::detail::input_offsetalator const d_tokens_offsets,
int64_t const* d_positions,
int64_t const* d_delimiter_offsets,
string_index_pair* d_all_tokens) const
{
auto const d_tokens = // this string's tokens output
Expand All @@ -149,8 +150,8 @@ struct base_split_tokenizer {
}

auto const delimiters =
cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
cudf::device_span<int64_t const>(d_positions + d_delimiter_offsets[idx],
d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);

auto& derived = static_cast<Derived const&>(*this);
derived.process_tokens(d_str, delimiters, d_tokens);
Expand Down Expand Up @@ -184,7 +185,7 @@ struct split_tokenizer_fn : base_split_tokenizer<split_tokenizer_fn> {
* @param d_tokens Output vector to store tokens for this string
*/
__device__ void process_tokens(string_view const d_str,
device_span<size_type const> d_delimiters,
device_span<int64_t const> d_delimiters,
device_span<string_index_pair> d_tokens) const
{
auto const base_ptr = get_base_ptr(); // d_positions values based on this
Expand Down Expand Up @@ -239,7 +240,7 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
* @param d_tokens Output vector to store tokens for this string
*/
__device__ void process_tokens(string_view const d_str,
device_span<size_type const> d_delimiters,
device_span<int64_t const> d_delimiters,
device_span<string_index_pair> d_tokens) const
{
auto const base_ptr = get_base_ptr(); // d_positions values are based on this ptr
Expand Down Expand Up @@ -290,7 +291,8 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
* @param input The input column of strings to split
* @param tokenizer Object used for counting and identifying delimiters and tokens
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned objects' device memory.
* @param mr Device memory resource used to allocate the returned objects' device memory
* @return Token offsets and a vector of string indices
*/
template <typename Tokenizer>
std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split_helper(
Expand All @@ -301,37 +303,38 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
{
auto const strings_count = input.size();
auto const chars_bytes =
cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);

auto d_offsets = input.offsets_begin();
get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
get_offset_value(input.offsets(), input.offset(), stream);
auto const d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());

// count the number of delimiters in the entire column
auto const delimiter_count =
thrust::count_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(chars_bytes),
[tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(chars_bytes),
[tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
});

// Create a vector of every delimiter position in the chars column.
// These may include overlapping or otherwise out-of-bounds delimiters which
// will be resolved during token processing.
auto delimiter_positions = rmm::device_uvector<size_type>(delimiter_count, stream);
auto delimiter_positions = rmm::device_uvector<int64_t>(delimiter_count, stream);
auto d_positions = delimiter_positions.data();
auto const copy_end =
thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(chars_bytes),
delimiter_positions.begin(),
[tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
});
auto const copy_end = cudf::detail::copy_if_safe(
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(chars_bytes),
delimiter_positions.begin(),
[tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
},
stream);

// create a vector of offsets to each string's delimiter set within delimiter_positions
auto const delimiter_offsets = [&] {
// first, create a vector of string indices for each delimiter
auto string_indices = rmm::device_uvector<size_type>(delimiter_count, stream);
auto string_indices = rmm::device_uvector<int64_t>(delimiter_count, stream);
thrust::upper_bound(rmm::exec_policy(stream),
d_offsets,
d_offsets + strings_count,
Expand All @@ -340,24 +343,24 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
string_indices.begin());

// compute delimiter offsets per string
auto delimiter_offsets = rmm::device_uvector<size_type>(strings_count + 1, stream);
auto delimiter_offsets = rmm::device_uvector<int64_t>(strings_count + 1, stream);
auto d_delimiter_offsets = delimiter_offsets.data();

// memset to zero-out the delimiter counts for any null-entries or strings with no delimiters
CUDF_CUDA_TRY(cudaMemsetAsync(
d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value()));
d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(int64_t), stream.value()));

// next, count the number of delimiters per string
auto d_string_indices = string_indices.data(); // identifies strings with delimiters only
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
delimiter_count,
[d_string_indices, d_delimiter_offsets] __device__(size_type idx) {
auto const str_idx = d_string_indices[idx] - 1;
cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{
*(d_delimiter_offsets + str_idx)};
ref.fetch_add(1, cuda::std::memory_order_relaxed);
});
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::counting_iterator<int64_t>(0),
delimiter_count,
[d_string_indices, d_delimiter_offsets] __device__(int64_t idx) {
auto const str_idx = d_string_indices[idx] - 1;
cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*(d_delimiter_offsets + str_idx)};
ref.fetch_add(1L, cuda::std::memory_order_relaxed);
});
// finally, convert the delimiter counts into offsets
thrust::exclusive_scan(rmm::exec_policy(stream),
delimiter_offsets.begin(),
Expand All @@ -379,11 +382,10 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
});

// create offsets from the counts for return to the caller
auto offsets = std::get<0>(
cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr));
auto const total_tokens =
cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
auto const d_tokens_offsets = offsets->view().data<size_type>();
auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
token_counts.begin(), token_counts.end(), stream, mr);
auto const d_tokens_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

// build a vector of all the token positions for all the strings
auto tokens = rmm::device_uvector<string_index_pair>(total_tokens, stream);
Expand Down
Loading

0 comments on commit 55d3bc5

Please sign in to comment.