Skip to content

Commit

Permalink
Use appropriate make_offsets_child_column for building lists columns (#…
Browse files Browse the repository at this point in the history
…15043)

Fixes `cudf::strings::extract_all()` to use `cudf::detail::make_offsets_child_column` so it properly computes the output-size and checks for overflow when building offsets for a lists column.
Also undo some changes from #14745 that incorrectly called `cudf::strings::detail::make_offsets_child_column` to create offsets for a lists column.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15043
  • Loading branch information
davidwendt authored Feb 23, 2024
1 parent c84e1e8 commit ee3c769
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 19 deletions.
27 changes: 12 additions & 15 deletions cpp/src/strings/extract/extract_all.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,12 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,

// Get the match counts for each string.
// This column will become the output lists child offsets column.
auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
auto d_offsets = offsets->mutable_view().data<size_type>();
auto counts = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
auto d_counts = counts->mutable_view().data<size_type>();

// Compute null output rows
auto [null_mask, null_count] = cudf::detail::valid_if(
d_offsets, d_offsets + strings_count, [] __device__(auto v) { return v > 0; }, stream, mr);
d_counts, d_counts + strings_count, [] __device__(auto v) { return v > 0; }, stream, mr);

// Return an empty lists column if there are no valid rows
if (strings_count == null_count) {
Expand All @@ -132,18 +132,15 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,

// Convert counts into offsets.
// Multiply each count by the number of groups.
thrust::transform_exclusive_scan(
rmm::exec_policy(stream),
d_offsets,
d_offsets + strings_count + 1,
d_offsets,
[groups] __device__(auto v) { return v * groups; },
size_type{0},
thrust::plus{});
auto const total_groups =
cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);

rmm::device_uvector<string_index_pair> indices(total_groups, stream);
auto sizes_itr = cudf::detail::make_counting_transform_iterator(
0, cuda::proclaim_return_type<size_type>([d_counts, groups] __device__(auto idx) {
return d_counts[idx] * groups;
}));
auto [offsets, total_strings] =
cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + strings_count, stream, mr);
auto d_offsets = offsets->view().data<size_type>();

rmm::device_uvector<string_index_pair> indices(total_strings, stream);

launch_for_each_kernel(
extract_fn{*d_strings, d_offsets, indices.data()}, *d_prog, strings_count, stream);
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/strings/search/findall.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace {
*/
struct findall_fn {
column_device_view const d_strings;
cudf::detail::input_offsetalator const d_offsets;
size_type const* d_offsets;
string_index_pair* d_indices;

__device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
Expand Down Expand Up @@ -76,7 +76,7 @@ struct findall_fn {
std::unique_ptr<column> findall_util(column_device_view const& d_strings,
reprog_device& d_prog,
int64_t total_matches,
cudf::detail::input_offsetalator const d_offsets,
size_type const* d_offsets,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand Down Expand Up @@ -104,9 +104,9 @@ std::unique_ptr<column> findall(strings_column_view const& input,

// Create lists offsets column
auto const sizes = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
auto [offsets, total_matches] = cudf::strings::detail::make_offsets_child_column(
auto [offsets, total_matches] = cudf::detail::make_offsets_child_column(
sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
auto const d_offsets = offsets->view().data<size_type>();

// Build strings column of the matches
auto strings_output = findall_util(*d_strings, *d_prog, total_matches, d_offsets, stream, mr);
Expand Down

0 comments on commit ee3c769

Please sign in to comment.