Skip to content

Commit

Permalink
Use offsetalator in cudf::strings::findall (#14745)
Browse files Browse the repository at this point in the history
Use `make_offsets_child_column` and `offsetalator_iterator` to build/access offsets instead of hardcoded types.
This cleans up the code nicely by automatically handling offset overflow and computing the total number of matches.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #14745
  • Loading branch information
davidwendt authored Feb 1, 2024
1 parent 2b0d987 commit 9916395
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 18 deletions.
2 changes: 1 addition & 1 deletion cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
// Use the sizes-to-offsets iterator to compute the total number of elements
auto const total_elements =
sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);

// TODO: replace exception with if-statement when enabling creating INT64 offsets
CUDF_EXPECTS(total_elements <= size_type_max,
Expand Down
27 changes: 10 additions & 17 deletions cpp/src/strings/search/findall.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,11 +20,10 @@

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/findall.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand All @@ -34,7 +33,6 @@
#include <rmm/exec_policy.hpp>

#include <thrust/pair.h>
#include <thrust/scan.h>

namespace cudf {
namespace strings {
Expand All @@ -50,7 +48,7 @@ namespace {
*/
struct findall_fn {
column_device_view const d_strings;
size_type const* d_offsets;
cudf::detail::input_offsetalator const d_offsets;
string_index_pair* d_indices;

__device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
Expand All @@ -77,8 +75,8 @@ struct findall_fn {

std::unique_ptr<column> findall_util(column_device_view const& d_strings,
reprog_device& d_prog,
size_type total_matches,
size_type const* d_offsets,
int64_t total_matches,
cudf::detail::input_offsetalator const d_offsets,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand All @@ -105,17 +103,12 @@ std::unique_ptr<column> findall(strings_column_view const& input,
auto d_prog = regex_device_builder::create_prog_device(prog, stream);

// Create lists offsets column
auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
auto d_offsets = offsets->mutable_view().data<size_type>();

// Convert counts into offsets
thrust::exclusive_scan(
rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);

// Create indices vector with the total number of groups that will be extracted
auto const total_matches =
cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
auto const sizes = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
auto [offsets, total_matches] = cudf::strings::detail::make_offsets_child_column(
sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

// Build strings column of the matches
auto strings_output = findall_util(*d_strings, *d_prog, total_matches, d_offsets, stream, mr);

// Build the lists column from the offsets and the strings
Expand Down

0 comments on commit 9916395

Please sign in to comment.