Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use offsetalator in cudf::strings::findall #14745

Merged
merged 8 commits into from
Feb 1, 2024
2 changes: 1 addition & 1 deletion cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
// Use the sizes-to-offsets iterator to compute the total number of elements
auto const total_elements =
sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);

// TODO: replace exception with if-statement when enabling creating INT64 offsets
CUDF_EXPECTS(total_elements <= size_type_max,
Expand Down
27 changes: 10 additions & 17 deletions cpp/src/strings/search/findall.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,11 +20,10 @@

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/findall.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand All @@ -34,7 +33,6 @@
#include <rmm/exec_policy.hpp>

#include <thrust/pair.h>
#include <thrust/scan.h>

namespace cudf {
namespace strings {
Expand All @@ -50,7 +48,7 @@ namespace {
*/
struct findall_fn {
column_device_view const d_strings;
size_type const* d_offsets;
cudf::detail::input_offsetalator const d_offsets;
string_index_pair* d_indices;

__device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
Expand All @@ -77,8 +75,8 @@ struct findall_fn {

std::unique_ptr<column> findall_util(column_device_view const& d_strings,
reprog_device& d_prog,
size_type total_matches,
size_type const* d_offsets,
int64_t total_matches,
cudf::detail::input_offsetalator const d_offsets,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand All @@ -105,17 +103,12 @@ std::unique_ptr<column> findall(strings_column_view const& input,
auto d_prog = regex_device_builder::create_prog_device(prog, stream);

// Create lists offsets column
auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
auto d_offsets = offsets->mutable_view().data<size_type>();

// Convert counts into offsets
thrust::exclusive_scan(
rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);

// Create indices vector with the total number of groups that will be extracted
auto const total_matches =
cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
auto const sizes = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
auto [offsets, total_matches] = cudf::strings::detail::make_offsets_child_column(
sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

// Build strings column of the matches
auto strings_output = findall_util(*d_strings, *d_prog, total_matches, d_offsets, stream, mr);

// Build the lists column from the offsets and the strings
Expand Down
Loading