Skip to content

Commit

Permalink
Change stack-based regex state data to use global memory (#10600)
Browse files Browse the repository at this point in the history
All libcudf strings regex calls will use global device memory for state data when evaluating regex on strings. Previously, separate templated kernels were used to store state data in fixed size stack memory depending on the number of instructions resolved from the provided regex pattern. This required the CUDA driver to allocate a large amount of device memory for when launching the kernel. This memory is managed by the launcher in the driver and so not under control of RMM.

This has been changed to use a memory-resource allocated global device memory to hold and manage the state data per string per instruction. This is an internal change only and results in no behavior changes. Overall, the performance based on the current benchmarks has not changed though much more memory may be required to execute any of the regex APIs depending on the number of instructions in the pattern and the total number of strings in the column.

Every effort has been made to not reduce performance from the stack-based approach. Additional optimizations here include copying the `reprog_device` class data to shared-memory (when it fits). Further optimizations are expected in later PRs as well.

Overall, the compile time of the files that use regex is also faster since only a single kernel is generated instead of 4 in the templated, stack-based implementation.

This PR is dependent on PR #10573.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: #10600
  • Loading branch information
davidwendt authored May 6, 2022
1 parent b12fd56 commit de0f7e0
Show file tree
Hide file tree
Showing 17 changed files with 578 additions and 595 deletions.
102 changes: 45 additions & 57 deletions cpp/src/strings/contains.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
*/

#include <strings/count_matches.hpp>
#include <strings/regex/dispatcher.hpp>
#include <strings/regex/regex.cuh>
#include <strings/utilities.hpp>
#include <strings/regex/utilities.cuh>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
Expand All @@ -27,65 +25,61 @@
#include <cudf/strings/contains.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

namespace cudf {
namespace strings {
namespace detail {

namespace {
/**
* @brief This functor handles both contains_re and match_re to minimize the number
* of regex calls to find() to be inlined greatly reducing compile time.
* @brief This functor handles both contains_re and match_re to regex-match a pattern
* to each string in a column.
*/
template <int stack_size>
struct contains_fn {
reprog_device prog;
column_device_view const d_strings;
bool const beginning_only; // do not make this a template parameter to keep compile times down
bool const beginning_only;

__device__ bool operator()(size_type idx)
__device__ bool operator()(size_type const idx,
reprog_device const prog,
int32_t const thread_idx)
{
if (d_strings.is_null(idx)) return false;
auto const d_str = d_strings.element<string_view>(idx);
int32_t begin = 0;
int32_t end = beginning_only ? 1 // match only the beginning of the string;
: -1; // match anywhere in the string
return static_cast<bool>(prog.find<stack_size>(idx, d_str, begin, end));

size_type begin = 0;
size_type end = beginning_only ? 1 // match only the beginning of the string;
: -1; // match anywhere in the string
return static_cast<bool>(prog.find(thread_idx, d_str, begin, end));
}
};

struct contains_dispatch_fn {
reprog_device d_prog;
bool const beginning_only;
std::unique_ptr<column> contains_impl(strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
bool const beginning_only,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto results = make_numeric_column(data_type{type_id::BOOL8},
input.size(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);
if (input.is_empty()) { return results; }

template <int stack_size>
std::unique_ptr<column> operator()(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto results = make_numeric_column(data_type{type_id::BOOL8},
input.size(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);

auto const d_strings = column_device_view::create(input.parent(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
results->mutable_view().data<bool>(),
contains_fn<stack_size>{d_prog, *d_strings, beginning_only});
return results;
}
};
auto d_prog = reprog_device::create(pattern, flags, stream);

auto d_results = results->mutable_view().data<bool>();
auto const d_strings = column_device_view::create(input.parent(), stream);

launch_transform_kernel(
contains_fn{*d_strings, beginning_only}, *d_prog, d_results, input.size(), stream);

return results;
}

} // namespace

Expand All @@ -96,10 +90,7 @@ std::unique_ptr<column> contains_re(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto d_prog =
reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);

return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, false}, input, stream, mr);
return contains_impl(input, pattern, flags, false, stream, mr);
}

std::unique_ptr<column> matches_re(
Expand All @@ -109,21 +100,18 @@ std::unique_ptr<column> matches_re(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto d_prog =
reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);

return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, true}, input, stream, mr);
return contains_impl(input, pattern, flags, true, stream, mr);
}

std::unique_ptr<column> count_re(strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
std::unique_ptr<column> count_re(
strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
// compile regex into device object
auto d_prog =
reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
auto d_prog = reprog_device::create(pattern, flags, stream);

auto const d_strings = column_device_view::create(input.parent(), stream);

Expand Down
69 changes: 24 additions & 45 deletions cpp/src/strings/count_matches.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,35 @@
*/

#include <strings/count_matches.hpp>
#include <strings/regex/dispatcher.hpp>
#include <strings/regex/regex.cuh>
#include <strings/regex/utilities.cuh>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/strings/string_view.cuh>

#include <rmm/exec_policy.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

namespace cudf {
namespace strings {
namespace detail {

namespace {
/**
* @brief Functor counts the total matches to the given regex in each string.
* @brief Kernel counts the total matches for the given regex in each string.
*/
template <int stack_size>
struct count_matches_fn {
struct count_fn {
column_device_view const d_strings;
reprog_device prog;

__device__ size_type operator()(size_type idx)
__device__ int32_t operator()(size_type const idx,
reprog_device const prog,
int32_t const thread_idx)
{
if (d_strings.is_null(idx)) { return 0; }
size_type count = 0;
if (d_strings.is_null(idx)) return 0;
auto const d_str = d_strings.element<string_view>(idx);
auto const nchars = d_str.length();
int32_t count = 0;

int32_t begin = 0;
int32_t end = nchars;
while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
size_type begin = 0;
size_type end = nchars;
while ((begin < end) && (prog.find(thread_idx, d_str, begin, end) > 0)) {
++count;
begin = end + (begin == end);
end = nchars;
Expand All @@ -58,41 +52,26 @@ struct count_matches_fn {
}
};

struct count_dispatch_fn {
reprog_device d_prog;

template <int stack_size>
std::unique_ptr<column> operator()(column_device_view const& d_strings,
size_type output_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
assert(output_size >= d_strings.size() and "Unexpected output size");

auto results = make_numeric_column(
data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);

thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(d_strings.size()),
results->mutable_view().data<int32_t>(),
count_matches_fn<stack_size>{d_strings, d_prog});
return results;
}
};

} // namespace

/**
* @copydoc cudf::strings::detail::count_matches
*/
std::unique_ptr<column> count_matches(column_device_view const& d_strings,
reprog_device const& d_prog,
reprog_device& d_prog,
size_type output_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return regex_dispatcher(d_prog, count_dispatch_fn{d_prog}, d_strings, output_size, stream, mr);
assert(output_size >= d_strings.size() and "Unexpected output size");

auto results = make_numeric_column(
data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);

if (d_strings.size() == 0) return results;

auto d_results = results->mutable_view().data<int32_t>();

launch_transform_kernel(count_fn{d_strings}, d_prog, d_results, d_strings.size(), stream);

return results;
}

} // namespace detail
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/strings/count_matches.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,11 @@ class reprog_device;
* @param output_size Number of rows for the output column.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return Integer column of match counts
*/
std::unique_ptr<column> count_matches(
column_device_view const& d_strings,
reprog_device const& d_prog,
reprog_device& d_prog,
size_type output_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
Expand Down
57 changes: 19 additions & 38 deletions cpp/src/strings/extract/extract.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@
* limitations under the License.
*/

#include <strings/regex/dispatcher.hpp>
#include <strings/regex/regex.cuh>
#include <strings/utilities.hpp>
#include <strings/regex/utilities.cuh>

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
Expand All @@ -31,7 +29,7 @@
#include <rmm/cuda_stream_view.hpp>

#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/fill.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/pair.h>
Expand All @@ -47,28 +45,26 @@ using string_index_pair = thrust::pair<const char*, size_type>;
/**
* @brief This functor handles extracting strings by applying the compiled regex pattern
* and creating string_index_pairs for all the substrings.
*
* @tparam stack_size Correlates to the regex instructions state to maintain for each string.
* Each instruction requires a fixed amount of overhead data.
*/
template <int stack_size>
struct extract_fn {
reprog_device prog;
column_device_view const d_strings;
cudf::detail::device_2dspan<string_index_pair> d_indices;

__device__ void operator()(size_type idx)
__device__ void operator()(size_type const idx,
reprog_device const d_prog,
int32_t const prog_idx)
{
auto const groups = prog.group_counts();
auto const groups = d_prog.group_counts();
auto d_output = d_indices[idx];

if (d_strings.is_valid(idx)) {
auto const d_str = d_strings.element<string_view>(idx);
int32_t begin = 0;
int32_t end = -1; // handles empty strings automatically
if (prog.find<stack_size>(idx, d_str, begin, end) > 0) {

size_type begin = 0;
size_type end = -1; // handles empty strings automatically
if (d_prog.find(prog_idx, d_str, begin, end) > 0) {
for (auto col_idx = 0; col_idx < groups; ++col_idx) {
auto const extracted = prog.extract<stack_size>(idx, d_str, begin, end, col_idx);
auto const extracted = d_prog.extract(prog_idx, d_str, begin, end, col_idx);
d_output[col_idx] = [&] {
if (!extracted) return string_index_pair{nullptr, 0};
auto const offset = d_str.byte_offset((*extracted).first);
Expand All @@ -85,33 +81,17 @@ struct extract_fn {
}
};

struct extract_dispatch_fn {
reprog_device d_prog;

template <int stack_size>
void operator()(column_device_view const& d_strings,
cudf::detail::device_2dspan<string_index_pair>& d_indices,
rmm::cuda_stream_view stream)
{
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
d_strings.size(),
extract_fn<stack_size>{d_prog, d_strings, d_indices});
}
};
} // namespace

//
std::unique_ptr<table> extract(
strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
std::unique_ptr<table> extract(strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// compile regex into device object
auto d_prog =
reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);
auto d_prog = reprog_device::create(pattern, flags, stream);

auto const groups = d_prog->group_counts();
CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
Expand All @@ -121,7 +101,8 @@ std::unique_ptr<table> extract(
cudf::detail::device_2dspan<string_index_pair>(indices.data(), input.size(), groups);

auto const d_strings = column_device_view::create(input.parent(), stream);
regex_dispatcher(*d_prog, extract_dispatch_fn{*d_prog}, *d_strings, d_indices, stream);

launch_for_each_kernel(extract_fn{*d_strings, d_indices}, *d_prog, input.size(), stream);

// build a result column for each group
std::vector<std::unique_ptr<column>> results(groups);
Expand Down
Loading

0 comments on commit de0f7e0

Please sign in to comment.