From ad6901c87d7ca6daf50ee45080cb3066eefef9bf Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 16 Feb 2021 19:02:54 -0500
Subject: [PATCH] Refactor libcudf strings::replace to use
make_strings_children utility (#7384)
Reference #7370
This PR simplifies the current `cudf::strings::replace` (non-regex) functions by refactoring to use the more efficient `make_strings_children` utility. This refactoring improves performance by about 2x on these APIs as measured by the gbenchmark PR #7369.
Baseline gbenchmark for replace-scalar
```
---------------------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------------------------------------------------------
StringReplaceScalar/replace_scalar/4096/32/manual_time 0.308 ms 0.316 ms 2345 bytes_per_second=224.631M/s
StringReplaceScalar/replace_scalar/4096/128/manual_time 1.01 ms 1.03 ms 684 bytes_per_second=269.171M/s
StringReplaceScalar/replace_scalar/4096/512/manual_time 7.35 ms 7.38 ms 95 bytes_per_second=149.028M/s
StringReplaceScalar/replace_scalar/4096/2048/manual_time 74.1 ms 74.2 ms 9 bytes_per_second=58.9153M/s
StringReplaceScalar/replace_scalar/4096/8192/manual_time 1170 ms 1170 ms 1 bytes_per_second=14.8457M/s
StringReplaceScalar/replace_scalar/32768/32/manual_time 0.314 ms 0.333 ms 2225 bytes_per_second=1.7147G/s
StringReplaceScalar/replace_scalar/32768/128/manual_time 1.16 ms 1.18 ms 604 bytes_per_second=1.83688G/s
StringReplaceScalar/replace_scalar/32768/512/manual_time 7.56 ms 7.58 ms 92 bytes_per_second=1.12604G/s
StringReplaceScalar/replace_scalar/32768/2048/manual_time 80.8 ms 80.9 ms 9 bytes_per_second=432.314M/s
StringReplaceScalar/replace_scalar/32768/8192/manual_time 1526 ms 1521 ms 1 bytes_per_second=91.3563M/s
StringReplaceScalar/replace_scalar/262144/32/manual_time 0.430 ms 0.449 ms 1622 bytes_per_second=10.0357G/s
StringReplaceScalar/replace_scalar/262144/128/manual_time 1.94 ms 1.96 ms 361 bytes_per_second=8.80298G/s
StringReplaceScalar/replace_scalar/262144/512/manual_time 18.1 ms 18.0 ms 39 bytes_per_second=3.77253G/s
StringReplaceScalar/replace_scalar/262144/2048/manual_time 227 ms 227 ms 3 bytes_per_second=1.20334G/s
StringReplaceScalar/replace_scalar/2097152/32/manual_time 2.48 ms 2.50 ms 282 bytes_per_second=13.9373G/s
StringReplaceScalar/replace_scalar/2097152/128/manual_time 11.8 ms 11.9 ms 59 bytes_per_second=11.5245G/s
StringReplaceScalar/replace_scalar/2097152/512/manual_time 101 ms 101 ms 7 bytes_per_second=5.42976G/s
StringReplaceScalar/replace_scalar/16777216/32/manual_time 22.2 ms 22.2 ms 31 bytes_per_second=12.4258G/s
```
gbenchmark results for refactored replace-scalar
```
---------------------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
---------------------------------------------------------------------------------------------------------------------
StringReplaceScalar/replace_scalar/4096/32/manual_time 0.144 ms 0.162 ms 4871 bytes_per_second=481.559M/s
StringReplaceScalar/replace_scalar/4096/128/manual_time 0.428 ms 0.446 ms 1633 bytes_per_second=634.055M/s
StringReplaceScalar/replace_scalar/4096/512/manual_time 2.65 ms 2.67 ms 263 bytes_per_second=413.561M/s
StringReplaceScalar/replace_scalar/4096/2048/manual_time 28.8 ms 28.8 ms 24 bytes_per_second=151.733M/s
StringReplaceScalar/replace_scalar/4096/8192/manual_time 479 ms 479 ms 2 bytes_per_second=36.2387M/s
StringReplaceScalar/replace_scalar/32768/32/manual_time 0.161 ms 0.178 ms 4347 bytes_per_second=3.35237G/s
StringReplaceScalar/replace_scalar/32768/128/manual_time 0.466 ms 0.484 ms 1502 bytes_per_second=4.57268G/s
StringReplaceScalar/replace_scalar/32768/512/manual_time 2.94 ms 2.96 ms 238 bytes_per_second=2.89405G/s
StringReplaceScalar/replace_scalar/32768/2048/manual_time 37.4 ms 37.4 ms 19 bytes_per_second=933.899M/s
StringReplaceScalar/replace_scalar/32768/8192/manual_time 567 ms 565 ms 1 bytes_per_second=245.929M/s
StringReplaceScalar/replace_scalar/262144/32/manual_time 0.316 ms 0.334 ms 2198 bytes_per_second=13.6601G/s
StringReplaceScalar/replace_scalar/262144/128/manual_time 1.39 ms 1.41 ms 498 bytes_per_second=12.237G/s
StringReplaceScalar/replace_scalar/262144/512/manual_time 12.8 ms 12.9 ms 54 bytes_per_second=5.30963G/s
StringReplaceScalar/replace_scalar/262144/2048/manual_time 157 ms 157 ms 4 bytes_per_second=1.73861G/s
StringReplaceScalar/replace_scalar/2097152/32/manual_time 1.84 ms 1.86 ms 379 bytes_per_second=18.7409G/s
StringReplaceScalar/replace_scalar/2097152/128/manual_time 9.50 ms 9.52 ms 74 bytes_per_second=14.3717G/s
StringReplaceScalar/replace_scalar/2097152/512/manual_time 84.7 ms 84.7 ms 8 bytes_per_second=6.44185G/s
StringReplaceScalar/replace_scalar/16777216/32/manual_time 14.0 ms 14.0 ms 50 bytes_per_second=19.6828G/s
```
Improvements for #7370 should base off of these changes.
Authors:
- David (@davidwendt)
Approvers:
- Jason Lowe (@jlowe)
- @nvdbaranec
- Mark Harris (@harrism)
URL: https://github.com/rapidsai/cudf/pull/7384
---
cpp/src/strings/replace/replace.cu | 286 ++++++++++++-----------------
1 file changed, 117 insertions(+), 169 deletions(-)
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index daed6008c28..20184af77f6 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -32,59 +32,51 @@ namespace cudf {
namespace strings {
namespace detail {
namespace {
-/**
- * @brief Used as template parameter to divide size calculation from
- * the actual string operation within a function.
- *
- * Useful when most of the logic is identical for both passes.
- */
-enum class two_pass {
- SIZE_ONLY = 0, ///< calculate the size only
- EXECUTE_OP ///< run the string operation
-};
/**
* @brief Function logic for the replace API.
*
* This will perform a replace operation on each string.
*/
-template
struct replace_fn {
column_device_view const d_strings;
string_view const d_target;
string_view const d_repl;
- int32_t max_repl;
- const int32_t* d_offsets{};
+ int32_t const max_repl;
+ int32_t* d_offsets{};
char* d_chars{};
- __device__ size_type operator()(size_type idx)
+ __device__ void operator()(size_type idx)
{
- if (d_strings.is_null(idx)) return 0; // null string
- string_view d_str = d_strings.element(idx);
- auto max_n = max_repl;
- if (max_n < 0) max_n = d_str.length(); // max possible replacements
- char* out_ptr = nullptr;
- if (Pass == two_pass::EXECUTE_OP) out_ptr = d_chars + d_offsets[idx];
+ if (d_strings.is_null(idx)) {
+ if (!d_chars) d_offsets[idx] = 0;
+ return;
+ }
+ auto const d_str = d_strings.element(idx);
const char* in_ptr = d_str.data();
- size_type bytes = d_str.size_bytes();
- auto position = d_str.find(d_target);
+
+ char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
+ auto max_n = (max_repl < 0) ? d_str.length() : max_repl;
+ auto bytes = d_str.size_bytes();
+ auto position = d_str.find(d_target);
+
size_type last_pos = 0;
while ((position >= 0) && (max_n > 0)) {
- if (Pass == two_pass::SIZE_ONLY)
- bytes += d_repl.size_bytes() - d_target.size_bytes();
- else // EXECUTE_OP
- {
- size_type curr_pos = d_str.byte_offset(position);
+ if (out_ptr) {
+ auto const curr_pos = d_str.byte_offset(position);
out_ptr = copy_and_increment(out_ptr, in_ptr + last_pos, curr_pos - last_pos); // copy left
out_ptr = copy_string(out_ptr, d_repl); // copy repl
last_pos = curr_pos + d_target.size_bytes();
+ } else {
+ bytes += d_repl.size_bytes() - d_target.size_bytes();
}
position = d_str.find(d_target, position + d_target.size_bytes());
--max_n;
}
- if (Pass == two_pass::EXECUTE_OP) // copy whats left (or right depending on your point of view)
+ if (out_ptr) // copy whats left (or right depending on your point of view)
memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
- return bytes;
+ else
+ d_offsets[idx] = bytes;
}
};
@@ -98,8 +90,7 @@ std::unique_ptr replace(strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
- size_type strings_count = strings.size();
- if (strings_count == 0) return make_empty_strings_column(stream, mr);
+ if (strings.is_empty()) return make_empty_strings_column(stream, mr);
CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
@@ -107,35 +98,21 @@ std::unique_ptr replace(strings_column_view const& strings,
string_view d_target(target.data(), target.size());
string_view d_repl(repl.data(), repl.size());
- auto strings_column = column_device_view::create(strings.parent(), stream);
- auto d_strings = *strings_column;
-
- // copy the null mask
- rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
- // build offsets column
- auto offsets_transformer_itr = thrust::make_transform_iterator(
- thrust::make_counting_iterator(0),
- replace_fn{d_strings, d_target, d_repl, maxrepl});
- auto offsets_column = make_offsets_child_column(
- offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
- auto d_offsets = offsets_column->view().data();
+ auto d_strings = column_device_view::create(strings.parent(), stream);
- // build chars column
- size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
- auto chars_column =
- create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr);
- auto d_chars = chars_column->mutable_view().data();
- thrust::for_each_n(
- rmm::exec_policy(stream),
- thrust::make_counting_iterator(0),
- strings_count,
- replace_fn{d_strings, d_target, d_repl, maxrepl, d_offsets, d_chars});
+ // this utility calls the given functor to build the offsets and chars columns
+ auto children =
+ cudf::strings::detail::make_strings_children(replace_fn{*d_strings, d_target, d_repl, maxrepl},
+ strings.size(),
+ strings.null_count(),
+ stream,
+ mr);
- return make_strings_column(strings_count,
- std::move(offsets_column),
- std::move(chars_column),
+ return make_strings_column(strings.size(),
+ std::move(children.first),
+ std::move(children.second),
strings.null_count(),
- std::move(null_mask),
+ cudf::detail::copy_bitmask(strings.parent(), stream, mr),
stream,
mr);
}
@@ -146,34 +123,37 @@ namespace {
*
* This will perform a replace_slice operation on each string.
*/
-template
struct replace_slice_fn {
column_device_view const d_strings;
string_view const d_repl;
- size_type start, stop;
- const int32_t* d_offsets{};
+ size_type const start;
+ size_type const stop;
+ int32_t* d_offsets{};
char* d_chars{};
- __device__ size_type operator()(size_type idx)
+ __device__ void operator()(size_type idx)
{
- if (d_strings.is_null(idx)) return 0; // null string
- string_view d_str = d_strings.element(idx);
- auto length = d_str.length();
- char* out_ptr = nullptr;
- if (Pass == two_pass::EXECUTE_OP) out_ptr = d_chars + d_offsets[idx];
- const char* in_ptr = d_str.data();
- size_type bytes = d_str.size_bytes();
- size_type begin = ((start < 0) || (start > length) ? length : start);
- size_type end = ((stop < 0) || (stop > length) ? length : stop);
- begin = d_str.byte_offset(begin);
- end = d_str.byte_offset(end);
- bytes += d_repl.size_bytes() - (end - begin);
- if (Pass == two_pass::EXECUTE_OP) {
- out_ptr = copy_and_increment(out_ptr, in_ptr, begin);
- out_ptr = copy_string(out_ptr, d_repl);
- out_ptr = copy_and_increment(out_ptr, in_ptr + end, d_str.size_bytes() - end);
+ if (d_strings.is_null(idx)) {
+ if (!d_chars) d_offsets[idx] = 0;
+ return;
+ }
+ auto const d_str = d_strings.element(idx);
+ auto const length = d_str.length();
+ char const* in_ptr = d_str.data();
+ auto const begin = d_str.byte_offset(((start < 0) || (start > length) ? length : start));
+ auto const end = d_str.byte_offset(((stop < 0) || (stop > length) ? length : stop));
+
+ if (d_chars) {
+ char* out_ptr = d_chars + d_offsets[idx];
+
+ out_ptr = copy_and_increment(out_ptr, in_ptr, begin); // copy beginning
+ out_ptr = copy_string(out_ptr, d_repl); // insert replacement
+ out_ptr = copy_and_increment(out_ptr, // copy end
+ in_ptr + end,
+ d_str.size_bytes() - end);
+ } else {
+ d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
}
- return bytes;
}
};
@@ -186,44 +166,27 @@ std::unique_ptr replace_slice(strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
- size_type strings_count = strings.size();
- if (strings_count == 0) return make_empty_strings_column(stream, mr);
+ if (strings.is_empty()) return make_empty_strings_column(stream, mr);
CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
string_view d_repl(repl.data(), repl.size());
- auto strings_column = column_device_view::create(strings.parent(), stream);
- auto d_strings = *strings_column;
+ auto d_strings = column_device_view::create(strings.parent(), stream);
- // copy the null mask
- rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
- // build offsets column
- auto offsets_transformer_itr = thrust::make_transform_iterator(
- thrust::make_counting_iterator(0),
- replace_slice_fn{d_strings, d_repl, start, stop});
- auto offsets_column = make_offsets_child_column(
- offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
- auto offsets_view = offsets_column->view();
- auto d_offsets = offsets_view.data();
+ // this utility calls the given functor to build the offsets and chars columns
+ auto children =
+ cudf::strings::detail::make_strings_children(replace_slice_fn{*d_strings, d_repl, start, stop},
+ strings.size(),
+ strings.null_count(),
+ stream,
+ mr);
- // build chars column
- size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
- auto chars_column =
- create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr);
- auto chars_view = chars_column->mutable_view();
- auto d_chars = chars_view.data();
- thrust::for_each_n(
- rmm::exec_policy(stream),
- thrust::make_counting_iterator(0),
- strings_count,
- replace_slice_fn{d_strings, d_repl, start, stop, d_offsets, d_chars});
-
- return make_strings_column(strings_count,
- std::move(offsets_column),
- std::move(chars_column),
+ return make_strings_column(strings.size(),
+ std::move(children.first),
+ std::move(children.second),
strings.null_count(),
- std::move(null_mask),
+ cudf::detail::copy_bitmask(strings.parent(), stream, mr),
stream,
mr);
}
@@ -234,37 +197,38 @@ namespace {
*
* This will perform the multi-replace operation on each string.
*/
-template
struct replace_multi_fn {
column_device_view const d_strings;
column_device_view const d_targets;
column_device_view const d_repls;
- const int32_t* d_offsets{};
+ int32_t* d_offsets{};
char* d_chars{};
- __device__ size_type operator()(size_type idx)
+ __device__ void operator()(size_type idx)
{
- if (d_strings.is_null(idx)) return 0;
- string_view d_str = d_strings.element(idx);
- char* out_ptr = nullptr;
- if (Pass == two_pass::EXECUTE_OP) out_ptr = d_chars + d_offsets[idx];
- const char* in_ptr = d_str.data();
- size_type size = d_str.size_bytes();
- size_type bytes = size, spos = 0, lpos = 0;
- while (spos < size) { // check each character against each target
+ if (d_strings.is_null(idx)) {
+ if (!d_chars) d_offsets[idx] = 0;
+ return;
+ }
+ auto const d_str = d_strings.element(idx);
+ char const* in_ptr = d_str.data();
+
+ size_type bytes = d_str.size_bytes();
+ size_type spos = 0;
+ size_type lpos = 0;
+ char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
+
+ // check each character against each target
+ while (spos < d_str.size_bytes()) {
for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
- string_view d_tgt = d_targets.element(tgt_idx);
- if ((d_tgt.size_bytes() <= (size - spos)) && // check fit
- (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // does it match
- { // found one
- string_view d_repl;
- if (d_repls.size() == 1)
- d_repl = d_repls.element(0);
- else
- d_repl = d_repls.element(tgt_idx);
- if (Pass == two_pass::SIZE_ONLY)
- bytes += d_repl.size_bytes() - d_tgt.size_bytes();
- else {
+ auto const d_tgt = d_targets.element(tgt_idx);
+ if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit
+ (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match
+ {
+ auto const d_repl = (d_repls.size() == 1) ? d_repls.element(0)
+ : d_repls.element(tgt_idx);
+ bytes += d_repl.size_bytes() - d_tgt.size_bytes();
+ if (out_ptr) {
out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
out_ptr = copy_string(out_ptr, d_repl);
lpos = spos + d_tgt.size_bytes();
@@ -275,9 +239,10 @@ struct replace_multi_fn {
}
++spos;
}
- if (Pass == two_pass::EXECUTE_OP) // copy remainder
- memcpy(out_ptr, in_ptr + lpos, size - lpos);
- return bytes;
+ if (out_ptr) // copy remainder
+ memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+ else
+ d_offsets[idx] = bytes;
}
};
@@ -289,8 +254,7 @@ std::unique_ptr replace(strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
- auto strings_count = strings.size();
- if (strings_count == 0) return make_empty_strings_column(stream, mr);
+ if (strings.is_empty()) return make_empty_strings_column(stream, mr);
CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
"Parameters targets must not be empty and must not have nulls");
CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
@@ -298,39 +262,23 @@ std::unique_ptr replace(strings_column_view const& strings,
if (repls.size() > 1)
CUDF_EXPECTS(repls.size() == targets.size(), "Sizes for targets and repls must match");
- auto strings_column = column_device_view::create(strings.parent(), stream);
- auto d_strings = *strings_column;
- auto targets_column = column_device_view::create(targets.parent(), stream);
- auto d_targets = *targets_column;
- auto repls_column = column_device_view::create(repls.parent(), stream);
- auto d_repls = *repls_column;
-
- // copy the null mask
- rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
- // build offsets column
- auto offsets_transformer_itr = thrust::make_transform_iterator(
- thrust::make_counting_iterator(0),
- replace_multi_fn{d_strings, d_targets, d_repls});
- auto offsets_column = make_offsets_child_column(
- offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
- auto d_offsets = offsets_column->view().data();
-
- // build chars column
- size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
- auto chars_column =
- create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr);
- auto d_chars = chars_column->mutable_view().data();
- thrust::for_each_n(
- rmm::exec_policy(stream),
- thrust::make_counting_iterator(0),
- strings_count,
- replace_multi_fn{d_strings, d_targets, d_repls, d_offsets, d_chars});
-
- return make_strings_column(strings_count,
- std::move(offsets_column),
- std::move(chars_column),
+ auto d_strings = column_device_view::create(strings.parent(), stream);
+ auto d_targets = column_device_view::create(targets.parent(), stream);
+ auto d_repls = column_device_view::create(repls.parent(), stream);
+
+ // this utility calls the given functor to build the offsets and chars columns
+ auto children =
+ cudf::strings::detail::make_strings_children(replace_multi_fn{*d_strings, *d_targets, *d_repls},
+ strings.size(),
+ strings.null_count(),
+ stream,
+ mr);
+
+ return make_strings_column(strings.size(),
+ std::move(children.first),
+ std::move(children.second),
strings.null_count(),
- std::move(null_mask),
+ cudf::detail::copy_bitmask(strings.parent(), stream, mr),
stream,
mr);
}