-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Split out strings/replace.cu and rework its gtests #15054
Split out strings/replace.cu and rework its gtests #15054
Conversation
namespace cudf { | ||
namespace strings { | ||
namespace detail { | ||
namespace { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code moved from
cudf/cpp/src/strings/replace/replace.cu
Lines 641 to 707 in 3ba63c3
namespace { | |
/** | |
* @brief Function logic for the replace_slice API. | |
* | |
* This will perform a replace_slice operation on each string. | |
*/ | |
struct replace_slice_fn { | |
column_device_view const d_strings; | |
string_view const d_repl; | |
size_type const start; | |
size_type const stop; | |
int32_t* d_offsets{}; | |
char* d_chars{}; | |
__device__ void operator()(size_type idx) | |
{ | |
if (d_strings.is_null(idx)) { | |
if (!d_chars) d_offsets[idx] = 0; | |
return; | |
} | |
auto const d_str = d_strings.element<string_view>(idx); | |
auto const length = d_str.length(); | |
char const* in_ptr = d_str.data(); | |
auto const begin = d_str.byte_offset(((start < 0) || (start > length) ? length : start)); | |
auto const end = d_str.byte_offset(((stop < 0) || (stop > length) ? length : stop)); | |
if (d_chars) { | |
char* out_ptr = d_chars + d_offsets[idx]; | |
out_ptr = copy_and_increment(out_ptr, in_ptr, begin); // copy beginning | |
out_ptr = copy_string(out_ptr, d_repl); // insert replacement | |
out_ptr = copy_and_increment(out_ptr, // copy end | |
in_ptr + end, | |
d_str.size_bytes() - end); | |
} else { | |
d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin); | |
} | |
} | |
}; | |
} // namespace | |
std::unique_ptr<column> replace_slice(strings_column_view const& strings, | |
string_scalar const& repl, | |
size_type start, | |
size_type stop, | |
rmm::cuda_stream_view stream, | |
rmm::mr::device_memory_resource* mr) | |
{ | |
if (strings.is_empty()) return make_empty_column(type_id::STRING); | |
CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid."); | |
if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop."); | |
string_view d_repl(repl.data(), repl.size()); | |
auto d_strings = column_device_view::create(strings.parent(), stream); | |
// this utility calls the given functor to build the offsets and chars columns | |
auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( | |
replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr); | |
return make_strings_column(strings.size(), | |
std::move(offsets_column), | |
std::move(chars_column->release().data.release()[0]), | |
strings.null_count(), | |
cudf::detail::copy_bitmask(strings.parent(), stream, mr)); | |
} |
(it will call experimental make-strings-children in a future PR)
namespace strings { | ||
namespace detail { | ||
|
||
std::unique_ptr<column> replace_nulls(strings_column_view const& strings, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code moved from
cudf/cpp/src/strings/replace/replace.cu
Lines 709 to 748 in 3ba63c3
std::unique_ptr<column> replace_nulls(strings_column_view const& strings, | |
string_scalar const& repl, | |
rmm::cuda_stream_view stream, | |
rmm::mr::device_memory_resource* mr) | |
{ | |
size_type strings_count = strings.size(); | |
if (strings_count == 0) return make_empty_column(type_id::STRING); | |
CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid."); | |
string_view d_repl(repl.data(), repl.size()); | |
auto strings_column = column_device_view::create(strings.parent(), stream); | |
auto d_strings = *strings_column; | |
// build offsets column | |
auto offsets_transformer_itr = thrust::make_transform_iterator( | |
thrust::make_counting_iterator<int32_t>(0), | |
cuda::proclaim_return_type<size_type>([d_strings, d_repl] __device__(size_type idx) { | |
return d_strings.is_null(idx) ? d_repl.size_bytes() | |
: d_strings.element<string_view>(idx).size_bytes(); | |
})); | |
auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column( | |
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); | |
auto d_offsets = offsets_column->view().data<int32_t>(); | |
// build chars column | |
rmm::device_uvector<char> chars(bytes, stream, mr); | |
auto d_chars = chars.data(); | |
thrust::for_each_n(rmm::exec_policy(stream), | |
thrust::make_counting_iterator<size_type>(0), | |
strings_count, | |
[d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) { | |
string_view d_str = d_repl; | |
if (!d_strings.is_null(idx)) d_str = d_strings.element<string_view>(idx); | |
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes()); | |
}); | |
return make_strings_column( | |
strings_count, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); | |
} |
(and modified to use offsetalator)
size_type stop, | ||
rmm::cuda_stream_view stream, | ||
rmm::mr::device_memory_resource* mr); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved below just for neatness (alphabetical)
Style check fails due to the latest changes of clang-format config. |
/merge |
Description
Splitting out changes in PR #14824 to make it easier to review. The changes here simply move
replace_slice()
andreplace_nulls()
fromreplace.cu
into their own source files.The detail functions have been simplified removing the template argument that was only needed for unit tests. The gtests were reworked to force calling either row-parallel or character-parallel based on the data input instead of being executed directly. This simplified the internal logic which had duplicate parameter checking.
The
cudf::strings::detail::replace_nulls()
is also fixed to use the appropriatemake_offsets_child_column
utitlity.The PR #14824 changes will add large strings support to
cudf::strings::replace()
.Checklist