Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split out strings/replace.cu and rework its gtests #15054

Merged
merged 6 commits into from
Feb 22, 2024

Conversation

davidwendt
Copy link
Contributor

@davidwendt davidwendt commented Feb 14, 2024

Description

Splitting out changes in PR #14824 to make it easier to review. The changes here simply move replace_slice() and replace_nulls() from replace.cu into their own source files.

The detail functions have been simplified removing the template argument that was only needed for unit tests. The gtests were reworked to force calling either row-parallel or character-parallel based on the data input instead of being executed directly. This simplified the internal logic which had duplicate parameter checking.

The cudf::strings::detail::replace_nulls() is also fixed to use the appropriate make_offsets_child_column utitlity.
The PR #14824 changes will add large strings support to cudf::strings::replace().

Checklist

  • I am familiar with the Contributing Guidelines.
  • New or existing tests cover these changes.
  • The documentation is up to date with these changes.

@davidwendt davidwendt added 2 - In Progress Currently a work in progress libcudf Affects libcudf (C++/CUDA) code. strings strings issues (C++ and Python) improvement Improvement / enhancement to an existing function non-breaking Non-breaking change labels Feb 14, 2024
@davidwendt davidwendt self-assigned this Feb 14, 2024
@github-actions github-actions bot added the CMake CMake build issue label Feb 14, 2024
namespace cudf {
namespace strings {
namespace detail {
namespace {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code moved from

namespace {
/**
* @brief Function logic for the replace_slice API.
*
* This will perform a replace_slice operation on each string.
*/
struct replace_slice_fn {
column_device_view const d_strings;
string_view const d_repl;
size_type const start;
size_type const stop;
int32_t* d_offsets{};
char* d_chars{};
__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}
auto const d_str = d_strings.element<string_view>(idx);
auto const length = d_str.length();
char const* in_ptr = d_str.data();
auto const begin = d_str.byte_offset(((start < 0) || (start > length) ? length : start));
auto const end = d_str.byte_offset(((stop < 0) || (stop > length) ? length : stop));
if (d_chars) {
char* out_ptr = d_chars + d_offsets[idx];
out_ptr = copy_and_increment(out_ptr, in_ptr, begin); // copy beginning
out_ptr = copy_string(out_ptr, d_repl); // insert replacement
out_ptr = copy_and_increment(out_ptr, // copy end
in_ptr + end,
d_str.size_bytes() - end);
} else {
d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
}
}
};
} // namespace
std::unique_ptr<column> replace_slice(strings_column_view const& strings,
string_scalar const& repl,
size_type start,
size_type stop,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (strings.is_empty()) return make_empty_column(type_id::STRING);
CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
string_view d_repl(repl.data(), repl.size());
auto d_strings = column_device_view::create(strings.parent(), stream);
// this utility calls the given functor to build the offsets and chars columns
auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr);
return make_strings_column(strings.size(),
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
strings.null_count(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr));
}

(it will call experimental make-strings-children in a future PR)

namespace strings {
namespace detail {

std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code moved from

std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
string_scalar const& repl,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
size_type strings_count = strings.size();
if (strings_count == 0) return make_empty_column(type_id::STRING);
CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
string_view d_repl(repl.data(), repl.size());
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
// build offsets column
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<int32_t>(0),
cuda::proclaim_return_type<size_type>([d_strings, d_repl] __device__(size_type idx) {
return d_strings.is_null(idx) ? d_repl.size_bytes()
: d_strings.element<string_view>(idx).size_bytes();
}));
auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto d_offsets = offsets_column->view().data<int32_t>();
// build chars column
rmm::device_uvector<char> chars(bytes, stream, mr);
auto d_chars = chars.data();
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
[d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) {
string_view d_str = d_repl;
if (!d_strings.is_null(idx)) d_str = d_strings.element<string_view>(idx);
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
});
return make_strings_column(
strings_count, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
}

(and modified to use offsetalator)

size_type stop,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved below just for neatness (alphabetical)

@davidwendt davidwendt marked this pull request as ready for review February 15, 2024 20:26
@davidwendt davidwendt requested review from a team as code owners February 15, 2024 20:26
@davidwendt davidwendt added 3 - Ready for Review Ready for review by team and removed 2 - In Progress Currently a work in progress labels Feb 16, 2024
cpp/include/cudf/strings/detail/replace.hpp Outdated Show resolved Hide resolved
cpp/src/strings/replace/replace_nulls.cu Outdated Show resolved Hide resolved
@ttnghia
Copy link
Contributor

ttnghia commented Feb 21, 2024

Style check fails due to the latest changes of clang-format config.

@davidwendt
Copy link
Contributor Author

/merge

@rapids-bot rapids-bot bot merged commit 6f6e521 into rapidsai:branch-24.04 Feb 22, 2024
69 checks passed
@davidwendt davidwendt deleted the refactor-strings-replace branch February 22, 2024 13:58
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
3 - Ready for Review Ready for review by team CMake CMake build issue improvement Improvement / enhancement to an existing function libcudf Affects libcudf (C++/CUDA) code. non-breaking Non-breaking change strings strings issues (C++ and Python)
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants