From 76b9f4b4e08c5acf9376c7cb8ce48bd6c59baf7c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 23 Nov 2022 15:27:46 -0500 Subject: [PATCH 1/4] Rework cudf::strings::pad and zfill to use make_strings_children --- cpp/src/strings/padding.cu | 195 ++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 102 deletions(-) diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 3f2012ceb15..d3d8a8a0a59 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -16,9 +16,6 @@ #include #include -#include -#include -#include #include #include #include @@ -29,106 +26,123 @@ #include #include -#include - -#include -#include -#include namespace cudf { namespace strings { namespace detail { namespace { -struct compute_pad_output_length_fn { - column_device_view d_strings; +/** + * @brief Base class for pad_fn and zfill_fn functors + * + * This handles the output size calculation while delegating the + * pad operation to Derived. + * + * @tparam Derived class uses the CRTP pattern to reuse code logic. + * The derived class must include a `pad(string_view,char*)` + * member function. + */ +template +struct base_fn { + column_device_view const d_column; size_type width; size_type fill_char_size; + offset_type* d_offsets{}; + char* d_chars{}; - __device__ size_type operator()(size_type idx) + base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size) + : d_column(d_column), width(width), fill_char_size(fill_char_size) { - if (d_strings.is_null(idx)) return 0; - string_view d_str = d_strings.element(idx); - return compute_padded_size(d_str, width, fill_char_size); + } + + __device__ void operator()(size_type idx) + { + if (d_column.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + } + + auto const d_str = d_column.element(idx); + auto& derived = static_cast(*this); + if (d_chars) { + derived.pad(d_str, d_chars + d_offsets[idx]); + } else { + d_offsets[idx] = compute_padded_size(d_str, width, fill_char_size); + } + }; +}; + +/** + * @brief Pads each string to specified width + * + * @tparam side Side of the string to pad + */ +template +struct pad_fn : base_fn> { + using Base = base_fn>; + + cudf::char_utf8 d_fill_char; + + pad_fn(column_device_view const& d_column, + size_type width, + size_type fill_char_size, + char_utf8 fill_char) + : Base(d_column, width, fill_char_size), d_fill_char(fill_char) + { + } + + __device__ void pad(string_view d_str, char* output) + { + pad_impl(d_str, Base::width, d_fill_char, output); } }; } // namespace -std::unique_ptr pad(strings_column_view const& strings, +std::unique_ptr pad(strings_column_view const& input, size_type width, side_type side, std::string_view fill_char, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); + if (input.is_empty()) return make_empty_column(type_id::STRING); CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty"); char_utf8 d_fill_char = 0; size_type fill_char_size = to_char_utf8(fill_char.data(), d_fill_char); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - - // create null_mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - - // build offsets column - auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( - 0, compute_pad_output_length_fn{d_strings, width, fill_char_size}); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - - // build chars column - auto const bytes = - cudf::detail::get_value(offsets_column->view(), strings_count, stream); - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - - if (side == side_type::LEFT) { - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { - if (d_strings.is_valid(idx)) { - pad_impl( - d_strings.element(idx), width, d_fill_char, d_chars + d_offsets[idx]); - } - }); - } else if (side == side_type::RIGHT) { - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { - if (d_strings.is_valid(idx)) { - pad_impl( - d_strings.element(idx), width, d_fill_char, d_chars + d_offsets[idx]); - } - }); - } else if (side == side_type::BOTH) { - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { - if (d_strings.is_valid(idx)) { - pad_impl( - d_strings.element(idx), width, d_fill_char, d_chars + d_offsets[idx]); - } - }); - } + auto d_strings = column_device_view::create(input.parent(), stream); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), - strings.null_count(), - std::move(null_mask)); + auto children = [&] { + if (side == side_type::LEFT) { + auto fn = pad_fn{*d_strings, width, fill_char_size, d_fill_char}; + return make_strings_children(fn, input.size(), stream, mr); + } else if (side == side_type::RIGHT) { + auto fn = pad_fn{*d_strings, width, fill_char_size, d_fill_char}; + return make_strings_children(fn, input.size(), stream, mr); + } + auto fn = pad_fn{*d_strings, width, fill_char_size, d_fill_char}; + return make_strings_children(fn, input.size(), stream, mr); + }(); + + return make_strings_column(input.size(), + std::move(children.first), + std::move(children.second), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } +namespace { + +/** + * @brief Zero-fill each string to specified width + */ +struct zfill_fn : base_fn { + zfill_fn(column_device_view const& d_column, size_type width) : base_fn(d_column, width, 1) {} + + __device__ void pad(string_view d_str, char* output) { zfill_impl(d_str, width, output); } +}; +} // namespace + std::unique_ptr zfill(strings_column_view const& input, size_type width, rmm::cuda_stream_view stream, @@ -136,35 +150,12 @@ std::unique_ptr zfill(strings_column_view const& input, { if (input.is_empty()) return make_empty_column(type_id::STRING); - auto strings_column = column_device_view::create(input.parent(), stream); - auto d_strings = *strings_column; - - // build offsets column - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - compute_pad_output_length_fn{d_strings, width, 1}); // fillchar is 1 byte - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + input.size(), stream, mr); - auto const d_offsets = offsets_column->view().data(); - - // build chars column - auto const bytes = cudf::detail::get_value(offsets_column->view(), input.size(), stream); - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - [d_strings, width, d_offsets, d_chars] __device__(size_type idx) { - if (d_strings.is_valid(idx)) { - zfill_impl( - d_strings.element(idx), width, d_chars + d_offsets[idx]); - } - }); + auto d_strings = column_device_view::create(input.parent(), stream); + auto children = make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(offsets_column), - std::move(chars_column), + std::move(children.first), + std::move(children.second), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } From e36e71f1656fda27c605b22e2f07ca2b6e7af6c9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 28 Nov 2022 10:04:15 -0500 Subject: [PATCH 2/4] add const decls --- cpp/src/strings/padding.cu | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index d3d8a8a0a59..8470dd9c23a 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -38,15 +38,14 @@ namespace { * This handles the output size calculation while delegating the * pad operation to Derived. * - * @tparam Derived class uses the CRTP pattern to reuse code logic. - * The derived class must include a `pad(string_view,char*)` - * member function. + * @tparam Derived class uses the CRTP pattern to reuse code logic + * and must include a `pad(string_view,char*)` member function. */ template struct base_fn { column_device_view const d_column; - size_type width; - size_type fill_char_size; + size_type const width; + size_type const fill_char_size; offset_type* d_offsets{}; char* d_chars{}; @@ -55,14 +54,14 @@ struct base_fn { { } - __device__ void operator()(size_type idx) + __device__ void operator()(size_type idx) const { if (d_column.is_null(idx)) { if (!d_chars) d_offsets[idx] = 0; } - auto const d_str = d_column.element(idx); - auto& derived = static_cast(*this); + auto const d_str = d_column.element(idx); + auto const& derived = static_cast(*this); if (d_chars) { derived.pad(d_str, d_chars + d_offsets[idx]); } else { @@ -80,7 +79,7 @@ template struct pad_fn : base_fn> { using Base = base_fn>; - cudf::char_utf8 d_fill_char; + cudf::char_utf8 const d_fill_char; pad_fn(column_device_view const& d_column, size_type width, @@ -90,7 +89,7 @@ struct pad_fn : base_fn> { { } - __device__ void pad(string_view d_str, char* output) + __device__ void pad(string_view d_str, char* output) const { pad_impl(d_str, Base::width, d_fill_char, output); } @@ -139,7 +138,7 @@ namespace { struct zfill_fn : base_fn { zfill_fn(column_device_view const& d_column, size_type width) : base_fn(d_column, width, 1) {} - __device__ void pad(string_view d_str, char* output) { zfill_impl(d_str, width, output); } + __device__ void pad(string_view d_str, char* output) const { zfill_impl(d_str, width, output); } }; } // namespace From 6baaf068ef596658222d69dfb4589e95615f1b33 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 29 Nov 2022 10:02:35 -0500 Subject: [PATCH 3/4] update fill-char-size decl --- cpp/src/strings/padding.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 8470dd9c23a..27afb399a6b 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -106,8 +106,8 @@ std::unique_ptr pad(strings_column_view const& input, { if (input.is_empty()) return make_empty_column(type_id::STRING); CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty"); - char_utf8 d_fill_char = 0; - size_type fill_char_size = to_char_utf8(fill_char.data(), d_fill_char); + auto d_fill_char = char_utf8{0}; + auto const fill_char_size = to_char_utf8(fill_char.data(), d_fill_char); auto d_strings = column_device_view::create(input.parent(), stream); From f908bb10fac9edc24c95c0660f1490eb9eb1f1a1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 29 Nov 2022 13:39:19 -0500 Subject: [PATCH 4/4] add missing return; --- cpp/src/strings/padding.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 27afb399a6b..da6d01c92dc 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -58,6 +58,7 @@ struct base_fn { { if (d_column.is_null(idx)) { if (!d_chars) d_offsets[idx] = 0; + return; } auto const d_str = d_column.element(idx);