Skip to content

Commit

Permalink
Use experimental make_strings_children for capitalize/case/pad functi…
Browse files Browse the repository at this point in the history
…ons (#15587)

Updates strings case conversion and pad functions to use the new experimental `make_strings_children` which supports building large strings.

Reference #15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: #15587
  • Loading branch information
davidwendt authored Apr 30, 2024
1 parent 2439dee commit 4da6fda
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 21 deletions.
11 changes: 6 additions & 5 deletions cpp/src/strings/capitalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/capitalize.hpp>
#include <cudf/strings/detail/char_tables.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utf8.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -64,8 +64,9 @@ struct base_fn {
character_cases_table_type const* d_case_table;
special_case_mapping const* d_special_case_mapping;
column_device_view const d_column;
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

base_fn(column_device_view const& d_column)
: d_flags(get_character_flags_table()),
Expand Down Expand Up @@ -108,7 +109,7 @@ struct base_fn {
__device__ void operator()(size_type idx)
{
if (d_column.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}

Expand Down Expand Up @@ -137,7 +138,7 @@ struct base_fn {
// capitalize the next char if this one is a delimiter
capitalize = derived.capitalize_next(chr, flag);
}
if (!d_chars) d_offsets[idx] = bytes;
if (!d_chars) { d_sizes[idx] = bytes; }
}
};

Expand Down Expand Up @@ -231,7 +232,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
rmm::device_async_resource_ref mr)
{
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
cudf::strings::detail::experimental::make_strings_children(cfn, input.size(), stream, mr);

return make_strings_column(input.size(),
std::move(offsets_column),
Expand Down
17 changes: 9 additions & 8 deletions cpp/src/strings/case.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/case.hpp>
#include <cudf/strings/detail/char_tables.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utf8.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -117,8 +117,9 @@ struct convert_char_fn {
*/
struct base_upper_lower_fn {
convert_char_fn converter;
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}

Expand All @@ -137,7 +138,7 @@ struct base_upper_lower_fn {
bytes += size;
}
}
if (!d_buffer) { d_offsets[idx] = bytes; }
if (!d_buffer) { d_sizes[idx] = bytes; }
}
};

Expand All @@ -152,7 +153,7 @@ struct upper_lower_fn : public base_upper_lower_fn {
__device__ void operator()(size_type idx) const
{
if (d_strings.is_null(idx)) {
if (!d_chars) { d_offsets[idx] = 0; }
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const d_str = d_strings.element<string_view>(idx);
Expand Down Expand Up @@ -295,8 +296,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,

// For smaller strings, use the regular string-parallel algorithm
if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
auto [offsets, chars] =
cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
converter, input.size(), stream, mr);
return make_strings_column(input.size(),
std::move(offsets),
chars.release(),
Expand Down Expand Up @@ -364,8 +365,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
// run case conversion over the new sub-strings
auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
auto chars =
std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr));
auto chars = std::get<1>(
cudf::strings::detail::experimental::make_strings_children(sub_conv, tmp_size, stream, mr));

return make_strings_column(input.size(),
std::move(offsets),
Expand Down
17 changes: 9 additions & 8 deletions cpp/src/strings/padding.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/pad_impl.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/padding.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -47,8 +47,9 @@ struct base_fn {
column_device_view const d_column;
size_type const width;
size_type const fill_char_size;
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size)
: d_column(d_column), width(width), fill_char_size(fill_char_size)
Expand All @@ -58,7 +59,7 @@ struct base_fn {
__device__ void operator()(size_type idx) const
{
if (d_column.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}

Expand All @@ -67,7 +68,7 @@ struct base_fn {
if (d_chars) {
derived.pad(d_str, d_chars + d_offsets[idx]);
} else {
d_offsets[idx] = compute_padded_size(d_str, width, fill_char_size);
d_sizes[idx] = compute_padded_size(d_str, width, fill_char_size);
}
};
};
Expand Down Expand Up @@ -116,13 +117,13 @@ std::unique_ptr<column> pad(strings_column_view const& input,
auto [offsets_column, chars] = [&] {
if (side == side_type::LEFT) {
auto fn = pad_fn<side_type::LEFT>{*d_strings, width, fill_char_size, d_fill_char};
return make_strings_children(fn, input.size(), stream, mr);
return experimental::make_strings_children(fn, input.size(), stream, mr);
} else if (side == side_type::RIGHT) {
auto fn = pad_fn<side_type::RIGHT>{*d_strings, width, fill_char_size, d_fill_char};
return make_strings_children(fn, input.size(), stream, mr);
return experimental::make_strings_children(fn, input.size(), stream, mr);
}
auto fn = pad_fn<side_type::BOTH>{*d_strings, width, fill_char_size, d_fill_char};
return make_strings_children(fn, input.size(), stream, mr);
return experimental::make_strings_children(fn, input.size(), stream, mr);
}();

return make_strings_column(input.size(),
Expand Down Expand Up @@ -153,7 +154,7 @@ std::unique_ptr<column> zfill(strings_column_view const& input,

auto d_strings = column_device_view::create(input.parent(), stream);
auto [offsets_column, chars] =
make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
experimental::make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);

return make_strings_column(input.size(),
std::move(offsets_column),
Expand Down

0 comments on commit 4da6fda

Please sign in to comment.