From ad6901c87d7ca6daf50ee45080cb3066eefef9bf Mon Sep 17 00:00:00 2001 From: David <45795991+davidwendt@users.noreply.github.com> Date: Tue, 16 Feb 2021 19:02:54 -0500 Subject: [PATCH] Refactor libcudf strings::replace to use make_strings_children utility (#7384) Reference #7370 This PR simplifies the current `cudf::strings::replace` (non-regex) functions by refactoring to use the more efficient `make_strings_children` utility. This refactoring improves performance by about 2x on these APIs as measured by the gbenchmark PR #7369.
Baseline gbenchmark for replace-scalar ``` --------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------------------------- StringReplaceScalar/replace_scalar/4096/32/manual_time 0.308 ms 0.316 ms 2345 bytes_per_second=224.631M/s StringReplaceScalar/replace_scalar/4096/128/manual_time 1.01 ms 1.03 ms 684 bytes_per_second=269.171M/s StringReplaceScalar/replace_scalar/4096/512/manual_time 7.35 ms 7.38 ms 95 bytes_per_second=149.028M/s StringReplaceScalar/replace_scalar/4096/2048/manual_time 74.1 ms 74.2 ms 9 bytes_per_second=58.9153M/s StringReplaceScalar/replace_scalar/4096/8192/manual_time 1170 ms 1170 ms 1 bytes_per_second=14.8457M/s StringReplaceScalar/replace_scalar/32768/32/manual_time 0.314 ms 0.333 ms 2225 bytes_per_second=1.7147G/s StringReplaceScalar/replace_scalar/32768/128/manual_time 1.16 ms 1.18 ms 604 bytes_per_second=1.83688G/s StringReplaceScalar/replace_scalar/32768/512/manual_time 7.56 ms 7.58 ms 92 bytes_per_second=1.12604G/s StringReplaceScalar/replace_scalar/32768/2048/manual_time 80.8 ms 80.9 ms 9 bytes_per_second=432.314M/s StringReplaceScalar/replace_scalar/32768/8192/manual_time 1526 ms 1521 ms 1 bytes_per_second=91.3563M/s StringReplaceScalar/replace_scalar/262144/32/manual_time 0.430 ms 0.449 ms 1622 bytes_per_second=10.0357G/s StringReplaceScalar/replace_scalar/262144/128/manual_time 1.94 ms 1.96 ms 361 bytes_per_second=8.80298G/s StringReplaceScalar/replace_scalar/262144/512/manual_time 18.1 ms 18.0 ms 39 bytes_per_second=3.77253G/s StringReplaceScalar/replace_scalar/262144/2048/manual_time 227 ms 227 ms 3 bytes_per_second=1.20334G/s StringReplaceScalar/replace_scalar/2097152/32/manual_time 2.48 ms 2.50 ms 282 bytes_per_second=13.9373G/s StringReplaceScalar/replace_scalar/2097152/128/manual_time 11.8 ms 11.9 ms 59 bytes_per_second=11.5245G/s StringReplaceScalar/replace_scalar/2097152/512/manual_time 101 ms 101 ms 7 bytes_per_second=5.42976G/s StringReplaceScalar/replace_scalar/16777216/32/manual_time 22.2 ms 22.2 ms 31 bytes_per_second=12.4258G/s ```
gbenchmark results for refactored replace-scalar ``` --------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------------------------- StringReplaceScalar/replace_scalar/4096/32/manual_time 0.144 ms 0.162 ms 4871 bytes_per_second=481.559M/s StringReplaceScalar/replace_scalar/4096/128/manual_time 0.428 ms 0.446 ms 1633 bytes_per_second=634.055M/s StringReplaceScalar/replace_scalar/4096/512/manual_time 2.65 ms 2.67 ms 263 bytes_per_second=413.561M/s StringReplaceScalar/replace_scalar/4096/2048/manual_time 28.8 ms 28.8 ms 24 bytes_per_second=151.733M/s StringReplaceScalar/replace_scalar/4096/8192/manual_time 479 ms 479 ms 2 bytes_per_second=36.2387M/s StringReplaceScalar/replace_scalar/32768/32/manual_time 0.161 ms 0.178 ms 4347 bytes_per_second=3.35237G/s StringReplaceScalar/replace_scalar/32768/128/manual_time 0.466 ms 0.484 ms 1502 bytes_per_second=4.57268G/s StringReplaceScalar/replace_scalar/32768/512/manual_time 2.94 ms 2.96 ms 238 bytes_per_second=2.89405G/s StringReplaceScalar/replace_scalar/32768/2048/manual_time 37.4 ms 37.4 ms 19 bytes_per_second=933.899M/s StringReplaceScalar/replace_scalar/32768/8192/manual_time 567 ms 565 ms 1 bytes_per_second=245.929M/s StringReplaceScalar/replace_scalar/262144/32/manual_time 0.316 ms 0.334 ms 2198 bytes_per_second=13.6601G/s StringReplaceScalar/replace_scalar/262144/128/manual_time 1.39 ms 1.41 ms 498 bytes_per_second=12.237G/s StringReplaceScalar/replace_scalar/262144/512/manual_time 12.8 ms 12.9 ms 54 bytes_per_second=5.30963G/s StringReplaceScalar/replace_scalar/262144/2048/manual_time 157 ms 157 ms 4 bytes_per_second=1.73861G/s StringReplaceScalar/replace_scalar/2097152/32/manual_time 1.84 ms 1.86 ms 379 bytes_per_second=18.7409G/s StringReplaceScalar/replace_scalar/2097152/128/manual_time 9.50 ms 9.52 ms 74 bytes_per_second=14.3717G/s StringReplaceScalar/replace_scalar/2097152/512/manual_time 84.7 ms 84.7 ms 8 bytes_per_second=6.44185G/s StringReplaceScalar/replace_scalar/16777216/32/manual_time 14.0 ms 14.0 ms 50 bytes_per_second=19.6828G/s ```
Improvements for #7370 should base off of these changes. Authors: - David (@davidwendt) Approvers: - Jason Lowe (@jlowe) - @nvdbaranec - Mark Harris (@harrism) URL: https://github.com/rapidsai/cudf/pull/7384 --- cpp/src/strings/replace/replace.cu | 286 ++++++++++++----------------- 1 file changed, 117 insertions(+), 169 deletions(-) diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index daed6008c28..20184af77f6 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,59 +32,51 @@ namespace cudf { namespace strings { namespace detail { namespace { -/** - * @brief Used as template parameter to divide size calculation from - * the actual string operation within a function. - * - * Useful when most of the logic is identical for both passes. - */ -enum class two_pass { - SIZE_ONLY = 0, ///< calculate the size only - EXECUTE_OP ///< run the string operation -}; /** * @brief Function logic for the replace API. * * This will perform a replace operation on each string. */ -template struct replace_fn { column_device_view const d_strings; string_view const d_target; string_view const d_repl; - int32_t max_repl; - const int32_t* d_offsets{}; + int32_t const max_repl; + int32_t* d_offsets{}; char* d_chars{}; - __device__ size_type operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_strings.is_null(idx)) return 0; // null string - string_view d_str = d_strings.element(idx); - auto max_n = max_repl; - if (max_n < 0) max_n = d_str.length(); // max possible replacements - char* out_ptr = nullptr; - if (Pass == two_pass::EXECUTE_OP) out_ptr = d_chars + d_offsets[idx]; + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const d_str = d_strings.element(idx); const char* in_ptr = d_str.data(); - size_type bytes = d_str.size_bytes(); - auto position = d_str.find(d_target); + + char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto max_n = (max_repl < 0) ? d_str.length() : max_repl; + auto bytes = d_str.size_bytes(); + auto position = d_str.find(d_target); + size_type last_pos = 0; while ((position >= 0) && (max_n > 0)) { - if (Pass == two_pass::SIZE_ONLY) - bytes += d_repl.size_bytes() - d_target.size_bytes(); - else // EXECUTE_OP - { - size_type curr_pos = d_str.byte_offset(position); + if (out_ptr) { + auto const curr_pos = d_str.byte_offset(position); out_ptr = copy_and_increment(out_ptr, in_ptr + last_pos, curr_pos - last_pos); // copy left out_ptr = copy_string(out_ptr, d_repl); // copy repl last_pos = curr_pos + d_target.size_bytes(); + } else { + bytes += d_repl.size_bytes() - d_target.size_bytes(); } position = d_str.find(d_target, position + d_target.size_bytes()); --max_n; } - if (Pass == two_pass::EXECUTE_OP) // copy whats left (or right depending on your point of view) + if (out_ptr) // copy whats left (or right depending on your point of view) memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos); - return bytes; + else + d_offsets[idx] = bytes; } }; @@ -98,8 +90,7 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(stream, mr); + if (strings.is_empty()) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid."); CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid."); CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string."); @@ -107,35 +98,21 @@ std::unique_ptr replace(strings_column_view const& strings, string_view d_target(target.data(), target.size()); string_view d_repl(repl.data(), repl.size()); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - - // copy the null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - // build offsets column - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - replace_fn{d_strings, d_target, d_repl, maxrepl}); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); + auto d_strings = column_device_view::create(strings.parent(), stream); - // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - replace_fn{d_strings, d_target, d_repl, maxrepl, d_offsets, d_chars}); + // this utility calls the given functor to build the offsets and chars columns + auto children = + cudf::strings::detail::make_strings_children(replace_fn{*d_strings, d_target, d_repl, maxrepl}, + strings.size(), + strings.null_count(), + stream, + mr); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + return make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } @@ -146,34 +123,37 @@ namespace { * * This will perform a replace_slice operation on each string. */ -template struct replace_slice_fn { column_device_view const d_strings; string_view const d_repl; - size_type start, stop; - const int32_t* d_offsets{}; + size_type const start; + size_type const stop; + int32_t* d_offsets{}; char* d_chars{}; - __device__ size_type operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_strings.is_null(idx)) return 0; // null string - string_view d_str = d_strings.element(idx); - auto length = d_str.length(); - char* out_ptr = nullptr; - if (Pass == two_pass::EXECUTE_OP) out_ptr = d_chars + d_offsets[idx]; - const char* in_ptr = d_str.data(); - size_type bytes = d_str.size_bytes(); - size_type begin = ((start < 0) || (start > length) ? length : start); - size_type end = ((stop < 0) || (stop > length) ? length : stop); - begin = d_str.byte_offset(begin); - end = d_str.byte_offset(end); - bytes += d_repl.size_bytes() - (end - begin); - if (Pass == two_pass::EXECUTE_OP) { - out_ptr = copy_and_increment(out_ptr, in_ptr, begin); - out_ptr = copy_string(out_ptr, d_repl); - out_ptr = copy_and_increment(out_ptr, in_ptr + end, d_str.size_bytes() - end); + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const d_str = d_strings.element(idx); + auto const length = d_str.length(); + char const* in_ptr = d_str.data(); + auto const begin = d_str.byte_offset(((start < 0) || (start > length) ? length : start)); + auto const end = d_str.byte_offset(((stop < 0) || (stop > length) ? length : stop)); + + if (d_chars) { + char* out_ptr = d_chars + d_offsets[idx]; + + out_ptr = copy_and_increment(out_ptr, in_ptr, begin); // copy beginning + out_ptr = copy_string(out_ptr, d_repl); // insert replacement + out_ptr = copy_and_increment(out_ptr, // copy end + in_ptr + end, + d_str.size_bytes() - end); + } else { + d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin); } - return bytes; } }; @@ -186,44 +166,27 @@ std::unique_ptr replace_slice(strings_column_view const& strings, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(stream, mr); + if (strings.is_empty()) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid."); if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop."); string_view d_repl(repl.data(), repl.size()); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; + auto d_strings = column_device_view::create(strings.parent(), stream); - // copy the null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - // build offsets column - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - replace_slice_fn{d_strings, d_repl, start, stop}); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto offsets_view = offsets_column->view(); - auto d_offsets = offsets_view.data(); + // this utility calls the given functor to build the offsets and chars columns + auto children = + cudf::strings::detail::make_strings_children(replace_slice_fn{*d_strings, d_repl, start, stop}, + strings.size(), + strings.null_count(), + stream, + mr); - // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - replace_slice_fn{d_strings, d_repl, start, stop, d_offsets, d_chars}); - - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + return make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } @@ -234,37 +197,38 @@ namespace { * * This will perform the multi-replace operation on each string. */ -template struct replace_multi_fn { column_device_view const d_strings; column_device_view const d_targets; column_device_view const d_repls; - const int32_t* d_offsets{}; + int32_t* d_offsets{}; char* d_chars{}; - __device__ size_type operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_strings.is_null(idx)) return 0; - string_view d_str = d_strings.element(idx); - char* out_ptr = nullptr; - if (Pass == two_pass::EXECUTE_OP) out_ptr = d_chars + d_offsets[idx]; - const char* in_ptr = d_str.data(); - size_type size = d_str.size_bytes(); - size_type bytes = size, spos = 0, lpos = 0; - while (spos < size) { // check each character against each target + if (d_strings.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const d_str = d_strings.element(idx); + char const* in_ptr = d_str.data(); + + size_type bytes = d_str.size_bytes(); + size_type spos = 0; + size_type lpos = 0; + char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + + // check each character against each target + while (spos < d_str.size_bytes()) { for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) { - string_view d_tgt = d_targets.element(tgt_idx); - if ((d_tgt.size_bytes() <= (size - spos)) && // check fit - (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // does it match - { // found one - string_view d_repl; - if (d_repls.size() == 1) - d_repl = d_repls.element(0); - else - d_repl = d_repls.element(tgt_idx); - if (Pass == two_pass::SIZE_ONLY) - bytes += d_repl.size_bytes() - d_tgt.size_bytes(); - else { + auto const d_tgt = d_targets.element(tgt_idx); + if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit + (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match + { + auto const d_repl = (d_repls.size() == 1) ? d_repls.element(0) + : d_repls.element(tgt_idx); + bytes += d_repl.size_bytes() - d_tgt.size_bytes(); + if (out_ptr) { out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); out_ptr = copy_string(out_ptr, d_repl); lpos = spos + d_tgt.size_bytes(); @@ -275,9 +239,10 @@ struct replace_multi_fn { } ++spos; } - if (Pass == two_pass::EXECUTE_OP) // copy remainder - memcpy(out_ptr, in_ptr + lpos, size - lpos); - return bytes; + if (out_ptr) // copy remainder + memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); + else + d_offsets[idx] = bytes; } }; @@ -289,8 +254,7 @@ std::unique_ptr replace(strings_column_view const& strings, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_count = strings.size(); - if (strings_count == 0) return make_empty_strings_column(stream, mr); + if (strings.is_empty()) return make_empty_strings_column(stream, mr); CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)), "Parameters targets must not be empty and must not have nulls"); CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)), @@ -298,39 +262,23 @@ std::unique_ptr replace(strings_column_view const& strings, if (repls.size() > 1) CUDF_EXPECTS(repls.size() == targets.size(), "Sizes for targets and repls must match"); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - auto targets_column = column_device_view::create(targets.parent(), stream); - auto d_targets = *targets_column; - auto repls_column = column_device_view::create(repls.parent(), stream); - auto d_repls = *repls_column; - - // copy the null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - // build offsets column - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - replace_multi_fn{d_strings, d_targets, d_repls}); - auto offsets_column = make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); - - // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count]; - auto chars_column = - create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - replace_multi_fn{d_strings, d_targets, d_repls, d_offsets, d_chars}); - - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), + auto d_strings = column_device_view::create(strings.parent(), stream); + auto d_targets = column_device_view::create(targets.parent(), stream); + auto d_repls = column_device_view::create(repls.parent(), stream); + + // this utility calls the given functor to build the offsets and chars columns + auto children = + cudf::strings::detail::make_strings_children(replace_multi_fn{*d_strings, *d_targets, *d_repls}, + strings.size(), + strings.null_count(), + stream, + mr); + + return make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), strings.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); }