From a4f6c6d8f2df5d0a78c846bb99913f195e91db4e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Oct 2021 20:23:26 -0400 Subject: [PATCH] Use single kernel to extract all groups in cudf::strings::extract (#9358) This is a less ambitious version of #8460 which had to be reverted in #8575 because it did not work with greedy quantifiers. The change here involves calling the underlying `reprog_device::extract` to retrieve each group result within a single kernel rather than launching a kernel for each group. The output is placed contiguously in a 2d span (wrapped uvector) and a permutation iterator is used to build the output columns (one column per group). Like it's predecessor, the performance improvement is mostly when specifying more than 1 group in the regex pattern. The benchmark results showed no change for single groups but was 2x faster for multiple groups over long (8K) strings and up to 4x faster for multiple groups over many (16M) strings. The benchmark test for extract was also updated to better report the number of groups being used when measuring results. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mark Harris (https://github.com/harrism) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/9358 --- cpp/benchmarks/string/extract_benchmark.cpp | 11 +- cpp/src/strings/extract.cu | 128 +++++++++++--------- 2 files changed, 78 insertions(+), 61 deletions(-) diff --git a/cpp/benchmarks/string/extract_benchmark.cpp b/cpp/benchmarks/string/extract_benchmark.cpp index 161e30c6f25..7ed083d9571 100644 --- a/cpp/benchmarks/string/extract_benchmark.cpp +++ b/cpp/benchmarks/string/extract_benchmark.cpp @@ -47,8 +47,8 @@ static void BM_extract(benchmark::State& state, int groups) return row; }); - std::string pattern; - while (static_cast(pattern.size()) < groups) { + std::string pattern{""}; + while (groups--) { pattern += "(\\d+) "; } @@ -86,6 +86,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -STRINGS_BENCHMARK_DEFINE(small, 2) -STRINGS_BENCHMARK_DEFINE(medium, 10) -STRINGS_BENCHMARK_DEFINE(large, 30) +STRINGS_BENCHMARK_DEFINE(one, 1) +STRINGS_BENCHMARK_DEFINE(two, 2) +STRINGS_BENCHMARK_DEFINE(four, 4) +STRINGS_BENCHMARK_DEFINE(eight, 8) diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract.cu index d12f5c534a5..c4076dd61c1 100644 --- a/cpp/src/strings/extract.cu +++ b/cpp/src/strings/extract.cu @@ -19,13 +19,13 @@ #include #include -#include +#include #include -#include -#include +#include #include #include #include +#include #include @@ -47,29 +47,36 @@ using string_index_pair = thrust::pair; template struct extract_fn { reprog_device prog; - column_device_view d_strings; - size_type column_index; + column_device_view const d_strings; + cudf::detail::device_2dspan d_indices; - __device__ string_index_pair operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_strings.is_null(idx)) return string_index_pair{nullptr, 0}; - string_view d_str = d_strings.element(idx); - string_index_pair result{nullptr, 0}; - int32_t begin = 0; - int32_t end = -1; // handles empty strings automatically - if (prog.find(idx, d_str, begin, end) > 0) { - auto extracted = prog.extract(idx, d_str, begin, end, column_index); - if (extracted) { - auto const offset = d_str.byte_offset(extracted.value().first); - // build index-pair - result = string_index_pair{d_str.data() + offset, - d_str.byte_offset(extracted.value().second) - offset}; + auto const groups = prog.group_counts(); + auto d_output = d_indices[idx]; + + if (d_strings.is_valid(idx)) { + auto const d_str = d_strings.element(idx); + int32_t begin = 0; + int32_t end = -1; // handles empty strings automatically + if (prog.find(idx, d_str, begin, end) > 0) { + for (auto col_idx = 0; col_idx < groups; ++col_idx) { + auto const extracted = prog.extract(idx, d_str, begin, end, col_idx); + d_output[col_idx] = [&] { + if (!extracted) return string_index_pair{nullptr, 0}; + auto const offset = d_str.byte_offset((*extracted).first); + return string_index_pair{d_str.data() + offset, + d_str.byte_offset((*extracted).second) - offset}; + }(); + } + return; } } - return result; + + // if null row or no match found, fill the output with null entries + thrust::fill(thrust::seq, d_output.begin(), d_output.end(), string_index_pair{nullptr, 0}); } }; - } // namespace // @@ -79,9 +86,9 @@ std::unique_ptr extract( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - auto strings_count = strings.size(); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; + auto const strings_count = strings.size(); + auto const strings_column = column_device_view::create(strings.parent(), stream); + auto const d_strings = *strings_column; // compile regex into device object auto prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); @@ -90,41 +97,50 @@ std::unique_ptr
extract( auto const groups = d_prog.group_counts(); CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern"); + rmm::device_uvector indices(strings_count * groups, stream); + cudf::detail::device_2dspan d_indices(indices.data(), strings_count, groups); + + auto const regex_insts = d_prog.insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + strings_count, + extract_fn{d_prog, d_strings, d_indices}); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + strings_count, + extract_fn{d_prog, d_strings, d_indices}); + } else if (regex_insts <= RX_LARGE_INSTS) { + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + strings_count, + extract_fn{d_prog, d_strings, d_indices}); + } else { + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + strings_count, + extract_fn{d_prog, d_strings, d_indices}); + } + // build a result column for each group - std::vector> results; - auto regex_insts = d_prog.insts_counts(); - - for (int32_t column_index = 0; column_index < groups; ++column_index) { - rmm::device_uvector indices(strings_count, stream); - - if (regex_insts <= RX_SMALL_INSTS) { - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - indices.begin(), - extract_fn{d_prog, d_strings, column_index}); - } else if (regex_insts <= RX_MEDIUM_INSTS) { - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - indices.begin(), - extract_fn{d_prog, d_strings, column_index}); - } else if (regex_insts <= RX_LARGE_INSTS) { - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - indices.begin(), - extract_fn{d_prog, d_strings, column_index}); - } else { - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - indices.begin(), - extract_fn{d_prog, d_strings, column_index}); - } + std::vector> results(groups); + auto make_strings_lambda = [&](size_type column_index) { + // this iterator transposes the extract results into column order + auto indices_itr = + thrust::make_permutation_iterator(indices.begin(), + cudf::detail::make_counting_transform_iterator( + 0, [column_index, groups] __device__(size_type idx) { + return (idx * groups) + column_index; + })); + return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr); + }; + + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(groups), + results.begin(), + make_strings_lambda); - results.emplace_back(make_strings_column(indices, stream, mr)); - } return std::make_unique
(std::move(results)); }