From a0fc6a89a596ebae7df436be25aed70ec908f83e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 9 Dec 2024 09:33:08 -0500 Subject: [PATCH] Use cooperative-groups instead of cub warp-reduce for strings contains (#17540) Replaces the `cub::WarpReduce` usage in `cudf::strings::contains` with cooperative-groups `any()`. The change is only for the `contains_warp_parallel` kernel which is used for wider strings. Using cooperative-groups generates more efficient code for the same results and gives an additional 11-14% performance improvement. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/17540 --- cpp/src/strings/search/find.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 0f33fcb6fe1..94bc81ec933 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -347,13 +348,15 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, string_view const d_target, bool* d_results) { - auto const idx = cudf::detail::grid_1d::global_thread_id(); - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; + auto const idx = cudf::detail::grid_1d::global_thread_id(); auto const str_idx = idx / cudf::detail::warp_size; if (str_idx >= d_strings.size()) { return; } - auto const lane_idx = idx % cudf::detail::warp_size; + + namespace cg = cooperative_groups; + auto const warp = cg::tiled_partition(cg::this_thread_block()); + auto const lane_idx = warp.thread_rank(); + if (d_strings.is_null(str_idx)) { return; } // get the string for this warp auto const d_str = d_strings.element(str_idx); @@ -373,7 +376,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, } } - auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max()); + auto const result = warp.any(found); if (lane_idx == 0) { d_results[str_idx] = result; } }