diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh index 9de575d0686..579d3757d3c 100644 --- a/cub/benchmarks/bench/reduce/base.cuh +++ b/cub/benchmarks/bench/reduce/base.cuh @@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list) }); } -NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types)) +NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types)) .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); diff --git a/cub/benchmarks/bench/reduce/max.cu b/cub/benchmarks/bench/reduce/custom.cu similarity index 81% rename from cub/benchmarks/bench/reduce/max.cu rename to cub/benchmarks/bench/reduce/custom.cu index 791d5bfe167..d6610c76ba7 100644 --- a/cub/benchmarks/bench/reduce/max.cu +++ b/cub/benchmarks/bench/reduce/custom.cu @@ -25,11 +25,18 @@ * ******************************************************************************/ +// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific +// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path. + +// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. It's main +// use is to detect regressions. + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = max_t; +using value_types = all_types; +using op_t = max_t; #include "base.cuh" diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu index 177d7628f6f..df77f799347 100644 --- a/cub/benchmarks/bench/reduce/min.cu +++ b/cub/benchmarks/bench/reduce/min.cu @@ -24,14 +24,22 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ -// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures. -// It specifically uses cuda::minimum<> instead of a user-defined operator. -#define TUNE_T int16_t + +// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<> +// instead of a user-defined operator, which CUB recognizes to select an optimized code path. + +// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<> +// Tuning parameters found for signed integer types apply equally for unsigned integer types +// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+? + +#include + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = ::cuda::minimum<>; +using value_types = push_back_t; +using op_t = ::cuda::minimum<>; #include "base.cuh" diff --git a/cub/benchmarks/bench/reduce/sum.cu b/cub/benchmarks/bench/reduce/sum.cu index 4433724f090..76395c0087e 100644 --- a/cub/benchmarks/bench/reduce/sum.cu +++ b/cub/benchmarks/bench/reduce/sum.cu @@ -25,11 +25,17 @@ * ******************************************************************************/ +// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses +// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path. + +// Tuning parameters found for signed integer types apply equally for unsigned integer types + #include // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 -using op_t = ::cuda::std::plus<>; +using value_types = push_back_t; +using op_t = ::cuda::std::plus<>; #include "base.cuh" diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index 451d289f6c2..8324650d044 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -52,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant> namespace detail { -template +template struct push_back {}; -template -struct push_back> +template +struct push_back, Ts...> { - using type = nvbench::type_list; + using type = nvbench::type_list; }; } // namespace detail -template -using push_back_t = typename detail::push_back::type; +template +using push_back_t = typename detail::push_back::type; #ifdef TUNE_OffsetT using offset_types = nvbench::type_list;