Skip to content

Commit

Permalink
Extend CUB reduce benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jan 15, 2025
1 parent 048b2bd commit cde1a7e
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 13 deletions.
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/reduce/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
});
}

NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,18 @@
*
******************************************************************************/

// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific
// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path.

// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. It's main
// use is to detect regressions.

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = max_t;
using value_types = all_types;
using op_t = max_t;
#include "base.cuh"
16 changes: 12 additions & 4 deletions cub/benchmarks/bench/reduce/min.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,22 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures.
// It specifically uses cuda::minimum<> instead of a user-defined operator.
#define TUNE_T int16_t

// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<>
// instead of a user-defined operator, which CUB recognizes to select an optimized code path.

// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<>
// Tuning parameters found for signed integer types apply equally for unsigned integer types
// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+?

#include <cuda/functional>

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = ::cuda::minimum<>;
using value_types = push_back_t<fundamental_types /*, __half, __nv_bfloat16*/>;
using op_t = ::cuda::minimum<>;
#include "base.cuh"
8 changes: 7 additions & 1 deletion cub/benchmarks/bench/reduce/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,17 @@
*
******************************************************************************/

// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses
// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path.

// Tuning parameters found for signed integer types apply equally for unsigned integer types

#include <nvbench_helper.cuh>

// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1

using op_t = ::cuda::std::plus<>;
using value_types = push_back_t<all_types /*, __half, __nv_bfloat16*/>;
using op_t = ::cuda::std::plus<>;
#include "base.cuh"
12 changes: 6 additions & 6 deletions cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
namespace detail
{

template <class T, class List>
template <class List, class... Ts>
struct push_back
{};

template <class T, class... As>
struct push_back<T, nvbench::type_list<As...>>
template <class... As, class... Ts>
struct push_back<nvbench::type_list<As...>, Ts...>
{
using type = nvbench::type_list<As..., T>;
using type = nvbench::type_list<As..., Ts...>;
};

} // namespace detail

template <class T, class List>
using push_back_t = typename detail::push_back<T, List>::type;
template <class List, class... Ts>
using push_back_t = typename detail::push_back<List, Ts...>::type;

#ifdef TUNE_OffsetT
using offset_types = nvbench::type_list<TUNE_OffsetT>;
Expand Down

0 comments on commit cde1a7e

Please sign in to comment.