Skip to content

Commit

Permalink
Add cub::DeviceTransform benchmark based on BabelStream
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jul 29, 2024
1 parent d565f9c commit 6efb3d7
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cub/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ function(add_bench_dir bench_dir)
add_bench(bench_target ${tuning_name} "${bench_src}")
# for convenience, make tuning variant buildable by default
file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n")
target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}")
target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}")
else()
# benchmarking
register_cccl_benchmark("${bench_name}" "")
Expand Down
178 changes: 178 additions & 0 deletions cub/benchmarks/bench/transform/babelstream.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#include <cub/device/device_copy.cuh>

// %RANGE% TUNE_THREADS tpb 128:1024:32
// %RANGE% TUNE_ALGORITHM alg 0:4:1

#include <cub/device/dispatch/dispatch_transform.cuh>

#include <nvbench_helper.cuh>

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif

# if TUNE_ALGORITHM == 4 && __CUDA_ARCH_LIST__ < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

struct policy_hub_t
{
struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
{
static constexpr auto alg_addr_unstable = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
static constexpr auto alg_addr_stable = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
using prefetch_policy = cub::detail::transform::prefetch_policy_t<TUNE_THREADS>;
using unrolled_policy = cub::detail::transform::unrolled_policy_t<TUNE_THREADS>;
using ublkcp_policy = cub::detail::transform::async_copy_policy_t<TUNE_THREADS>;
static constexpr int min_bif = cub::detail::transform::arch_to_min_bif(__CUDA_ARCH_LIST__);
};
};
#endif

#ifdef TUNE_T
using element_types = nvbench::type_list<TUNE_T>;
#else
using element_types =
nvbench::type_list<std::int8_t,
std::int16_t,
float,
double
# ifdef NVBENCH_HELPER_HAS_I128
,
__int128
# endif
>;
#endif

// BabelStream uses 2^25, H200 can fit 2^31 int128s
// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
auto array_size_powers = nvbench::range(16, 28, 4);

template <typename OffsetT, typename RandomAccessIteratorsInTuple, typename RandomAccessIteratorOut, typename TransformOp>
CUB_RUNTIME_FUNCTION static void bench_transform(
nvbench::state& state,
OffsetT count,
RandomAccessIteratorsInTuple inputs,
RandomAccessIteratorOut output,
TransformOp transform_op)
{
#if !TUNE_BASE
using policy_t = policy_hub_t;
#else
using policy_t = cub::detail::transform::policy_hub<
decltype(cub::detail::transform::iterator_value_tuple(RandomAccessIteratorsInTuple{})),
cub::detail::value_t<RandomAccessIteratorOut>>;
#endif

state.exec(nvbench::exec_tag::no_batch, [&](const nvbench::launch& launch) {
cub::detail::transform::
dispatch_t<true, OffsetT, RandomAccessIteratorsInTuple, RandomAccessIteratorOut, TransformOp, policy_t>::dispatch(
count, inputs, output, transform_op, launch.get_stream());
});
}

// Modified from BabelStream to also work for integers
constexpr auto startA = 1; // BabelStream: 0.1
constexpr auto startB = 2; // BabelStream: 0.2
constexpr auto startC = 3; // BabelStream: 0.1
constexpr auto startScalar = 4; // BabelStream: 0.4

template <typename T>
static void mul(nvbench::state& state, nvbench::type_list<T>)
{
const auto n = static_cast<std::size_t>(state.get_int64("Elements{io}"));
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(n);
state.add_global_memory_writes<T>(n);

const T scalar = startScalar;
bench_transform(state, n, ::cuda::std::tuple{c.begin()}, b.begin(), [=] __device__ __host__(const T& ci) {
return ci * scalar;
});
}

NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types))
.set_name("mul")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

template <typename T>
static void add(nvbench::state& state, nvbench::type_list<T>)
{
const auto n = static_cast<std::size_t>(state.get_int64("Elements{io}"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
bench_transform(
state, n, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
return ai + bi;
});
}

NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types))
.set_name("add")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

template <typename T>
static void triad(nvbench::state& state, nvbench::type_list<T>)
{
const auto n = static_cast<std::size_t>(state.get_int64("Elements{io}"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
const T scalar = startScalar;
bench_transform(
state, n, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), [=] _CCCL_DEVICE(const T& bi, const T& ci) {
return bi + scalar * ci;
});
}

NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types))
.set_name("triad")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

template <typename T>
static void nstream(nvbench::state& state, nvbench::type_list<T>)
{
const auto n = static_cast<std::size_t>(state.get_int64("Elements{io}"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(3 * n);
state.add_global_memory_writes<T>(n);
const T scalar = startScalar;
bench_transform(
state,
n,
::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
a.begin(),
[=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
});
}

NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types))
.set_name("nstream")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

0 comments on commit 6efb3d7

Please sign in to comment.