-
Notifications
You must be signed in to change notification settings - Fork 180
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add cub::DeviceTransform benchmark based on BabelStream
- Loading branch information
1 parent
29b7cb8
commit 84bb0ad
Showing
2 changed files
with
179 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
#include <cub/device/device_copy.cuh> | ||
|
||
// %RANGE% TUNE_THREADS tpb 128:1024:32 | ||
// %RANGE% TUNE_ALGORITHM alg 0:4:1 | ||
|
||
#include <cub/device/dispatch/dispatch_transform.cuh> | ||
|
||
#include <nvbench_helper.cuh> | ||
|
||
#if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 | ||
# error "This benchmark does not support being compiled for multiple architectures" | ||
#endif | ||
|
||
#if !TUNE_BASE | ||
# if TUNE_ALGORITHM == 4 && __CUDA_ARCH_LIST__ < 900 | ||
# error "Cannot compile algorithm 4 (ublkcp) below sm90" | ||
# endif | ||
|
||
struct policy_hub_t | ||
{ | ||
struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy> | ||
{ | ||
static constexpr auto alg_addr_unstable = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM); | ||
static constexpr auto alg_addr_stable = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM); | ||
using prefetch_policy = cub::detail::transform::prefetch_policy_t<TUNE_THREADS>; | ||
using unrolled_policy = cub::detail::transform::unrolled_policy_t<TUNE_THREADS>; | ||
using ublkcp_policy = cub::detail::transform::async_copy_policy_t<TUNE_THREADS>; | ||
static constexpr int min_bif = cub::detail::transform::arch_to_min_bif(__CUDA_ARCH_LIST__); | ||
}; | ||
}; | ||
#endif | ||
|
||
#ifdef TUNE_T | ||
using element_types = nvbench::type_list<TUNE_T>; | ||
#else | ||
using element_types = | ||
nvbench::type_list<std::int8_t, | ||
std::int16_t, | ||
float, | ||
double | ||
# ifdef NVBENCH_HELPER_HAS_I128 | ||
, | ||
__int128 | ||
# endif | ||
>; | ||
#endif | ||
|
||
// BabelStream uses 2^25, H200 can fit 2^31 int128s | ||
// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80) | ||
// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108) | ||
// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132) | ||
auto array_size_powers = nvbench::range(16, 28, 4); | ||
|
||
template <typename OffsetT, typename RandomAccessIteratorsInTuple, typename RandomAccessIteratorOut, typename TransformOp> | ||
CUB_RUNTIME_FUNCTION static void bench_transform( | ||
nvbench::state& state, | ||
OffsetT count, | ||
RandomAccessIteratorsInTuple inputs, | ||
RandomAccessIteratorOut output, | ||
TransformOp transform_op) | ||
{ | ||
#if !TUNE_BASE | ||
using policy_t = policy_hub_t; | ||
#else | ||
using policy_t = cub::detail::transform::policy_hub< | ||
decltype(cub::detail::transform::iterator_value_tuple(RandomAccessIteratorsInTuple{})), | ||
cub::detail::value_t<RandomAccessIteratorOut>>; | ||
#endif | ||
|
||
state.exec(nvbench::exec_tag::no_batch, [&](const nvbench::launch& launch) { | ||
cub::detail::transform:: | ||
dispatch_t<true, OffsetT, RandomAccessIteratorsInTuple, RandomAccessIteratorOut, TransformOp, policy_t>::dispatch( | ||
count, inputs, output, transform_op, launch.get_stream()); | ||
}); | ||
} | ||
|
||
// Modified from BabelStream to also work for integers | ||
constexpr auto startA = 1; // BabelStream: 0.1 | ||
constexpr auto startB = 2; // BabelStream: 0.2 | ||
constexpr auto startC = 3; // BabelStream: 0.1 | ||
constexpr auto startScalar = 4; // BabelStream: 0.4 | ||
|
||
template <typename T> | ||
static void mul(nvbench::state& state, nvbench::type_list<T>) | ||
{ | ||
const auto n = static_cast<std::size_t>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<T> b(n, startB); | ||
thrust::device_vector<T> c(n, startC); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<T>(n); | ||
state.add_global_memory_writes<T>(n); | ||
|
||
const T scalar = startScalar; | ||
bench_transform(state, n, ::cuda::std::tuple{c.begin()}, b.begin(), [=] __device__ __host__(const T& ci) { | ||
return ci * scalar; | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types)) | ||
.set_name("mul") | ||
.set_type_axes_names({"T{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", array_size_powers); | ||
|
||
template <typename T> | ||
static void add(nvbench::state& state, nvbench::type_list<T>) | ||
{ | ||
const auto n = static_cast<std::size_t>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<T> a(n, startA); | ||
thrust::device_vector<T> b(n, startB); | ||
thrust::device_vector<T> c(n, startC); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<T>(2 * n); | ||
state.add_global_memory_writes<T>(n); | ||
bench_transform( | ||
state, n, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), [] _CCCL_DEVICE(const T& ai, const T& bi) -> T { | ||
return ai + bi; | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types)) | ||
.set_name("add") | ||
.set_type_axes_names({"T{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", array_size_powers); | ||
|
||
template <typename T> | ||
static void triad(nvbench::state& state, nvbench::type_list<T>) | ||
{ | ||
const auto n = static_cast<std::size_t>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<T> a(n, startA); | ||
thrust::device_vector<T> b(n, startB); | ||
thrust::device_vector<T> c(n, startC); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<T>(2 * n); | ||
state.add_global_memory_writes<T>(n); | ||
const T scalar = startScalar; | ||
bench_transform( | ||
state, n, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), [=] _CCCL_DEVICE(const T& bi, const T& ci) { | ||
return bi + scalar * ci; | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types)) | ||
.set_name("triad") | ||
.set_type_axes_names({"T{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", array_size_powers); | ||
|
||
template <typename T> | ||
static void nstream(nvbench::state& state, nvbench::type_list<T>) | ||
{ | ||
const auto n = static_cast<std::size_t>(state.get_int64("Elements{io}")); | ||
thrust::device_vector<T> a(n, startA); | ||
thrust::device_vector<T> b(n, startB); | ||
thrust::device_vector<T> c(n, startC); | ||
|
||
state.add_element_count(n); | ||
state.add_global_memory_reads<T>(3 * n); | ||
state.add_global_memory_writes<T>(n); | ||
const T scalar = startScalar; | ||
bench_transform( | ||
state, | ||
n, | ||
::cuda::std::tuple{a.begin(), b.begin(), c.begin()}, | ||
a.begin(), | ||
[=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) { | ||
return ai + bi + scalar * ci; | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types)) | ||
.set_name("nstream") | ||
.set_type_axes_names({"T{ct}"}) | ||
.add_int64_power_of_two_axis("Elements{io}", array_size_powers); |