Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cub::DeviceTransform #2086

Merged
merged 72 commits into from
Sep 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
66b84ad
Drive-by fix
bernhardmgruber Aug 22, 2024
51a08e6
Add cub::DeviceTransform
bernhardmgruber Jul 18, 2024
116face
Split BabelStream benchmark by stream count and extend
bernhardmgruber Aug 22, 2024
f1d5ae6
ptr_set simplification and fixes
bernhardmgruber Aug 22, 2024
64187cc
Refactor ptr_set
bernhardmgruber Aug 22, 2024
492fa42
Fix Catch
bernhardmgruber Aug 22, 2024
6586134
Refactor
bernhardmgruber Aug 23, 2024
16b813e
Align memcpy_async transactions to 16 bytes
bernhardmgruber Aug 23, 2024
f7fe22d
Use custom aligned_size_t
bernhardmgruber Aug 23, 2024
8451f51
Apply feedback from fbusato
bernhardmgruber Aug 26, 2024
acd5bd4
Update cub/cub/device/dispatch/dispatch_transform.cuh
bernhardmgruber Aug 26, 2024
eb51002
Apply feedback from gevtushenko
bernhardmgruber Aug 26, 2024
809fa38
Apply feedback from miscco
bernhardmgruber Aug 26, 2024
f90afe2
Fix failing tests
bernhardmgruber Aug 26, 2024
b5e954b
Fix CI
bernhardmgruber Aug 26, 2024
b84034f
Fix tests
bernhardmgruber Aug 26, 2024
4d5b2a8
Fix typos
bernhardmgruber Aug 26, 2024
1a374f0
Cache element counts and guard against SMEM exhaustion
bernhardmgruber Aug 26, 2024
7667675
Document out of bounds reads
bernhardmgruber Aug 27, 2024
8162b94
Fix memcpy_async tail padding access for compute-sanitizer
bernhardmgruber Aug 27, 2024
34eb956
Assert
bernhardmgruber Aug 27, 2024
8d61568
Notes, asserts, and alignment fixes
bernhardmgruber Aug 27, 2024
a958fb0
Ensure kernel symbols are not visible
bernhardmgruber Aug 27, 2024
bf612b1
Working version of peeling memcpy_async
bernhardmgruber Aug 28, 2024
50a742c
Randomize huge_t test
bernhardmgruber Aug 28, 2024
61d077e
Refactor
bernhardmgruber Aug 28, 2024
cd12ed8
Draft: avoid OOB access with UBLKCP
bernhardmgruber Aug 28, 2024
64adc57
Avoid printing huge comparison
bernhardmgruber Aug 28, 2024
9c8900d
Improve over-aligned type handling for ublkcp
bernhardmgruber Aug 28, 2024
0c323c5
Fix passing on offset type in tests
bernhardmgruber Aug 28, 2024
dc90114
Useful test output
bernhardmgruber Aug 28, 2024
dd3847f
Fix size calculation
bernhardmgruber Aug 28, 2024
8ea268f
Fix out-of-bounds write check
bernhardmgruber Aug 28, 2024
4f8615e
Fix
bernhardmgruber Aug 28, 2024
8dc472d
Refactor
bernhardmgruber Aug 28, 2024
efa0ebf
Improve comments
bernhardmgruber Aug 28, 2024
401cf23
Simplify kernel_arg to improve codegen
bernhardmgruber Aug 28, 2024
39ac845
Copy peeled bytes using larger types
bernhardmgruber Aug 28, 2024
93ccca4
Fix typo
bernhardmgruber Aug 28, 2024
84ef0b2
Fix
bernhardmgruber Aug 28, 2024
46cc725
Guard static counts
bernhardmgruber Aug 28, 2024
33fe58a
Avoid MSVC compilation error
bernhardmgruber Aug 28, 2024
3b86981
Fix total_bytes_bulk_copied
bernhardmgruber Aug 28, 2024
bbb8ea5
Apply suggestion by fbusato
bernhardmgruber Aug 29, 2024
6ab49a3
tail_elements
bernhardmgruber Aug 29, 2024
650f8a1
less threads for copy
bernhardmgruber Aug 29, 2024
6b6acef
Improve condition
bernhardmgruber Aug 29, 2024
a2b9b5f
Move memcpy async along ublkcp
bernhardmgruber Aug 30, 2024
ad6fe24
fix fix fix
bernhardmgruber Aug 31, 2024
b85b668
double3 and fixes
bernhardmgruber Aug 31, 2024
ff92e43
copy_thread_count opt
bernhardmgruber Aug 31, 2024
59729c7
fix test
bernhardmgruber Aug 31, 2024
08dc29c
rollback fallback copy opt
bernhardmgruber Aug 31, 2024
48b57f6
fix
bernhardmgruber Aug 31, 2024
fe0e5a8
Disable heavy functor test
bernhardmgruber Sep 2, 2024
1ff5fd4
Process first and last tiles separately
bernhardmgruber Sep 2, 2024
790a2b3
Fix typo
bernhardmgruber Sep 3, 2024
a5d8c3f
C++11
bernhardmgruber Sep 3, 2024
5707596
Add code path for full_tile
bernhardmgruber Sep 4, 2024
b319673
Remove all algorithms except fallback_for and ublkcp
bernhardmgruber Sep 4, 2024
233acb1
Drop heavy functor for now
bernhardmgruber Sep 4, 2024
36ed917
Fix API example
bernhardmgruber Sep 4, 2024
ebb3d29
Fix CI errors
bernhardmgruber Sep 4, 2024
7245f1d
Fix policy
bernhardmgruber Sep 5, 2024
93bd981
Fallback if any input T is too aligned
bernhardmgruber Sep 5, 2024
bff304d
Address some reviewer comments
bernhardmgruber Sep 6, 2024
12ba6ca
Address some reviewer comments
bernhardmgruber Sep 6, 2024
9daae96
Address some reviewer comments
bernhardmgruber Sep 6, 2024
2764c05
Avoid preprocessor directives as macro arguments
bernhardmgruber Sep 6, 2024
56b1802
Avoid compiling ublkcp kernel below sm90
bernhardmgruber Sep 6, 2024
98312f4
Try to workaround MSVC issue
bernhardmgruber Sep 6, 2024
5e7a134
Fix babelstream3
bernhardmgruber Sep 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cub/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ function(add_bench_dir bench_dir)
add_bench(bench_target ${tuning_name} "${bench_src}")
# for convenience, make tuning variant buildable by default
file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n")
target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}")
target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}")
else()
# benchmarking
register_cccl_benchmark("${bench_name}" "")
Expand Down
104 changes: 104 additions & 0 deletions cub/benchmarks/bench/transform/babelstream.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#include <cub/device/dispatch/dispatch_transform.cuh>
#include <cub/util_namespace.cuh>

#include <cuda/std/type_traits>

#include <stdexcept>

#include <nvbench_helper.cuh>

template <typename... RandomAccessIteratorsIn>
#if TUNE_BASE
using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tuple<RandomAccessIteratorsIn...>>;
#else
struct policy_hub_t
{
struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
{
static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
using algo_policy =
::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::fallback_for,
cub::detail::transform::fallback_for_policy,
cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>;
};
};
#endif

#ifdef TUNE_T
using element_types = nvbench::type_list<TUNE_T>;
#else
using element_types =
nvbench::type_list<std::int8_t,
std::int16_t,
float,
double
# ifdef NVBENCH_HELPER_HAS_I128
,
__int128
# endif
>;
#endif

// BabelStream uses 2^25, H200 can fit 2^31 int128s
// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
// inline auto array_size_powers = std::vector<nvbench::int64_t>{28};
inline auto array_size_powers = nvbench::range(16, 28, 4);

template <typename OffsetT,
typename... RandomAccessIteratorsIn,
typename RandomAccessIteratorOut,
typename TransformOp,
typename ExecTag = decltype(nvbench::exec_tag::no_batch)>
void bench_transform(
nvbench::state& state,
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
OffsetT num_items,
TransformOp transform_op,
ExecTag exec_tag = nvbench::exec_tag::no_batch)
{
state.exec(exec_tag, [&](const nvbench::launch& launch) {
cub::detail::transform::dispatch_t<
false,
OffsetT,
::cuda::std::tuple<RandomAccessIteratorsIn...>,
RandomAccessIteratorOut,
TransformOp,
policy_hub_t<RandomAccessIteratorsIn...>>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
});
}

// Modified from BabelStream to also work for integers
inline constexpr auto startA = 1; // BabelStream: 0.1
inline constexpr auto startB = 2; // BabelStream: 0.2
inline constexpr auto startC = 3; // BabelStream: 0.1
inline constexpr auto startScalar = 4; // BabelStream: 0.4

// TODO(bgruber): we should put those somewhere into libcu++:
// from C++ GSL
struct narrowing_error : std::runtime_error
{
narrowing_error()
: std::runtime_error("Narrowing error")
{}
};

// from C++ GSL
// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow
template <typename DstT, typename SrcT, ::cuda::std::__enable_if_t<::cuda::std::is_arithmetic<SrcT>::value, int> = 0>
constexpr DstT narrow(SrcT value)
{
constexpr bool is_different_signedness = ::cuda::std::is_signed<SrcT>::value != ::cuda::std::is_signed<DstT>::value;
const auto converted = static_cast<DstT>(value);
if (static_cast<SrcT>(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{}))))
{
throw narrowing_error{};
}
return converted;
}
46 changes: 46 additions & 0 deletions cub/benchmarks/bench/transform/babelstream1.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(n);
state.add_global_memory_writes<T>(n);

const T scalar = startScalar;
bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) {
return ci * scalar;
});
}

NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types))
.set_name("mul")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);
69 changes: 69 additions & 0 deletions cub/benchmarks/bench/transform/babelstream2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
bench_transform(
state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
return ai + bi;
});
}

NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types))
.set_name("add")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);

template <typename T, typename OffsetT>
static void triad(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

state.add_element_count(n);
state.add_global_memory_reads<T>(2 * n);
state.add_global_memory_writes<T>(n);
const T scalar = startScalar;
bench_transform(
state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
return bi + scalar * ci;
});
}

NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types))
.set_name("triad")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers);
64 changes: 64 additions & 0 deletions cub/benchmarks/bench/transform/babelstream3.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

// %RANGE% TUNE_THREADS tpb 128:1024:128
// %RANGE% TUNE_ALGORITHM alg 0:1:1

// keep checks at the top so compilation of discarded variants fails really fast
#if !TUNE_BASE
# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 4 (ublkcp) below sm90"
# endif

# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
# endif
#endif

#include "babelstream.h"

#if !TUNE_BASE
# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
# error "This benchmark does not support being compiled for multiple architectures"
# endif
#endif

template <typename T, typename OffsetT>
static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
const auto overwrite = static_cast<bool>(state.get_int64("OverwriteInput"));
thrust::device_vector<T> a(n, startA);
thrust::device_vector<T> b(n, startB);
thrust::device_vector<T> c(n, startC);

// The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes
// the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to
// choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input.
thrust::device_vector<T> d;
if (!overwrite)
{
d.resize(n);
}

state.add_element_count(n);
state.add_global_memory_reads<T>(3 * n);
state.add_global_memory_writes<T>(n);
const T scalar = startScalar;
bench_transform(
state,
::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
overwrite ? a.begin() : d.begin(),
n,
[=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
return ai + bi + scalar * ci;
},
nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when
// OverwriteInput=0, but doesn't hurt
}

NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types))
.set_name("nstream")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", array_size_powers)
.add_int64_axis("OverwriteInput", {1});
1 change: 1 addition & 0 deletions cub/cub/cub.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
#include <cub/device/device_segmented_sort.cuh>
#include <cub/device/device_select.cuh>
#include <cub/device/device_spmv.cuh>
#include <cub/device/device_transform.cuh>

// Grid
// #include <cub/grid/grid_barrier.cuh>
Expand Down
Loading
Loading