NVIDIA · bernhardmgruber · Sep 8, 2024 · Aug 22, 2024 · Jul 18, 2024 · Aug 22, 2024
@@ -106,7 +106,7 @@ function(add_bench_dir bench_dir)
       add_bench(bench_target ${tuning_name} "${bench_src}")
       # for convenience, make tuning variant buildable by default
       file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n")
-      target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}")
+      target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}")
     else()
       # benchmarking
       register_cccl_benchmark("${bench_name}" "")

@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cub/device/dispatch/dispatch_transform.cuh>
+#include <cub/util_namespace.cuh>
+
+#include <cuda/std/type_traits>
+
+#include <stdexcept>
+
+#include <nvbench_helper.cuh>
+
+template <typename... RandomAccessIteratorsIn>
+#if TUNE_BASE
+using policy_hub_t = cub::detail::transform::policy_hub<false, ::cuda::std::tuple<RandomAccessIteratorsIn...>>;
+#else
+struct policy_hub_t
+{
+  struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy>
+  {
+    static constexpr int min_bif    = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__);
+    static constexpr auto algorithm = static_cast<cub::detail::transform::Algorithm>(TUNE_ALGORITHM);
+    using algo_policy =
+      ::cuda::std::_If<algorithm == cub::detail::transform::Algorithm::fallback_for,
+                       cub::detail::transform::fallback_for_policy,
+                       cub::detail::transform::async_copy_policy_t<TUNE_THREADS>>;
+  };
+};
+#endif
+
+#ifdef TUNE_T
+using element_types = nvbench::type_list<TUNE_T>;
+#else
+using element_types =
+  nvbench::type_list<std::int8_t,
+                     std::int16_t,
+                     float,
+                     double
+#  ifdef NVBENCH_HELPER_HAS_I128
+                     ,
+                     __int128
+#  endif
+                     >;
+#endif
+
+// BabelStream uses 2^25, H200 can fit 2^31 int128s
+// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80)
+// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108)
+// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132)
+// inline auto array_size_powers = std::vector<nvbench::int64_t>{28};
+inline auto array_size_powers = nvbench::range(16, 28, 4);
+
+template <typename OffsetT,
+          typename... RandomAccessIteratorsIn,
+          typename RandomAccessIteratorOut,
+          typename TransformOp,
+          typename ExecTag = decltype(nvbench::exec_tag::no_batch)>
+void bench_transform(
+  nvbench::state& state,
+  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
+  RandomAccessIteratorOut output,
+  OffsetT num_items,
+  TransformOp transform_op,
+  ExecTag exec_tag = nvbench::exec_tag::no_batch)
+{
+  state.exec(exec_tag, [&](const nvbench::launch& launch) {
+    cub::detail::transform::dispatch_t<
+      false,
+      OffsetT,
+      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
+      RandomAccessIteratorOut,
+      TransformOp,
+      policy_hub_t<RandomAccessIteratorsIn...>>::dispatch(inputs, output, num_items, transform_op, launch.get_stream());
+  });
+}
+
+// Modified from BabelStream to also work for integers
+inline constexpr auto startA      = 1; // BabelStream: 0.1
+inline constexpr auto startB      = 2; // BabelStream: 0.2
+inline constexpr auto startC      = 3; // BabelStream: 0.1
+inline constexpr auto startScalar = 4; // BabelStream: 0.4
+
+// TODO(bgruber): we should put those somewhere into libcu++:
+// from C++ GSL
+struct narrowing_error : std::runtime_error
+{
+  narrowing_error()
+      : std::runtime_error("Narrowing error")
+  {}
+};
+
+// from C++ GSL
+// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow
+template <typename DstT, typename SrcT, ::cuda::std::__enable_if_t<::cuda::std::is_arithmetic<SrcT>::value, int> = 0>
+constexpr DstT narrow(SrcT value)
+{
+  constexpr bool is_different_signedness = ::cuda::std::is_signed<SrcT>::value != ::cuda::std::is_signed<DstT>::value;
+  const auto converted                   = static_cast<DstT>(value);
+  if (static_cast<SrcT>(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{}))))
+  {
+    throw narrowing_error{};
+  }
+  return converted;
+}
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void mul(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(n);
+  state.add_global_memory_writes<T>(n);
+
+  const T scalar = startScalar;
+  bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) {
+    return ci * scalar;
+  });
+}
+
+NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("mul")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void add(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  bench_transform(
+    state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T {
+      return ai + bi;
+    });
+}
+
+NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("add")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
+
+template <typename T, typename OffsetT>
+static void triad(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(2 * n);
+  state.add_global_memory_writes<T>(n);
+  const T scalar = startScalar;
+  bench_transform(
+    state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) {
+      return bi + scalar * ci;
+    });
+}
+
+NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("triad")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers);
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: BSD-3-Clause
+
+// %RANGE% TUNE_THREADS tpb 128:1024:128
+// %RANGE% TUNE_ALGORITHM alg 0:1:1
+
+// keep checks at the top so compilation of discarded variants fails really fast
+#if !TUNE_BASE
+#  if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900
+#    error "Cannot compile algorithm 4 (ublkcp) below sm90"
+#  endif
+
+#  if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP)
+#    error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)"
+#  endif
+#endif
+
+#include "babelstream.h"
+
+#if !TUNE_BASE
+#  if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1
+#    error "This benchmark does not support being compiled for multiple architectures"
+#  endif
+#endif
+
+template <typename T, typename OffsetT>
+static void nstream(nvbench::state& state, nvbench::type_list<T, OffsetT>)
+{
+  const auto n         = narrow<OffsetT>(state.get_int64("Elements{io}"));
+  const auto overwrite = static_cast<bool>(state.get_int64("OverwriteInput"));
+  thrust::device_vector<T> a(n, startA);
+  thrust::device_vector<T> b(n, startB);
+  thrust::device_vector<T> c(n, startC);
+
+  // The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes
+  // the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to
+  // choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input.
+  thrust::device_vector<T> d;
+  if (!overwrite)
+  {
+    d.resize(n);
+  }
+
+  state.add_element_count(n);
+  state.add_global_memory_reads<T>(3 * n);
+  state.add_global_memory_writes<T>(n);
+  const T scalar = startScalar;
+  bench_transform(
+    state,
+    ::cuda::std::tuple{a.begin(), b.begin(), c.begin()},
+    overwrite ? a.begin() : d.begin(),
+    n,
+    [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) {
+      return ai + bi + scalar * ci;
+    },
+    nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when
+                              // OverwriteInput=0, but doesn't hurt
+}
+
+NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types))
+  .set_name("nstream")
+  .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
+  .add_int64_power_of_two_axis("Elements{io}", array_size_powers)
+  .add_int64_axis("OverwriteInput", {1});
@@ -76,6 +76,7 @@
 #include <cub/device/device_segmented_sort.cuh>
 #include <cub/device/device_select.cuh>
 #include <cub/device/device_spmv.cuh>
+#include <cub/device/device_transform.cuh>
 
 // Grid
 // #include <cub/grid/grid_barrier.cuh>