diff --git a/cub/benchmarks/CMakeLists.txt b/cub/benchmarks/CMakeLists.txt index 1c3102d0d71..3932fffea6f 100644 --- a/cub/benchmarks/CMakeLists.txt +++ b/cub/benchmarks/CMakeLists.txt @@ -106,7 +106,7 @@ function(add_bench_dir bench_dir) add_bench(bench_target ${tuning_name} "${bench_src}") # for convenience, make tuning variant buildable by default file(WRITE "${tuning_path}" "#pragma once\n#define TUNE_BASE 1\n") - target_compile_options(${bench_target} PRIVATE "--extended-lambda -include${tuning_path}") + target_compile_options(${bench_target} PRIVATE "--extended-lambda" "-include${tuning_path}") else() # benchmarking register_cccl_benchmark("${bench_name}" "") diff --git a/cub/benchmarks/bench/transform/babelstream.h b/cub/benchmarks/bench/transform/babelstream.h new file mode 100644 index 00000000000..0f482d59e2f --- /dev/null +++ b/cub/benchmarks/bench/transform/babelstream.h @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +#include + +#include + +#include + +template +#if TUNE_BASE +using policy_hub_t = cub::detail::transform::policy_hub>; +#else +struct policy_hub_t +{ + struct max_policy : cub::ChainedPolicy<350, max_policy, max_policy> + { + static constexpr int min_bif = cub::detail::transform::arch_to_min_bytes_in_flight(__CUDA_ARCH_LIST__); + static constexpr auto algorithm = static_cast(TUNE_ALGORITHM); + using algo_policy = + ::cuda::std::_If>; + }; +}; +#endif + +#ifdef TUNE_T +using element_types = nvbench::type_list; +#else +using element_types = + nvbench::type_list; +#endif + +// BabelStream uses 2^25, H200 can fit 2^31 int128s +// 2^20 chars / 2^16 int128 saturate V100 (min_bif =12 * SM count =80) +// 2^21 chars / 2^17 int128 saturate A100 (min_bif =16 * SM count =108) +// 2^23 chars / 2^19 int128 saturate H100/H200 HBM3 (min_bif =32or48 * SM count =132) +// inline auto array_size_powers = std::vector{28}; +inline auto array_size_powers = nvbench::range(16, 28, 4); + +template +void bench_transform( + nvbench::state& state, + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + OffsetT num_items, + TransformOp transform_op, + ExecTag exec_tag = nvbench::exec_tag::no_batch) +{ + state.exec(exec_tag, [&](const nvbench::launch& launch) { + cub::detail::transform::dispatch_t< + false, + OffsetT, + ::cuda::std::tuple, + RandomAccessIteratorOut, + TransformOp, + policy_hub_t>::dispatch(inputs, output, num_items, transform_op, launch.get_stream()); + }); +} + +// Modified from BabelStream to also work for integers +inline constexpr auto startA = 1; // BabelStream: 0.1 +inline constexpr auto startB = 2; // BabelStream: 0.2 +inline constexpr auto startC = 3; // BabelStream: 0.1 +inline constexpr auto startScalar = 4; // BabelStream: 0.4 + +// TODO(bgruber): we should put those somewhere into libcu++: +// from C++ GSL +struct narrowing_error : std::runtime_error +{ + narrowing_error() + : std::runtime_error("Narrowing error") + {} +}; + +// from C++ GSL +// implementation insipired by: https://github.com/microsoft/GSL/blob/main/include/gsl/narrow +template ::value, int> = 0> +constexpr DstT narrow(SrcT value) +{ + constexpr bool is_different_signedness = ::cuda::std::is_signed::value != ::cuda::std::is_signed::value; + const auto converted = static_cast(value); + if (static_cast(converted) != value || (is_different_signedness && ((converted < DstT{}) != (value < SrcT{})))) + { + throw narrowing_error{}; + } + return converted; +} diff --git a/cub/benchmarks/bench/transform/babelstream1.cu b/cub/benchmarks/bench/transform/babelstream1.cu new file mode 100644 index 00000000000..87abdfef6ff --- /dev/null +++ b/cub/benchmarks/bench/transform/babelstream1.cu @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "babelstream.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +template +static void mul(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(n); + state.add_global_memory_writes(n); + + const T scalar = startScalar; + bench_transform(state, ::cuda::std::tuple{c.begin()}, b.begin(), n, [=] _CCCL_DEVICE(const T& ci) { + return ci * scalar; + }); +} + +NVBENCH_BENCH_TYPES(mul, NVBENCH_TYPE_AXES(element_types, offset_types)) + .set_name("mul") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); diff --git a/cub/benchmarks/bench/transform/babelstream2.cu b/cub/benchmarks/bench/transform/babelstream2.cu new file mode 100644 index 00000000000..c8fa017b788 --- /dev/null +++ b/cub/benchmarks/bench/transform/babelstream2.cu @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "babelstream.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +template +static void add(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + bench_transform( + state, ::cuda::std::tuple{a.begin(), b.begin()}, c.begin(), n, [] _CCCL_DEVICE(const T& ai, const T& bi) -> T { + return ai + bi; + }); +} + +NVBENCH_BENCH_TYPES(add, NVBENCH_TYPE_AXES(element_types, offset_types)) + .set_name("add") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); + +template +static void triad(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + state.add_element_count(n); + state.add_global_memory_reads(2 * n); + state.add_global_memory_writes(n); + const T scalar = startScalar; + bench_transform( + state, ::cuda::std::tuple{b.begin(), c.begin()}, a.begin(), n, [=] _CCCL_DEVICE(const T& bi, const T& ci) { + return bi + scalar * ci; + }); +} + +NVBENCH_BENCH_TYPES(triad, NVBENCH_TYPE_AXES(element_types, offset_types)) + .set_name("triad") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers); diff --git a/cub/benchmarks/bench/transform/babelstream3.cu b/cub/benchmarks/bench/transform/babelstream3.cu new file mode 100644 index 00000000000..db541554210 --- /dev/null +++ b/cub/benchmarks/bench/transform/babelstream3.cu @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +// %RANGE% TUNE_THREADS tpb 128:1024:128 +// %RANGE% TUNE_ALGORITHM alg 0:1:1 + +// keep checks at the top so compilation of discarded variants fails really fast +#if !TUNE_BASE +# if TUNE_ALGORITHM == 1 && (__CUDA_ARCH_LIST__) < 900 +# error "Cannot compile algorithm 4 (ublkcp) below sm90" +# endif + +# if TUNE_ALGORITHM == 1 && !defined(_CUB_HAS_TRANSFORM_UBLKCP) +# error "Cannot tune for ublkcp algorithm, which is not provided by CUB (old CTK?)" +# endif +#endif + +#include "babelstream.h" + +#if !TUNE_BASE +# if CUB_DETAIL_COUNT(__CUDA_ARCH_LIST__) != 1 +# error "This benchmark does not support being compiled for multiple architectures" +# endif +#endif + +template +static void nstream(nvbench::state& state, nvbench::type_list) +{ + const auto n = narrow(state.get_int64("Elements{io}")); + const auto overwrite = static_cast(state.get_int64("OverwriteInput")); + thrust::device_vector a(n, startA); + thrust::device_vector b(n, startB); + thrust::device_vector c(n, startC); + + // The BabelStream nstream overwrites one input array to avoid write-allocation of cache lines. However, this changes + // the data that is computed for each iteration and results in an unstable workload. Therefore, we added an axis to + // choose a different output array. Pass `-a OverwriteInput=0` to the benchmark to disable overwriting the input. + thrust::device_vector d; + if (!overwrite) + { + d.resize(n); + } + + state.add_element_count(n); + state.add_global_memory_reads(3 * n); + state.add_global_memory_writes(n); + const T scalar = startScalar; + bench_transform( + state, + ::cuda::std::tuple{a.begin(), b.begin(), c.begin()}, + overwrite ? a.begin() : d.begin(), + n, + [=] _CCCL_DEVICE(const T& ai, const T& bi, const T& ci) { + return ai + bi + scalar * ci; + }, + nvbench::exec_tag::none); // Use batch mode for benchmarking since the workload changes. Not necessary when + // OverwriteInput=0, but doesn't hurt +} + +NVBENCH_BENCH_TYPES(nstream, NVBENCH_TYPE_AXES(element_types, offset_types)) + .set_name("nstream") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", array_size_powers) + .add_int64_axis("OverwriteInput", {1}); diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh index f02ae6c0024..2c4d6dd5f4e 100644 --- a/cub/cub/cub.cuh +++ b/cub/cub/cub.cuh @@ -76,6 +76,7 @@ #include #include #include +#include // Grid // #include diff --git a/cub/cub/device/device_transform.cuh b/cub/cub/device/device_transform.cuh new file mode 100644 index 00000000000..984109692f6 --- /dev/null +++ b/cub/cub/device/device_transform.cuh @@ -0,0 +1,271 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +CUB_NAMESPACE_BEGIN + +//! DeviceTransform provides device-wide, parallel operations for transforming elements tuple-wise from multiple input +//! sequences into an output sequence. +struct DeviceTransform +{ + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity + //! (i.e. address) of the objects passed to the call operator of the transformation operation. + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin transform-many + //! :end-before: example-end transform-many + //! + //! @endrst + //! + //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The + //! iterators' value types must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + template + CUB_RUNTIME_FUNCTION static cudaError_t Transform( + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform"); + return detail::transform:: + dispatch_t, RandomAccessIteratorOut, TransformOp>:: + dispatch( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB + // APIs. + template + CUB_RUNTIME_FUNCTION static cudaError_t Transform( + void* d_temp_storage, + size_t& temp_storage_bytes, + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + return Transform( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //! @rst + //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. No guarantee is given on the identity + //! (i.e. address) of the objects passed to the call operator of the transformation operation. + //! @endrst + //! + //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type + //! must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + template + CUB_RUNTIME_FUNCTION static cudaError_t Transform( + RandomAccessIteratorIn input, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + return Transform( + ::cuda::std::make_tuple(::cuda::std::move(input)), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + // This overload has additional parameters to specify temporary storage. Provided for compatibility with other CUB + // APIs. + template + CUB_RUNTIME_FUNCTION static cudaError_t Transform( + void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorIn input, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + return Transform( + ::cuda::std::make_tuple(::cuda::std::move(input)), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Transforms many input sequences into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. The objects passed to the call operator + //! of the transformation operation are guaranteed to reside in the input sequences and are never copied. + //! + //! A Simple Example + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_transform_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin transform-many-stable + //! :end-before: example-end transform-many-stable + //! + //! @endrst + //! + //! @param inputs A tuple of iterators to the input sequences where num_items elements are read from each. The + //! iterators' value types must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + template + CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + CUB_DETAIL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses"); + return detail::transform:: + dispatch_t, RandomAccessIteratorOut, TransformOp>:: + dispatch( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + template + CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( + void* d_temp_storage, + size_t& temp_storage_bytes, + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + return TransformStableArgumentAddresses( + ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //! @rst + //! Transforms one input sequence into one output sequence, by applying a transformation operation on corresponding + //! input elements and writing the result to the corresponding output element. The objects passed to the call operator + //! of the transformation operation are guaranteed to reside in the input sequences and are never copied. + //! @endrst + //! + //! @param input An iterator to the input sequence where num_items elements are read from. The iterator's value type + //! must be trivially relocatable. + //! @param output An iterator to the output sequence where num_items results are written to. + //! @param num_items The number of elements in each input sequence. + //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value + //! types must be convertible to the parameters of the function object's call operator. The return type of the call + //! operator must be assignable to the dereferenced output iterator. + //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + template + CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( + RandomAccessIteratorIn input, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + return TransformStableArgumentAddresses( + ::cuda::std::make_tuple(::cuda::std::move(input)), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + template + CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses( + void* d_temp_storage, + size_t& temp_storage_bytes, + RandomAccessIteratorIn input, + RandomAccessIteratorOut output, + int num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) + { + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + return TransformStableArgumentAddresses( + ::cuda::std::make_tuple(::cuda::std::move(input)), + ::cuda::std::move(output), + num_items, + ::cuda::std::move(transform_op), + stream); + } +#endif // DOXYGEN_SHOULD_SKIP_THIS +}; + +CUB_NAMESPACE_END diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh new file mode 100644 index 00000000000..8fb596da075 --- /dev/null +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -0,0 +1,866 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000 +_CCCL_NV_DIAG_SUPPRESS(186) +# include +// we cannot re-enable the warning here, because it is triggered outside the translation unit +// see also: https://godbolt.org/z/1x8b4hn3G +#endif // defined(_CCCL_CUDA_COMPILER) && _CCCL_CUDACC_VER < 1105000 + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +CUB_NAMESPACE_BEGIN + +// the ublkcp kernel needs PTX features that are only available and understood by CTK 12 and later +#if _CCCL_CUDACC_VER_MAJOR >= 12 +# define _CUB_HAS_TRANSFORM_UBLKCP +#endif // _CCCL_CUDACC_VER_MAJOR >= 12 + +namespace detail +{ +namespace transform +{ +_CCCL_HOST_DEVICE constexpr int sum() +{ + return 0; +} + +// TODO(bgruber): remove with C++17 +template +_CCCL_HOST_DEVICE constexpr int sum(int head, Ts... tail) +{ + return head + sum(tail...); +} + +#if _CCCL_STD_VER >= 2017 +template +_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int +{ + return (int{sizeof(value_t)} + ... + 0); +} +#else // ^^^ C++17 ^^^ / vvv C++11 vvv +template +_CCCL_HOST_DEVICE constexpr auto loaded_bytes_per_iteration() -> int +{ + return sum(int{sizeof(value_t)}...); +} +#endif // _CCCL_STD_VER >= 2017 + +enum class Algorithm +{ + fallback_for, +#ifdef _CUB_HAS_TRANSFORM_UBLKCP + ublkcp, +#endif // _CUB_HAS_TRANSFORM_UBLKCP +}; + +// this kernel replicates the behavior of cub::DeviceFor::Bulk +template +_CCCL_DEVICE void transform_kernel_impl( + ::cuda::std::integral_constant, + Offset num_items, + int /* items_per_thread */, + F transform_op, + RandomAccessIteratorOut out, + RandomAccessIteratorsIn... ins) +{ + auto op = [&](Offset i) { + out[i] = transform_op(ins[i]...); + }; + using OpT = decltype(op); + + // TODO(bgruber): verbatim copy from for_each's static_kernel below: + using agent_t = for_each::agent_block_striped_t; + + constexpr auto block_threads = ForPolicy::block_threads; + constexpr auto items_per_tile = ForPolicy::items_per_thread * block_threads; + + const auto tile_base = static_cast(blockIdx.x) * items_per_tile; + const auto num_remaining = num_items - tile_base; + const auto items_in_tile = static_cast(num_remaining < items_per_tile ? num_remaining : items_per_tile); + + if (items_in_tile == items_per_tile) + { + agent_t{tile_base, op}.template consume_tile(items_per_tile, block_threads); + } + else + { + agent_t{tile_base, op}.template consume_tile(items_in_tile, block_threads); + } +} + +template +struct async_copy_policy_t +{ + static constexpr int block_threads = BlockThreads; + // items per tile are determined at runtime. these (inclusive) bounds allow overriding that value via a tuning policy + static constexpr int min_items_per_thread = 1; + static constexpr int max_items_per_thread = 32; +}; + +// TODO(bgruber) cheap copy of ::cuda::std::apply, which requires C++17. +template +_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply_impl(F&& f, Tuple&& t, ::cuda::std::index_sequence) + -> decltype(::cuda::std::forward(f)(::cuda::std::get(::cuda::std::forward(t))...)) +{ + return ::cuda::std::forward(f)(::cuda::std::get(::cuda::std::forward(t))...); +} + +template +_CCCL_DEVICE _CCCL_FORCEINLINE auto poor_apply(F&& f, Tuple&& t) + -> decltype(poor_apply_impl( + ::cuda::std::forward(f), + ::cuda::std::forward(t), + ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t>::value>{})) +{ + return poor_apply_impl( + ::cuda::std::forward(f), + ::cuda::std::forward(t), + ::cuda::std::make_index_sequence<::cuda::std::tuple_size<::cuda::std::__libcpp_remove_reference_t>::value>{}); +} + +// mult must be a power of 2 +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr auto round_up_to_po2_multiple(Integral x, Integral mult) -> Integral +{ +#if _CCCL_STD_VER > 2011 + _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(static_cast<::cuda::std::__make_unsigned_t>(mult)), ""); +#endif // _CCCL_STD_VER > 2011 + return (x + mult - 1) & ~(mult - 1); +} + +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment) +{ +#if _CCCL_STD_VER > 2011 + _LIBCUDACXX_ASSERT(::cuda::std::has_single_bit(alignment), ""); +#endif // _CCCL_STD_VER > 2011 + return reinterpret_cast( + reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1}); +} + +// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding +// +// For performance considerations of memcpy_async: +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidance-for-memcpy-async +// +// We basically have to align the base pointer to 16 bytes, and copy a multiple of 16 bytes. To achieve this, when we +// copy a tile of data from an input buffer, we round down the pointer to the start of the tile to the next lower +// address that is a multiple of 16 bytes. This introduces head padding. We also round up the total number of bytes to +// copy (including head padding) to a multiple of 16 bytes, which introduces tail padding. For the bulk copy kernel, we +// have to align to 128 bytes instead of 16. +// +// However, padding memory copies like that may access the input buffer out-of-bounds. Here are some thoughts: +// * According to the CUDA programming guide +// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses), "any address of a variable +// residing in global memory or returned by one of the memory allocation routines from the driver or runtime API is +// always aligned to at least 256 bytes." +// * Memory protection is usually done on memory page level, which is even larger than 256 bytes for CUDA and 4KiB on +// Intel x86 and 4KiB+ ARM. Front and tail padding thus never leaves the memory page of the input buffer. +// * This should count for device memory, but also for device accessible memory living on the host. +// * The base pointer alignment and size rounding also never leaves the size of a cache line. +// +// Copying larger data blocks with head and tail padding should thus be legal. Nevertheless, an out-of-bounds read is +// still technically undefined behavior in C++. Also, compute-sanitizer flags at least such reads after the end of a +// buffer. Therefore, we lean on the safer side and protect against out of bounds reads at the beginning and end. + +// A note on size and alignment: The size of a type is at least as large as its alignment. We rely on this fact in some +// conditions. +// This is guaranteed by the C++ standard, and follows from the definition of arrays: the difference between neighboring +// array element addresses is sizeof element type and each array element needs to fulfill the alignment requirement of +// the element type. + +// Pointer with metadata to describe readonly input memory for memcpy_async and UBLKCP kernels. +// cg::memcpy_async is most efficient when the data is 16-byte aligned and the size a multiple of 16 bytes +// UBLKCP is most efficient when the data is 128-byte aligned and the size a multiple of 16 bytes +template // Cannot add alignment to signature, because we need a uniform kernel template instantiation +struct aligned_base_ptr +{ + using value_type = T; + + const char* ptr; // aligned pointer before the original pointer (16-byte or 128-byte). May not be aligned to + // alignof(T). E.g.: array of int3 starting at address 4, ptr == 0 + int head_padding; // byte offset between ptr and the original pointer. Value inside [0;15] or [0;127]. + + _CCCL_HOST_DEVICE const T* ptr_to_elements() const + { + return reinterpret_cast(ptr + head_padding); + } + + _CCCL_HOST_DEVICE friend bool operator==(const aligned_base_ptr& a, const aligned_base_ptr& b) + { + return a.ptr == b.ptr && a.head_padding == b.head_padding; + } +}; + +template +_CCCL_HOST_DEVICE auto make_aligned_base_ptr(const T* ptr, int alignment) -> aligned_base_ptr +{ + const char* base_ptr = round_down_ptr(ptr, alignment); + return aligned_base_ptr{base_ptr, static_cast(reinterpret_cast(ptr) - base_ptr)}; +} + +constexpr int bulk_copy_alignment = 128; +constexpr int bulk_copy_size_multiple = 16; + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP +_CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one() +{ + const ::cuda::std::uint32_t membermask = ~0; + ::cuda::std::uint32_t is_elected; + asm volatile( + "{\n\t .reg .pred P_OUT; \n\t" + "elect.sync _|P_OUT, %1;\n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(is_elected) + : "r"(membermask) + :); + return threadIdx.x < 32 && static_cast(is_elected); +} + +// TODO(bgruber): inline this as lambda in C++14 +template +_CCCL_DEVICE void bulk_copy_tile( + ::cuda::std::uint64_t& bar, + int tile_stride, + char* smem, + int& smem_offset, + ::cuda::std::uint32_t& total_bytes_bulk_copied, + Offset global_offset, + const aligned_base_ptr& aligned_ptr) +{ + static_assert(alignof(T) <= bulk_copy_alignment, ""); + + const char* src = aligned_ptr.ptr + global_offset * sizeof(T); + char* dst = smem + smem_offset; + _LIBCUDACXX_ASSERT(reinterpret_cast(src) % bulk_copy_alignment == 0, ""); + _LIBCUDACXX_ASSERT(reinterpret_cast(dst) % bulk_copy_alignment == 0, ""); + + // TODO(bgruber): we could precompute bytes_to_copy on the host + const int bytes_to_copy = round_up_to_po2_multiple( + aligned_ptr.head_padding + static_cast(sizeof(T)) * tile_stride, bulk_copy_size_multiple); + + ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar); + total_bytes_bulk_copied += bytes_to_copy; + + // add bulk_copy_alignment to make space for the next tile's head padding + smem_offset += static_cast(sizeof(T)) * tile_stride + bulk_copy_alignment; +} + +template +_CCCL_DEVICE void bulk_copy_tile_fallback( + int tile_size, + int tile_stride, + char* smem, + int& smem_offset, + Offset global_offset, + const aligned_base_ptr& aligned_ptr) +{ + const T* src = aligned_ptr.ptr_to_elements() + global_offset; + T* dst = reinterpret_cast(smem + smem_offset + aligned_ptr.head_padding); + _LIBCUDACXX_ASSERT(reinterpret_cast(src) % alignof(T) == 0, ""); + _LIBCUDACXX_ASSERT(reinterpret_cast(dst) % alignof(T) == 0, ""); + + const int bytes_to_copy = static_cast(sizeof(T)) * tile_size; + cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy); + + // add bulk_copy_alignment to make space for the next tile's head padding + smem_offset += static_cast(sizeof(T)) * tile_stride + bulk_copy_alignment; +} + +// TODO(bgruber): inline this as lambda in C++14 +template +_CCCL_DEVICE _CCCL_FORCEINLINE const T& +fetch_operand(int tile_stride, const char* smem, int& smem_offset, int smem_idx, const aligned_base_ptr& aligned_ptr) +{ + const T* smem_operand_tile_base = reinterpret_cast(smem + smem_offset + aligned_ptr.head_padding); + smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment; + return smem_operand_tile_base[smem_idx]; +} + +template +_CCCL_DEVICE void transform_kernel_ublkcp( + Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr... aligned_ptrs) +{ + __shared__ uint64_t bar; + extern __shared__ char __align__(bulk_copy_alignment) smem[]; + + namespace ptx = ::cuda::ptx; + + constexpr int block_dim = BulkCopyPolicy::block_threads; + const int tile_stride = block_dim * num_elem_per_thread; + const Offset offset = static_cast(blockIdx.x) * tile_stride; + const int tile_size = ::cuda::std::min(num_items - offset, Offset{tile_stride}); + + const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x; + if (inner_blocks) + { + // use one thread to setup the entire bulk copy + if (elect_one()) + { + ptx::mbarrier_init(&bar, 1); + ptx::fence_proxy_async(ptx::space_shared); + + int smem_offset = 0; + ::cuda::std::uint32_t total_copied = 0; + + // TODO(bgruber): use a fold over comma in C++17 + // Order of evaluation is left-to-right + int dummy[] = {(bulk_copy_tile(bar, tile_stride, smem, smem_offset, total_copied, offset, aligned_ptrs), 0)..., + 0}; + (void) dummy; + + // TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet + ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied); + } + + // all threads wait for bulk copy + __syncthreads(); + while (!ptx::mbarrier_try_wait_parity(&bar, 0)) + ; + } + else + { + // use all threads to schedule an async_memcpy + int smem_offset = 0; + + // TODO(bgruber): use a fold over comma in C++17 + // Order of evaluation is left-to-right + int dummy[] = {(bulk_copy_tile_fallback(tile_size, tile_stride, smem, smem_offset, offset, aligned_ptrs), 0)..., 0}; + (void) dummy; + + cooperative_groups::wait(cooperative_groups::this_thread_block()); + } + + // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below + out += offset; + + // note: I tried expressing the UBLKCP_AGENT as a function object but it adds a lot of code to handle the variadics + // TODO(bgruber): use a polymorphic lambda in C++14 +# define UBLKCP_AGENT(full_tile) \ + _Pragma("unroll 1") /* Unroll 1 tends to improve performance, especially for smaller data types (confirmed by \ + benchmark) */ \ + for (int j = 0; j < num_elem_per_thread; ++j) \ + { \ + const int idx = j * block_dim + threadIdx.x; \ + if (full_tile || idx < tile_size) \ + { \ + int smem_offset = 0; \ + /* need to expand into a tuple for guaranteed order of evaluation*/ \ + out[idx] = poor_apply( \ + [&](const InTs&... values) { \ + return f(values...); \ + }, \ + ::cuda::std::tuple{fetch_operand(tile_stride, smem, smem_offset, idx, aligned_ptrs)...}); \ + } \ + } + if (tile_stride == tile_size) + { + UBLKCP_AGENT(true); + } + else + { + UBLKCP_AGENT(false); + } +# undef UBLKCP_AGENT +} + +template +_CCCL_DEVICE void transform_kernel_impl( + ::cuda::std::integral_constant, + Offset num_items, + int num_elem_per_thread, + F f, + RandomAccessIteratorOut out, + aligned_base_ptr... aligned_ptrs) +{ + // only call the real kernel for sm90 and later + NV_IF_TARGET(NV_PROVIDES_SM_90, + (transform_kernel_ublkcp(num_items, num_elem_per_thread, f, out, aligned_ptrs...);)); +} +#endif // _CUB_HAS_TRANSFORM_UBLKCP + +template +union kernel_arg +{ + aligned_base_ptr> aligned_ptr; + It iterator; + + _CCCL_HOST_DEVICE kernel_arg() {} // in case It is not default-constructible +}; + +template +_CCCL_HOST_DEVICE auto make_iterator_kernel_arg(It it) -> kernel_arg +{ + kernel_arg arg; + arg.iterator = it; + return arg; +} + +template +_CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) -> kernel_arg +{ + kernel_arg arg; + arg.aligned_ptr = make_aligned_base_ptr(ptr, alignment); + return arg; +} + +// TODO(bgruber): make a variable template in C++14 +template +using needs_aligned_ptr_t = + ::cuda::std::bool_constant; + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP +template ::value, int> = 0> +_CCCL_DEVICE _CCCL_FORCEINLINE auto select_kernel_arg( + ::cuda::std::integral_constant, kernel_arg&& arg) -> aligned_base_ptr>&& +{ + return ::cuda::std::move(arg.aligned_ptr); +} +#endif // _CUB_HAS_TRANSFORM_UBLKCP + +template ::value, int> = 0> +_CCCL_DEVICE _CCCL_FORCEINLINE auto +select_kernel_arg(::cuda::std::integral_constant, kernel_arg&& arg) -> It&& +{ + return ::cuda::std::move(arg.iterator); +} + +// There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated +// with the same arguments for each algorithm. Only the device compiler will then select the implementation. This +// saves some compile-time and binary size. +template +__launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads) + CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel( + Offset num_items, + int num_elem_per_thread, + F f, + RandomAccessIteratorOut out, + kernel_arg... ins) +{ + constexpr auto alg = ::cuda::std::integral_constant{}; + transform_kernel_impl( + alg, + num_items, + num_elem_per_thread, + ::cuda::std::move(f), + ::cuda::std::move(out), + select_kernel_arg(alg, ::cuda::std::move(ins))...); +} + +constexpr int arch_to_min_bytes_in_flight(int sm_arch) +{ + // TODO(bgruber): use if-else in C++14 for better readability + return sm_arch >= 900 ? 48 * 1024 // 32 for H100, 48 for H200 + : sm_arch >= 800 ? 16 * 1024 // A100 + : 12 * 1024; // V100 and below +} + +template +_CCCL_HOST_DEVICE constexpr auto bulk_copy_smem_for_tile_size(int tile_size) -> int +{ + return round_up_to_po2_multiple(int{sizeof(int64_t)}, bulk_copy_alignment) /* bar */ + // 128 bytes of padding for each input tile (handles before + after) + + tile_size * loaded_bytes_per_iteration() + + sizeof...(RandomAccessIteratorsIn) * bulk_copy_alignment; +} + +using fallback_for_policy = for_each::policy_hub_t::policy_350_t::for_policy_t; + +template +struct policy_hub +{ + static_assert(sizeof(RandomAccessIteratorTupleIn) == 0, "Second parameter must be a tuple"); +}; + +template +struct policy_hub> +{ + static constexpr bool no_input_streams = sizeof...(RandomAccessIteratorsIn) == 0; + static constexpr bool all_contiguous = + ::cuda::std::conjunction...>::value; + static constexpr bool all_values_trivially_reloc = + ::cuda::std::conjunction>...>::value; + + static constexpr bool can_memcpy = all_contiguous && all_values_trivially_reloc; + + // TODO(bgruber): consider a separate kernel for just filling + + struct policy300 : ChainedPolicy<300, policy300, policy300> + { + static constexpr int min_bif = arch_to_min_bytes_in_flight(300); + // TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy + static constexpr auto algorithm = Algorithm::fallback_for; + using algo_policy = fallback_for_policy; + }; + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP + // H100 and H200 + struct policy900 : ChainedPolicy<900, policy900, policy300> + { + static constexpr int min_bif = arch_to_min_bytes_in_flight(900); + using async_policy = async_copy_policy_t<256>; + static constexpr bool exhaust_smem = + bulk_copy_smem_for_tile_size( + async_policy::block_threads * async_policy::min_items_per_thread) + > 48 * 1024; + static constexpr bool any_type_is_overalinged = +# if _CCCL_STD_VER >= 2017 + ((alignof(value_t) > bulk_copy_alignment) || ...); +# else + sum((alignof(value_t) > bulk_copy_alignment)...) > 0; +# endif + + static constexpr bool use_fallback = + RequiresStableAddress || !can_memcpy || no_input_streams || exhaust_smem || any_type_is_overalinged; + static constexpr auto algorithm = use_fallback ? Algorithm::fallback_for : Algorithm::ublkcp; + using algo_policy = ::cuda::std::_If; + }; + + using max_policy = policy900; +#else // _CUB_HAS_TRANSFORM_UBLKCP + using max_policy = policy300; +#endif // _CUB_HAS_TRANSFORM_UBLKCP +}; + +// TODO(bgruber): replace by ::cuda::std::expected in C++14 +template +struct PoorExpected +{ + alignas(T) char storage[sizeof(T)]; + cudaError_t error; + + _CCCL_HOST_DEVICE PoorExpected(T value) + : error(cudaSuccess) + { + new (storage) T(::cuda::std::move(value)); + } + + _CCCL_HOST_DEVICE PoorExpected(cudaError_t error) + : error(error) + {} + + _CCCL_HOST_DEVICE explicit operator bool() const + { + return error == cudaSuccess; + } + + _CCCL_HOST_DEVICE T& operator*() + { + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing") + return reinterpret_cast(storage); + _CCCL_DIAG_POP + } + + _CCCL_HOST_DEVICE const T& operator*() const + { + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_GCC("-Wstrict-aliasing") + return reinterpret_cast(storage); + _CCCL_DIAG_POP + } + + _CCCL_HOST_DEVICE T* operator->() + { + return &**this; + } + + _CCCL_HOST_DEVICE const T* operator->() const + { + return &**this; + } +}; + +// TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this. +_CCCL_HOST_DEVICE inline PoorExpected get_max_shared_memory() +{ + // gevtushenko promised me that I can assume that the stream passed to the CUB API entry point (where the kernels + // will later be launched on) belongs to the currently active device. So we can just query the active device here. + int device = 0; + auto error = CubDebug(cudaGetDevice(&device)); + if (error != cudaSuccess) + { + return error; + } + + int max_smem = 0; + error = CubDebug(cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlock, device)); + if (error != cudaSuccess) + { + return error; + } + + return max_smem; +} + +struct elem_counts +{ + int elem_per_thread; + int tile_size; + int smem_size; +}; + +template > +struct dispatch_t; + +template +struct dispatch_t, + RandomAccessIteratorOut, + TransformOp, + PolicyHub> +{ + static_assert(::cuda::std::is_same::value + || ::cuda::std::is_same::value, + "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types"); + + ::cuda::std::tuple in; + RandomAccessIteratorOut out; + Offset num_items; + TransformOp op; + cudaStream_t stream; + +#define CUB_DETAIL_TRANSFORM_KERNEL_PTR \ + &transform_kernel...> + + static constexpr int loaded_bytes_per_iter = loaded_bytes_per_iteration(); + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP + // TODO(bgruber): I want to write tests for this but those are highly depending on the architecture we are running + // on? + template + CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel() + -> PoorExpected< + ::cuda::std:: + tuple> + { + using policy_t = typename ActivePolicy::algo_policy; + constexpr int block_dim = policy_t::block_threads; + static_assert(block_dim % bulk_copy_alignment == 0, + "block_threads needs to be a multiple of bulk_copy_alignment (128)"); // then tile_size is a multiple + // of 128-byte + + auto determine_element_counts = [&]() -> PoorExpected { + const auto max_smem = get_max_shared_memory(); + if (!max_smem) + { + return max_smem.error; + } + + elem_counts last_counts{}; + // Increase the number of output elements per thread until we reach the required bytes in flight. + static_assert(policy_t::min_items_per_thread <= policy_t::max_items_per_thread, ""); // ensures the loop below + // runs at least once + for (int elem_per_thread = +policy_t::min_items_per_thread; elem_per_thread < +policy_t::max_items_per_thread; + ++elem_per_thread) + { + const int tile_size = block_dim * elem_per_thread; + const int smem_size = bulk_copy_smem_for_tile_size(tile_size); + if (smem_size > *max_smem) + { +# ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + // assert should be prevented by smem check in policy + assert(last_counts.elem_per_thread > 0 && "min_items_per_thread exceeds available shared memory"); +# endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + return last_counts; + } + + if (tile_size >= num_items) + { + return elem_counts{elem_per_thread, tile_size, smem_size}; + } + + int max_occupancy = 0; + const auto error = + CubDebug(MaxSmOccupancy(max_occupancy, CUB_DETAIL_TRANSFORM_KERNEL_PTR, block_dim, smem_size)); + if (error != cudaSuccess) + { + return error; + } + + const int bytes_in_flight_SM = max_occupancy * tile_size * loaded_bytes_per_iter; + if (ActivePolicy::min_bif <= bytes_in_flight_SM) + { + return elem_counts{elem_per_thread, tile_size, smem_size}; + } + + last_counts = elem_counts{elem_per_thread, tile_size, smem_size}; + } + return last_counts; + }; + PoorExpected config = [&]() { + NV_IF_TARGET( + NV_IS_HOST, + ( + // this static variable exists for each template instantiation of the surrounding function and class, on which + // the chosen element count solely depends (assuming max SMEM is constant during a program execution) + static auto cached_config = determine_element_counts(); return cached_config;), + ( + // we cannot cache the determined element count in device code + return determine_element_counts();)); + }(); + if (!config) + { + return config.error; + } +# ifdef CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + assert(config->elem_per_thread > 0); + assert(config->tile_size > 0); + assert(config->tile_size % bulk_copy_alignment == 0); + assert((sizeof...(RandomAccessIteratorsIn) == 0) != (config->smem_size != 0)); // logical xor +# endif // CUB_DETAIL_DEBUG_ENABLE_HOST_ASSERTIONS + + const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{config->tile_size})); + return ::cuda::std::make_tuple( + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, config->smem_size, stream), + CUB_DETAIL_TRANSFORM_KERNEL_PTR, + config->elem_per_thread); + } + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t + invoke_algorithm(cuda::std::index_sequence, ::cuda::std::integral_constant) + { + auto ret = configure_ublkcp_kernel(); + if (!ret) + { + return ret.error; + } + // TODO(bgruber): use a structured binding in C++17 + // auto [launcher, kernel, elem_per_thread] = *ret; + + return ::cuda::std::get<0>(*ret).doit( + ::cuda::std::get<1>(*ret), + num_items, + ::cuda::std::get<2>(*ret), + op, + out, + make_aligned_base_ptr_kernel_arg( + THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get(in)), bulk_copy_alignment)...); + } +#endif // _CUB_HAS_TRANSFORM_UBLKCP + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t + invoke_algorithm(cuda::std::index_sequence, ::cuda::std::integral_constant) + { + constexpr int block_threads = ActivePolicy::algo_policy::block_threads; + constexpr int items_per_thread = ActivePolicy::algo_policy::items_per_thread; + constexpr int tile_size = block_threads * items_per_thread; + const auto grid_dim = static_cast(::cuda::ceil_div(num_items, Offset{tile_size})); + return CubDebug( + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_threads, 0, stream) + .doit( + CUB_DETAIL_TRANSFORM_KERNEL_PTR, + num_items, + items_per_thread, + op, + out, + make_iterator_kernel_arg(THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(::cuda::std::get(in)))...)); + } + + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke() + { + // // TODO(bgruber): replace the overload set by if constexpr in C++17 + return invoke_algorithm(::cuda::std::index_sequence_for{}, + ::cuda::std::integral_constant{}); + } + + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( + ::cuda::std::tuple in, + RandomAccessIteratorOut out, + Offset num_items, + TransformOp op, + cudaStream_t stream) + { + if (num_items == 0) + { + return cudaSuccess; + } + + int ptx_version = 0; + auto error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) + { + return error; + } + + dispatch_t dispatch{::cuda::std::move(in), ::cuda::std::move(out), num_items, ::cuda::std::move(op), stream}; + return CubDebug(PolicyHub::max_policy::Invoke(ptx_version, dispatch)); + } + +#undef CUB_DETAIL_TRANSFORM_KERNEL_PTR +}; +} // namespace transform +} // namespace detail +CUB_NAMESPACE_END diff --git a/cub/test/CMakeLists.txt b/cub/test/CMakeLists.txt index 48a0142801a..3ec8c94eef1 100644 --- a/cub/test/CMakeLists.txt +++ b/cub/test/CMakeLists.txt @@ -261,6 +261,11 @@ function(cub_add_test target_name_var test_name test_src cub_target launcher_id) target_compile_options(${test_target} PRIVATE -ftemplate-depth=1000) # for handling large type lists endif() + # enable lambdas for all API examples + if ("${test_target}" MATCHES "test.[A-Za-z0-9_]+_api") + target_compile_options(${test_target} PRIVATE $<$:--extended-lambda>) + endif() + target_link_libraries(${test_target} PRIVATE ${cub_target} ${config_c2h_target} diff --git a/cub/test/catch2_test_device_transform.cu b/cub/test/catch2_test_device_transform.cu new file mode 100644 index 00000000000..50f253ef5c3 --- /dev/null +++ b/cub/test/catch2_test_device_transform.cu @@ -0,0 +1,556 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" +// above header needs to be included first + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "c2h/custom_type.cuh" +#include "catch2_test_helper.h" +#include "catch2_test_launch_helper.h" +#include "test/test_util_vec.h" + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +using cub::detail::transform::Algorithm; + +template +struct policy_hub_for_alg +{ + struct max_policy : cub::ChainedPolicy<300, max_policy, max_policy> + { + static constexpr int min_bif = 64 * 1024; + static constexpr Algorithm algorithm = Alg; + using algo_policy = + ::cuda::std::_If>; + }; +}; + +template +CUB_RUNTIME_FUNCTION static cudaError_t transform_many_with_alg_entry_point( + void* d_temp_storage, + size_t& temp_storage_bytes, + ::cuda::std::tuple inputs, + RandomAccessIteratorOut output, + Offset num_items, + TransformOp transform_op, + cudaStream_t stream = nullptr) +{ + if (d_temp_storage == nullptr) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + constexpr bool RequiresStableAddress = false; + return cub::detail::transform::dispatch_t, + RandomAccessIteratorOut, + TransformOp, + policy_hub_for_alg>{} + .dispatch(inputs, output, num_items, transform_op, stream); +} + +DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many); +DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::TransformStableArgumentAddresses, transform_many_stable); +DECLARE_TMPL_LAUNCH_WRAPPER(transform_many_with_alg_entry_point, + transform_many_with_alg, + ESCAPE_LIST(Algorithm Alg, typename Offset), + ESCAPE_LIST(Alg, Offset)); + +using algorithms = + c2h::enum_type_list; + +using offset_types = c2h::type_list; + +#ifdef _CUB_HAS_TRANSFORM_UBLKCP +# define FILTER_UBLKCP \ + if (alg == Algorithm::ublkcp && ptx_version < 900) \ + { \ + return; \ + } +#else // _CUB_HAS_TRANSFORM_UBLKCP +# define FILTER_UBLKCP +#endif // _CUB_HAS_TRANSFORM_UBLKCP + +#define FILTER_UNSUPPORTED_ALGS \ + int ptx_version = 0; \ + REQUIRE(cub::PtxVersion(ptx_version) == cudaSuccess); \ + _CCCL_DIAG_PUSH \ + _CCCL_DIAG_SUPPRESS_MSVC(4127) /* conditional expression is constant */ \ + FILTER_UBLKCP \ + _CCCL_DIAG_POP + +CUB_TEST("DeviceTransform::Transform BabelStream add", + "[device][device_transform]", + c2h::type_list, + offset_types, + algorithms) +{ + using type = typename c2h::get<0, TestType>; + using offset_t = typename c2h::get<1, TestType>; + constexpr auto alg = c2h::get<2, TestType>::value; + FILTER_UNSUPPORTED_ALGS + const int num_items = GENERATE(0, 1, 15, 16, 17, 127, 128, 129, 4095, 4096, 4097); // edge cases around 16 and 128 + CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg, num_items); + + c2h::device_vector a(num_items); + c2h::device_vector b(num_items); + c2h::gen(CUB_SEED(1), a); + c2h::gen(CUB_SEED(1), b); + + c2h::device_vector result(num_items); + transform_many_with_alg( + ::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus{}); + + // compute reference and verify + c2h::host_vector a_h = a; + c2h::host_vector b_h = b; + c2h::host_vector reference_h(num_items); + std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus{}); + REQUIRE(reference_h == result); +} + +template +struct alignas(Alignment) overaligned_addable_t +{ + int value; + + overaligned_addable_t() = default; + + _CCCL_HOST_DEVICE overaligned_addable_t(int val) + : value{val} + {} + + _CCCL_HOST_DEVICE static void check(const overaligned_addable_t& obj) + { + if (reinterpret_cast(&obj) % Alignment != 0) + { + printf("Error: object not aligned to %d: %p\n", Alignment, &obj); + ::cuda::std::terminate(); + } + } + + _CCCL_HOST_DEVICE friend auto operator==(const overaligned_addable_t& a, const overaligned_addable_t& b) -> bool + { + check(a); + check(b); + return a.value == b.value; + } + + _CCCL_HOST_DEVICE friend auto + operator+(const overaligned_addable_t& a, const overaligned_addable_t& b) -> overaligned_addable_t + { + check(a); + check(b); + return overaligned_addable_t{a.value + b.value}; + } + + _CCCL_HOST friend auto operator<<(std::ostream& os, const overaligned_addable_t& obj) -> std::ostream& + { + check(obj); + return os << "over{" << obj.value << "}"; + } +}; + +using overaligned_types = + c2h::type_list +#ifndef _CCCL_COMPILER_MSVC // error C2719: [...] formal parameter with requested alignment of 256 won't be aligned + , + overaligned_addable_t<256> +#endif // _CCCL_COMPILER_MSVC + >; + +// test with types exceeding the memcpy_async and bulk copy alignments (16 and 128 bytes respectively) +CUB_TEST("DeviceTransform::Transform overaligned type", "[device][device_transform]", overaligned_types) +{ + using type = c2h::get<0, TestType>; + CAPTURE(c2h::demangle(typeid(type).name())); + + const int num_items = GENERATE(0, 1, 100, 1000); + c2h::device_vector a(num_items, 3); // put some integers at the front, so SMEM has to handle different alignments + c2h::device_vector b(num_items, 4); + + c2h::device_vector result(num_items); + // we need raw pointers here to halfen the conversion sequence from device_reference -> int -> type when calling + // plus(...), which is too long to compile + transform_many(::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())), + result.begin(), + num_items, + ::cuda::std::plus{}); + + REQUIRE(result == c2h::device_vector(num_items, 7)); +} + +CUB_TEST("DeviceTransform::Transform huge type", "[device][device_transform]") +{ + using huge_t = c2h::custom_type_t::type>; + static_assert(alignof(huge_t) == 8, "Need a large type with alignment < 16"); + CAPTURE(c2h::demangle(typeid(huge_t).name())); + + const int num_items = GENERATE(0, 1, 100, 1000); + c2h::device_vector a(num_items); + c2h::device_vector b(num_items); + c2h::gen(CUB_SEED(1), a); + c2h::gen(CUB_SEED(1), b); + + c2h::device_vector result(num_items); + transform_many(::cuda::std::make_tuple(a.begin(), b.begin()), result.begin(), num_items, ::cuda::std::plus{}); + + c2h::host_vector a_h = a; + c2h::host_vector b_h = b; + c2h::host_vector reference_h(num_items); + std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus{}); + REQUIRE(result == reference_h); +} + +struct times_seven +{ + _CCCL_HOST_DEVICE auto operator()(unsigned char v) const -> char + { + return static_cast(v * 7); + } +}; + +CUB_TEST("DeviceTransform::Transform with large input", "[device][device_transform]", algorithms) +try +{ + using type = unsigned char; + using offset_t = cuda::std::int64_t; + constexpr auto alg = c2h::get<0, TestType>::value; + FILTER_UNSUPPORTED_ALGS + CAPTURE(alg); + + constexpr offset_t num_items = (offset_t{1} << 32) + 123456; // a few thread blocks beyond 4GiB + c2h::device_vector input(num_items); + c2h::gen(CUB_SEED(1), input); + + c2h::device_vector result(num_items); + transform_many_with_alg( + ::cuda::std::make_tuple(input.begin()), result.begin(), num_items, times_seven{}); + + // compute reference and verify + c2h::host_vector input_h = input; + c2h::host_vector reference_h(num_items); + std::transform(input_h.begin(), input_h.end(), reference_h.begin(), times_seven{}); + REQUIRE((reference_h == result)); +} +catch (const std::bad_alloc&) +{ + // allocation failure is not a test failure, so we can run tests on smaller GPUs +} + +template +struct nstream_kernel +{ + static constexpr T scalar = 42; + + _CCCL_HOST_DEVICE T operator()(const T& ai, const T& bi, const T& ci) const + { + return ai + bi + scalar * ci; + } +}; + +// overwrites one input stream +CUB_TEST("DeviceTransform::Transform BabelStream nstream", + "[device][device_transform]", + c2h::type_list, + offset_types, + algorithms) +{ + using type = typename c2h::get<0, TestType>; + using offset_t = typename c2h::get<1, TestType>; + constexpr auto alg = c2h::get<2, TestType>::value; + FILTER_UNSUPPORTED_ALGS + CAPTURE(c2h::demangle(typeid(type).name()), c2h::demangle(typeid(offset_t).name()), alg); + + const int num_items = GENERATE(0, 1, 100, 1000, 10000); + c2h::device_vector a(num_items); + c2h::device_vector b(num_items); + c2h::device_vector c(num_items); + c2h::gen(CUB_SEED(1), a, type{10}, type{100}); + c2h::gen(CUB_SEED(1), b, type{10}, type{100}); + c2h::gen(CUB_SEED(1), c, type{10}, type{100}); + + // copy to host before changing + c2h::host_vector a_h = a; + c2h::host_vector b_h = b; + c2h::host_vector c_h = c; + + transform_many_with_alg( + ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin()), a.begin(), num_items, nstream_kernel{}); + + // compute reference and verify + auto z = thrust::make_zip_iterator(a_h.begin(), b_h.begin(), c_h.begin()); + std::transform(z, z + num_items, a_h.begin(), thrust::make_zip_function(nstream_kernel{})); + REQUIRE(a_h == a); +} + +struct sum_five +{ + __device__ auto operator()(std::int8_t a, std::int16_t b, std::int32_t c, std::int64_t d, float e) const -> double + { + return a + b + c + d + e; + } +}; + +CUB_TEST("DeviceTransform::Transform add five streams", "[device][device_transform]", algorithms) +{ + using offset_t = int; + constexpr auto alg = c2h::get<0, TestType>::value; + FILTER_UNSUPPORTED_ALGS + + constexpr int num_items = 100; + c2h::device_vector a(num_items, 1); + c2h::device_vector b(num_items, 2); + c2h::device_vector c(num_items, 3); + c2h::device_vector d(num_items, 4); + c2h::device_vector e(num_items, 5); + + c2h::device_vector result(num_items); + transform_many_with_alg( + ::cuda::std::make_tuple(a.begin(), b.begin(), c.begin(), d.begin(), e.begin()), + result.begin(), + num_items, + sum_five{}); + + // compute reference and verify + c2h::device_vector reference(num_items, 1 + 2 + 3 + 4 + 5); + REQUIRE(reference == result); +} + +struct give_me_five +{ + __device__ auto operator()() const -> int + { + return 5; + } +}; + +CUB_TEST("DeviceTransform::Transform no streams", "[device][device_transform]") +{ + constexpr int num_items = 100; + c2h::device_vector result(num_items); + transform_many(::cuda::std::tuple<>{}, result.begin(), num_items, give_me_five{}); + + // compute reference and verify + c2h::device_vector reference(num_items, 5); + REQUIRE(reference == result); +} + +CUB_TEST("DeviceTransform::Transform fancy input iterator types", "[device][device_transform]") +{ + using type = int; + + constexpr int num_items = 100; + thrust::counting_iterator a{0}; + thrust::counting_iterator b{10}; + + c2h::device_vector result(num_items); + transform_many(::cuda::std::make_tuple(a, b), result.begin(), num_items, ::cuda::std::plus{}); + + // compute reference and verify + c2h::host_vector reference_h(num_items); + std::transform(a, a + num_items, b, reference_h.begin(), std::plus{}); + REQUIRE(reference_h == result); +} + +CUB_TEST("DeviceTransform::Transform fancy output iterator type", "[device][device_transform]", algorithms) +{ + using type = int; + using offset_t = int; + constexpr auto alg = c2h::get<0, TestType>::value; + FILTER_UNSUPPORTED_ALGS + + constexpr int num_items = 100; + c2h::device_vector a(num_items, 13); + c2h::device_vector b(num_items, 35); + c2h::device_vector result(num_items); + + using thrust::placeholders::_1; + auto out = thrust::make_transform_output_iterator(result.begin(), _1 + 4); + transform_many_with_alg( + ::cuda::std::make_tuple(a.begin(), b.begin()), out, num_items, ::cuda::std::plus{}); + REQUIRE(result == c2h::device_vector(num_items, (13 + 35) + 4)); +} + +CUB_TEST("DeviceTransform::Transform mixed input iterator types", "[device][device_transform]") +{ + using type = int; + + constexpr int num_items = 100; + thrust::counting_iterator a{0}; + c2h::device_vector b(num_items, 10); + + c2h::device_vector result(num_items); + transform_many(::cuda::std::make_tuple(a, b.begin()), result.begin(), num_items, ::cuda::std::plus{}); + + // compute reference and verify + c2h::host_vector b_h = b; + c2h::host_vector reference_h(num_items); + std::transform(a, a + num_items, b_h.begin(), reference_h.begin(), std::plus{}); + REQUIRE(reference_h == result); +} + +struct plus_needs_stable_address +{ + int* a; + int* b; + + _CCCL_HOST_DEVICE int operator()(const int& v) const + { + const auto i = &v - a; + return v + b[i]; + } +}; + +CUB_TEST("DeviceTransform::Transform address stability", "[device][device_transform]") +{ + using type = int; + + constexpr int num_items = 100; + c2h::device_vector a(num_items); + c2h::device_vector b(num_items); + thrust::sequence(a.begin(), a.end()); + thrust::sequence(b.begin(), b.end(), 42); + + c2h::device_vector result(num_items); + transform_many_stable( + ::cuda::std::make_tuple(thrust::raw_pointer_cast(a.data())), + result.begin(), + num_items, + plus_needs_stable_address{thrust::raw_pointer_cast(a.data()), thrust::raw_pointer_cast(b.data())}); + + // compute reference and verify + c2h::device_vector a_h = a; + c2h::device_vector b_h = b; + c2h::host_vector reference_h(num_items); + std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), std::plus{}); + REQUIRE(reference_h == result); +} + +// Non-trivially-copyable/relocatable type which cannot be copied using std::memcpy or cudaMemcpy +struct non_trivial +{ + int data; + + non_trivial() = default; + + _CCCL_HOST_DEVICE explicit non_trivial(int data) + : data(data) + {} + + _CCCL_HOST_DEVICE non_trivial(const non_trivial& nt) + : data(nt.data) + {} + + _CCCL_HOST_DEVICE auto operator=(const non_trivial& nt) -> non_trivial& + { + data = nt.data; + return *this; + } + + _CCCL_HOST_DEVICE auto operator-() const -> non_trivial + { + return non_trivial{-data}; + } + + friend _CCCL_HOST_DEVICE auto operator==(non_trivial a, non_trivial b) -> bool + { + return a.data == b.data; + } +}; +static_assert(!::cuda::std::is_trivially_copyable::value, ""); // as required by the standard +static_assert(!thrust::is_trivially_relocatable::value, ""); // CUB uses this check internally + +// Note(bgruber): I gave up on writing a test that checks whether the copy ctor/assignment operator is actually called +// (e.g. by tracking/counting invocations of those), since C++ allows (but not guarantees) elision of these operations. +// Also thrust algorithms perform a lot of copies in-between, so the test needs to use only raw allocations and +// iteration for setup and checking. +CUB_TEST("DeviceTransform::Transform not trivially relocatable", "[device][device_transform]") +{ + constexpr int num_items = 100; + c2h::device_vector input(num_items, non_trivial{42}); + c2h::device_vector result(num_items); + transform_many( + ::cuda::std::make_tuple(thrust::raw_pointer_cast(input.data())), result.begin(), num_items, ::cuda::std::negate<>{}); + + const auto reference = c2h::device_vector(num_items, non_trivial{-42}); + REQUIRE((reference == result)); +} + +CUB_TEST("DeviceTransform::Transform buffer start alignment", + "[device][device_transform]", + c2h::type_list) +{ + using type = typename c2h::get<0, TestType>; + + constexpr int num_items = 1000; + const int offset = GENERATE(1, 2, 4, 8, 16, 32, 64, 128); // global memory is always at least 256 byte aligned + CAPTURE(c2h::demangle(typeid(type).name()), offset); + c2h::device_vector input(num_items); + thrust::sequence(input.begin(), input.end()); + c2h::device_vector result(num_items); + using thrust::placeholders::_1; + transform_many(::cuda::std::make_tuple(input.begin() + offset), + result.begin() + offset, + num_items - offset, + _1 * 10); // FIXME(bgruber): does not work on negative + + c2h::device_vector reference(num_items); + thrust::tabulate(reference.begin() + offset, reference.end(), (_1 + offset) * 10); + REQUIRE(reference == result); +} + +namespace Catch +{ +template +struct StringMaker> +{ + static auto convert(cub::detail::transform::aligned_base_ptr abp) -> std::string + { + std::stringstream ss; + ss << "{ptr: " << abp.ptr << ", head_padding: " << abp.head_padding << "}"; + return ss.str(); + } +}; +} // namespace Catch + +// TODO(bgruber): rewrite this example using int3 +CUB_TEST("DeviceTransform::Transform aligned_base_ptr", "[device][device_transform]") +{ + alignas(128) int arr[256]; + using namespace cub::detail::transform; + CHECK(make_aligned_base_ptr(&arr[0], 128) == aligned_base_ptr{reinterpret_cast(&arr[0]), 0}); + CHECK(make_aligned_base_ptr(&arr[1], 128) == aligned_base_ptr{reinterpret_cast(&arr[0]), 4}); + CHECK(make_aligned_base_ptr(&arr[5], 128) == aligned_base_ptr{reinterpret_cast(&arr[0]), 20}); + CHECK(make_aligned_base_ptr(&arr[31], 128) == aligned_base_ptr{reinterpret_cast(&arr[0]), 124}); + CHECK(make_aligned_base_ptr(&arr[32], 128) == aligned_base_ptr{reinterpret_cast(&arr[32]), 0}); + CHECK(make_aligned_base_ptr(&arr[33], 128) == aligned_base_ptr{reinterpret_cast(&arr[32]), 4}); + CHECK(make_aligned_base_ptr(&arr[127], 128) == aligned_base_ptr{reinterpret_cast(&arr[96]), 124}); + CHECK(make_aligned_base_ptr(&arr[128], 128) == aligned_base_ptr{reinterpret_cast(&arr[128]), 0}); + CHECK(make_aligned_base_ptr(&arr[129], 128) == aligned_base_ptr{reinterpret_cast(&arr[128]), 4}); +} diff --git a/cub/test/catch2_test_device_transform_api.cu b/cub/test/catch2_test_device_transform_api.cu new file mode 100644 index 00000000000..46388ed6b23 --- /dev/null +++ b/cub/test/catch2_test_device_transform_api.cu @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include +#include + +#include "catch2_test_helper.h" + +// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows +void test_transform_api() +{ + // example-begin transform-many + constexpr auto num_items = 4; + auto input1 = thrust::device_vector{0, -2, 5, 3}; + auto input2 = thrust::device_vector{5.2f, 3.1f, -1.1f, 3.0f}; + auto input3 = thrust::counting_iterator{100}; + auto op = [] __device__(int a, float b, int c) { + return (a + b) * c; + }; + + auto result = thrust::device_vector(num_items); + cub::DeviceTransform::Transform( + ::cuda::std::make_tuple(input1.begin(), input2.begin(), input3), result.begin(), num_items, op); + + const auto expected = thrust::host_vector{520, 111, 397, 618}; + // example-end transform-many + CHECK(result == expected); +} + +CUB_TEST("DeviceTransform::Transform API example", "[device][device_transform]") +{ + test_transform_api(); +} + +// need a separate function because the ext. lambda needs to be enclosed by a function with external linkage on Windows +void test_transform_stable_api() +{ + // example-begin transform-many-stable + constexpr auto num_items = 4; + auto input1 = thrust::device_vector{0, -2, 5, 3}; + auto input2 = thrust::device_vector{52, 31, -11, 30}; + + auto* input1_ptr = thrust::raw_pointer_cast(input1.data()); + auto* input2_ptr = thrust::raw_pointer_cast(input2.data()); + + auto op = [input1_ptr, input2_ptr] __device__(const int& a) -> int { + const auto i = &a - input1_ptr; // we depend on the address of a + return a + input2_ptr[i]; + }; + + auto result = thrust::device_vector(num_items); + cub::DeviceTransform::TransformStableArgumentAddresses( + ::cuda::std::make_tuple(input1_ptr), result.begin(), num_items, op); + + const auto expected = thrust::host_vector{52, 29, -6, 33}; + // example-end transform-many-stable + CHECK(result == expected); +} + +CUB_TEST("DeviceTransform::TransformStableArgumentAddresses API example", "[device][device_transform]") +{ + test_transform_stable_api(); +} diff --git a/cub/test/catch2_test_launch_helper.h b/cub/test/catch2_test_launch_helper.h index 11da1a32e3b..4add1d15d11 100644 --- a/cub/test/catch2_test_launch_helper.h +++ b/cub/test/catch2_test_launch_helper.h @@ -74,7 +74,7 @@ //! Consult with `test/catch2_test_cdp_wrapper.cu` for more usage examples. #if !defined(TEST_LAUNCH) -# error Test file should contain %PARAM% TEST_LAUNCH lid 0:1 +# error Test file should contain %PARAM% TEST_LAUNCH lid 0:1:2 #endif #define DECLARE_INVOCABLE(API, WRAPPED_API_NAME, TMPL_HEAD_OPT, TMPL_ARGS_OPT) \