diff --git a/docs/thrust/cmake_options.rst b/docs/thrust/cmake_options.rst index a3dab2487b5..f5dfb2209a5 100644 --- a/docs/thrust/cmake_options.rst +++ b/docs/thrust/cmake_options.rst @@ -69,6 +69,19 @@ Generic CMake Options - If true, installation rules will be generated for thrust. Default is ``ON``. +- ``THRUST_DISPATCH_TYPE={Dynamic, Force32bit, Force64bit}`` + + - Allows the user to force Thrust to use a specific size for the offset type. Default + is ``Dynamic``. + + - ``Dynamic`` lets Thrust choose the index type based on input size, allowing + large inputs and optimal performance at the cost of increased compile time and binary size, + as Thrust will compile each kernel twice, once for 32 bit and once for 64 bit. + - ``Force32bit`` forces Thrust to use a 32 bit offset type. This improves compile time and + binary size but limits the input size. + - ``Force64bit`` forces Thrust to use a 64 bit offset type. This improves compile time and + binary size and allows large input sizes. However, it might degrade runtime performance. + Single Config CMake Options --------------------------- diff --git a/thrust/CMakeLists.txt b/thrust/CMakeLists.txt index ca372d3677f..a70d8a1c693 100644 --- a/thrust/CMakeLists.txt +++ b/thrust/CMakeLists.txt @@ -59,6 +59,10 @@ option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON") option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON") option(THRUST_ENABLE_BENCHMARKS "Build Thrust runtime benchmarks." "${CCCL_ENABLE_BENCHMARKS}") +# Allow the user to optionally select offset type dispatch to fixed 32 or 64 bit types +set(THRUST_DISPATCH_TYPE "Dynamic" CACHE STRING "Select Thrust offset type dispatch." FORCE) +set_property(CACHE THRUST_DISPATCH_TYPE PROPERTY STRINGS "Dynamic" "Force32bit" "Force64bit") + # Check if we're actually building anything before continuing. If not, no need # to search for deps, etc. This is a common approach for packagers that just # need the install rules. See GH issue NVIDIA/thrust#1211. diff --git a/thrust/cmake/ThrustBuildCompilerTargets.cmake b/thrust/cmake/ThrustBuildCompilerTargets.cmake index 158ca04faf1..c5be03ecf5c 100644 --- a/thrust/cmake/ThrustBuildCompilerTargets.cmake +++ b/thrust/cmake/ThrustBuildCompilerTargets.cmake @@ -131,6 +131,12 @@ function(thrust_build_compiler_targets) ) endforeach() + if (THRUST_DISPATCH_TYPE STREQUAL "Force32bit") + list(APPEND cxx_compile_definitions "THRUST_FORCE_32_BIT_OFFSET_TYPE") + elseif (THRUST_DISPATCH_TYPE STREQUAL "Force64bit") + list(APPEND cxx_compile_definitions "THRUST_FORCE_64_BIT_OFFSET_TYPE") + endif() + foreach (cxx_definition IN LISTS cxx_compile_definitions) # Add these for both CUDA and CXX targets: target_compile_definitions(thrust.compiler_interface INTERFACE diff --git a/thrust/cmake/ThrustHeaderTesting.cmake b/thrust/cmake/ThrustHeaderTesting.cmake index 4c1d07f744b..5a10912864b 100644 --- a/thrust/cmake/ThrustHeaderTesting.cmake +++ b/thrust/cmake/ThrustHeaderTesting.cmake @@ -146,6 +146,19 @@ foreach(thrust_target IN LISTS THRUST_TARGETS) "CUB_WRAPPED_NAMESPACE=wrapped_cub") thrust_add_header_test(${thrust_target} base "${header_definitions}") + # We need to ensure that the different dispatch mechanisms work + set(header_definitions + "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" + "CUB_WRAPPED_NAMESPACE=wrapped_cub" + "THRUST_FORCE_32_BIT_OFFSET_TYPE") + thrust_add_header_test(${thrust_target} offset_32 "${header_definitions}") + + set(header_definitions + "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" + "CUB_WRAPPED_NAMESPACE=wrapped_cub" + "THRUST_FORCE_64_BIT_OFFSET_TYPE") + thrust_add_header_test(${thrust_target} offset_64 "${header_definitions}") + thrust_get_target_property(config_device ${thrust_target} DEVICE) if ("CUDA" STREQUAL "${config_device}") # Check that BF16 support can be disabled diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h index 971b93d6281..3d004aa5531 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h @@ -26,96 +26,186 @@ # pragma system_header #endif // no system header +#include #include #include +#include +#include + #include +#include -/** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version assumes - * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch - * interfaces, that always deduce the size type from the arguments. - */ -#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ - if (count <= thrust::detail::integer_traits::const_max) \ - { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call arguments; \ - } \ - else \ - { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call arguments; \ - } +#if defined(THRUST_FORCE_32_BIT_OFFSET_TYPE) && defined(THRUST_FORCE_64_BIT_OFFSET_TYPE) +# error "Only THRUST_FORCE_32_BIT_OFFSET_TYPE or THRUST_FORCE_64_BIT_OFFSET_TYPE may be defined!" +#endif // THRUST_FORCE_32_BIT_OFFSET_TYPE && THRUST_FORCE_64_BIT_OFFSET_TYPE -/** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version assumes - * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch - * interfaces, that always deduce the size type from the arguments. - * - * This version of the macro supports providing two count variables, which is necessary for set algorithms. - */ -#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ - if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ - { \ - auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ - auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ - status = call arguments; \ - } \ - else \ - { \ - auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ - auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ - status = call arguments; \ - } - -/** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm implementation. This version allows - * using different token sequences for callables in both branches, and is intended to be used with CUB-style dispatch - * interfaces, where the "simple" interface always forces the size to be `int` (making it harder for us to use), but the - * complex interface that we end up using doesn't actually provide a way to fully deduce the type from just the call, - * making the size type appear in the token sequence of the callable. - * - * See reduce_n_impl to see an example of how this is meant to be used. - */ -#define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ - if (count <= thrust::detail::integer_traits::const_max) \ +#define _THRUST_INDEX_TYPE_DISPATCH(index_type, status, call, count, arguments) \ { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call_32 arguments; \ - } \ - else \ - { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call_64 arguments; \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call arguments; \ } -/// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the -/// `count` argument. `count` must not be negative. -#define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ - if (static_cast(count) \ - <= static_cast(thrust::detail::integer_traits::const_max)) \ - { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call_32 arguments; \ - } \ - else \ - { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call_64 arguments; \ +#define _THRUST_INDEX_TYPE_DISPATCH2(index_type, status, call, count1, count2, arguments) \ + { \ + auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ + auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ + status = call arguments; \ } -/// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. -#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \ - if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ - { \ - auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ - auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ - status = call_32 arguments; \ - } \ - else \ - { \ - auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ - auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ - status = call_64 arguments; \ +#define _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + if (thrust::detail::is_negative(count)) \ + { \ + ::cuda::std::__throw_runtime_error("Invalid input range, passed negative size"); \ } + +#define _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2) + +#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE) +//! @brief Always dispatches to 64 bit offset version of an algorithm +# define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::int64_t, status, call, count, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH but with two counts +# define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH2(std::int64_t, status, call, count1, count2, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH but with two different call implementations +# define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::int64_t, status, call_64, count, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. +# define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH2(std::int64_t, status, call_64, count1, count2, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but always dispatching to uint64_t. `count` must not be negative. +# define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments) + +#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE) + +//! @brief Ensures that the size of the input does not overflow the offset type +# define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count) \ + if (static_cast(count) \ + > static_cast(thrust::detail::integer_traits::const_max)) \ + { \ + ::cuda::std::__throw_runtime_error( \ + "Input size exceeds the maximum allowable value for " #index_type \ + ". It was used because the macro THRUST_FORCE_32_BIT_OFFSET_TYPE was defined. " \ + "To handle larger input sizes, either remove this macro to dynamically dispatch " \ + "between 32-bit and 64-bit index types, or define THRUST_FORCE_64_BIT_OFFSET_TYPE."); \ + } + +//! @brief Ensures that the sizes of the inputs do not overflow the offset type, but two counts +# define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW2(index_type, count1, count2) \ + if (static_cast(count1) + static_cast(count2) \ + > static_cast(thrust::detail::integer_traits::const_max)) \ + { \ + ::cuda::std::__throw_runtime_error( \ + "Input size exceeds the maximum allowable value for " #index_type \ + ". It was used because the macro THRUST_FORCE_32_BIT_OFFSET_TYPE was defined. " \ + "To handle larger input sizes, either remove this macro to dynamically dispatch " \ + "between 32-bit and 64-bit index types, or define THRUST_FORCE_64_BIT_OFFSET_TYPE."); \ + } + +//! @brief Always dispatches to 32 bit offset version of an algorithm but throws if count would overflow +# define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(std::int32_t, count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::int32_t, status, call, count, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH but with two counts +# define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW2(std::int32_t, count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH2(std::int32_t, status, call, count1, count2, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH but with two different call implementations +# define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(std::int32_t, count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::int32_t, status, call_32, count, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. +# define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW2(std::int32_t, count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH2(std::int32_t, status, call_32, count1, count2, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH but always dispatching to uint64_t. `count` must not be negative. +# define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(std::uint32_t, count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::uint32_t, status, call_32, count, arguments) + +#else // ^^^ THRUST_FORCE_32_BIT_OFFSET_TYPE ^^^ / vvv !THRUST_FORCE_32_BIT_OFFSET_TYPE vvv + +# define _THRUST_INDEX_TYPE_DISPATCH_SELECT(index_type, count) \ + (static_cast(count) \ + <= static_cast(thrust::detail::integer_traits::const_max)) + +# define _THRUST_INDEX_TYPE_DISPATCH_SELECT2(index_type, count1, count2) \ + (static_cast(count1) + static_cast(count2) \ + <= static_cast(thrust::detail::integer_traits::const_max)) + +//! Dispatch between 32-bit and 64-bit index_type based versions of the same algorithm implementation. This version +//! assumes that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style +//! dispatch interfaces, that always deduce the size type from the arguments. +# define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + if _THRUST_INDEX_TYPE_DISPATCH_SELECT (std::int32_t, count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::int32_t, status, call, count, arguments) \ + else \ + _THRUST_INDEX_TYPE_DISPATCH(std::int64_t, status, call, count, arguments) + +//! Dispatch between 32-bit and 64-bit index_type based versions of the same algorithm implementation. This version +//! assumes that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style +//! dispatch interfaces, that always deduce the size type from the arguments. +//! +//! This version of the macro supports providing two count variables, which is necessary for set algorithms. +# define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2) \ + if _THRUST_INDEX_TYPE_DISPATCH_SELECT2 (std::int32_t, count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH2(std::int32_t, status, call, count1, count2, arguments) \ + else \ + _THRUST_INDEX_TYPE_DISPATCH2(std::int64_t, status, call, count1, count2, arguments) + +//! Dispatch between 32-bit and 64-bit index_type based versions of the same algorithm implementation. This version +//! allows using different token sequences for callables in both branches, and is intended to be used with CUB-style +//! dispatch interfaces, where the "simple" interface always forces the size to be `int` (making it harder for us to +//! use), but the complex interface that we end up using doesn't actually provide a way to fully deduce the type from +//! just the call, making the size type appear in the token sequence of the callable. +//! +//! See reduce_n_impl to see an example of how this is meant to be used. +# define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + if _THRUST_INDEX_TYPE_DISPATCH_SELECT (std::int32_t, count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::int32_t, status, call_32, count, arguments) \ + else \ + _THRUST_INDEX_TYPE_DISPATCH(std::int64_t, status, call_64, count, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. +# define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW2(count1, count2) \ + if _THRUST_INDEX_TYPE_DISPATCH_SELECT2 (std::int32_t, count1, count2) \ + _THRUST_INDEX_TYPE_DISPATCH2(std::int32_t, status, call_32, count1, count2, arguments) \ + else \ + _THRUST_INDEX_TYPE_DISPATCH2(std::int64_t, status, call_64, count1, count2, arguments) + +//! Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the +//! `count` argument. `count` must not be negative. +# define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + if _THRUST_INDEX_TYPE_DISPATCH_SELECT (std::uint32_t, count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::uint32_t, status, call_32, count, arguments) \ + else \ + _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments) + +#endif // !THRUST_FORCE_32_BIT_OFFSET_TYPE