From 06e334f3460088e5305369d5ed7bc4c9d960dcc3 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 21 Aug 2024 06:58:25 +0200 Subject: [PATCH] Adds support for large number of items to `DeviceScan` (#2171) * make DeviceScan offset type a template parameter * updates tests to use device interface * moves thrust scan to unsigned offset types * adjusts benchmarks to account for used offset types * uses dynamic dispatch to unsigned type * adds tparam docs for NumItemsT * fixes warning about different signedness comparison * adds check for negative num_items in thrust::scan * fixes unused param in is_negative --- cub/benchmarks/bench/scan/exclusive/base.cuh | 2 +- cub/cub/device/device_scan.cuh | 152 +++++++++++------- .../catch2_test_device_scan_large_offsets.cu | 41 ++--- thrust/thrust/detail/integer_math.h | 14 ++ thrust/thrust/system/cuda/detail/dispatch.h | 15 ++ thrust/thrust/system/cuda/detail/scan.h | 27 +++- 6 files changed, 159 insertions(+), 92 deletions(-) diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 65b760fba26..42897931679 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -129,7 +129,7 @@ static void basic(nvbench::state& state, nvbench::type_list) }); } -using some_offset_types = nvbench::type_list; +using some_offset_types = nvbench::type_list; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types)) .set_name("base") diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 1ec282d978c..c9d93c935b5 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -42,6 +42,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -153,6 +154,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -173,19 +177,19 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using InitT = cub::detail::value_t; // Initial value @@ -196,13 +200,13 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -262,6 +266,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -279,20 +286,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -378,6 +385,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -404,7 +414,7 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -412,13 +422,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -432,7 +442,7 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -440,7 +450,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -521,6 +531,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -544,28 +557,28 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -656,6 +669,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -686,7 +702,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -694,13 +711,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -718,7 +735,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -726,7 +744,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -810,6 +828,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -833,28 +854,36 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -924,6 +953,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -944,32 +976,32 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1028,6 +1060,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -1045,20 +1080,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1138,6 +1173,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1162,20 +1200,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); @@ -1221,6 +1259,9 @@ struct DeviceScan //! @tparam InitValueT //! **[inferred]** Type of the `init_value` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1247,7 +1288,7 @@ struct DeviceScan //! //! @param[in] stream //! CUDA stream to launch kernels within. - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1255,13 +1296,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using AccumT = cub::detail::accumulator_t>; constexpr bool ForceInclusive = true; @@ -1284,14 +1325,14 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1365,6 +1406,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1386,26 +1430,26 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu index 9d00d89e144..0c0854e21e1 100644 --- a/cub/test/catch2_test_device_scan_large_offsets.cu +++ b/cub/test/catch2_test_device_scan_large_offsets.cu @@ -35,33 +35,12 @@ #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" -// TODO(elstehle) replace with DeviceScan interface once https://github.com/NVIDIA/cccl/issues/50 is addressed -// Temporary wrapper that allows specializing the DeviceScan algorithm for different offset types -template -CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_scan_wrapper( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items, - cudaStream_t stream = 0) -{ - using init_value_t = cub::detail::InputValue; - init_value_t init_value_wrapper{init_value}; - - return cub::DispatchScan::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value_wrapper, num_items, stream); -} - -DECLARE_LAUNCH_WRAPPER(dispatch_scan_wrapper, dispatch_exclusive_scan); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan); // %PARAM% TEST_LAUNCH lid 0:1:2 -// TODO(elstehle) replace with actual offset types, once https://github.com/NVIDIA/cccl/issues/50 is addresed // List of offset types to be used for testing large number of items -using offset_types = c2h::type_list; +using offset_types = c2h::type_list; template struct expected_sum_op @@ -106,12 +85,12 @@ try offset_t num_items_max = static_cast(num_items_max_ull); offset_t num_items_min = num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; - // TODO(elstehle) remove single-item size, once https://github.com/NVIDIA/cccl/issues/50 is addresed - offset_t num_items = - GENERATE_COPY(values({num_items_max, static_cast(num_items_max - 1), static_cast(1)}), - take(2, random(num_items_min, num_items_max))); + offset_t num_items = GENERATE_COPY( + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); - // Prepare input + // Prepare input (generate a series of: 0, 1, 2, ..., , 0, 1, 2, ..., , 0, 1, ...) constexpr index_t segment_size = 1000; auto index_it = thrust::make_counting_iterator(index_t{}); auto items_it = thrust::make_transform_iterator(index_it, mod_op{segment_size}); @@ -120,8 +99,12 @@ try c2h::device_vector d_items_out(num_items); auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data()); + c2h::device_vector d_initial_value(1); + d_initial_value[0] = item_t{}; + auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); + // Run test - dispatch_exclusive_scan(items_it, d_items_out_it, op_t{}, item_t{}, num_items); + device_exclusive_scan(items_it, d_items_out_it, op_t{}, future_init_value, num_items); // Ensure that we created the correct output auto expected_out_it = diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h index ab37d9a3a9f..730b0847674 100644 --- a/thrust/thrust/detail/integer_math.h +++ b/thrust/thrust/detail/integer_math.h @@ -27,6 +27,8 @@ #endif // no system header #include +#include + #include #include @@ -60,6 +62,18 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x) return 0 == (x & (x - 1)); } +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T x) +{ + return x < 0; +} + +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T) +{ + return false; +} + template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_odd(Integer x) { diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h index 90c99688f7c..971b93d6281 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h @@ -90,6 +90,21 @@ status = call_64 arguments; \ } +/// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the +/// `count` argument. `count` must not be negative. +#define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + if (static_cast(count) \ + <= static_cast(thrust::detail::integer_traits::const_max)) \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_32 arguments; \ + } \ + else \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_64 arguments; \ + } + /// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. #define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \ if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index e225f2cfe4e..e9405776db7 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -42,6 +42,7 @@ # include +# include # include # include # include @@ -63,16 +64,21 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( thrust::cuda_cub::execution_policy& policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op) { using AccumT = typename thrust::iterator_traits::value_type; - using Dispatch32 = cub::DispatchScan; - using Dispatch64 = cub::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if(thrust::detail::is_negative(num_items)){ + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -88,7 +94,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( { // Allocate temporary storage: thrust::detail::temporary_array tmp{policy, tmp_size}; - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -113,16 +119,21 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( ScanOp scan_op) { using InputValueT = cub::detail::InputValue; - using Dispatch32 = cub::DispatchScan; - using Dispatch64 = cub::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if(thrust::detail::is_negative(num_items)){ + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -138,7 +149,7 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( { // Allocate temporary storage: thrust::detail::temporary_array tmp{policy, tmp_size}; - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch,