diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 65b760fba26..42897931679 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -129,7 +129,7 @@ static void basic(nvbench::state& state, nvbench::type_list) }); } -using some_offset_types = nvbench::type_list; +using some_offset_types = nvbench::type_list; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(all_types, some_offset_types)) .set_name("base") diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 29f3cf6c1e1..2786e6e7a6f 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -41,6 +41,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -152,6 +153,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -172,19 +176,19 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using InitT = cub::detail::value_t; // Initial value @@ -195,13 +199,13 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -261,6 +265,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -278,20 +285,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -377,6 +384,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -403,7 +413,7 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -411,13 +421,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -431,7 +441,7 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -439,7 +449,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -520,6 +530,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -543,28 +556,28 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -655,6 +668,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -685,7 +701,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -693,13 +710,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan, OffsetT>::Dispatch( d_temp_storage, @@ -717,7 +734,8 @@ struct DeviceScan typename OutputIteratorT, typename ScanOpT, typename InitValueT, - typename InitValueIterT = InitValueT*> + typename InitValueIterT = InitValueT*, + typename NumItemsT = int> CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, @@ -725,7 +743,7 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -809,6 +827,9 @@ struct DeviceScan //! **[inferred]** Type of the `init_value` used Binary scan functor type //! having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -832,28 +853,36 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, FutureValue init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -923,6 +952,9 @@ struct DeviceScan //! @tparam OutputIteratorT //! **[inferred]** Random-access output iterator type for writing scan outputs @iterator //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -943,32 +975,32 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1027,6 +1059,9 @@ struct DeviceScan //! @tparam IteratorT //! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to `temp_storage_bytes` and no work is done. @@ -1044,20 +1079,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( - void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, int num_items, cudaStream_t stream = 0) + void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1137,6 +1172,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1161,20 +1199,20 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; return DispatchScan::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream); @@ -1220,6 +1258,9 @@ struct DeviceScan //! @tparam InitValueT //! **[inferred]** Type of the `init_value` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] d_temp_storage //! Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1246,7 +1287,7 @@ struct DeviceScan //! //! @param[in] stream //! CUDA stream to launch kernels within. - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1254,13 +1295,13 @@ struct DeviceScan OutputIteratorT d_out, ScanOpT scan_op, InitValueT init_value, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { CUB_DETAIL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit"); - // Signed integer type for global offsets - using OffsetT = int; + // Unsigned integer type for global offsets + using OffsetT = detail::choose_offset_t; using AccumT = cub::detail::accumulator_t>; constexpr bool ForceInclusive = true; @@ -1283,14 +1324,14 @@ struct DeviceScan } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { @@ -1364,6 +1405,9 @@ struct DeviceScan //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! //! @param[in] //! d_temp_storage Device-accessible allocation of temporary storage. //! When `nullptr`, the required allocation size is written to @@ -1385,26 +1429,26 @@ struct DeviceScan //! @rst //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. //! @endrst - template + template CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream = 0) { return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream); } #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template + template CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan( void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, ScanOpT scan_op, - int num_items, + NumItemsT num_items, cudaStream_t stream, bool debug_synchronous) { diff --git a/cub/test/catch2_test_device_scan_large_offsets.cu b/cub/test/catch2_test_device_scan_large_offsets.cu index 9d00d89e144..0c0854e21e1 100644 --- a/cub/test/catch2_test_device_scan_large_offsets.cu +++ b/cub/test/catch2_test_device_scan_large_offsets.cu @@ -35,33 +35,12 @@ #include "catch2_test_helper.h" #include "catch2_test_launch_helper.h" -// TODO(elstehle) replace with DeviceScan interface once https://github.com/NVIDIA/cccl/issues/50 is addressed -// Temporary wrapper that allows specializing the DeviceScan algorithm for different offset types -template -CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch_scan_wrapper( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items, - cudaStream_t stream = 0) -{ - using init_value_t = cub::detail::InputValue; - init_value_t init_value_wrapper{init_value}; - - return cub::DispatchScan::Dispatch( - d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init_value_wrapper, num_items, stream); -} - -DECLARE_LAUNCH_WRAPPER(dispatch_scan_wrapper, dispatch_exclusive_scan); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_exclusive_scan); // %PARAM% TEST_LAUNCH lid 0:1:2 -// TODO(elstehle) replace with actual offset types, once https://github.com/NVIDIA/cccl/issues/50 is addresed // List of offset types to be used for testing large number of items -using offset_types = c2h::type_list; +using offset_types = c2h::type_list; template struct expected_sum_op @@ -106,12 +85,12 @@ try offset_t num_items_max = static_cast(num_items_max_ull); offset_t num_items_min = num_items_max_ull > 10000 ? static_cast(num_items_max_ull - 10000ULL) : offset_t{0}; - // TODO(elstehle) remove single-item size, once https://github.com/NVIDIA/cccl/issues/50 is addresed - offset_t num_items = - GENERATE_COPY(values({num_items_max, static_cast(num_items_max - 1), static_cast(1)}), - take(2, random(num_items_min, num_items_max))); + offset_t num_items = GENERATE_COPY( + values( + {num_items_max, static_cast(num_items_max - 1), static_cast(1), static_cast(3)}), + take(2, random(num_items_min, num_items_max))); - // Prepare input + // Prepare input (generate a series of: 0, 1, 2, ..., , 0, 1, 2, ..., , 0, 1, ...) constexpr index_t segment_size = 1000; auto index_it = thrust::make_counting_iterator(index_t{}); auto items_it = thrust::make_transform_iterator(index_it, mod_op{segment_size}); @@ -120,8 +99,12 @@ try c2h::device_vector d_items_out(num_items); auto d_items_out_it = thrust::raw_pointer_cast(d_items_out.data()); + c2h::device_vector d_initial_value(1); + d_initial_value[0] = item_t{}; + auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); + // Run test - dispatch_exclusive_scan(items_it, d_items_out_it, op_t{}, item_t{}, num_items); + device_exclusive_scan(items_it, d_items_out_it, op_t{}, future_init_value, num_items); // Ensure that we created the correct output auto expected_out_it = diff --git a/thrust/thrust/detail/integer_math.h b/thrust/thrust/detail/integer_math.h index ab37d9a3a9f..730b0847674 100644 --- a/thrust/thrust/detail/integer_math.h +++ b/thrust/thrust/detail/integer_math.h @@ -27,6 +27,8 @@ #endif // no system header #include +#include + #include #include @@ -60,6 +62,18 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x) return 0 == (x & (x - 1)); } +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T x) +{ + return x < 0; +} + +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE typename std::enable_if::value, bool>::type is_negative(T) +{ + return false; +} + template _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_odd(Integer x) { diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h index 90c99688f7c..971b93d6281 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h @@ -90,6 +90,21 @@ status = call_64 arguments; \ } +/// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but dispatching to uint32_t and uint64_t, respectively, depending on the +/// `count` argument. `count` must not be negative. +#define THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \ + if (static_cast(count) \ + <= static_cast(thrust::detail::integer_traits::const_max)) \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_32 arguments; \ + } \ + else \ + { \ + auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ + status = call_64 arguments; \ + } + /// Like \ref THRUST_INDEX_TYPE_DISPATCH2 but uses two counts. #define THRUST_DOUBLE_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count1, count2, arguments) \ if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ diff --git a/thrust/thrust/system/cuda/detail/scan.h b/thrust/thrust/system/cuda/detail/scan.h index e225f2cfe4e..e9405776db7 100644 --- a/thrust/thrust/system/cuda/detail/scan.h +++ b/thrust/thrust/system/cuda/detail/scan.h @@ -42,6 +42,7 @@ # include +# include # include # include # include @@ -63,16 +64,21 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( thrust::cuda_cub::execution_policy& policy, InputIt first, Size num_items, OutputIt result, ScanOp scan_op) { using AccumT = typename thrust::iterator_traits::value_type; - using Dispatch32 = cub::DispatchScan; - using Dispatch64 = cub::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if(thrust::detail::is_negative(num_items)){ + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -88,7 +94,7 @@ _CCCL_HOST_DEVICE OutputIt inclusive_scan_n_impl( { // Allocate temporary storage: thrust::detail::temporary_array tmp{policy, tmp_size}; - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -113,16 +119,21 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( ScanOp scan_op) { using InputValueT = cub::detail::InputValue; - using Dispatch32 = cub::DispatchScan; - using Dispatch64 = cub::DispatchScan; + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; cudaStream_t stream = thrust::cuda_cub::stream(policy); cudaError_t status; + // Negative number of items are normalized to `0` + if(thrust::detail::is_negative(num_items)){ + num_items = 0; + } + // Determine temporary storage requirements: size_t tmp_size = 0; { - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch, @@ -138,7 +149,7 @@ _CCCL_HOST_DEVICE OutputIt exclusive_scan_n_impl( { // Allocate temporary storage: thrust::detail::temporary_array tmp{policy, tmp_size}; - THRUST_INDEX_TYPE_DISPATCH2( + THRUST_UNSIGNED_INDEX_TYPE_DISPATCH2( status, Dispatch32::Dispatch, Dispatch64::Dispatch,