Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Support reduction for more than 2^31 items
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed Nov 13, 2022
1 parent 945820c commit 8b9f0b0
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 29 deletions.
67 changes: 44 additions & 23 deletions cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <limits>

#include <cub/config.cuh>
#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_reduce.cuh>
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
Expand Down Expand Up @@ -158,6 +159,8 @@ struct DeviceReduce
* **[inferred]** Data element type that is convertible to the `value` type
* of `InputIteratorT`
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand Down Expand Up @@ -188,18 +191,19 @@ struct DeviceReduce
template <typename InputIteratorT,
typename OutputIteratorT,
typename ReductionOpT,
typename T>
typename T,
typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
NumItemsT num_items,
ReductionOpT reduction_op,
T init,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

return DispatchReduce<InputIteratorT,
OutputIteratorT,
Expand All @@ -209,7 +213,7 @@ struct DeviceReduce
temp_storage_bytes,
d_in,
d_out,
num_items,
static_cast<OffsetT>(num_items),
reduction_op,
init,
stream);
Expand Down Expand Up @@ -303,6 +307,8 @@ struct DeviceReduce
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand All @@ -324,16 +330,18 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
cudaStream_t stream = 0)
NumItemsT num_items,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

// The output value type
using OutputT =
Expand All @@ -350,7 +358,7 @@ struct DeviceReduce
temp_storage_bytes,
d_in,
d_out,
num_items,
static_cast<OffsetT>(num_items),
cub::Sum(),
InitT{}, // zero-initialize
stream);
Expand Down Expand Up @@ -429,6 +437,8 @@ struct DeviceReduce
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand All @@ -450,16 +460,18 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
NumItemsT num_items,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;
Expand All @@ -474,7 +486,7 @@ struct DeviceReduce
temp_storage_bytes,
d_in,
d_out,
num_items,
static_cast<OffsetT>(num_items),
cub::Min(),
// replace with
// std::numeric_limits<T>::max() when
Expand Down Expand Up @@ -583,7 +595,8 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
Expand Down Expand Up @@ -707,6 +720,8 @@ struct DeviceReduce
* **[inferred]** Output iterator type for recording the reduced
* aggregate \iterator
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand All @@ -728,16 +743,18 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
NumItemsT num_items,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

// The input value type
using InputT = cub::detail::value_t<InputIteratorT>;
Expand All @@ -752,7 +769,7 @@ struct DeviceReduce
temp_storage_bytes,
d_in,
d_out,
num_items,
static_cast<OffsetT>(num_items),
cub::Max(),
// replace with
// std::numeric_limits<T>::lowest()
Expand Down Expand Up @@ -863,7 +880,8 @@ struct DeviceReduce
* **[optional]** CUDA stream to launch kernels within.
* Default is stream<sub>0</sub>.
*/
template <typename InputIteratorT, typename OutputIteratorT>
template <typename InputIteratorT,
typename OutputIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
Expand Down Expand Up @@ -1054,6 +1072,8 @@ struct DeviceReduce
* **[inferred]*8 Binary reduction functor type having member
* `T operator()(const T &a, const T &b)`
*
* @tparam NumItemsT **[inferred]** Type of num_items
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage. When `nullptr`, the
* required allocation size is written to `temp_storage_bytes` and no work
Expand Down Expand Up @@ -1095,7 +1115,8 @@ struct DeviceReduce
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename ReductionOpT>
typename ReductionOpT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
ReduceByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
Expand All @@ -1105,11 +1126,11 @@ struct DeviceReduce
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
ReductionOpT reduction_op,
int num_items,
NumItemsT num_items,
cudaStream_t stream = 0)
{
// Signed integer type for global offsets
using OffsetT = int;
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

// FlagT iterator type (not used)

Expand All @@ -1134,7 +1155,7 @@ struct DeviceReduce
d_num_runs_out,
EqualityOp(),
reduction_op,
num_items,
static_cast<OffsetT>(num_items),
stream);
}

Expand Down
46 changes: 40 additions & 6 deletions test/test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,25 +33,25 @@
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <cub/util_allocator.cuh>
#include <cub/util_math.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_segmented_reduce.cuh>
#include <cub/iterator/constant_input_iterator.cuh>
#include <cub/iterator/discard_output_iterator.cuh>
#include <cub/iterator/transform_input_iterator.cuh>
#include <cub/util_allocator.cuh>
#include <cub/util_math.cuh>
#include <cub/util_type.cuh>

#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <nv/target>

#include "test_util.h"

#include <cstdio>
#include <limits>
#include <typeinfo>

#include "test_util.h"
#include <nv/target>

using namespace cub;

Expand Down Expand Up @@ -1333,6 +1333,39 @@ __global__ void InitializeTestAccumulatorTypes(int num_items,
}
}

template <typename T>
void TestBigIndicesHelper(int magnitude)
{
const std::size_t num_items = 1ll << magnitude;
thrust::constant_iterator<T> const_iter(T{1});
thrust::device_vector<std::size_t> out(1);
std::size_t* d_out = thrust::raw_pointer_cast(out.data());

std::uint8_t *d_temp_storage{};
std::size_t temp_storage_bytes{};

CubDebugExit(
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));

thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

CubDebugExit(
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, const_iter, d_out, num_items));
std::size_t result = out[0];

AssertEquals(result, num_items);
}

template <typename T>
void TestBigIndices()
{
TestBigIndicesHelper<T>(30);
TestBigIndicesHelper<T>(31);
TestBigIndicesHelper<T>(32);
TestBigIndicesHelper<T>(33);
}

void TestAccumulatorTypes()
{
const int num_items = 2 * 1024 * 1024;
Expand Down Expand Up @@ -1491,6 +1524,7 @@ int main(int argc, char** argv)
TestType<TestBar, TestBar>(max_items, max_segments);

TestAccumulatorTypes();
TestBigIndices<std::size_t>();
#endif
printf("\n");
return 0;
Expand Down

0 comments on commit 8b9f0b0

Please sign in to comment.