diff --git a/dependencies/cub b/dependencies/cub index a39e385cc6..dd02b11a34 160000 --- a/dependencies/cub +++ b/dependencies/cub @@ -1 +1 @@ -Subproject commit a39e385cc6be20754f859dd266021ab1d88459d3 +Subproject commit dd02b11a34f1b799d191dca280d78549e42fd7e1 diff --git a/thrust/system/cuda/detail/scan.h b/thrust/system/cuda/detail/scan.h index 4c3cfefec7..07b71a4e9d 100644 --- a/thrust/system/cuda/detail/scan.h +++ b/thrust/system/cuda/detail/scan.h @@ -26,762 +26,204 @@ ******************************************************************************/ #pragma once - -#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include -#include -#include - -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include #include +#include +#include -namespace thrust -{ -template -__host__ __device__ OutputIterator -inclusive_scan(const thrust::detail::execution_policy_base &exec, - InputIterator first, - InputIterator last, - OutputIterator result, - AssociativeOperator binary_op); - -template -__host__ __device__ OutputIterator -exclusive_scan(const thrust::detail::execution_policy_base &exec, - InputIterator first, - InputIterator last, - OutputIterator result, - T init, - AssociativeOperator binary_op); -} // end namespace thrust +#include namespace thrust { -namespace cuda_cub { - -namespace __scan { - - namespace mpl = thrust::detail::mpl::math; - - template - struct WarpSize { enum { value = 32 }; }; - - template - struct PtxPolicy - { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; - static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; - }; // struct PtxPolicy - - - // Scale the number of warps to keep same amount of "tile" storage - // as the nominal configuration for 4B data. Minimum of two warps. - // - template - struct THRUST_BLOCK_THREADS - { - enum - { - value = mpl::min::value) * - 4) / - sizeof(T)>::value * - WarpSize::value>::value - }; - }; // struct THRUST_BLOCK_THREADS - - // If necessary, scale down number of items per thread to keep - // the same amount of "tile" storage as the nominal configuration for 4B data. - // Minimum 1 item per thread - // - template - struct THRUST_ITEMS_PER_THREAD - { - enum - { - value = mpl::min< - int, - NOMINAL_4B_ITEMS_PER_THREAD, - mpl::max< - int, - 1, - (NOMINAL_4B_ITEMS_PER_THREAD * - NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / - THRUST_BLOCK_THREADS::value>::value>::value - }; - }; - - - template - struct Tuning; - - template - struct Tuning - { - typedef sm30 Arch; - enum - { - NOMINAL_4B_BLOCK_THREADS = 256, - NOMINAL_4B_ITEMS_PER_THREAD = 9, - }; - - typedef PtxPolicy::value, - THRUST_ITEMS_PER_THREAD::value, - cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, - cub::LOAD_DEFAULT, - cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - cub::BLOCK_SCAN_RAKING_MEMOIZE> - type; - }; // struct Tuning for sm30 - - template - struct Tuning - { - typedef sm35 Arch; - enum - { - NOMINAL_4B_BLOCK_THREADS = 128, - NOMINAL_4B_ITEMS_PER_THREAD = 12, - }; - - typedef PtxPolicy::value, - THRUST_ITEMS_PER_THREAD::value, - cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, - cub::LOAD_LDG, - cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - cub::BLOCK_SCAN_RAKING> - type; - }; // struct Tuning for sm35 - - template - struct Tuning - { - typedef sm52 Arch; - enum - { - NOMINAL_4B_BLOCK_THREADS = 128, - NOMINAL_4B_ITEMS_PER_THREAD = 12, - }; - - typedef PtxPolicy::value, - THRUST_ITEMS_PER_THREAD::value, - cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, - cub::LOAD_LDG, - cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - cub::BLOCK_SCAN_RAKING> - type; - }; // struct Tuning for sm52 - - template - struct ScanAgent - { - typedef cub::ScanTileState ScanTileState; - typedef cub::BlockScanRunningPrefixOp RunningPrefixCallback; - - template - struct PtxPlan : Tuning::type - { - typedef Tuning tuning; - - - typedef typename core::LoadIterator::type LoadIt; - typedef typename core::BlockLoad::type BlockLoad; - typedef typename core::BlockStore::type BlockStore; - - typedef cub::TilePrefixCallbackOp - TilePrefixCallback; - typedef cub::BlockScan - BlockScan; - - union TempStorage - { - typename BlockLoad::TempStorage load; - typename BlockStore::TempStorage store; - - struct - { - typename TilePrefixCallback::TempStorage prefix; - typename BlockScan::TempStorage scan; - }; - }; // struct TempStorage - }; // struct PtxPlan - typedef typename core::specialize_plan_msvc10_war::type::type ptx_plan; - - typedef typename ptx_plan::LoadIt LoadIt; - typedef typename ptx_plan::BlockLoad BlockLoad; - typedef typename ptx_plan::BlockStore BlockStore; - typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback; - typedef typename ptx_plan::BlockScan BlockScan; - typedef typename ptx_plan::TempStorage TempStorage; - - enum - { - INCLUSIVE = Inclusive::value, - BLOCK_THREADS = ptx_plan::BLOCK_THREADS, - ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD, - ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE, - - SYNC_AFTER_LOAD = (ptx_plan::LOAD_ALGORITHM != cub::BLOCK_LOAD_DIRECT), - }; - - struct impl - { - //--------------------------------------------------------------------- - // Per thread data - //--------------------------------------------------------------------- - - TempStorage &storage; - ScanTileState &tile_state; - LoadIt load_it; - OutputIt output_it; - ScanOp scan_op; - - //--------------------------------------------------------------------- - // Block scan utility methods (first tile) - //--------------------------------------------------------------------- - - // Exclusive scan specialization - // - template - void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD], - _ScanOp scan_op, - T & block_aggregate, - thrust::detail::false_type /* is_inclusive */) - { - BlockScan(storage.scan).ExclusiveScan(items, items, scan_op, block_aggregate); - } - - // Exclusive sum specialization - // - void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD], - plus /*scan_op*/, - T & block_aggregate, - thrust::detail::false_type /* is_inclusive */) - { - BlockScan(storage.scan).ExclusiveSum(items, items, block_aggregate); - } - - // Inclusive scan specialization - // - template - void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD], - _ScanOp scan_op, - T & block_aggregate, - thrust::detail::true_type /* is_inclusive */) - { - BlockScan(storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); - } - - - // Inclusive sum specialization - // - void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD], - plus /*scan_op*/, - T & block_aggregate, - thrust::detail::true_type /* is_inclusive */) - { - BlockScan(storage.scan).InclusiveSum(items, items, block_aggregate); - } - - //--------------------------------------------------------------------- - // Block scan utility methods (subsequent tiles) - //--------------------------------------------------------------------- - - // Exclusive scan specialization (with prefix from predecessors) - // - template - void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD], - _ScanOp scan_op, - T & block_aggregate, - PrefixCallback &prefix_op, - thrust::detail::false_type /* is_inclusive */) - { - BlockScan(storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); - block_aggregate = prefix_op.GetBlockAggregate(); - } - - // Exclusive sum specialization (with prefix from predecessors) - // - template - THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD], - plus /*scan_op*/, - T & block_aggregate, - PrefixCallback &prefix_op, - thrust::detail::false_type /* is_inclusive */) - { - BlockScan(storage.scan).ExclusiveSum(items, items, prefix_op); - block_aggregate = prefix_op.GetBlockAggregate(); - } - - // Inclusive scan specialization (with prefix from predecessors) - // - template - THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD], - _ScanOp scan_op, - T & block_aggregate, - PrefixCallback &prefix_op, - thrust::detail::true_type /* is_inclusive */) - { - BlockScan(storage.scan).InclusiveScan(items, items, scan_op, prefix_op); - block_aggregate = prefix_op.GetBlockAggregate(); - } - - // Inclusive sum specialization (with prefix from predecessors) - // - template - THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD], - plus /*scan_op*/, - T & block_aggregate, - PrefixCallback &prefix_op, - thrust::detail::true_type /* is_inclusive */) - { - BlockScan(storage.scan).InclusiveSum(items, items, prefix_op); - block_aggregate = prefix_op.GetBlockAggregate(); - } - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - // Process a tile of input (dynamic chained scan) - // - template - THRUST_DEVICE_FUNCTION void - consume_tile(Size /*num_items*/, - Size num_remaining, - int tile_idx, - Size tile_base, - AddInitToExclusive add_init_to_exclusive_scan) - { - using core::sync_threadblock; - - // Load items - T items[ITEMS_PER_THREAD]; - - if (IS_FULL_TILE) - { - BlockLoad(storage.load).Load(load_it + tile_base, items); - } - else - { - // Fill last element with the first element - // because collectives are not suffix guarded - BlockLoad(storage.load) - .Load(load_it + tile_base, - items, - num_remaining, - *(load_it + tile_base)); - } - - if (SYNC_AFTER_LOAD) - sync_threadblock(); - - // Perform tile scan - if (tile_idx == 0) - { - // Scan first tile - T block_aggregate; - scan_tile(items, scan_op, block_aggregate, Inclusive()); - - // Update tile status if there may be successor tiles (i.e., this tile is full) - if (IS_FULL_TILE && (threadIdx.x == 0)) - tile_state.SetInclusive(0, block_aggregate); - } - else - { - // Scan non-first tile - T block_aggregate; - TilePrefixCallback prefix_op(tile_state, storage.prefix, scan_op, tile_idx); - scan_tile(items, scan_op, block_aggregate, prefix_op, Inclusive()); - } - - sync_threadblock(); - - add_init_to_exclusive_scan(items, tile_idx); - - // Store items - if (IS_FULL_TILE) - { - BlockStore(storage.store).Store(output_it + tile_base, items); - } - else - { - BlockStore(storage.store).Store(output_it + tile_base, items, num_remaining); - } - } - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Dequeue and scan tiles of items as part of a dynamic chained scan - // with Init - template - THRUST_DEVICE_FUNCTION - impl(TempStorage & storage_, - ScanTileState & tile_state_, - InputIt input_it, - OutputIt output_it_, - ScanOp scan_op_, - Size num_items, - AddInitToExclusiveScan add_init_to_exclusive_scan) - : storage(storage_), - tile_state(tile_state_), - load_it(core::make_load_iterator(ptx_plan(), input_it)), - output_it(output_it_), - scan_op(scan_op_) - { - int tile_idx = blockIdx.x; - Size tile_base = ITEMS_PER_TILE * tile_idx; - Size num_remaining = num_items - tile_base; - - if (num_remaining > ITEMS_PER_TILE) - { - // Full tile - consume_tile(num_items, - num_remaining, - tile_idx, - tile_base, - add_init_to_exclusive_scan); - } - else if (num_remaining > 0) - { - // Partially-full tile - consume_tile(num_items, - num_remaining, - tile_idx, - tile_base, - add_init_to_exclusive_scan); - } - } - }; // struct impl - - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- - - template - THRUST_AGENT_ENTRY(InputIt input_it, - OutputIt output_it, - ScanOp scan_op, - Size num_items, - ScanTileState tile_state, - AddInitToExclusiveScan add_init_to_exclusive_scan, - char * shmem) - { - TempStorage &storage = *reinterpret_cast(shmem); - impl(storage, - tile_state, - input_it, - output_it, - scan_op, - num_items, - add_init_to_exclusive_scan); - } - }; // struct ScanAgent - - template - struct InitAgent - { - template - struct PtxPlan : PtxPolicy<128> {}; - - typedef core::specialize_plan ptx_plan; - - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- - - THRUST_AGENT_ENTRY(ScanTileState tile_state, - Size num_tiles, - char * /*shmem*/) - { - tile_state.InitializeStatus(num_tiles); - } - - }; // struct InitAgent - - template - struct DoNothing - { - typedef T type; - template - THRUST_DEVICE_FUNCTION void - operator()(T (&items)[ITEMS_PER_THREAD], int /*tile_idx*/) - { - THRUST_UNUSED_VAR(items); - } - }; // struct DoNothing +namespace cuda_cub +{ +namespace detail +{ - template - struct AddInitToExclusiveScan +__thrust_exec_check_disable__ +template +__host__ __device__ +OutputIt inclusive_scan_n_impl(thrust::cuda_cub::execution_policy &policy, + InputIt first, + Size num_items, + OutputIt result, + ScanOp scan_op) +{ + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; + + cudaStream_t stream = thrust::cuda_cub::stream(policy); + cudaError_t status; + + // Determine temporary storage requirements: + size_t tmp_size = 0; { - typedef T type; - T init; - ScanOp scan_op; - - THRUST_RUNTIME_FUNCTION - AddInitToExclusiveScan(T init_, ScanOp scan_op_) - : init(init_), scan_op(scan_op_) {} - - template - THRUST_DEVICE_FUNCTION void - operator()(T (&items)[ITEMS_PER_THREAD], int tile_idx) - { - if (tile_idx == 0 && threadIdx.x == 0) - { - items[0] = init; - for (int i = 1; i < ITEMS_PER_THREAD; ++i) - items[i] = scan_op(init, items[i]); - } - else - { - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - items[i] = scan_op(init, items[i]); - } - } - }; // struct AddInitToExclusiveScan + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + num_items, + (nullptr, + tmp_size, + first, + result, + scan_op, + cub::NullType{}, + num_items_fixed, + stream, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after determining tmp storage " + "requirements for inclusive_scan"); + } - template - static cudaError_t THRUST_RUNTIME_FUNCTION - doit_step(void * d_temp_storage, - size_t & temp_storage_bytes, - InputIt input_it, - Size num_items, - AddInitToExclusiveScan add_init_to_exclusive_scan, - OutputIt output_it, - ScanOp scan_op, - cudaStream_t stream, - bool debug_sync) + // Run scan: { - using core::AgentPlan; - using core::AgentLauncher; - - cudaError_t status = cudaSuccess; - if (num_items == 0) - return cudaErrorNotSupported; - - typedef typename AddInitToExclusiveScan::type T; - - typedef AgentLauncher< - ScanAgent > - scan_agent; - - typedef typename scan_agent::ScanTileState ScanTileState; - - typedef AgentLauncher > init_agent; - - AgentPlan scan_plan = scan_agent::get_plan(stream); - AgentPlan init_plan = init_agent::get_plan(); - - int tile_size = scan_plan.items_per_tile; - Size num_tiles = static_cast((num_items + tile_size - 1) / tile_size); - - size_t vshmem_size = core::vshmem_size(scan_plan.shared_memory_size, - num_tiles); - - size_t allocation_sizes[2] = {0, vshmem_size}; - status = ScanTileState::AllocationSize(static_cast(num_tiles), allocation_sizes[0]); - CUDA_CUB_RET_IF_FAIL(status); - - void* allocations[2] = {NULL, NULL}; - - status = core::alias_storage(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes); - CUDA_CUB_RET_IF_FAIL(status); - - if (d_temp_storage == NULL) - { - return status; - } - - ScanTileState tile_state; - status = tile_state.Init(static_cast(num_tiles), allocations[0], allocation_sizes[0]); - CUDA_CUB_RET_IF_FAIL(status); - - char *vshmem_ptr = vshmem_size > 0 ? (char*)allocations[1] : NULL; - - init_agent ia(init_plan, num_tiles, stream, "scan::init_agent", debug_sync); - ia.launch(tile_state, num_tiles); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); + // Allocate temporary storage: + thrust::detail::temporary_array tmp{ + policy, + tmp_size}; + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + num_items, + (tmp.data().get(), + tmp_size, + first, + result, + scan_op, + cub::NullType{}, + num_items_fixed, + stream, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after dispatching inclusive_scan kernel"); + thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize(policy), + "inclusive_scan failed to synchronize"); + } - scan_agent sa(scan_plan, num_items, stream, vshmem_ptr, "scan::scan_agent", debug_sync); - sa.launch(input_it, - output_it, - scan_op, - num_items, - tile_state, - add_init_to_exclusive_scan); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - return status; - } // func doit_step + return result + num_items; +} - template - THRUST_RUNTIME_FUNCTION - OutputIt scan(execution_policy& policy, - InputIt input_it, - OutputIt output_it, - Size num_items, - ScanOp scan_op, - AddInitToExclusiveScan add_init_to_exclusive_scan) +__thrust_exec_check_disable__ +template +__host__ __device__ +OutputIt exclusive_scan_n_impl(thrust::cuda_cub::execution_policy &policy, + InputIt first, + Size num_items, + OutputIt result, + InitValueT init, + ScanOp scan_op) +{ + using Dispatch32 = cub::DispatchScan; + using Dispatch64 = cub::DispatchScan; + + cudaStream_t stream = thrust::cuda_cub::stream(policy); + cudaError_t status; + + // Determine temporary storage requirements: + size_t tmp_size = 0; { - if (num_items == 0) - return output_it; - - size_t storage_size = 0; - cudaStream_t stream = cuda_cub::stream(policy); - bool debug_sync = THRUST_DEBUG_SYNC_FLAG; - - cudaError_t status; - THRUST_INDEX_TYPE_DISPATCH(status, - doit_step, + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, num_items, - (NULL, - storage_size, - input_it, - num_items_fixed, - add_init_to_exclusive_scan, - output_it, - scan_op, - stream, - debug_sync)); - cuda_cub::throw_on_error(status, "scan failed on 1st step"); - - // Allocate temporary storage. - thrust::detail::temporary_array - tmp(policy, storage_size); - void *ptr = static_cast(tmp.data().get()); + (nullptr, + tmp_size, + first, + result, + scan_op, + init, + num_items_fixed, + stream, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after determining tmp storage " + "requirements for exclusive_scan"); + } - THRUST_INDEX_TYPE_DISPATCH(status, - doit_step, + // Run scan: + { + // Allocate temporary storage: + thrust::detail::temporary_array tmp{ + policy, + tmp_size}; + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, num_items, - (ptr, - storage_size, - input_it, - num_items_fixed, - add_init_to_exclusive_scan, - output_it, - scan_op, - stream, - debug_sync)); - cuda_cub::throw_on_error(status, "scan failed on 2nd step"); - - status = cuda_cub::synchronize(policy); - cuda_cub::throw_on_error(status, "scan failed to synchronize"); + (tmp.data().get(), + tmp_size, + first, + result, + scan_op, + init, + num_items_fixed, + stream, + THRUST_DEBUG_SYNC_FLAG)); + thrust::cuda_cub::throw_on_error(status, + "after dispatching exclusive_scan kernel"); + thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize(policy), + "exclusive_scan failed to synchronize"); + } - return output_it + num_items; - } // func scan + return result + num_items; +} -} // namespace __scan +} // namespace detail //------------------------- // Thrust API entry points //------------------------- __thrust_exec_check_disable__ -template -OutputIt __host__ __device__ -inclusive_scan_n(execution_policy &policy, - InputIt first, - Size num_items, - OutputIt result, - ScanOp scan_op) +template +__host__ __device__ +OutputIt inclusive_scan_n(thrust::cuda_cub::execution_policy &policy, + InputIt first, + Size num_items, + OutputIt result, + ScanOp scan_op) { OutputIt ret = result; if (__THRUST_HAS_CUDART__) { - typedef typename iterator_traits::value_type T; - ret = __scan::scan(policy, - first, - result, - num_items, - scan_op, - __scan::DoNothing()); + ret = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy, + first, + num_items, + result, + scan_op); } else { @@ -796,66 +238,61 @@ inclusive_scan_n(execution_policy &policy, return ret; } - -template -OutputIt __host__ __device__ -inclusive_scan(execution_policy &policy, - InputIt first, - InputIt last, - OutputIt result, - ScanOp scan_op) +template +__host__ __device__ +OutputIt inclusive_scan(thrust::cuda_cub::execution_policy &policy, + InputIt first, + InputIt last, + OutputIt result, + ScanOp scan_op) { - typedef typename thrust::iterator_traits::difference_type diff_t; - diff_t num_items = thrust::distance(first, last); - return cuda_cub::inclusive_scan_n(policy, first, num_items, result, scan_op); + using diff_t = typename thrust::iterator_traits::difference_type; + diff_t const num_items = thrust::distance(first, last); + return thrust::cuda_cub::inclusive_scan_n(policy, + first, + num_items, + result, + scan_op); } - -template -OutputIt __host__ __device__ -inclusive_scan(execution_policy &policy, - InputIt first, - OutputIt last, - OutputIt result) +template +__host__ __device__ +OutputIt inclusive_scan(thrust::cuda_cub::execution_policy &policy, + InputIt first, + InputIt last, + OutputIt result) { - - typedef typename thrust::detail::eval_if< - thrust::detail::is_output_iterator::value, - thrust::iterator_value, - thrust::iterator_value >::type result_type; - return cuda_cub::inclusive_scan(policy, first, last, result, plus()); -}; + return thrust::cuda_cub::inclusive_scan(policy, + first, + last, + result, + thrust::plus<>{}); +} __thrust_exec_check_disable__ -template -OutputIt __host__ __device__ -exclusive_scan_n(execution_policy &policy, - InputIt first, - Size num_items, - OutputIt result, - T init, - ScanOp scan_op) +template +__host__ __device__ +OutputIt exclusive_scan_n(thrust::cuda_cub::execution_policy &policy, + InputIt first, + Size num_items, + OutputIt result, + T init, + ScanOp scan_op) { OutputIt ret = result; if (__THRUST_HAS_CUDART__) { - ret = __scan::scan( - policy, - first, - result, - num_items, - scan_op, - __scan::AddInitToExclusiveScan(init, scan_op)); + ret = thrust::cuda_cub::detail::exclusive_scan_n_impl(policy, + first, + num_items, + result, + init, + scan_op); } else { @@ -871,58 +308,57 @@ exclusive_scan_n(execution_policy &policy, return ret; } -template -OutputIt __host__ __device__ -exclusive_scan(execution_policy &policy, - InputIt first, - InputIt last, - OutputIt result, - T init, - ScanOp scan_op) +template +__host__ __device__ +OutputIt exclusive_scan(thrust::cuda_cub::execution_policy &policy, + InputIt first, + InputIt last, + OutputIt result, + T init, + ScanOp scan_op) { - typedef typename thrust::iterator_traits::difference_type diff_t; - diff_t num_items = thrust::distance(first, last); - return cuda_cub::exclusive_scan_n(policy, first, num_items, result, init, scan_op); + using diff_t = typename thrust::iterator_traits::difference_type; + diff_t const num_items = thrust::distance(first, last); + return thrust::cuda_cub::exclusive_scan_n(policy, + first, + num_items, + result, + init, + scan_op); } -template -OutputIt __host__ __device__ -exclusive_scan(execution_policy &policy, - InputIt first, - OutputIt last, - OutputIt result, - T init) +template +__host__ __device__ +OutputIt exclusive_scan(thrust::cuda_cub::execution_policy &policy, + InputIt first, + InputIt last, + OutputIt result, + T init) { - return cuda_cub::exclusive_scan(policy, first, last, result, init, plus()); + return thrust::cuda_cub::exclusive_scan(policy, + first, + last, + result, + init, + thrust::plus<>{}); } -template -OutputIt __host__ __device__ -exclusive_scan(execution_policy &policy, - InputIt first, - OutputIt last, - OutputIt result) +template +__host__ __device__ +OutputIt exclusive_scan(thrust::cuda_cub::execution_policy &policy, + InputIt first, + InputIt last, + OutputIt result) { - typedef typename thrust::detail::eval_if< - thrust::detail::is_output_iterator::value, - thrust::iterator_value, - thrust::iterator_value - >::type result_type; - return cuda_cub::exclusive_scan(policy, first, last, result, result_type(0)); + using init_type = typename thrust::iterator_traits::value_type; + return cuda_cub::exclusive_scan(policy, first, last, result, init_type{}); }; } // namespace cuda_cub -} // end namespace thrust +} // namespace thrust #include - -#endif