From b7e6717679567ae8bd03cd1fe671fb94f419e53f Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 10 Oct 2023 15:08:26 +0200 Subject: [PATCH] Add `_CCCL_IMPLICIT_SYSTEM_HEADER` to cub headers --- cub/cub/agent/agent_adjacent_difference.cuh | 5 +- cub/cub/agent/agent_batch_memcpy.cuh | 4 + cub/cub/agent/agent_histogram.cuh | 11 +- cub/cub/agent/agent_merge_sort.cuh | 3 + cub/cub/agent/agent_radix_sort_downsweep.cuh | 15 +- cub/cub/agent/agent_radix_sort_histogram.cuh | 11 +- cub/cub/agent/agent_radix_sort_onesweep.cuh | 29 +- cub/cub/agent/agent_radix_sort_upsweep.cuh | 9 +- cub/cub/agent/agent_reduce.cuh | 17 +- cub/cub/agent/agent_reduce_by_key.cuh | 11 +- cub/cub/agent/agent_rle.cuh | 39 +- cub/cub/agent/agent_scan.cuh | 19 +- cub/cub/agent/agent_scan_by_key.cuh | 19 +- cub/cub/agent/agent_segment_fixup.cuh | 7 +- cub/cub/agent/agent_segmented_radix_sort.cuh | 7 +- cub/cub/agent/agent_select_if.cuh | 21 +- cub/cub/agent/agent_spmv_orig.cuh | 5 +- cub/cub/agent/agent_sub_warp_merge_sort.cuh | 13 +- cub/cub/agent/agent_three_way_partition.cuh | 5 +- cub/cub/agent/agent_unique_by_key.cuh | 16 +- cub/cub/agent/single_pass_scan_operators.cuh | 63 +- cub/cub/block/block_adjacent_difference.cuh | 23 +- cub/cub/block/block_discontinuity.cuh | 7 +- cub/cub/block/block_exchange.cuh | 9 +- cub/cub/block/block_histogram.cuh | 9 +- cub/cub/block/block_load.cuh | 5 +- cub/cub/block/block_merge_sort.cuh | 4 + cub/cub/block/block_radix_rank.cuh | 43 +- cub/cub/block/block_radix_sort.cuh | 363 ++--- cub/cub/block/block_raking_layout.cuh | 7 +- cub/cub/block/block_reduce.cuh | 15 +- cub/cub/block/block_run_length_decode.cuh | 3 + cub/cub/block/block_scan.cuh | 27 +- cub/cub/block/block_shuffle.cuh | 7 +- cub/cub/block/block_store.cuh | 11 +- cub/cub/block/radix_rank_sort_operations.cuh | 23 +- .../block_histogram_atomic.cuh | 8 +- .../specializations/block_histogram_sort.cuh | 9 +- .../specializations/block_reduce_raking.cuh | 9 +- .../block_reduce_raking_commutative_only.cuh | 9 +- .../block_reduce_warp_reductions.cuh | 9 +- .../specializations/block_scan_raking.cuh | 10 +- .../specializations/block_scan_warp_scans.cuh | 9 +- cub/cub/config.cuh | 5 + cub/cub/cub.cuh | 4 +- cub/cub/detail/choose_offset.cuh | 5 +- cub/cub/detail/cpp_compatibility.cuh | 4 +- cub/cub/detail/detect_cuda_runtime.cuh | 5 + cub/cub/detail/device_double_buffer.cuh | 4 + cub/cub/detail/device_synchronize.cuh | 6 +- cub/cub/detail/exec_check_disable.cuh | 4 +- cub/cub/detail/strong_load.cuh | 5 +- cub/cub/detail/strong_store.cuh | 5 +- cub/cub/detail/temporary_storage.cuh | 4 + cub/cub/detail/type_traits.cuh | 6 +- cub/cub/detail/uninitialized_copy.cuh | 15 +- cub/cub/device/device_adjacent_difference.cuh | 5 +- cub/cub/device/device_copy.cuh | 5 +- cub/cub/device/device_histogram.cuh | 631 ++++---- cub/cub/device/device_memcpy.cuh | 5 +- cub/cub/device/device_merge_sort.cuh | 5 +- cub/cub/device/device_partition.cuh | 23 +- cub/cub/device/device_radix_sort.cuh | 1347 +++++++++-------- cub/cub/device/device_reduce.cuh | 399 ++--- cub/cub/device/device_run_length_encode.cuh | 5 +- cub/cub/device/device_scan.cuh | 741 ++++----- .../device/device_segmented_radix_sort.cuh | 773 +++++----- cub/cub/device/device_segmented_reduce.cuh | 5 +- cub/cub/device/device_segmented_sort.cuh | 169 ++- cub/cub/device/device_select.cuh | 377 ++--- cub/cub/device/device_spmv.cuh | 4 + .../dispatch/dispatch_adjacent_difference.cuh | 13 +- .../device/dispatch/dispatch_batch_memcpy.cuh | 5 +- .../device/dispatch/dispatch_histogram.cuh | 9 +- .../device/dispatch/dispatch_merge_sort.cuh | 4 + .../device/dispatch/dispatch_radix_sort.cuh | 85 +- cub/cub/device/dispatch/dispatch_reduce.cuh | 5 +- .../dispatch/dispatch_reduce_by_key.cuh | 9 +- cub/cub/device/dispatch/dispatch_rle.cuh | 9 +- cub/cub/device/dispatch/dispatch_scan.cuh | 17 +- .../device/dispatch/dispatch_scan_by_key.cuh | 17 +- .../dispatch/dispatch_segmented_sort.cuh | 4 + .../device/dispatch/dispatch_select_if.cuh | 103 +- .../device/dispatch/dispatch_spmv_orig.cuh | 11 +- .../dispatch/dispatch_three_way_partition.cuh | 5 +- .../dispatch/dispatch_unique_by_key.cuh | 2 +- .../dispatch/tuning/tuning_histogram.cuh | 5 +- .../dispatch/tuning/tuning_reduce_by_key.cuh | 5 +- .../tuning/tuning_run_length_encode.cuh | 19 +- .../device/dispatch/tuning/tuning_scan.cuh | 9 +- .../dispatch/tuning/tuning_scan_by_key.cuh | 25 +- .../dispatch/tuning/tuning_select_if.cuh | 25 +- .../tuning/tuning_three_way_partition.cuh | 5 +- .../dispatch/tuning/tuning_unique_by_key.cuh | 7 +- cub/cub/grid/grid_barrier.cuh | 13 +- cub/cub/grid/grid_even_share.cuh | 9 +- cub/cub/grid/grid_mapping.cuh | 3 + cub/cub/grid/grid_queue.cuh | 5 +- cub/cub/host/mutex.cuh | 9 +- cub/cub/iterator/arg_index_input_iterator.cuh | 9 +- .../cache_modified_input_iterator.cuh | 5 +- .../cache_modified_output_iterator.cuh | 9 +- cub/cub/iterator/constant_input_iterator.cuh | 9 +- cub/cub/iterator/counting_input_iterator.cuh | 9 +- cub/cub/iterator/discard_output_iterator.cuh | 10 +- cub/cub/iterator/tex_obj_input_iterator.cuh | 5 +- cub/cub/iterator/tex_ref_input_iterator.cuh | 5 +- cub/cub/iterator/transform_input_iterator.cuh | 9 +- cub/cub/thread/thread_load.cuh | 5 +- cub/cub/thread/thread_operators.cuh | 33 +- cub/cub/thread/thread_reduce.cuh | 15 +- cub/cub/thread/thread_scan.cuh | 7 +- cub/cub/thread/thread_search.cuh | 9 +- cub/cub/thread/thread_sort.cuh | 3 + cub/cub/thread/thread_store.cuh | 5 +- cub/cub/util_allocator.cuh | 36 +- cub/cub/util_arch.cuh | 12 +- cub/cub/util_compiler.cuh | 5 + cub/cub/util_cpp_dialect.cuh | 4 + cub/cub/util_debug.cuh | 27 +- cub/cub/util_deprecated.cuh | 6 +- cub/cub/util_device.cuh | 12 +- cub/cub/util_macro.cuh | 10 +- cub/cub/util_math.cuh | 7 +- cub/cub/util_namespace.cuh | 5 +- cub/cub/util_ptx.cuh | 50 +- cub/cub/util_temporary_storage.cuh | 4 + cub/cub/util_type.cuh | 15 +- cub/cub/version.cuh | 5 + .../specializations/warp_exchange_shfl.cuh | 9 +- .../specializations/warp_exchange_smem.cuh | 5 +- .../warp/specializations/warp_reduce_shfl.cuh | 27 +- .../warp/specializations/warp_reduce_smem.cuh | 7 +- .../warp/specializations/warp_scan_shfl.cuh | 7 +- .../warp/specializations/warp_scan_smem.cuh | 7 +- cub/cub/warp/warp_exchange.cuh | 5 +- cub/cub/warp/warp_load.cuh | 5 +- cub/cub/warp/warp_merge_sort.cuh | 5 +- cub/cub/warp/warp_reduce.cuh | 5 +- cub/cub/warp/warp_scan.cuh | 5 +- cub/cub/warp/warp_store.cuh | 5 +- thrust/thrust/system/cuda/detail/util.h | 2 +- 142 files changed, 3402 insertions(+), 2988 deletions(-) diff --git a/cub/cub/agent/agent_adjacent_difference.cuh b/cub/cub/agent/agent_adjacent_difference.cuh index b135fbbf53b..2855242e7eb 100644 --- a/cub/cub/agent/agent_adjacent_difference.cuh +++ b/cub/cub/agent/agent_adjacent_difference.cuh @@ -28,6 +28,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../util_type.cuh" #include "../util_namespace.cuh" #include "../block/block_load.cuh" @@ -159,7 +162,7 @@ struct AgentDifference } else { - InputT tile_prev_input = MayAlias + InputT tile_prev_input = MayAlias ? first_tile_previous[tile_idx] : *(input_it + tile_base - 1); diff --git a/cub/cub/agent/agent_batch_memcpy.cuh b/cub/cub/agent/agent_batch_memcpy.cuh index 20db5bb6aec..a7602f7786c 100644 --- a/cub/cub/agent/agent_batch_memcpy.cuh +++ b/cub/cub/agent/agent_batch_memcpy.cuh @@ -33,6 +33,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index 94c2efcaf4e..ddae3ec9cad 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -33,11 +33,14 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include "../util_type.cuh" #include "../block/block_load.cuh" -#include "../config.cuh" #include "../grid/grid_queue.cuh" #include "../iterator/cache_modified_input_iterator.cuh" @@ -301,8 +304,8 @@ struct AgentHistogram for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) { int channel_bins = num_privatized_bins[CHANNEL]; - for (int privatized_bin = threadIdx.x; - privatized_bin < channel_bins; + for (int privatized_bin = threadIdx.x; + privatized_bin < channel_bins; privatized_bin += BLOCK_THREADS) { int output_bin = -1; @@ -631,7 +634,7 @@ struct AgentHistogram // Consume a partially-full tile at the end of the row OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; ConsumeTile(tile_offset, num_remaining); - } + } else { // Consume full tile diff --git a/cub/cub/agent/agent_merge_sort.cuh b/cub/cub/agent/agent_merge_sort.cuh index 2e994dd97e1..adbaa572d2b 100644 --- a/cub/cub/agent/agent_merge_sort.cuh +++ b/cub/cub/agent/agent_merge_sort.cuh @@ -28,6 +28,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../util_type.cuh" #include "../util_namespace.cuh" #include "../block/block_load.cuh" diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh index d7e77bb882c..b66ad972a5c 100644 --- a/cub/cub/agent/agent_radix_sort_downsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_downsweep.cuh @@ -35,6 +35,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include @@ -44,7 +48,6 @@ #include #include #include -#include #include #include @@ -135,7 +138,7 @@ struct AgentRadixSortDownsweep using ValuesItr = CacheModifiedInputIterator; // Radix ranking type to use - using BlockRadixRankT = + using BlockRadixRankT = cub::detail::block_radix_rank_t< RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>; @@ -202,7 +205,7 @@ struct AgentRadixSortDownsweep // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; - std::uint32_t current_bit; + std::uint32_t current_bit; std::uint32_t num_bits; // Whether to short-cirucit @@ -488,15 +491,15 @@ struct AgentRadixSortDownsweep OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; // Assign default (min/max) value to all keys - bit_ordered_type default_key = IS_DESCENDING - ? traits::min_raw_binary_key(decomposer) + bit_ordered_type default_key = IS_DESCENDING + ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer); // Load tile of keys LoadKeys( keys, block_offset, - valid_items, + valid_items, default_key, Int2Type(), Int2Type()); diff --git a/cub/cub/agent/agent_radix_sort_histogram.cuh b/cub/cub/agent/agent_radix_sort_histogram.cuh index 9e895b8d0d2..b5af14e8145 100644 --- a/cub/cub/agent/agent_radix_sort_histogram.cuh +++ b/cub/cub/agent/agent_radix_sort_histogram.cuh @@ -34,9 +34,12 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../block/block_load.cuh" #include "../block/radix_rank_sort_operations.cuh" -#include "../config.cuh" #include "../thread/thread_reduce.cuh" #include "../util_math.cuh" #include "../util_type.cuh" @@ -120,7 +123,7 @@ struct AgentRadixSortHistogram // thread fields // shared memory storage _TempStorage& s; - + // bins for the histogram OffsetT* d_bins_out; @@ -175,7 +178,7 @@ struct AgentRadixSortHistogram } __device__ __forceinline__ - void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) + void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD]) { // tile_offset < num_items always, hence the line below works bool full_tile = num_items - tile_offset >= TILE_ITEMS; @@ -264,7 +267,7 @@ struct AgentRadixSortHistogram AccumulateSharedHistograms(tile_offset, keys); } CTA_SYNC(); - + // Accumulate the result in global memory. AccumulateGlobalHistograms(); CTA_SYNC(); diff --git a/cub/cub/agent/agent_radix_sort_onesweep.cuh b/cub/cub/agent/agent_radix_sort_onesweep.cuh index 6c57d1f0215..e4d47c9797f 100644 --- a/cub/cub/agent/agent_radix_sort_onesweep.cuh +++ b/cub/cub/agent/agent_radix_sort_onesweep.cuh @@ -33,10 +33,13 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../block/block_radix_rank.cuh" #include "../block/radix_rank_sort_operations.cuh" #include "../block/block_store.cuh" -#include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" @@ -64,7 +67,7 @@ template < int NOMINAL_BLOCK_THREADS_4B, int NOMINAL_ITEMS_PER_THREAD_4B, typename ComputeT, - /** \brief Number of private histograms to use in the ranker; + /** \brief Number of private histograms to use in the ranker; ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */ int _RANK_NUM_PARTS, /** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that @@ -106,7 +109,7 @@ struct AgentRadixSortOnesweep RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS, TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS, - RADIX_DIGITS = 1 << RADIX_BITS, + RADIX_DIGITS = 1 << RADIX_BITS, BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS, FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS, WARP_THREADS = CUB_PTX_WARP_THREADS, @@ -127,7 +130,7 @@ struct AgentRadixSortOnesweep typename traits::template digit_extractor_t; typedef PortionOffsetT AtomicOffsetT; - + static constexpr RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortOnesweepPolicy::RANK_ALGORITHM; static constexpr BlockScanAlgorithm SCAN_ALGORITHM = @@ -224,7 +227,7 @@ struct AgentRadixSortOnesweep __device__ __forceinline__ void LookbackPartial(int (&bins)[BINS_PER_THREAD]) { #pragma unroll - for (int u = 0; u < BINS_PER_THREAD; ++u) + for (int u = 0; u < BINS_PER_THREAD; ++u) { int bin = ThreadBin(u); if (FULL_BINS || bin < RADIX_DIGITS) @@ -260,7 +263,7 @@ struct AgentRadixSortOnesweep agent.TryShortCircuit(keys, bins); } }; - + __device__ __forceinline__ void LookbackGlobal(int (&bins)[BINS_PER_THREAD]) { #pragma unroll @@ -452,7 +455,7 @@ struct AgentRadixSortOnesweep { s.global_offsets[bin] = d_bins_in[bin] - offsets[u]; } - } + } } __device__ __forceinline__ void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD], @@ -512,7 +515,7 @@ struct AgentRadixSortOnesweep constexpr int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS; constexpr int ALIGN = 8; constexpr auto CACHE_MODIFIER = STORE_CG; - + int warp_start = warp * ITEMS_PER_WARP; int warp_end = (warp + 1) * ITEMS_PER_WARP; int warp_offset = warp_start; @@ -596,11 +599,11 @@ struct AgentRadixSortOnesweep // compute digits corresponding to the keys int digits[ITEMS_PER_THREAD]; ComputeKeyDigits(digits); - + // load values ValueT values[ITEMS_PER_THREAD]; LoadValues(block_idx * TILE_ITEMS, values); - + // scatter values CTA_SYNC(); ScatterValuesShared(values, ranks); @@ -608,7 +611,7 @@ struct AgentRadixSortOnesweep CTA_SYNC(); ScatterValuesGlobal(digits); } - + __device__ __forceinline__ void GatherScatterValues( int (&ranks)[ITEMS_PER_THREAD], Int2Type keys_only) {} @@ -628,7 +631,7 @@ struct AgentRadixSortOnesweep BlockRadixRankT(s.rank_temp_storage).RankKeys( keys, ranks, digit_extractor(), exclusive_digit_prefix, CountsCallback(*this, bins, keys)); - + // scatter keys in shared memory CTA_SYNC(); ScatterKeysShared(keys, ranks); @@ -637,7 +640,7 @@ struct AgentRadixSortOnesweep LoadBinsToOffsetsGlobal(exclusive_digit_prefix); LookbackGlobal(bins); UpdateBinsGlobal(bins, exclusive_digit_prefix); - + // scatter keys in global memory CTA_SYNC(); ScatterKeysGlobal(); diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh index 1a534454627..6b387b0b771 100644 --- a/cub/cub/agent/agent_radix_sort_upsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh @@ -33,12 +33,15 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../thread/thread_reduce.cuh" #include "../thread/thread_load.cuh" #include "../warp/warp_reduce.cuh" #include "../block/block_load.cuh" #include "../block/radix_rank_sort_operations.cuh" -#include "../config.cuh" #include "../util_type.cuh" #include "../iterator/cache_modified_input_iterator.cuh" @@ -321,7 +324,7 @@ struct AgentRadixSortUpsweep const OffsetT &block_end) { // Process partial tile if necessary using single loads - for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS) + for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS) { // Load and bucket key bit_ordered_type key = d_keys_in[block_offset + offset]; @@ -346,7 +349,7 @@ struct AgentRadixSortUpsweep : temp_storage(temp_storage.Alias()), d_keys_in(reinterpret_cast(d_keys_in)), - current_bit(current_bit), + current_bit(current_bit), num_bits(num_bits), decomposer(decomposer) {} diff --git a/cub/cub/agent/agent_reduce.cuh b/cub/cub/agent/agent_reduce.cuh index 4a29d707b30..15da056b57b 100644 --- a/cub/cub/agent/agent_reduce.cuh +++ b/cub/cub/agent/agent_reduce.cuh @@ -13,9 +13,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -27,17 +27,20 @@ ******************************************************************************/ /** - * @file cub::AgentReduce implements a stateful abstraction of CUDA thread + * @file cub::AgentReduce implements a stateful abstraction of CUDA thread * blocks for participating in device-wide reduction. */ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include -#include #include #include #include @@ -368,7 +371,7 @@ struct AgentReduce .Reduce(thread_aggregate, reduction_op, valid_items); } - // Extracting this into a function saves 8% of generated kernel size by allowing to reuse + // Extracting this into a function saves 8% of generated kernel size by allowing to reuse // the block reduction below. This also workaround hang in nvcc. ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize); @@ -439,7 +442,7 @@ private: even_share.block_offset += even_share.block_stride; - // Consume subsequent full tiles of input, at least one full tile was processed, so + // Consume subsequent full tiles of input, at least one full tile was processed, so // `even_share.block_end >= TILE_ITEMS` while (even_share.block_offset <= even_share.block_end - TILE_ITEMS) { diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index f04fb73f053..428bc06591e 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -33,12 +33,15 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include #include -#include #include #include @@ -68,8 +71,8 @@ CUB_NAMESPACE_BEGIN * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * - * @tparam DelayConstructorT - * Implementation detail, do not specify directly, requirements on the + * @tparam DelayConstructorT + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template #include "single_pass_scan_operators.cuh" @@ -41,7 +45,6 @@ #include "../block/block_scan.cuh" #include "../block/block_exchange.cuh" #include "../block/block_discontinuity.cuh" -#include "../config.cuh" #include "../grid/grid_queue.cuh" #include "../iterator/cache_modified_input_iterator.cuh" #include "../iterator/constant_input_iterator.cuh" @@ -56,28 +59,28 @@ CUB_NAMESPACE_BEGIN /** * Parameterizable tuning policy type for AgentRle * - * @tparam _BLOCK_THREADS + * @tparam _BLOCK_THREADS * Threads per thread block * - * @tparam _ITEMS_PER_THREAD + * @tparam _ITEMS_PER_THREAD * Items per thread (per tile of input) * - * @tparam _LOAD_ALGORITHM + * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * - * @tparam _LOAD_MODIFIER + * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * - * @tparam _STORE_WARP_TIME_SLICING - * Whether or not only one warp's worth of shared memory should be allocated and time-sliced among - * block-warps during any store-related data transpositions + * @tparam _STORE_WARP_TIME_SLICING + * Whether or not only one warp's worth of shared memory should be allocated and time-sliced among + * block-warps during any store-related data transpositions * (versus each warp having its own storage) * - * @tparam _SCAN_ALGORITHM + * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * - * @tparam DelayConstructorT - * Implementation detail, do not specify directly, requirements on the + * @tparam DelayConstructorT + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template #include #include #include #include -#include #include #include @@ -73,8 +76,8 @@ CUB_NAMESPACE_BEGIN * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * - * @tparam DelayConstructorT - * Implementation detail, do not specify directly, requirements on the + * @tparam DelayConstructorT + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template #include #include #include #include -#include #include #include @@ -52,8 +55,8 @@ CUB_NAMESPACE_BEGIN /** * Parameterizable tuning policy type for AgentScanByKey * - * @tparam DelayConstructorT - * Implementation detail, do not specify directly, requirements on the + * @tparam DelayConstructorT + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template #include "single_pass_scan_operators.cuh" @@ -40,7 +44,6 @@ #include "../block/block_store.cuh" #include "../block/block_scan.cuh" #include "../block/block_discontinuity.cuh" -#include "../config.cuh" #include "../iterator/cache_modified_input_iterator.cuh" #include "../iterator/constant_input_iterator.cuh" @@ -240,7 +243,7 @@ struct AgentSegmentFixup else BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); - // RLE + // RLE #pragma unroll for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) { diff --git a/cub/cub/agent/agent_segmented_radix_sort.cuh b/cub/cub/agent/agent_segmented_radix_sort.cuh index a629771120e..f791e2b609e 100644 --- a/cub/cub/agent/agent_segmented_radix_sort.cuh +++ b/cub/cub/agent/agent_segmented_radix_sort.cuh @@ -27,10 +27,13 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include -#include #include #include @@ -152,7 +155,7 @@ struct AgentSegmentedRadixSort // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b // LOWEST -> -nan = 11...11b -> TwiddleIn -> 0 = 00...00b - bit_ordered_type default_key_bits = IS_DESCENDING + bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer); KeyT oob_default = reinterpret_cast(default_key_bits); diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index fcccffe7f6a..6faa04cbed0 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -33,6 +33,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include "single_pass_scan_operators.cuh" @@ -41,7 +45,6 @@ #include "../block/block_scan.cuh" #include "../block/block_exchange.cuh" #include "../block/block_discontinuity.cuh" -#include "../config.cuh" #include "../grid/grid_queue.cuh" #include "../iterator/cache_modified_input_iterator.cuh" @@ -55,23 +58,23 @@ CUB_NAMESPACE_BEGIN /** * Parameterizable tuning policy type for AgentSelectIf * - * @tparam _BLOCK_THREADS + * @tparam _BLOCK_THREADS * Threads per thread block * - * @tparam _ITEMS_PER_THREAD + * @tparam _ITEMS_PER_THREAD * Items per thread (per tile of input) * - * @tparam _LOAD_ALGORITHM + * @tparam _LOAD_ALGORITHM * The BlockLoad algorithm to use * - * @tparam _LOAD_MODIFIER + * @tparam _LOAD_MODIFIER * Cache load modifier for reading input elements * - * @tparam _SCAN_ALGORITHM + * @tparam _SCAN_ALGORITHM * The BlockScan algorithm to use * - * @tparam DelayConstructorT - * Implementation detail, do not specify directly, requirements on the + * @tparam DelayConstructorT + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template #include "../util_type.cuh" #include "../block/block_reduce.cuh" #include "../block/block_scan.cuh" #include "../block/block_exchange.cuh" -#include "../config.cuh" #include "../thread/thread_search.cuh" #include "../thread/thread_operators.cuh" #include "../iterator/cache_modified_input_iterator.cuh" diff --git a/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/cub/cub/agent/agent_sub_warp_merge_sort.cuh index 21ca2e5030c..80c8c7caa95 100644 --- a/cub/cub/agent/agent_sub_warp_merge_sort.cuh +++ b/cub/cub/agent/agent_sub_warp_merge_sort.cuh @@ -27,8 +27,11 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include @@ -159,21 +162,21 @@ class AgentSubWarpSort return lhs == rhs; } - __device__ static bool get_oob_default(Int2Type /* is bool */) + __device__ static bool get_oob_default(Int2Type /* is bool */) { // Traits::MAX_KEY for `bool` is 0xFF which is different from `true` and makes // comparison with oob unreliable. return !IS_DESCENDING; } - __device__ static KeyT get_oob_default(Int2Type /* is bool */) + __device__ static KeyT get_oob_default(Int2Type /* is bool */) { // For FP64 the difference is: // Lowest() -> -1.79769e+308 = 00...00b -> TwiddleIn -> -0 = 10...00b // LOWEST -> -nan = 11...11b -> TwiddleIn -> 0 = 00...00b // Segmented sort doesn't support custom types at the moment. - bit_ordered_type default_key_bits = IS_DESCENDING + bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(detail::identity_decomposer_t{}) : traits::max_raw_binary_key(detail::identity_decomposer_t{}); return reinterpret_cast(default_key_bits); @@ -253,7 +256,7 @@ public: KeyT keys[PolicyT::ITEMS_PER_THREAD]; ValueT values[PolicyT::ITEMS_PER_THREAD]; - KeyT oob_default = + KeyT oob_default = AgentSubWarpSort::get_oob_default(Int2Type::value>{}); WarpLoadKeysT(storage.load_keys) diff --git a/cub/cub/agent/agent_three_way_partition.cuh b/cub/cub/agent/agent_three_way_partition.cuh index 93cc8d3966e..4186f61895c 100644 --- a/cub/cub/agent/agent_three_way_partition.cuh +++ b/cub/cub/agent/agent_three_way_partition.cuh @@ -27,6 +27,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include @@ -36,7 +40,6 @@ #include #include #include -#include #include diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh index bf943a00b09..4d406f653a6 100644 --- a/cub/cub/agent/agent_unique_by_key.cuh +++ b/cub/cub/agent/agent_unique_by_key.cuh @@ -32,6 +32,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include @@ -51,8 +55,8 @@ CUB_NAMESPACE_BEGIN /** * Parameterizable tuning policy type for AgentUniqueByKey * - * @tparam DelayConstructorT - * Implementation detail, do not specify directly, requirements on the + * @tparam DelayConstructorT + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template (num_items - tile_offset); OffsetT num_selections = ConsumeTile(num_remaining, - tile_idx, + tile_idx, tile_offset, tile_state); - if (threadIdx.x == 0) + if (threadIdx.x == 0) { *d_num_selected_out = num_selections; } diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index 342e859246c..c222718601e 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -33,9 +33,12 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include @@ -110,22 +113,22 @@ enum ScanTileStatus SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available }; -namespace detail +namespace detail { template __device__ __forceinline__ void delay() { NV_IF_TARGET(NV_PROVIDES_SM_70, - (if (Delay > 0) + (if (Delay > 0) { - if (gridDim.x < GridThreshold) + if (gridDim.x < GridThreshold) { __threadfence_block(); } - else + else { - __nanosleep(Delay); + __nanosleep(Delay); } })); } @@ -134,15 +137,15 @@ template __device__ __forceinline__ void delay(int ns) { NV_IF_TARGET(NV_PROVIDES_SM_70, - (if (ns > 0) + (if (ns > 0) { - if (gridDim.x < GridThreshold) + if (gridDim.x < GridThreshold) { __threadfence_block(); } - else + else { - __nanosleep(ns); + __nanosleep(ns); } })); } @@ -194,7 +197,7 @@ struct no_delay_constructor_t { struct delay_t { - __device__ __forceinline__ void operator()() + __device__ __forceinline__ void operator()() { NV_IF_TARGET(NV_PROVIDES_SM_70, (), @@ -215,7 +218,7 @@ struct reduce_by_key_delay_constructor_t { struct delay_t { - __device__ __forceinline__ void operator()() + __device__ __forceinline__ void operator()() { NV_DISPATCH_TARGET( NV_IS_EXACTLY_SM_80, (delay();), @@ -262,7 +265,7 @@ struct exponential_backoff_constructor_t } }; - __device__ __forceinline__ exponential_backoff_constructor_t(unsigned int /* seed */) + __device__ __forceinline__ exponential_backoff_constructor_t(unsigned int /* seed */) { always_delay(); } @@ -437,7 +440,7 @@ struct exponential_backon_constructor_t unsigned int max_delay = InitialDelay; - __device__ __forceinline__ exponential_backon_constructor_t(unsigned int /* seed */) + __device__ __forceinline__ exponential_backon_constructor_t(unsigned int /* seed */) { always_delay(); } @@ -613,7 +616,7 @@ struct ScanTileState /** * Wait for the corresponding tile to become non-invalid */ - template > + template > __device__ __forceinline__ void WaitForValid( int tile_idx, StatusWord &status, @@ -628,7 +631,7 @@ struct ScanTileState } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)) - { + { delay_or_prevent_hoisting(); TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); tile_descriptor = reinterpret_cast(alias); @@ -642,11 +645,11 @@ struct ScanTileState * Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or * (b) there is no memory fence between reading a non-invalid status and the call to LoadValid. */ - __device__ __forceinline__ T LoadValid(int tile_idx) - { + __device__ __forceinline__ T LoadValid(int tile_idx) + { TxnWord alias = d_tile_descriptors[TILE_STATUS_PADDING + tile_idx]; TileDescriptor tile_descriptor = reinterpret_cast(alias); - return tile_descriptor.value; + return tile_descriptor.value; } }; @@ -704,7 +707,7 @@ struct ScanTileState error = CubDebug( AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -784,7 +787,7 @@ struct ScanTileState /** * Wait for the corresponding tile to become non-invalid */ - template + template __device__ __forceinline__ void WaitForValid( int tile_idx, StatusWord &status, @@ -798,7 +801,7 @@ struct ScanTileState __threadfence(); } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff)); - if (status == StatusWord(SCAN_TILE_PARTIAL)) + if (status == StatusWord(SCAN_TILE_PARTIAL)) { value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); } @@ -812,9 +815,9 @@ struct ScanTileState * Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or * (b) there is no memory fence between reading a non-invalid status and the call to LoadValid. */ - __device__ __forceinline__ T LoadValid(int tile_idx) - { - return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx]; + __device__ __forceinline__ T LoadValid(int tile_idx) + { + return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx]; } }; @@ -1008,7 +1011,7 @@ struct ReduceByKeyScanTileState /** * Wait for the corresponding tile to become non-invalid */ - template ::delay_t> + template ::delay_t> __device__ __forceinline__ void WaitForValid( int tile_idx, StatusWord &status, @@ -1058,8 +1061,8 @@ struct ReduceByKeyScanTileState * the current tile by using the call-back warp to wait on on * aggregates/prefixes from predecessor tiles to become available. * - * @tparam DelayConstructorT - * Implementation detail, do not specify directly, requirements on the + * @tparam DelayConstructorT + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template < @@ -1096,7 +1099,7 @@ struct TilePrefixCallbackOp T exclusive_prefix; ///< Exclusive prefix for the tile T inclusive_prefix; ///< Inclusive prefix for the tile - // Constructs prefix functor for a given tile index. + // Constructs prefix functor for a given tile index. // Precondition: thread blocks processing all of the predecessor tiles were scheduled. __device__ __forceinline__ TilePrefixCallbackOp(ScanTileStateT &tile_status, TempStorage &temp_storage, @@ -1117,7 +1120,7 @@ struct TilePrefixCallbackOp {} // Block until all predecessors within the warp-wide window have non-invalid status - template > + template > __device__ __forceinline__ void ProcessWindow( int predecessor_idx, ///< Preceding tile index to inspect diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh index 524ffbebfaa..33f2c321a55 100644 --- a/cub/cub/block/block_adjacent_difference.cuh +++ b/cub/cub/block/block_adjacent_difference.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -36,6 +36,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../util_type.cuh" #include "../util_ptx.cuh" @@ -489,7 +492,7 @@ public: } /** - * @brief Subtracts the left element of each adjacent pair of elements + * @brief Subtracts the left element of each adjacent pair of elements * partitioned across a CUDA thread block. * * @par @@ -497,7 +500,7 @@ public: * - \smemreuse * * @par Snippet - * The code snippet below illustrates how to use @p BlockAdjacentDifference + * The code snippet below illustrates how to use @p BlockAdjacentDifference * to compute the left difference between adjacent elements. * * @par @@ -516,7 +519,7 @@ public: * * __global__ void ExampleKernel(...) * { - * // Specialize BlockAdjacentDifference for a 1D block of + * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; @@ -607,7 +610,7 @@ public: } /** - * @brief Subtracts the left element of each adjacent pair of elements + * @brief Subtracts the left element of each adjacent pair of elements * partitioned across a CUDA thread block. * * @par @@ -615,7 +618,7 @@ public: * - \smemreuse * * @par Snippet - * The code snippet below illustrates how to use @p BlockAdjacentDifference + * The code snippet below illustrates how to use @p BlockAdjacentDifference * to compute the left difference between adjacent elements. * * @par @@ -634,7 +637,7 @@ public: * * __global__ void ExampleKernel(...) * { - * // Specialize BlockAdjacentDifference for a 1D block of + * // Specialize BlockAdjacentDifference for a 1D block of * // 128 threads of type int * using BlockAdjacentDifferenceT = * cub::BlockAdjacentDifference; @@ -725,9 +728,9 @@ public: { output[0] = input[0]; } - else if (linear_tid == 0) + else if (linear_tid == 0) { - output[0] = difference_op(input[0], + output[0] = difference_op(input[0], tile_predecessor_item); } else diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh index a3bf17f3190..98801ef3c88 100644 --- a/cub/cub/block/block_discontinuity.cuh +++ b/cub/cub/block/block_discontinuity.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,6 +34,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../util_type.cuh" #include "../util_ptx.cuh" diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh index 5f4bc2e84a8..ae96f8abfe3 100644 --- a/cub/cub/block/block_exchange.cuh +++ b/cub/cub/block/block_exchange.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,7 +33,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh index b9ab759607d..ee0c851e892 100644 --- a/cub/cub/block/block_histogram.cuh +++ b/cub/cub/block/block_histogram.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,9 +33,12 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "specializations/block_histogram_sort.cuh" #include "specializations/block_histogram_atomic.cuh" -#include "../config.cuh" #include "../util_ptx.cuh" CUB_NAMESPACE_BEGIN diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index b419ab6e726..bb6081b5b57 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -33,12 +33,15 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include "../block/block_exchange.cuh" #include "../iterator/cache_modified_input_iterator.cuh" -#include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh index dc07ef6c294..49693540fba 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -27,6 +27,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 18655af4f0b..64fcd7979c7 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,13 +33,16 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include "../thread/thread_reduce.cuh" #include "../thread/thread_scan.cuh" #include "../block/block_scan.cuh" #include "../block/radix_rank_sort_operations.cuh" -#include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" @@ -102,7 +105,7 @@ struct warp_in_block_matcher_t { static __device__ std::uint32_t match_any(std::uint32_t label, std::uint32_t warp_id) { - if (warp_id == static_cast(PartialWarpId)) + if (warp_id == static_cast(PartialWarpId)) { return MatchAny(label); } @@ -155,7 +158,7 @@ struct warp_in_block_matcher_t * constexpr int block_threads = 2; * constexpr int radix_bits = 5; * - * // Specialize BlockRadixRank for a 1D block of 2 threads + * // Specialize BlockRadixRank for a 1D block of 2 threads * using block_radix_rank = cub::BlockRadixRank; * using storage_t = typename block_radix_rank::TempStorage; * @@ -172,7 +175,7 @@ struct warp_in_block_matcher_t * * ... * \endcode - * Suppose the set of input `keys` across the block of threads is `{ [16,10], [9,11] }`. + * Suppose the set of input `keys` across the block of threads is `{ [16,10], [9,11] }`. * The corresponding output `ranks` in those threads will be `{ [3,1], [0,2] }`. * * \par Re-using dynamically allocating shared memory @@ -758,8 +761,8 @@ public: // Mask of peers who have same digit as me uint32_t peer_mask = detail::warp_in_block_matcher_t< - RADIX_BITS, - PARTIAL_WARP_THREADS, + RADIX_BITS, + PARTIAL_WARP_THREADS, WARPS - 1>::match_any(digit, warp_id); // Pointer to smem digit counter for this key @@ -918,7 +921,7 @@ struct BlockRadixRankMatchEarlyCounts // types typedef cub::BlockScan BlockScan; - + // temporary storage struct TempStorage @@ -981,7 +984,7 @@ struct BlockRadixRankMatchEarlyCounts for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS) { match_masks[bin] = 0; - } + } } WARP_SYNC(WARP_MASK); @@ -992,7 +995,7 @@ struct BlockRadixRankMatchEarlyCounts { atomicAdd(&warp_histograms[Digit(keys[u])][part], 1); } - + // sum different parts; // no extra work is necessary if NUM_PARTS == 1 if (NUM_PARTS > 1) @@ -1025,7 +1028,7 @@ struct BlockRadixRankMatchEarlyCounts { // sum up warp-private histograms #pragma unroll - for (int u = 0; u < BINS_PER_THREAD; ++u) + for (int u = 0; u < BINS_PER_THREAD; ++u) { bins[u] = 0; int bin = ThreadBin(u); @@ -1127,12 +1130,12 @@ struct BlockRadixRankMatchEarlyCounts int (&exclusive_digit_prefix)[BINS_PER_THREAD]) { ComputeHistogramsWarp(keys); - + CTA_SYNC(); int bins[BINS_PER_THREAD]; ComputeOffsetsWarpUpsweep(bins); callback(bins); - + BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix); ComputeOffsetsWarpDownsweep(exclusive_digit_prefix); @@ -1164,7 +1167,7 @@ struct BlockRadixRankMatchEarlyCounts { BlockRadixRankMatchInternal internal(temp_storage, digit_extractor, callback); - internal.RankKeys(keys, ranks, exclusive_digit_prefix); + internal.RankKeys(keys, ranks, exclusive_digit_prefix); } template @@ -1193,13 +1196,13 @@ struct BlockRadixRankMatchEarlyCounts #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -namespace detail +namespace detail { -// `BlockRadixRank` doesn't conform to the typical pattern, not exposing the algorithm -// template parameter. Other algorithms don't provide the same template parameters, not allowing -// multi-dimensional thread block specializations. -// +// `BlockRadixRank` doesn't conform to the typical pattern, not exposing the algorithm +// template parameter. Other algorithms don't provide the same template parameters, not allowing +// multi-dimensional thread block specializations. +// // TODO(senior-zero) for 3.0: // - Put existing implementations into the detail namespace // - Support multi-dimensional thread blocks in the rest of implementations diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index e275d6f611e..0a7dc571c1b 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,18 +34,21 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "block_exchange.cuh" #include "block_radix_rank.cuh" #include "radix_rank_sort_operations.cuh" -#include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" CUB_NAMESPACE_BEGIN //! @rst -//! BlockRadixSort class provides :ref:`collective ` methods for sorting -//! items partitioned across a CUDA thread block using a radix sorting method. +//! BlockRadixSort class provides :ref:`collective ` methods for sorting +//! items partitioned across a CUDA thread block using a radix sorting method. //! //! .. image:: ../img/sorting_logo.png //! :align: center @@ -156,7 +159,7 @@ CUB_NAMESPACE_BEGIN //! ... //! //! Suppose the set of input ``thread_keys`` across the block of threads is -//! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. +//! ``{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }``. //! The corresponding output ``thread_keys`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }``. //! @@ -173,40 +176,40 @@ CUB_NAMESPACE_BEGIN //! //! @ingroup BlockModule //! -//! @tparam KeyT +//! @tparam KeyT //! KeyT type //! -//! @tparam BLOCK_DIM_X +//! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! -//! @tparam ITEMS_PER_THREAD +//! @tparam ITEMS_PER_THREAD //! The number of items per thread //! -//! @tparam ValueT +//! @tparam ValueT //! **[optional]** ValueT type (default: cub::NullType, which indicates a keys-only sort) //! -//! @tparam RADIX_BITS +//! @tparam RADIX_BITS //! **[optional]** The number of radix bits per digit place (default: 4 bits) //! -//! @tparam MEMOIZE_OUTER_SCAN -//! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory -//! reads at the expense of higher register pressure (default: true for architectures SM35 and +//! @tparam MEMOIZE_OUTER_SCAN +//! **[optional]** Whether or not to buffer outer raking scan partials to incur fewer shared memory +//! reads at the expense of higher register pressure (default: true for architectures SM35 and //! newer, false otherwise). //! -//! @tparam INNER_SCAN_ALGORITHM -//! **[optional]** The cub::BlockScanAlgorithm algorithm to use +//! @tparam INNER_SCAN_ALGORITHM +//! **[optional]** The cub::BlockScanAlgorithm algorithm to use //! (default: cub::BLOCK_SCAN_WARP_SCANS) //! -//! @tparam SMEM_CONFIG +//! @tparam SMEM_CONFIG //! **[optional]*8 Shared memory bank mode (default: `cudaSharedMemBankSizeFourByte`) //! -//! @tparam BLOCK_DIM_Y +//! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension (default: 1) //! -//! @tparam BLOCK_DIM_Z +//! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! -//! @tparam LEGACY_PTX_ARCH +//! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused template < typename KeyT, @@ -575,7 +578,7 @@ public: } //! @rst - //! Performs an ascending block-wide radix sort over a + //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! * @granularity @@ -608,27 +611,27 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template __device__ __forceinline__ // @@ -648,7 +651,7 @@ public: } //! @rst - //! Performs an ascending block-wide radix sort over a + //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! * @granularity @@ -681,19 +684,19 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template __device__ __forceinline__ // @@ -758,7 +761,7 @@ public: } //! @rst - //! Performs an ascending block-wide radix sort over a + //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values. //! //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along" @@ -796,13 +799,13 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values @@ -810,16 +813,16 @@ public: //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template __device__ __forceinline__ // @@ -841,7 +844,7 @@ public: } //! @rst - //! Performs an ascending block-wide radix sort over a + //! Performs an ascending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values. //! //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along" @@ -879,13 +882,13 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values @@ -893,8 +896,8 @@ public: //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template __device__ __forceinline__ // @@ -959,7 +962,7 @@ public: } //! @rst - //! Performs a descending block-wide radix sort over a + //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! * @granularity @@ -992,27 +995,27 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template __device__ __forceinline__ // @@ -1035,7 +1038,7 @@ public: } //! @rst - //! Performs a descending block-wide radix sort over a + //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys. //! //! * @granularity @@ -1068,19 +1071,19 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template __device__ __forceinline__ // @@ -1153,7 +1156,7 @@ public: } //! @rst - //! Performs a descending block-wide radix sort over a + //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values. //! //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along" @@ -1191,13 +1194,13 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values @@ -1205,16 +1208,16 @@ public: //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template __device__ __forceinline__ // @@ -1236,7 +1239,7 @@ public: } //! @rst - //! Performs a descending block-wide radix sort over a + //! Performs a descending block-wide radix sort over a //! :ref:`blocked arrangement ` of keys and values. //! //! * BlockRadixSort can only accommodate one associated tile of values. To "truck along" @@ -1274,13 +1277,13 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values @@ -1288,8 +1291,8 @@ public: //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template __device__ __forceinline__ // @@ -1364,8 +1367,8 @@ public: } //! @rst - //! Performs an ascending block-wide radix sort over a - //! :ref:`blocked arrangement ` of keys, leaving them in a + //! Performs an ascending block-wide radix sort over a + //! :ref:`blocked arrangement ` of keys, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity @@ -1377,7 +1380,7 @@ public: //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a - //! tuple of references to relevant members of the key. + //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ @@ -1398,27 +1401,27 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template __device__ __forceinline__ // @@ -1441,8 +1444,8 @@ public: } //! @rst - //! Performs an ascending block-wide radix sort over a - //! :ref:`blocked arrangement ` of keys, leaving them in a + //! Performs an ascending block-wide radix sort over a + //! :ref:`blocked arrangement ` of keys, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity @@ -1454,7 +1457,7 @@ public: //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a - //! tuple of references to relevant members of the key. + //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ @@ -1475,19 +1478,19 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template __device__ __forceinline__ // @@ -1560,8 +1563,8 @@ public: } //! @rst - //! Performs an ascending block-wide radix sort over a - //! :ref:`blocked arrangement ` of keys and values, leaving them in a + //! Performs an ascending block-wide radix sort over a + //! :ref:`blocked arrangement ` of keys and values, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity @@ -1573,7 +1576,7 @@ public: //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a - //! tuple of references to relevant members of the key. + //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ @@ -1594,13 +1597,13 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values @@ -1608,16 +1611,16 @@ public: //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template __device__ __forceinline__ // @@ -1639,8 +1642,8 @@ public: } //! @rst - //! Performs an ascending block-wide radix sort over a - //! :ref:`blocked arrangement ` of keys and values, leaving them in a + //! Performs an ascending block-wide radix sort over a + //! :ref:`blocked arrangement ` of keys and values, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity @@ -1652,7 +1655,7 @@ public: //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a - //! tuple of references to relevant members of the key. + //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ @@ -1673,13 +1676,13 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values @@ -1687,8 +1690,8 @@ public: //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template __device__ __forceinline__ // @@ -1756,8 +1759,8 @@ public: } //! @rst - //! Performs a descending block-wide radix sort over a - //! :ref:`blocked arrangement ` of keys, leaving them in a + //! Performs a descending block-wide radix sort over a + //! :ref:`blocked arrangement ` of keys, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity @@ -1769,7 +1772,7 @@ public: //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a - //! tuple of references to relevant members of the key. + //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ @@ -1790,27 +1793,27 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template __device__ __forceinline__ // @@ -1833,8 +1836,8 @@ public: } //! @rst - //! Performs a descending block-wide radix sort over a - //! :ref:`blocked arrangement ` of keys, leaving them in a + //! Performs a descending block-wide radix sort over a + //! :ref:`blocked arrangement ` of keys, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity @@ -1846,7 +1849,7 @@ public: //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a - //! tuple of references to relevant members of the key. + //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ @@ -1867,19 +1870,19 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template __device__ __forceinline__ // @@ -1952,8 +1955,8 @@ public: } //! @rst - //! Performs a descending block-wide radix sort over a - //! :ref:`blocked arrangement ` of keys and values, leaving them in a + //! Performs a descending block-wide radix sort over a + //! :ref:`blocked arrangement ` of keys and values, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity @@ -1965,7 +1968,7 @@ public: //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a - //! tuple of references to relevant members of the key. + //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ @@ -1986,13 +1989,13 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values @@ -2000,16 +2003,16 @@ public: //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) template __device__ __forceinline__ // @@ -2031,8 +2034,8 @@ public: } //! @rst - //! Performs a descending block-wide radix sort over a - //! :ref:`blocked arrangement ` of keys and values, leaving them in a + //! Performs a descending block-wide radix sort over a + //! :ref:`blocked arrangement ` of keys and values, leaving them in a //! :ref:`striped arrangement `. //! //! * @granularity @@ -2044,7 +2047,7 @@ public: //! Let's consider a user-defined ``custom_t`` type below. To sort an array of //! ``custom_t`` objects, we have to tell CUB about relevant members of the //! ``custom_t`` type. We do this by providing a decomposer that returns a - //! tuple of references to relevant members of the key. + //! tuple of references to relevant members of the key. //! //! .. literalinclude:: ../../test/catch2_test_block_radix_sort_custom.cu //! :language: c++ @@ -2065,13 +2068,13 @@ public: //! @endrst //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in,out] keys + //! @param[in,out] keys //! Keys to sort //! //! @param[in,out] values @@ -2079,8 +2082,8 @@ public: //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. template __device__ __forceinline__ // diff --git a/cub/cub/block/block_raking_layout.cuh b/cub/cub/block/block_raking_layout.cuh index 4d49f54f2c8..573252bae44 100644 --- a/cub/cub/block/block_raking_layout.cuh +++ b/cub/cub/block/block_raking_layout.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -35,6 +35,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../util_type.cuh" CUB_NAMESPACE_BEGIN diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh index 5a9db703db9..eae6257009f 100644 --- a/cub/cub/block/block_reduce.cuh +++ b/cub/cub/block/block_reduce.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,10 +33,13 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "specializations/block_reduce_raking.cuh" #include "specializations/block_reduce_raking_commutative_only.cuh" #include "specializations/block_reduce_warp_reductions.cuh" -#include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" #include "../thread/thread_operators.cuh" @@ -349,7 +352,7 @@ public: template __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input - ReductionOp reduction_op) ///< [in] Binary reduction functor + ReductionOp reduction_op) ///< [in] Binary reduction functor { return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); } @@ -396,7 +399,7 @@ public: typename ReductionOp> __device__ __forceinline__ T Reduce( T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment - ReductionOp reduction_op) ///< [in] Binary reduction functor + ReductionOp reduction_op) ///< [in] Binary reduction functor { // Reduce partials T partial = internal::ThreadReduce(inputs, reduction_op); @@ -441,7 +444,7 @@ public: template __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input - ReductionOp reduction_op, ///< [in] Binary reduction functor + ReductionOp reduction_op, ///< [in] Binary reduction functor int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) { // Determine if we skip bounds checking diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index 41a3ab22b15..dd0340a25ee 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -28,6 +28,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../thread/thread_search.cuh" #include "../util_math.cuh" #include "../util_namespace.cuh" diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index 544c15f0dbb..22689972cd5 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,9 +33,12 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "specializations/block_scan_raking.cuh" #include "specializations/block_scan_warp_scans.cuh" -#include "../config.cuh" #include "../util_type.cuh" #include "../util_ptx.cuh" @@ -735,7 +738,7 @@ public: T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op) ///< [in] Binary scan functor + ScanOp scan_op) ///< [in] Binary scan functor { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); } @@ -785,7 +788,7 @@ public: T input, ///< [in] Calling thread's input items T &output, ///< [out] Calling thread's output items (may be aliased to \p input) T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op, ///< [in] Binary scan functor + ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); @@ -873,7 +876,7 @@ public: __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor + ScanOp scan_op, ///< [in] Binary scan functor BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); @@ -1717,7 +1720,7 @@ public: __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor + ScanOp scan_op) ///< [in] Binary scan functor { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); } @@ -1766,7 +1769,7 @@ public: __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor + ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); @@ -1854,7 +1857,7 @@ public: __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor + ScanOp scan_op, ///< [in] Binary scan functor BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); @@ -1914,7 +1917,7 @@ public: __device__ __forceinline__ void InclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor + ScanOp scan_op) ///< [in] Binary scan functor { if (ITEMS_PER_THREAD == 1) { @@ -1984,7 +1987,7 @@ public: __device__ __forceinline__ void InclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor + ScanOp scan_op, ///< [in] Binary scan functor T &block_aggregate) ///< [out] block-wide aggregate reduction of input items { if (ITEMS_PER_THREAD == 1) @@ -2098,7 +2101,7 @@ public: __device__ __forceinline__ void InclusiveScan( T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor + ScanOp scan_op, ///< [in] Binary scan functor BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. { if (ITEMS_PER_THREAD == 1) diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh index 58938301c11..c96b8066ae6 100644 --- a/cub/cub/block/block_shuffle.cuh +++ b/cub/cub/block/block_shuffle.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,6 +34,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../util_ptx.cuh" #include "../util_type.cuh" diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index 2cb6bee4337..adbb5506ebe 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,11 +33,14 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include "block_exchange.cuh" -#include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" @@ -878,7 +881,7 @@ private: StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; - + /****************************************************************************** * Type definitions diff --git a/cub/cub/block/radix_rank_sort_operations.cuh b/cub/cub/block/radix_rank_sort_operations.cuh index 679dfd4230e..31c0225488d 100644 --- a/cub/cub/block/radix_rank_sort_operations.cuh +++ b/cub/cub/block/radix_rank_sort_operations.cuh @@ -33,6 +33,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include @@ -40,7 +44,6 @@ #include -#include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" #include "cub/detail/cpp_compatibility.cuh" @@ -96,14 +99,14 @@ struct BaseDigitExtractor * key from a digit. */ template struct BFEDigitExtractor : BaseDigitExtractor -{ +{ using typename BaseDigitExtractor::UnsignedBits; - std::uint32_t bit_start; + std::uint32_t bit_start; std::uint32_t num_bits; explicit __device__ __forceinline__ BFEDigitExtractor( - std::uint32_t bit_start = 0, + std::uint32_t bit_start = 0, std::uint32_t num_bits = 0) : bit_start(bit_start) , num_bits(num_bits) @@ -139,14 +142,14 @@ struct ShiftDigitExtractor : BaseDigitExtractor #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -namespace detail +namespace detail { template struct logic_helper_t; template -struct true_t +struct true_t { static constexpr bool value = true; }; @@ -181,9 +184,9 @@ for_each_member_impl(F f, const ::cuda::std::tuple& tpl) static_assert(sizeof...(Ts), "Empty aggregates are not supported"); // Most radix operations are indifferent to the order of operations. - // Conversely, the digit extractor traverses fields from the least significant - // to the most significant to imitate bitset printing where higher bits are on - // the left. It also maps to intuition, where something coming first is more + // Conversely, the digit extractor traverses fields from the least significant + // to the most significant to imitate bitset printing where higher bits are on + // the left. It also maps to intuition, where something coming first is more // important. Therefore, we traverse fields on the opposite order. for_each_member_impl_helper(f, tpl, THRUST_NS_QUALIFIER::make_reversed_index_sequence{}); } @@ -578,7 +581,7 @@ struct traits_t } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS -//! Twiddling keys for radix sort +//! Twiddling keys for radix sort template struct RadixSortTwiddle { diff --git a/cub/cub/block/specializations/block_histogram_atomic.cuh b/cub/cub/block/specializations/block_histogram_atomic.cuh index 93299fa7192..1e777ee4a7f 100644 --- a/cub/cub/block/specializations/block_histogram_atomic.cuh +++ b/cub/cub/block/specializations/block_histogram_atomic.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -35,6 +35,8 @@ #include "../../config.cuh" +_CCCL_IMPLICIT_SYSTEM_HEADER + CUB_NAMESPACE_BEGIN @@ -57,7 +59,7 @@ struct BlockHistogramAtomic /// Composite data onto an existing histogram template < typename T, - typename CounterT, + typename CounterT, int ITEMS_PER_THREAD> __device__ __forceinline__ void Composite( T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh index 79659ae106d..c23cdb0f6e1 100644 --- a/cub/cub/block/specializations/block_histogram_sort.cuh +++ b/cub/cub/block/specializations/block_histogram_sort.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,9 +33,12 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../../block/block_radix_sort.cuh" #include "../../block/block_discontinuity.cuh" -#include "../../config.cuh" #include "../../util_ptx.cuh" CUB_NAMESPACE_BEGIN diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh index 423f18f8679..7790a9de7c1 100644 --- a/cub/cub/block/specializations/block_reduce_raking.cuh +++ b/cub/cub/block/specializations/block_reduce_raking.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,10 +33,13 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../../block/block_raking_layout.cuh" #include "../../warp/warp_reduce.cuh" #include "../../thread/thread_reduce.cuh" -#include "../../config.cuh" #include "../../util_ptx.cuh" CUB_NAMESPACE_BEGIN diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh index 1fc1caab15a..f0d119a01ea 100644 --- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,10 +33,13 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "block_reduce_raking.cuh" #include "../../warp/warp_reduce.cuh" #include "../../thread/thread_reduce.cuh" -#include "../../config.cuh" #include "../../util_ptx.cuh" CUB_NAMESPACE_BEGIN diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh index 1e51a9fcfe1..d7db7b8f1d2 100644 --- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,7 +33,10 @@ #pragma once -#include +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh index 4891dad141d..b35e71d45c4 100644 --- a/cub/cub/block/specializations/block_scan_raking.cuh +++ b/cub/cub/block/specializations/block_scan_raking.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,7 +34,9 @@ #pragma once -#include +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER #include #include @@ -156,7 +158,7 @@ struct BlockScanRaking CopySegment(out, in, Int2Type()); } - + /// Templated copy (base case) __device__ __forceinline__ void CopySegment( T* /*out*/, ///< [out] Out array diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh index f76131a7856..e050a9b88b4 100644 --- a/cub/cub/block/specializations/block_scan_warp_scans.cuh +++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,7 +33,10 @@ #pragma once -#include +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/config.cuh b/cub/cub/config.cuh index b909bbf7237..88aa182d02f 100644 --- a/cub/cub/config.cuh +++ b/cub/cub/config.cuh @@ -32,6 +32,11 @@ #pragma once +// For `_CCCL_IMPLICIT_SYSTEM_HEADER` +#include + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "util_arch.cuh" #include "util_compiler.cuh" #include "util_cpp_dialect.cuh" diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh index c5cac22cf14..81c3d38a98d 100644 --- a/cub/cub/cub.cuh +++ b/cub/cub/cub.cuh @@ -36,6 +36,8 @@ // Static configuration #include "config.cuh" +_CCCL_IMPLICIT_SYSTEM_HEADER + // Block #include "block/block_adjacent_difference.cuh" #include "block/block_discontinuity.cuh" @@ -101,10 +103,8 @@ // Util #include "util_allocator.cuh" -#include "util_arch.cuh" #include "util_debug.cuh" #include "util_device.cuh" -#include "util_macro.cuh" #include "util_ptx.cuh" #include "util_temporary_storage.cuh" #include "util_type.cuh" diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh index 4154123973a..b3ea6bcf27f 100644 --- a/cub/cub/detail/choose_offset.cuh +++ b/cub/cub/detail/choose_offset.cuh @@ -27,7 +27,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/detail/cpp_compatibility.cuh b/cub/cub/detail/cpp_compatibility.cuh index c4fbe649692..c0770598141 100644 --- a/cub/cub/detail/cpp_compatibility.cuh +++ b/cub/cub/detail/cpp_compatibility.cuh @@ -17,7 +17,9 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER #if CUB_CPP_DIALECT >= 2017 && __cpp_if_constexpr # define CUB_IF_CONSTEXPR if constexpr diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh index b8e776db748..a7b025e4cdc 100644 --- a/cub/cub/detail/detect_cuda_runtime.cuh +++ b/cub/cub/detail/detect_cuda_runtime.cuh @@ -33,6 +33,11 @@ #pragma once +// We cannot use `cub/config.cuh` here due to circular dependencies +#include + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: diff --git a/cub/cub/detail/device_double_buffer.cuh b/cub/cub/detail/device_double_buffer.cuh index c427dcb438b..ee7670a06ba 100644 --- a/cub/cub/detail/device_double_buffer.cuh +++ b/cub/cub/detail/device_double_buffer.cuh @@ -16,6 +16,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include diff --git a/cub/cub/detail/device_synchronize.cuh b/cub/cub/detail/device_synchronize.cuh index 9da0a361aff..273d03fd374 100644 --- a/cub/cub/detail/device_synchronize.cuh +++ b/cub/cub/detail/device_synchronize.cuh @@ -16,10 +16,12 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include -#include #include diff --git a/cub/cub/detail/exec_check_disable.cuh b/cub/cub/detail/exec_check_disable.cuh index c5f4b4572c7..8a4fe75fad8 100644 --- a/cub/cub/detail/exec_check_disable.cuh +++ b/cub/cub/detail/exec_check_disable.cuh @@ -16,7 +16,9 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER /** * @def CUB_EXEC_CHECK_DISABLE diff --git a/cub/cub/detail/strong_load.cuh b/cub/cub/detail/strong_load.cuh index 12e6672b9eb..5dcf6d1f8ea 100644 --- a/cub/cub/detail/strong_load.cuh +++ b/cub/cub/detail/strong_load.cuh @@ -32,7 +32,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/detail/strong_store.cuh b/cub/cub/detail/strong_store.cuh index fd293519a38..ab9805218e1 100644 --- a/cub/cub/detail/strong_store.cuh +++ b/cub/cub/detail/strong_store.cuh @@ -32,7 +32,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/detail/temporary_storage.cuh b/cub/cub/detail/temporary_storage.cuh index 51cb3cc855c..9881b0950bd 100644 --- a/cub/cub/detail/temporary_storage.cuh +++ b/cub/cub/detail/temporary_storage.cuh @@ -16,6 +16,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh index 77903306250..c07ee7a5aad 100644 --- a/cub/cub/detail/type_traits.cuh +++ b/cub/cub/detail/type_traits.cuh @@ -32,6 +32,10 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include @@ -51,7 +55,7 @@ using invoke_result_t = /// The type of intermediate accumulator (according to P2322R6) template -using accumulator_t = +using accumulator_t = typename ::cuda::std::decay>::type; } // namespace detail diff --git a/cub/cub/detail/uninitialized_copy.cuh b/cub/cub/detail/uninitialized_copy.cuh index 2b3e4b1da26..807a458011e 100644 --- a/cub/cub/detail/uninitialized_copy.cuh +++ b/cub/cub/detail/uninitialized_copy.cuh @@ -12,9 +12,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -27,7 +27,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include @@ -48,7 +51,7 @@ __host__ __device__ void uninitialized_copy(T *ptr, U &&val) template ::value, + ::cuda::std::is_trivially_copyable::value, int >::type = 0> __host__ __device__ void uninitialized_copy(T *ptr, U &&val) @@ -56,7 +59,7 @@ __host__ __device__ void uninitialized_copy(T *ptr, U &&val) *ptr = ::cuda::std::forward(val); } -template ::value, diff --git a/cub/cub/device/device_adjacent_difference.cuh b/cub/cub/device/device_adjacent_difference.cuh index 0f614d1876d..aedac4f69f9 100644 --- a/cub/cub/device/device_adjacent_difference.cuh +++ b/cub/cub/device/device_adjacent_difference.cuh @@ -27,7 +27,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh index 445f5d862bc..54ac962f273 100644 --- a/cub/cub/device/device_copy.cuh +++ b/cub/cub/device/device_copy.cuh @@ -32,7 +32,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 4e71c04ed82..a4cddfd105d 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -13,9 +13,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -27,18 +27,21 @@ ******************************************************************************/ /** - * @file cub::DeviceHistogram provides device-wide parallel operations for - * constructing histogram(s) from a sequence of samples data residing + * @file cub::DeviceHistogram provides device-wide parallel operations for + * constructing histogram(s) from a sequence of samples data residing * within device-accessible memory. */ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include -#include #include #include @@ -46,8 +49,8 @@ CUB_NAMESPACE_BEGIN /** - * @brief DeviceHistogram provides device-wide parallel operations for - * constructing histogram(s) from a sequence of samples data residing + * @brief DeviceHistogram provides device-wide parallel operations for + * constructing histogram(s) from a sequence of samples data residing * within device-accessible memory. ![](histogram_logo.png) * @ingroup SingleModule * @@ -67,7 +70,7 @@ struct DeviceHistogram //@{ /** - * @brief Computes an intensity histogram from a sequence of data samples + * @brief Computes an intensity histogram from a sequence of data samples * using equal-width bins. * * @par @@ -97,7 +100,7 @@ struct DeviceHistogram * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input samples and output histogram * int num_samples; // e.g., 10 * float* d_samples; // e.g., [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5] @@ -112,7 +115,7 @@ struct DeviceHistogram * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::HistogramEven( * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, + * d_samples, d_histogram, num_levels, * lower_level, upper_level, num_samples); * * // Allocate temporary storage @@ -121,56 +124,56 @@ struct DeviceHistogram * // Compute histograms * cub::DeviceHistogram::HistogramEven( * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, + * d_samples, d_histogram, num_levels, * lower_level, upper_level, num_samples); * * // d_histogram <-- [1, 5, 0, 3, 0, 0]; * @endcode * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam SampleIteratorT + * **[inferred]** Random-access input iterator type for reading input * samples \iterator * - * @tparam CounterT + * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * - * @tparam LevelT + * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, + * @tparam OffsetT + * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_samples + * @param[in] d_samples * The pointer to the input sequence of data samples. * - * @param[out] d_histogram - * The pointer to the histogram counter output array of length + * @param[out] d_histogram + * The pointer to the histogram counter output array of length * `num_levels - 1`. * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples. + * @param[in] num_levels + * The number of boundaries (levels) for delineating histogram samples. * Implies that the number of bins is `num_levels - 1`. * - * @param[in] lower_level + * @param[in] lower_level * The lower sample value bound (inclusive) for the lowest histogram bin. * - * @param[in] upper_level + * @param[in] upper_level * The upper sample value bound (exclusive) for the highest histogram bin. * - * @param[in] num_samples + * @param[in] num_samples * The number of input samples (i.e., the length of `d_samples`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template ` must be valid, and both LevelT * and SampleT must be valid arithmetic types. The common type must be @@ -277,7 +280,7 @@ struct DeviceHistogram * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input samples and output histogram * int num_row_samples; // e.g., 5 * int num_rows; // e.g., 2; @@ -310,57 +313,57 @@ struct DeviceHistogram * // d_histogram <-- [1, 5, 0, 3, 0, 0]; * @endcode * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading + * @tparam SampleIteratorT + * **[inferred]** Random-access input iterator type for reading * input samples. \iterator * - * @tparam CounterT + * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * - * @tparam LevelT + * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * - * @tparam OffsetT + * @tparam OffsetT * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_samples + * @param[in] d_samples * The pointer to the input sequence of data samples. * - * @param[out] d_histogram - * The pointer to the histogram counter output array of + * @param[out] d_histogram + * The pointer to the histogram counter output array of * length `num_levels - 1`. * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples. + * @param[in] num_levels + * The number of boundaries (levels) for delineating histogram samples. * Implies that the number of bins is `num_levels - 1`. * - * @param[in] lower_level + * @param[in] lower_level * The lower sample value bound (inclusive) for the lowest histogram bin. * - * @param[in] upper_level + * @param[in] upper_level * The upper sample value bound (exclusive) for the highest histogram bin. * - * @param[in] num_row_samples + * @param[in] num_row_samples * The number of data samples per row in the region of interest * - * @param[in] num_rows + * @param[in] num_rows * The number of rows in the region of interest * - * @param[in] row_stride_bytes - * The number of bytes between starts of consecutive rows in + * @param[in] row_stride_bytes + * The number of bytes between starts of consecutive rows in * the region of interest * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template i is + * - The number of histogram bins for channeli is * `num_levels[i] - 1`. * - For channeli, the range of values for all histogram bins - * have the same width: + * have the same width: * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` * - If the common type of sample and level is of integral type, the bin for a sample is * computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - @@ -473,7 +476,7 @@ struct DeviceHistogram * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input samples and output histograms * int num_pixels; // e.g., 5 * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), @@ -490,7 +493,7 @@ struct DeviceHistogram * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramEven<4, 3>( * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, + * d_samples, d_histogram, num_levels, * lower_level, upper_level, num_pixels); * * // Allocate temporary storage @@ -499,7 +502,7 @@ struct DeviceHistogram * // Compute histograms * cub::DeviceHistogram::MultiHistogramEven<4, 3>( * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, + * d_samples, d_histogram, num_levels, * lower_level, upper_level, num_pixels); * * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], @@ -507,65 +510,65 @@ struct DeviceHistogram * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] * @endcode * - * @tparam NUM_CHANNELS - * Number of channels interleaved in the input data (may be greater than + * @tparam NUM_CHANNELS + * Number of channels interleaved in the input data (may be greater than * the number of channels being actively histogrammed) * - * @tparam NUM_ACTIVE_CHANNELS + * @tparam NUM_ACTIVE_CHANNELS * **[inferred]** Number of channels actively being histogrammed * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading + * @tparam SampleIteratorT + * **[inferred]** Random-access input iterator type for reading * input samples. \iterator * - * @tparam CounterT + * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * - * @tparam LevelT + * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, + * @tparam OffsetT + * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_samples - * The pointer to the multi-channel input sequence of data samples. - * The samples from different channels are assumed to be interleaved - * (e.g., an array of 32-bit pixels where each pixel consists of four + * @param[in] d_samples + * The pointer to the multi-channel input sequence of data samples. + * The samples from different channels are assumed to be interleaved + * (e.g., an array of 32-bit pixels where each pixel consists of four * *RGBA* 8-bit samples). * * @param[out] d_histogram - * The pointers to the histogram counter output arrays, one for each active - * channel. For channeli, the allocation length of + * The pointers to the histogram counter output arrays, one for each active + * channel. For channeli, the allocation length of * `d_histogram[i]` should be `num_levels[i] - 1`. * * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples in - * each active channel. Implies that the number of bins for + * The number of boundaries (levels) for delineating histogram samples in + * each active channel. Implies that the number of bins for * channeli is `num_levels[i] - 1`. * * @param[in] lower_level - * The lower sample value bound (inclusive) for the lowest histogram bin in + * The lower sample value bound (inclusive) for the lowest histogram bin in * each active channel. * * @param[in] upper_level - * The upper sample value bound (exclusive) for the highest histogram bin + * The upper sample value bound (exclusive) for the highest histogram bin * in each active channel. * - * @param[in] num_pixels - * The number of multi-channel pixels + * @param[in] num_pixels + * The number of multi-channel pixels * (i.e., the length of `d_samples / NUM_CHANNELS`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template i is + * - The number of histogram bins for channeli is * `num_levels[i] - 1`. - * - For channeli, the range of values for all histogram - * bins have the same width: + * - For channeli, the range of values for all histogram + * bins have the same width: * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` * - If the common type of sample and level is of integral type, the bin for a sample is * computed as `(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - @@ -663,14 +666,14 @@ struct DeviceHistogram * `cudaErrorInvalidValue` is returned. If the common type is 128 bits wide, bin computation * will use 128-bit arithmetic and `cudaErrorInvalidValue` will only be returned if bin * computation would overflow for 128-bit arithmetic. - * - For a given row `r` in `[0, num_rows)`, and sample `s` in - * `[0, num_row_pixels)`, let - * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, + * - For a given row `r` in `[0, num_rows)`, and sample `s` in + * `[0, num_row_pixels)`, let + * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, * `sample_begin = row_begin + s * NUM_CHANNELS`, and * `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For a given channel - * `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges - * `[sample_begin, sample_end)` and - * `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap + * `c` in `[0, NUM_ACTIVE_CHANNELS)`, the ranges + * `[sample_begin, sample_end)` and + * `[d_histogram[c], d_histogram[c] + num_levels[c] - 1)` shall not overlap * in any way. * - `cuda::std::common_type` must be valid, and both LevelT * and SampleT must be valid arithmetic types. The common type must be @@ -678,15 +681,15 @@ struct DeviceHistogram * - @devicestorage * * @par Snippet - * The code snippet below illustrates the computation of three 256-bin - * *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 + * The code snippet below illustrates the computation of three 256-bin + * *RGB* histograms from a 2x3 region of interest of within a flattened 2x4 * array of quad-channel *RGBA* pixels (8 bits per channel per pixel). * * @par * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for input + * // Declare, allocate, and initialize device-accessible pointers for input * // samples and output histograms * int num_row_pixels; // e.g., 3 * int num_rows; // e.g., 2 @@ -722,71 +725,71 @@ struct DeviceHistogram * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] * @endcode * - * @tparam NUM_CHANNELS - * Number of channels interleaved in the input data (may be greater than + * @tparam NUM_CHANNELS + * Number of channels interleaved in the input data (may be greater than * the number of channels being actively histogrammed) * - * @tparam NUM_ACTIVE_CHANNELS + * @tparam NUM_ACTIVE_CHANNELS * **[inferred]** Number of channels actively being histogrammed * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam SampleIteratorT + * **[inferred]** Random-access input iterator type for reading input * samples. \iterator * - * @tparam CounterT + * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * - * @tparam LevelT + * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, + * @tparam OffsetT + * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_samples - * The pointer to the multi-channel input sequence of data samples. The - * samples from different channels are assumed to be interleaved (e.g., - * an array of 32-bit pixels where each pixel consists of four + * @param[in] d_samples + * The pointer to the multi-channel input sequence of data samples. The + * samples from different channels are assumed to be interleaved (e.g., + * an array of 32-bit pixels where each pixel consists of four * *RGBA* 8-bit samples). * - * @param[out] d_histogram - * The pointers to the histogram counter output arrays, one for each - * active channel. For channeli, the allocation length + * @param[out] d_histogram + * The pointers to the histogram counter output arrays, one for each + * active channel. For channeli, the allocation length * of `d_histogram[i]` should be `num_levels[i] - 1`. * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples in - * each active channel. Implies that the number of bins for + * @param[in] num_levels + * The number of boundaries (levels) for delineating histogram samples in + * each active channel. Implies that the number of bins for * channeli is `num_levels[i] - 1`. * - * @param[in] lower_level - * The lower sample value bound (inclusive) for the lowest histogram bin in + * @param[in] lower_level + * The lower sample value bound (inclusive) for the lowest histogram bin in * each active channel. * - * @param[in] upper_level - * The upper sample value bound (exclusive) for the highest histogram bin + * @param[in] upper_level + * The upper sample value bound (exclusive) for the highest histogram bin * in each active channel. * - * @param[in] num_row_pixels + * @param[in] num_row_pixels * The number of multi-channel pixels per row in the region of interest * - * @param[in] num_rows + * @param[in] num_rows * The number of rows in the region of interest * - * @param[in] row_stride_bytes - * The number of bytes between starts of consecutive rows in the region of + * @param[in] row_stride_bytes + * The number of bytes between starts of consecutive rows in the region of * interest * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template i is `[level[i], level[i+1])` - * - The range `[d_histogram, d_histogram + num_levels - 1)` shall not - * overlap `[d_samples, d_samples + num_samples)` nor - * `[d_levels, d_levels + num_levels)` in any way. The ranges - * `[d_levels, d_levels + num_levels)` and + * - The range `[d_histogram, d_histogram + num_levels - 1)` shall not + * overlap `[d_samples, d_samples + num_samples)` nor + * `[d_levels, d_levels + num_levels)` in any way. The ranges + * `[d_levels, d_levels + num_levels)` and * `[d_samples, d_samples + num_samples)` may overlap. * - @devicestorage * @@ -921,7 +924,7 @@ struct DeviceHistogram * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for input + * // Declare, allocate, and initialize device-accessible pointers for input * // samples and output histogram * int num_samples; // e.g., 10 * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] @@ -949,49 +952,49 @@ struct DeviceHistogram * * @endcode * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading + * @tparam SampleIteratorT + * **[inferred]** Random-access input iterator type for reading * input samples.\iterator * - * @tparam CounterT + * @tparam CounterT * **[inferred]** Integer type for histogram bin counters * - * @tparam LevelT + * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, + * @tparam OffsetT + * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_samples + * @param[in] d_samples * The pointer to the input sequence of data samples. * - * @param[out] d_histogram - * The pointer to the histogram counter output array of length + * @param[out] d_histogram + * The pointer to the histogram counter output array of length * `num_levels - 1`. * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples. + * @param[in] num_levels + * The number of boundaries (levels) for delineating histogram samples. * Implies that the number of bins is `num_levels - 1`. * - * @param[in] d_levels - * The pointer to the array of boundaries (levels). Bin ranges are defined - * by consecutive boundary pairings: lower sample value boundaries are + * @param[in] d_levels + * The pointer to the array of boundaries (levels). Bin ranges are defined + * by consecutive boundary pairings: lower sample value boundaries are * inclusive and upper sample value boundaries are exclusive. * - * @param[in] num_samples + * @param[in] num_samples * The number of data samples per row in the region of interest * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template i is `[level[i], level[i+1])` - * - For a given row `r` in `[0, num_rows)`, let - * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and + * - For a given row `r` in `[0, num_rows)`, let + * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)` and * `row_end = row_begin + num_row_samples`. The range * `[d_histogram, d_histogram + num_levels - 1)` shall not overlap * `[row_begin, row_end)` nor `[d_levels, d_levels + num_levels)`. @@ -1116,55 +1119,55 @@ struct DeviceHistogram * // d_histogram <-- [1, 5, 0, 3, 0, 0]; * @endcode * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading + * @tparam SampleIteratorT + * **[inferred]** Random-access input iterator type for reading * input samples. \iterator - * - * @tparam CounterT + * + * @tparam CounterT * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT + * + * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, + * + * @tparam OffsetT + * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_samples + * @param[in] d_samples * The pointer to the input sequence of data samples. * - * @param[out] d_histogram - * The pointer to the histogram counter output array of length + * @param[out] d_histogram + * The pointer to the histogram counter output array of length * `num_levels - 1`. * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples. + * @param[in] num_levels + * The number of boundaries (levels) for delineating histogram samples. * Implies that the number of bins is `num_levels - 1`. * - * @param[in] d_levels - * The pointer to the array of boundaries (levels). Bin ranges are defined - * by consecutive boundary pairings: lower sample value boundaries are + * @param[in] d_levels + * The pointer to the array of boundaries (levels). Bin ranges are defined + * by consecutive boundary pairings: lower sample value boundaries are * inclusive and upper sample value boundaries are exclusive. * - * @param[in] num_row_samples + * @param[in] num_row_samples * The number of data samples per row in the region of interest * - * @param[in] num_rows + * @param[in] num_rows * The number of rows in the region of interest * - * @param[in] row_stride_bytes - * The number of bytes between starts of consecutive rows in the region + * @param[in] row_stride_bytes + * The number of bytes between starts of consecutive rows in the region * of interest * - * @param[in] stream + * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ @@ -1233,24 +1236,24 @@ struct DeviceHistogram } /** - * @brief Computes per-channel intensity histograms from a sequence of - * multi-channel "pixel" data samples using the specified bin + * @brief Computes per-channel intensity histograms from a sequence of + * multi-channel "pixel" data samples using the specified bin * boundary levels. * * @par - * - The input is a sequence of *pixel* structures, where each pixel - * comprises a record of `NUM_CHANNELS` consecutive data samples + * - The input is a sequence of *pixel* structures, where each pixel + * comprises a record of `NUM_CHANNELS` consecutive data samples * (e.g., an *RGBA* pixel). - * - Of the `NUM_CHANNELS` specified, the function will only compute - * histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms + * - Of the `NUM_CHANNELS` specified, the function will only compute + * histograms for the first `NUM_ACTIVE_CHANNELS` (e.g., *RGB* histograms * from *RGBA* pixel samples). - * - The number of histogram bins for channeli is + * - The number of histogram bins for channeli is * `num_levels[i] - 1`. - * - For channeli, the range of values for all histogram - * bins have the same width: + * - For channeli, the range of values for all histogram + * bins have the same width: * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` - * - For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the - * range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall + * - For given channels `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the + * range `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall * not overlap `[d_samples, d_samples + NUM_CHANNELS * num_pixels)` nor * `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. * The ranges `[d_levels[c2], d_levels[c2] + num_levels[c2])` and @@ -1258,15 +1261,15 @@ struct DeviceHistogram * - @devicestorage * * @par Snippet - * The code snippet below illustrates the computation of three 4-bin *RGB* - * histograms from a quad-channel sequence of *RGBA* pixels + * The code snippet below illustrates the computation of three 4-bin *RGB* + * histograms from a quad-channel sequence of *RGBA* pixels * (8 bits per channel per pixel) * * @par * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input samples and output histograms * int num_pixels; // e.g., 5 * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), @@ -1299,63 +1302,63 @@ struct DeviceHistogram * * @endcode * - * @tparam NUM_CHANNELS - * Number of channels interleaved in the input data (may be greater than + * @tparam NUM_CHANNELS + * Number of channels interleaved in the input data (may be greater than * the number of channels being actively histogrammed) - * - * @tparam NUM_ACTIVE_CHANNELS + * + * @tparam NUM_ACTIVE_CHANNELS * **[inferred]** Number of channels actively being histogrammed - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading + * + * @tparam SampleIteratorT + * **[inferred]** Random-access input iterator type for reading * input samples. \iterator - * - * @tparam CounterT + * + * @tparam CounterT * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT + * + * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, + * + * @tparam OffsetT + * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_samples - * The pointer to the multi-channel input sequence of data samples. - * The samples from different channels are assumed to be interleaved (e.g., - * an array of 32-bit pixels where each pixel consists of four *RGBA* + * @param[in] d_samples + * The pointer to the multi-channel input sequence of data samples. + * The samples from different channels are assumed to be interleaved (e.g., + * an array of 32-bit pixels where each pixel consists of four *RGBA* * 8-bit samples). * - * @param[out] d_histogram - * The pointers to the histogram counter output arrays, one for each active - * channel. For channeli, the allocation length of + * @param[out] d_histogram + * The pointers to the histogram counter output arrays, one for each active + * channel. For channeli, the allocation length of * `d_histogram[i]` should be `num_levels[i] - 1`. * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples in - * each active channel. Implies that the number of bins for + * @param[in] num_levels + * The number of boundaries (levels) for delineating histogram samples in + * each active channel. Implies that the number of bins for * channeli is `num_levels[i] - 1`. * - * @param[in] d_levels - * The pointers to the arrays of boundaries (levels), one for each active - * channel. Bin ranges are defined by consecutive boundary pairings: lower - * sample value boundaries are inclusive and upper sample value boundaries + * @param[in] d_levels + * The pointers to the arrays of boundaries (levels), one for each active + * channel. Bin ranges are defined by consecutive boundary pairings: lower + * sample value boundaries are inclusive and upper sample value boundaries * are exclusive. * - * @param[in] num_pixels - * The number of multi-channel pixels + * @param[in] num_pixels + * The number of multi-channel pixels * (i.e., the length of `d_samples / NUM_CHANNELS`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template i is + * - The number of histogram bins for channeli is * `num_levels[i] - 1`. - * - For channeli, the range of values for all histogram - * bins have the same width: + * - For channeli, the range of values for all histogram + * bins have the same width: * `(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)` - * - For a given row `r` in `[0, num_rows)`, and sample `s` in - * `[0, num_row_pixels)`, let - * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, + * - For a given row `r` in `[0, num_rows)`, and sample `s` in + * `[0, num_row_pixels)`, let + * `row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`, * `sample_begin = row_begin + s * NUM_CHANNELS`, and * `sample_end = sample_begin + NUM_ACTIVE_CHANNELS`. For given channels * `c1` and `c2` in `[0, NUM_ACTIVE_CHANNELS)`, the range - * `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not + * `[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)` shall not * overlap `[sample_begin, sample_end)` nor * `[d_levels[c2], d_levels[c2] + num_levels[c2])` in any way. The ranges - * `[d_levels[c2], d_levels[c2] + num_levels[c2])` and + * `[d_levels[c2], d_levels[c2] + num_levels[c2])` and * `[sample_begin, sample_end)` may overlap. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the computation of three 4-bin *RGB* - * histograms from a 2x3 region of interest of within a flattened 2x4 array + * The code snippet below illustrates the computation of three 4-bin *RGB* + * histograms from a 2x3 region of interest of within a flattened 2x4 array * of quad-channel *RGBA* pixels (8 bits per channel per pixel). * * @par * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for input + * // Declare, allocate, and initialize device-accessible pointers for input * // samples and output histograms * int num_row_pixels; // e.g., 3 * int num_rows; // e.g., 2 @@ -1483,7 +1486,7 @@ struct DeviceHistogram * size_t temp_storage_bytes = 0; * cub::DeviceHistogram::MultiHistogramRange<4, 3>( * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, + * d_samples, d_histogram, num_levels, d_levels, * num_row_pixels, num_rows, row_stride_bytes); * * // Allocate temporary storage @@ -1492,7 +1495,7 @@ struct DeviceHistogram * // Compute histograms * cub::DeviceHistogram::MultiHistogramRange<4, 3>( * d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, + * d_samples, d_histogram, num_levels, * d_levels, num_row_pixels, num_rows, row_stride_bytes); * * // d_histogram <-- [ [2, 3, 0, 1], @@ -1501,68 +1504,68 @@ struct DeviceHistogram * * @endcode * - * @tparam NUM_CHANNELS - * Number of channels interleaved in the input data (may be greater than + * @tparam NUM_CHANNELS + * Number of channels interleaved in the input data (may be greater than * the number of channels being actively histogrammed) - * - * @tparam NUM_ACTIVE_CHANNELS + * + * @tparam NUM_ACTIVE_CHANNELS * **[inferred]** Number of channels actively being histogrammed - * - * @tparam SampleIteratorT - * **[inferred]** Random-access input iterator type for reading input + * + * @tparam SampleIteratorT + * **[inferred]** Random-access input iterator type for reading input * samples. \iterator - * - * @tparam CounterT + * + * @tparam CounterT * **[inferred]** Integer type for histogram bin counters - * - * @tparam LevelT + * + * @tparam LevelT * **[inferred]** Type for specifying boundaries (levels) - * - * @tparam OffsetT - * **[inferred]** Signed integer type for sequence offsets, list lengths, + * + * @tparam OffsetT + * **[inferred]** Signed integer type for sequence offsets, list lengths, * pointer differences, etc. \offset_size1 * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to \p temp_storage_bytes and no work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_samples - * The pointer to the multi-channel input sequence of data samples. The - * samples from different channels are assumed to be interleaved (e.g., an - * array of 32-bit pixels where each pixel consists of four + * @param[in] d_samples + * The pointer to the multi-channel input sequence of data samples. The + * samples from different channels are assumed to be interleaved (e.g., an + * array of 32-bit pixels where each pixel consists of four * *RGBA* 8-bit samples). * - * @param[out] d_histogram - * The pointers to the histogram counter output arrays, one for each active - * channel. For channeli, the allocation length of + * @param[out] d_histogram + * The pointers to the histogram counter output arrays, one for each active + * channel. For channeli, the allocation length of * `d_histogram[i]` should be `num_levels[i] - 1`. * - * @param[in] num_levels - * The number of boundaries (levels) for delineating histogram samples in - * each active channel. Implies that the number of bins for + * @param[in] num_levels + * The number of boundaries (levels) for delineating histogram samples in + * each active channel. Implies that the number of bins for * channeli is `num_levels[i] - 1`. * - * @param[in] d_levels - * The pointers to the arrays of boundaries (levels), one for each active - * channel. Bin ranges are defined by consecutive boundary pairings: lower - * sample value boundaries are inclusive and upper sample value boundaries + * @param[in] d_levels + * The pointers to the arrays of boundaries (levels), one for each active + * channel. Bin ranges are defined by consecutive boundary pairings: lower + * sample value boundaries are inclusive and upper sample value boundaries * are exclusive. * - * @param[in] num_row_pixels + * @param[in] num_row_pixels * The number of multi-channel pixels per row in the region of interest * - * @param[in] num_rows + * @param[in] num_rows * The number of rows in the region of interest * - * @param[in] row_stride_bytes - * The number of bytes between starts of consecutive rows in the + * @param[in] row_stride_bytes + * The number of bytes between starts of consecutive rows in the * region of interest * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/device/device_merge_sort.cuh b/cub/cub/device/device_merge_sort.cuh index 9b17ac39125..7806921ab12 100644 --- a/cub/cub/device/device_merge_sort.cuh +++ b/cub/cub/device/device_merge_sort.cuh @@ -27,7 +27,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/device/device_partition.cuh b/cub/cub/device/device_partition.cuh index 6692a489b20..491e8b2e56b 100644 --- a/cub/cub/device/device_partition.cuh +++ b/cub/cub/device/device_partition.cuh @@ -34,10 +34,13 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include #include #include #include @@ -85,9 +88,9 @@ struct DevicePartition * - Copies of the selected items are compacted into @p d_out and maintain * their original relative ordering, however copies of the unselected * items are compacted into the rear of @p d_out in reverse order. - * - The range `[d_out, d_out + num_items)` shall not overlap - * `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any - * way. The range `[d_in, d_in + num_items)` may overlap + * - The range `[d_out, d_out + num_items)` shall not overlap + * `[d_in, d_in + num_items)` nor `[d_flags, d_flags + num_items)` in any + * way. The range `[d_in, d_in + num_items)` may overlap * `[d_flags, d_flags + num_items)`. * - \devicestorage * @@ -251,8 +254,8 @@ struct DevicePartition * - Copies of the selected items are compacted into @p d_out and maintain * their original relative ordering, however copies of the unselected * items are compacted into the rear of @p d_out in reverse order. - * - The range `[d_out, d_out + num_items)` shall not overlap - * `[d_in, d_in + num_items)` in any way. + * - The range `[d_out, d_out + num_items)` shall not overlap + * `[d_in, d_in + num_items)` in any way. * - \devicestorage * * @par Performance @@ -451,10 +454,10 @@ struct DevicePartition * - Copies of the unselected items are compacted into the * @p d_unselected_out in reverse order. * - The ranges `[d_out, d_out + num_items)`, - * `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`, - * `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`, - * `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`, - * shall not overlap in any way. + * `[d_first_part_out, d_first_part_out + d_num_selected_out[0])`, + * `[d_second_part_out, d_second_part_out + d_num_selected_out[1])`, + * `[d_unselected_out, d_unselected_out + num_items - d_num_selected_out[0] - d_num_selected_out[1])`, + * shall not overlap in any way. * * @par Snippet * The code snippet below illustrates how this algorithm can partition an diff --git a/cub/cub/device/device_radix_sort.cuh b/cub/cub/device/device_radix_sort.cuh index e9d1b4d1d91..6644adff2a7 100644 --- a/cub/cub/device/device_radix_sort.cuh +++ b/cub/cub/device/device_radix_sort.cuh @@ -13,9 +13,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -26,13 +26,16 @@ * ******************************************************************************/ -//! @file cub::DeviceRadixSort provides device-wide, parallel operations for -//! computing a radix sort across a sequence of data items residing within +//! @file cub::DeviceRadixSort provides device-wide, parallel operations for +//! computing a radix sort across a sequence of data items residing within //! device-accessible memory. #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include @@ -43,33 +46,33 @@ CUB_NAMESPACE_BEGIN -//! @brief DeviceRadixSort provides device-wide, parallel operations for -//! computing a radix sort across a sequence of data items residing +//! @brief DeviceRadixSort provides device-wide, parallel operations for +//! computing a radix sort across a sequence of data items residing //! within device-accessible memory. ![](sorting_logo.png) //! @ingroup SingleModule -//! +//! //! @par Overview -//! The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) -//! arranges items into ascending (or descending) order. The algorithm relies -//! upon a positional representation for keys, i.e., each key is comprised of an -//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from -//! least-significant to most-significant. For a given input sequence of keys -//! and a set of rules specifying a total ordering of the symbolic alphabet, the +//! The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) +//! arranges items into ascending (or descending) order. The algorithm relies +//! upon a positional representation for keys, i.e., each key is comprised of an +//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from +//! least-significant to most-significant. For a given input sequence of keys +//! and a set of rules specifying a total ordering of the symbolic alphabet, the //! radix sorting method produces a lexicographic ordering of those keys. -//! +//! //! @par Supported Types //! DeviceRadixSort can sort all of the built-in C++ numeric primitive types //! (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` -//! and `__nv_bfloat16` 16-bit floating-point types. User-defined types are +//! and `__nv_bfloat16` 16-bit floating-point types. User-defined types are //! supported as long as decomposer object is provided. -//! +//! //! @par Floating-Point Special Cases -//! +//! //! - Positive and negative zeros are considered equivalent, and will be treated //! as such in the output. //! - No special handling is implemented for NaN values; these are sorted //! according to their bit representations after any transformations. -//! +//! //! @par Transformations //! Although the direct radix sorting method can only be applied to unsigned //! integral types, DeviceRadixSort is able to sort signed and floating-point @@ -78,41 +81,41 @@ CUB_NAMESPACE_BEGIN //! transformations must be considered when restricting the //! `[begin_bit, end_bit)` range, as the bitwise transformations will occur //! before the bit-range truncation. -//! +//! //! Any transformations applied to the keys prior to sorting are reversed //! while writing to the final output buffer. -//! +//! //! @par Type Specific Bitwise Transformations //! To convert the input values into a radix-sortable bitwise representation, //! the following transformations take place prior to sorting: -//! +//! //! - For unsigned integral values, the keys are used directly. //! - For signed integral values, the sign bit is inverted. //! - For positive floating point values, the sign bit is inverted. //! - For negative floating point values, the full key is inverted. -//! +//! //! For floating point types, positive and negative zero are a special case and //! will be considered equivalent during sorting. -//! +//! //! @par Descending Sort Bitwise Transformations //! If descending sort is used, the keys are inverted after performing any //! type-specific transformations, and the resulting keys are sorted in ascending //! order. -//! +//! //! @par Stability //! DeviceRadixSort is stable. For floating-point types, `-0.0` and `+0.0` are //! considered equal and appear in the result in the same order as they appear in //! the input. -//! +//! //! @par Usage Considerations //! @cdp_class{DeviceRadixSort} -//! +//! //! @par Performance -//! @linear_performance{radix sort} The following chart illustrates -//! DeviceRadixSort::SortKeys performance across different CUDA architectures +//! @linear_performance{radix sort} The following chart illustrates +//! DeviceRadixSort::SortKeys performance across different CUDA architectures //! for uniform-random `uint32` keys. //! @plots_below -//! +//! //! @image html lsb_radix_sort_int32_keys.png struct DeviceRadixSort { @@ -203,7 +206,7 @@ public: //! @name KeyT-value pairs //@{ - //! @brief Sorts key-value pairs into ascending order. + //! @brief Sorts key-value pairs into ascending order. //! (`~2N` auxiliary storage required) //! //! @par @@ -216,15 +219,15 @@ public: //! - `[d_keys_out, d_keys_out + num_items)` //! - `[d_values_in, d_values_in + num_items)` //! - `[d_values_out, d_values_out + num_items)` - //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see + //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - @devicestorage //! //! @par Performance - //! The following charts illustrate saturated sorting performance across + //! The following charts illustrate saturated sorting performance across //! different CUDA architectures for uniform-random `uint32, uint32` and //! `uint64, uint64` pairs, respectively. //! @@ -236,10 +239,10 @@ public: //! keys with associated vector of `int` values. //! @par //! @code - //! #include + //! #include //! // or equivalently //! - //! // Declare, allocate, and initialize device-accessible pointers + //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -265,49 +268,49 @@ public: //! // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] //! @endcode //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] d_values_in + //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! - //! @param[out] d_values_out - //! Pointer to the correspondingly-reordered output sequence of associated + //! @param[out] d_values_out + //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., sizeof(unsigned int) * 8) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t @@ -380,9 +383,9 @@ public: } #endif - //! @rst + //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage. - //! + //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. @@ -394,10 +397,10 @@ public: //! * ``[d_values_in, d_values_in + num_items)`` //! * ``[d_values_out, d_values_out + num_items)`` //! - //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify - //! differentiating key bits. This can reduce overall sorting overhead and + //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify + //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see + //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! @@ -417,7 +420,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairs``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -426,62 +429,62 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] d_values_in + //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! - //! @param[out] d_values_out - //! Pointer to the correspondingly-reordered output sequence of associated + //! @param[out] d_values_out + //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -530,9 +533,9 @@ public: stream); } - //! @rst + //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx 2N` auxiliary storage. - //! + //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. @@ -544,7 +547,7 @@ public: //! * ``[d_values_in, d_values_in + num_items)`` //! * ``[d_values_out, d_values_out + num_items)`` //! - //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see + //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! @@ -564,7 +567,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairs``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -573,54 +576,54 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] d_values_in + //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! - //! @param[out] d_values_out - //! Pointer to the correspondingly-reordered output sequence of associated + //! @param[out] d_values_out + //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -665,7 +668,7 @@ public: stream); } - //! @brief Sorts key-value pairs into ascending order. + //! @brief Sorts key-value pairs into ascending order. //! (`~N` auxiliary storage required) //! //! @par @@ -673,7 +676,7 @@ public: //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). - //! - The contents of both buffers within each pair may be altered by the + //! - The contents of both buffers within each pair may be altered by the //! sorting operation. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: @@ -681,18 +684,18 @@ public: //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - `[d_values.Current(), d_values.Current() + num_items)` //! - `[d_values.Alternate(), d_values.Alternate() + num_items)` - //! - Upon completion, the sorting operation will update the "current" - //! indicator within each DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). - //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage //! //! @par Performance - //! The following charts illustrate saturated sorting performance across + //! The following charts illustrate saturated sorting performance across //! different CUDA architectures for uniform-random `uint32, uint32` and //! `uint64, uint64` pairs, respectively. //! @@ -700,14 +703,14 @@ public: //! @image html lsb_radix_sort_int64_pairs.png //! //! @par Snippet - //! The code snippet below illustrates the sorting of a device vector of `int` + //! The code snippet below illustrates the sorting of a device vector of `int` //! keys with associated vector of `int` values. //! @par //! @code - //! #include + //! #include //! // or equivalently //! - //! // Declare, allocate, and initialize device-accessible pointers for + //! // Declare, allocate, and initialize device-accessible pointers for //! // sorting data //! int num_items; // e.g., 7 //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -738,45 +741,45 @@ public: //! //! @endcode //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the //! required allocation size is written to @p temp_storage_bytes and no work is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in,out] d_values - //! Double-buffer of values whose "current" device-accessible buffer - //! contains the unsorted input values and, upon return, is updated to point + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer + //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t @@ -833,14 +836,14 @@ public: } #endif - //! @rst + //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage. - //! + //! //! * The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). - //! * The contents of both buffers within each pair may be altered by the + //! * The contents of both buffers within each pair may be altered by the //! sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: @@ -850,9 +853,9 @@ public: //! - ``[d_values.Current(), d_values.Current() + num_items)`` //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)`` //! - //! - Upon completion, the sorting operation will update the "current" - //! indicator within each DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - @devicestorageP //! - @devicestorage @@ -873,7 +876,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairs``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -882,51 +885,51 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in,out] d_values - //! Double-buffer of values whose "current" device-accessible buffer - //! contains the unsorted input values and, upon return, is updated to point + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer + //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -963,14 +966,14 @@ public: stream); } - //! @rst + //! @rst //! Sorts key-value pairs into ascending order using :math:`\approx N` auxiliary storage. - //! + //! //! * The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). - //! * The contents of both buffers within each pair may be altered by the + //! * The contents of both buffers within each pair may be altered by the //! sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: @@ -980,12 +983,12 @@ public: //! - ``[d_values.Current(), d_values.Current() + num_items)`` //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)`` //! - //! - Upon completion, the sorting operation will update the "current" - //! indicator within each DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). - //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage @@ -1006,7 +1009,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairs``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -1015,59 +1018,59 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in,out] d_values - //! Double-buffer of values whose "current" device-accessible buffer - //! contains the unsorted input values and, upon return, is updated to point + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer + //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -1108,7 +1111,7 @@ public: stream); } - //! @brief Sorts key-value pairs into descending order. + //! @brief Sorts key-value pairs into descending order. //! (`~2N` auxiliary storage required). //! //! @par @@ -1121,10 +1124,10 @@ public: //! - `[d_keys_out, d_keys_out + num_items)` //! - `[d_values_in, d_values_in + num_items)` //! - `[d_values_out, d_values_out + num_items)` - //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see + //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - @devicestorage //! @@ -1132,14 +1135,14 @@ public: //! Performance is similar to DeviceRadixSort::SortPairs. //! //! @par Snippet - //! The code snippet below illustrates the sorting of a device vector of `int` + //! The code snippet below illustrates the sorting of a device vector of `int` //! keys with associated vector of `int` values. //! @par //! @code - //! #include + //! #include //! // or equivalently //! - //! // Declare, allocate, and initialize device-accessible pointers + //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -1167,49 +1170,49 @@ public: //! // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] //! @endcode //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of @p d_temp_storage allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] d_values_in + //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! - //! @param[out] d_values_out - //! Pointer to the correspondingly-reordered output sequence of associated + //! @param[out] d_values_out + //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t @@ -1279,9 +1282,9 @@ public: } #endif - //! @rst + //! @rst //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage. - //! + //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. @@ -1293,10 +1296,10 @@ public: //! * ``[d_values_in, d_values_in + num_items)`` //! * ``[d_values_out, d_values_out + num_items)`` //! - //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify - //! differentiating key bits. This can reduce overall sorting overhead and + //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify + //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see + //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! @@ -1316,7 +1319,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairsDescending``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -1325,62 +1328,62 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] d_values_in + //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! - //! @param[out] d_values_out - //! Pointer to the correspondingly-reordered output sequence of associated + //! @param[out] d_values_out + //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -1429,9 +1432,9 @@ public: stream); } - //! @rst + //! @rst //! Sorts key-value pairs into descending order using :math:`\approx 2N` auxiliary storage. - //! + //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. @@ -1443,7 +1446,7 @@ public: //! * ``[d_values_in, d_values_in + num_items)`` //! * ``[d_values_out, d_values_out + num_items)`` //! - //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see + //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! @@ -1463,7 +1466,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairsDescending``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -1472,54 +1475,54 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] d_values_in + //! @param[in] d_values_in //! Pointer to the corresponding input sequence of associated value items //! - //! @param[out] d_values_out - //! Pointer to the correspondingly-reordered output sequence of associated + //! @param[out] d_values_out + //! Pointer to the correspondingly-reordered output sequence of associated //! value items //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -1564,7 +1567,7 @@ public: stream); } - //! @brief Sorts key-value pairs into descending order. + //! @brief Sorts key-value pairs into descending order. //! (`~N` auxiliary storage required). //! //! @par @@ -1572,7 +1575,7 @@ public: //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). - //! - The contents of both buffers within each pair may be altered by the + //! - The contents of both buffers within each pair may be altered by the //! sorting operation. //! - In-place operations are not supported. There must be no overlap between //! any of the provided ranges: @@ -1580,12 +1583,12 @@ public: //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - `[d_values.Current(), d_values.Current() + num_items)` //! - `[d_values.Alternate(), d_values.Alternate() + num_items)` - //! - Upon completion, the sorting operation will update the "current" - //! indicator within each DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the number + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the number //! of key bits specified and the targeted device architecture). - //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage @@ -1594,14 +1597,14 @@ public: //! Performance is similar to DeviceRadixSort::SortPairs. //! //! @par Snippet - //! The code snippet below illustrates the sorting of a device vector of `int` + //! The code snippet below illustrates the sorting of a device vector of `int` //! keys with associated vector of `int` values. //! @par //! @code - //! #include + //! #include //! // or equivalently //! - //! // Declare, allocate, and initialize device-accessible pointers + //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -1631,46 +1634,46 @@ public: //! // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] //! @endcode //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in,out] d_values - //! Double-buffer of values whose "current" device-accessible buffer - //! contains the unsorted input values and, upon return, is updated to point + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer + //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t @@ -1727,14 +1730,14 @@ public: } #endif - //! @rst + //! @rst //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage. - //! + //! //! * The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). - //! * The contents of both buffers within each pair may be altered by the + //! * The contents of both buffers within each pair may be altered by the //! sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: @@ -1744,9 +1747,9 @@ public: //! - ``[d_values.Current(), d_values.Current() + num_items)`` //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)`` //! - //! - Upon completion, the sorting operation will update the "current" - //! indicator within each DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! - @devicestorageP //! - @devicestorage @@ -1767,7 +1770,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairsDescending``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -1776,51 +1779,51 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in,out] d_values - //! Double-buffer of values whose "current" device-accessible buffer - //! contains the unsorted input values and, upon return, is updated to point + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer + //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -1857,14 +1860,14 @@ public: stream); } - //! @rst + //! @rst //! Sorts key-value pairs into descending order using :math:`\approx N` auxiliary storage. - //! + //! //! * The sorting operation is given a pair of key buffers and a corresponding //! pair of associated value buffers. Each pair is managed by a DoubleBuffer //! structure that indicates which of the two buffers is "current" (and thus //! contains the input data to be sorted). - //! * The contents of both buffers within each pair may be altered by the + //! * The contents of both buffers within each pair may be altered by the //! sorting operation. //! * In-place operations are not supported. There must be no overlap between //! any of the provided ranges: @@ -1874,12 +1877,12 @@ public: //! - ``[d_values.Current(), d_values.Current() + num_items)`` //! - ``[d_values.Alternate(), d_values.Alternate() + num_items)`` //! - //! - Upon completion, the sorting operation will update the "current" - //! indicator within each DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! - Upon completion, the sorting operation will update the "current" + //! indicator within each DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). - //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage @@ -1900,7 +1903,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortPairsDescending``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -1909,59 +1912,59 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam ValueT + //! @tparam ValueT //! **[inferred]** ValueT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in,out] d_values - //! Double-buffer of values whose "current" device-accessible buffer - //! contains the unsorted input values and, upon return, is updated to point + //! @param[in,out] d_values + //! Double-buffer of values whose "current" device-accessible buffer + //! contains the unsorted input values and, upon return, is updated to point //! to the sorted output values //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -2009,7 +2012,7 @@ public: //@{ - //! @brief Sorts keys into ascending order. + //! @brief Sorts keys into ascending order. //! (`~2N` auxiliary storage required) //! //! @par @@ -2020,30 +2023,30 @@ public: //! any of the provided ranges: //! - `[d_keys_in, d_keys_in + num_items)` //! - `[d_keys_out, d_keys_out + num_items)` - //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see + //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - @devicestorage //! //! @par Performance - //! The following charts illustrate saturated sorting performance across - //! different CUDA architectures for uniform-random `uint32` and `uint64` + //! The following charts illustrate saturated sorting performance across + //! different CUDA architectures for uniform-random `uint32` and `uint64` //! keys, respectively. //! //! @image html lsb_radix_sort_int32_keys.png //! @image html lsb_radix_sort_int64_keys.png //! //! @par Snippet - //! The code snippet below illustrates the sorting of a device vector of + //! The code snippet below illustrates the sorting of a device vector of //! `int` keys. //! @par //! @code - //! #include + //! #include //! // or equivalently //! - //! // Declare, allocate, and initialize device-accessible pointers + //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -2066,42 +2069,42 @@ public: //! // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] //! @endcode //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t @@ -2138,9 +2141,9 @@ public: stream); } - //! @rst + //! @rst //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage. - //! + //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. @@ -2150,10 +2153,10 @@ public: //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! - //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify - //! differentiating key bits. This can reduce overall sorting overhead and + //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify + //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see + //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! @@ -2173,7 +2176,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeys``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -2182,52 +2185,52 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -2274,9 +2277,9 @@ public: stream); } - //! @rst + //! @rst //! Sorts keys into ascending order using :math:`\approx 2N` auxiliary storage. - //! + //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. @@ -2286,10 +2289,10 @@ public: //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! - //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see + //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! @@ -2309,7 +2312,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeys``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -2318,44 +2321,44 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -2437,33 +2440,33 @@ public: //! any of the provided ranges: //! - `[d_keys.Current(), d_keys.Current() + num_items)` //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` - //! - Upon completion, the sorting operation will update the "current" - //! indicator within the DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! - Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). - //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage //! //! @par Performance - //! The following charts illustrate saturated sorting performance across - //! different CUDA architectures for uniform-random `uint32` and `uint64` + //! The following charts illustrate saturated sorting performance across + //! different CUDA architectures for uniform-random `uint32` and `uint64` //! keys, respectively. //! //! @image html lsb_radix_sort_int32_keys.png //! @image html lsb_radix_sort_int64_keys.png //! //! @par Snippet - //! The code snippet below illustrates the sorting of a device vector of + //! The code snippet below illustrates the sorting of a device vector of //! `int` keys. //! @par //! @code - //! #include + //! #include //! // or equivalently //! - //! // Declare, allocate, and initialize device-accessible pointers + //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -2489,38 +2492,38 @@ public: //! // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] //! @endcode //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t @@ -2577,9 +2580,9 @@ public: } #endif - //! @rst + //! @rst //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage. - //! + //! //! * The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). @@ -2590,9 +2593,9 @@ public: //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - //! * Upon completion, the sorting operation will update the "current" - //! indicator within the DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! * Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! * @devicestorageP //! * @devicestorage @@ -2613,7 +2616,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeys``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -2622,43 +2625,43 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -2695,9 +2698,9 @@ public: stream); } - //! @rst + //! @rst //! Sorts keys into ascending order using :math:`\approx N` auxiliary storage. - //! + //! //! * The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). @@ -2708,12 +2711,12 @@ public: //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify - //! differentiating key bits. This can reduce overall sorting overhead and + //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify + //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! * Upon completion, the sorting operation will update the "current" - //! indicator within the DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! * Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! * @devicestorageP //! * @devicestorage @@ -2734,7 +2737,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeys``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -2743,51 +2746,51 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -2827,8 +2830,8 @@ public: end_bit, stream); } - - //! @brief Sorts keys into descending order. + + //! @brief Sorts keys into descending order. //! (`~2N` auxiliary storage required). //! //! @par @@ -2839,10 +2842,10 @@ public: //! any of the provided ranges: //! - `[d_keys_in, d_keys_in + num_items)` //! - `[d_keys_out, d_keys_out + num_items)` - //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see + //! - @devicestorageNP For sorting using only `O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! - @devicestorage //! @@ -2850,14 +2853,14 @@ public: //! Performance is similar to DeviceRadixSort::SortKeys. //! //! @par Snippet - //! The code snippet below illustrates the sorting of a device vector of + //! The code snippet below illustrates the sorting of a device vector of //! `int` keys. //! @par //! @code - //! #include + //! #include //! // or equivalently //! - //! // Declare, allocate, and initialize device-accessible pointers + //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -2870,7 +2873,7 @@ public: //! // Determine temporary device storage requirements //! void *d_temp_storage = NULL; //! size_t temp_storage_bytes = 0; - //! cub::DeviceRadixSort::SortKeysDescending( + //! cub::DeviceRadixSort::SortKeysDescending( //! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); //! //! // Allocate temporary storage @@ -2884,39 +2887,39 @@ public: //! //! @endcode //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t @@ -2979,9 +2982,9 @@ public: } #endif - //! @rst + //! @rst //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage. - //! + //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. @@ -2991,10 +2994,10 @@ public: //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! - //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! * An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see + //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! @@ -3014,7 +3017,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeysDescending``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -3023,52 +3026,52 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -3115,9 +3118,9 @@ public: stream); } - //! @rst + //! @rst //! Sorts keys into descending order using :math:`\approx 2N` auxiliary storage. - //! + //! //! * The contents of the input data are not altered by the sorting operation. //! * Pointers to contiguous memory must be used; iterators are not currently //! supported. @@ -3127,7 +3130,7 @@ public: //! * ``[d_keys_in, d_keys_in + num_items)`` //! * ``[d_keys_out, d_keys_out + num_items)`` //! - //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see + //! * @devicestorageNP For sorting using only :math:`O(P)` temporary storage, see //! the sorting interface using DoubleBuffer wrappers below. //! * @devicestorage //! @@ -3147,7 +3150,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeysDescending``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -3156,44 +3159,44 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in] d_keys_in + //! @param[in] d_keys_in //! Pointer to the input data of key data to sort //! - //! @param[out] d_keys_out + //! @param[out] d_keys_out //! Pointer to the sorted output sequence of key data //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -3236,7 +3239,7 @@ public: stream); } - //! @brief Sorts keys into descending order. + //! @brief Sorts keys into descending order. //! (`~N` auxiliary storage required). //! //! @par @@ -3248,12 +3251,12 @@ public: //! any of the provided ranges: //! - `[d_keys.Current(), d_keys.Current() + num_items)` //! - `[d_keys.Alternate(), d_keys.Alternate() + num_items)` - //! - Upon completion, the sorting operation will update the "current" - //! indicator within the DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! - Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). - //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - //! bits can be specified. This can reduce overall sorting overhead and + //! - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + //! bits can be specified. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. //! - @devicestorageP //! - @devicestorage @@ -3265,10 +3268,10 @@ public: //! The code snippet below illustrates the sorting of a device vector of @p int keys. //! @par //! @code - //! #include + //! #include //! // or equivalently //! - //! // Declare, allocate, and initialize device-accessible pointers + //! // Declare, allocate, and initialize device-accessible pointers //! // for sorting data //! int num_items; // e.g., 7 //! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -3294,38 +3297,38 @@ public: //! // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] //! @endcode //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `sizeof(unsigned int) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static cudaError_t @@ -3382,9 +3385,9 @@ public: } #endif - //! @rst + //! @rst //! Sorts keys into descending order using :math:`\approx N` auxiliary storage. - //! + //! //! * The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). @@ -3395,9 +3398,9 @@ public: //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - //! * Upon completion, the sorting operation will update the "current" - //! indicator within the DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! * Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! * @devicestorageP //! * @devicestorage @@ -3418,7 +3421,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeysDescending``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -3427,43 +3430,43 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // @@ -3500,9 +3503,9 @@ public: stream); } - //! @rst + //! @rst //! Sorts keys into descending order using :math:`\approx N` auxiliary storage. - //! + //! //! * The sorting operation is given a pair of key buffers managed by a //! DoubleBuffer structure that indicates which of the two buffers is //! "current" (and thus contains the input data to be sorted). @@ -3513,12 +3516,12 @@ public: //! * ``[d_keys.Current(), d_keys.Current() + num_items)`` //! * ``[d_keys.Alternate(), d_keys.Alternate() + num_items)` //! - //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify - //! differentiating key bits. This can reduce overall sorting overhead and + //! * A bit subrange ``[begin_bit, end_bit)`` is provided to specify + //! differentiating key bits. This can reduce overall sorting overhead and //! yield a corresponding performance improvement. - //! * Upon completion, the sorting operation will update the "current" - //! indicator within the DoubleBuffer wrapper to reference which of the two - //! buffers now contains the sorted output sequence (a function of the + //! * Upon completion, the sorting operation will update the "current" + //! indicator within the DoubleBuffer wrapper to reference which of the two + //! buffers now contains the sorted output sequence (a function of the //! number of key bits specified and the targeted device architecture). //! * @devicestorageP //! * @devicestorage @@ -3539,7 +3542,7 @@ public: //! //! The following snippet shows how to sort an array of ``custom_t`` objects //! using ``cub::DeviceRadixSort::SortKeysDescending``: - //! + //! //! .. literalinclude:: ../../test/catch2_test_device_radix_sort_custom.cu //! :language: c++ //! :dedent: @@ -3548,51 +3551,51 @@ public: //! //! @endrst //! - //! @tparam KeyT + //! @tparam KeyT //! **[inferred]** KeyT type //! - //! @tparam NumItemsT + //! @tparam NumItemsT //! **[inferred]** Type of num_items //! //! @tparam DecomposerT - //! **[inferred]** Type of a callable object responsible for decomposing a + //! **[inferred]** Type of a callable object responsible for decomposing a //! ``KeyT`` into a tuple of references to its constituent arithmetic types: - //! ``::cuda::std::tuple operator()(KeyT &key)``. - //! The leftmost element of the tuple is considered the most significant. + //! ``::cuda::std::tuple operator()(KeyT &key)``. + //! The leftmost element of the tuple is considered the most significant. //! The call operator must not modify members of the key. //! - //! @param[in] d_temp_storage - //! Device-accessible allocation of temporary storage. When `nullptr`, the - //! required allocation size is written to `temp_storage_bytes` and no work + //! @param[in] d_temp_storage + //! Device-accessible allocation of temporary storage. When `nullptr`, the + //! required allocation size is written to `temp_storage_bytes` and no work //! is done. //! - //! @param[in,out] temp_storage_bytes + //! @param[in,out] temp_storage_bytes //! Reference to size in bytes of `d_temp_storage` allocation //! - //! @param[in,out] d_keys - //! Reference to the double-buffer of keys whose "current" device-accessible - //! buffer contains the unsorted input keys and, upon return, is updated to + //! @param[in,out] d_keys + //! Reference to the double-buffer of keys whose "current" device-accessible + //! buffer contains the unsorted input keys and, upon return, is updated to //! point to the sorted output keys //! - //! @param[in] num_items + //! @param[in] num_items //! Number of items to sort //! //! @param decomposer //! Callable object responsible for decomposing a ``KeyT`` into a tuple of - //! references to its constituent arithmetic types. The leftmost element of - //! the tuple is considered the most significant. The call operator must not + //! references to its constituent arithmetic types. The leftmost element of + //! the tuple is considered the most significant. The call operator must not //! modify members of the key. //! - //! @param[in] begin_bit - //! **[optional]** The least-significant bit index (inclusive) needed for + //! @param[in] begin_bit + //! **[optional]** The least-significant bit index (inclusive) needed for //! key comparison //! - //! @param[in] end_bit - //! **[optional]** The most-significant bit index (exclusive) needed for key + //! @param[in] end_bit + //! **[optional]** The most-significant bit index (exclusive) needed for key //! comparison (e.g., `(sizeof(float) + sizeof(long long int)) * 8`) //! - //! @param[in] stream - //! **[optional]** CUDA stream to launch kernels within. + //! @param[in] stream + //! **[optional]** CUDA stream to launch kernels within. //! Default is stream0. template CUB_RUNTIME_FUNCTION static // diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index f2c4090f8b5..1606042f184 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -13,9 +13,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -27,17 +27,20 @@ ******************************************************************************/ /** - * @file cub::DeviceReduce provides device-wide, parallel operations for - * computing a reduction across a sequence of data items residing within + * @file cub::DeviceReduce provides device-wide, parallel operations for + * computing a reduction across a sequence of data items residing within * device-accessible memory. */ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include #include #include #include @@ -50,9 +53,9 @@ CUB_NAMESPACE_BEGIN //! @ingroup SingleModule //! //! @rst -//! DeviceReduce provides device-wide, parallel operations for computing -//! a reduction across a sequence of data items residing within -//! device-accessible memory. +//! DeviceReduce provides device-wide, parallel operations for computing +//! a reduction across a sequence of data items residing within +//! device-accessible memory. //! //! .. image:: ../img/reduce_logo.png //! :align: center @@ -60,7 +63,7 @@ CUB_NAMESPACE_BEGIN //! Overview //! ==================================== //! A `reduction `_ -//! (or *fold*) uses a binary combining operator to compute a single aggregate +//! (or *fold*) uses a binary combining operator to compute a single aggregate //! from a sequence of input elements. //! //! Usage Considerations @@ -79,8 +82,8 @@ CUB_NAMESPACE_BEGIN //! //! @par //! The following chart illustrates DeviceReduce::ReduceByKey (summation) -//! performance across different CUDA architectures for `fp32` values. Segments -//! are identified by `int32` keys, and have lengths uniformly sampled +//! performance across different CUDA architectures for `fp32` values. Segments +//! are identified by `int32` keys, and have lengths uniformly sampled //! from `[1, 1000]`. //! //! .. image:: ../img/reduce_by_key_fp32_len_500.png @@ -90,7 +93,7 @@ CUB_NAMESPACE_BEGIN struct DeviceReduce { /** - * @brief Computes a device-wide reduction using the specified binary + * @brief Computes a device-wide reduction using the specified binary * `reduction_op` functor and initial value `init`. * * @par @@ -104,11 +107,11 @@ struct DeviceReduce * - @devicestorage * * @par Snippet - * The code snippet below illustrates a user-defined min-reduction of a + * The code snippet below illustrates a user-defined min-reduction of a * device vector of `int` data elements. * @par * @code - * #include + * #include * // or equivalently * * // CustomMin functor @@ -121,7 +124,7 @@ struct DeviceReduce * } * }; * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -134,7 +137,7 @@ struct DeviceReduce * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::Reduce( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items, min_op, init); * * // Allocate temporary storage @@ -142,55 +145,55 @@ struct DeviceReduce * * // Run reduction * cub::DeviceReduce::Reduce( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items, min_op, init); * * // d_out <-- [0] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * - * @tparam ReductionOpT - * **[inferred]** Binary reduction functor type having member + * @tparam ReductionOpT + * **[inferred]** Binary reduction functor type having member * `T operator()(const T &a, const T &b)` * - * @tparam T - * **[inferred]** Data element type that is convertible to the `value` type + * @tparam T + * **[inferred]** Data element type that is convertible to the `value` type * of `InputIteratorT` * * @tparam NumItemsT **[inferred]** Type of num_items * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param d_in[in] + * @param d_in[in] * Pointer to the input sequence of data items * - * @param d_out[out] + * @param d_out[out] * Pointer to the output aggregate * - * @param num_items[in] + * @param num_items[in] * Total number of input items (i.e., length of `d_in`) * - * @param reduction_op[in] + * @param reduction_op[in] * Binary reduction functor * - * @param[in] init + * @param[in] init * Initial value of the reduction * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -304,39 +307,39 @@ struct DeviceReduce * // d_out <-- [38] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @tparam NumItemsT **[inferred]** Type of num_items * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, @@ -353,12 +356,12 @@ struct DeviceReduce cub::detail::non_void_value_t>; - using InitT = OutputT; + using InitT = OutputT; - return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, d_in, @@ -404,14 +407,14 @@ struct DeviceReduce * - @devicestorage * * @par Snippet - * The code snippet below illustrates the min-reduction of a device vector of + * The code snippet below illustrates the min-reduction of a device vector of * `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -434,35 +437,35 @@ struct DeviceReduce * // d_out <-- [0] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @tparam NumItemsT **[inferred]** Type of num_items * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template ::Dispatch(d_temp_storage, temp_storage_bytes, @@ -493,10 +496,10 @@ struct DeviceReduce d_out, static_cast(num_items), cub::Min(), - // replace with + // replace with // std::numeric_limits::max() when // C++11 support is more prevalent - Traits::Max(), + Traits::Max(), stream); } @@ -521,15 +524,15 @@ struct DeviceReduce } /** - * @brief Finds the first device-wide minimum using the less-than ('<') + * @brief Finds the first device-wide minimum using the less-than ('<') * operator, also returning the index of that item. * * @par - * - The output value type of `d_out` is cub::KeyValuePair `` + * - The output value type of `d_out` is cub::KeyValuePair `` * (assuming the value type of `d_in` is `T`) - * - The minimum is written to `d_out.value` and its offset in the input + * - The minimum is written to `d_out.value` and its offset in the input * array is written to `d_out.key`. - * - The `{1, std::numeric_limits::max()}` tuple is produced for + * - The `{1, std::numeric_limits::max()}` tuple is produced for * zero-length inputs * - Does not support `<` operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction @@ -541,14 +544,14 @@ struct DeviceReduce * - @devicestorage * * @par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector + * The code snippet below illustrates the argmin-reduction of a device vector * of `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -572,35 +575,35 @@ struct DeviceReduce * * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input items + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input items * (of some type `T`) \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate * (having value type `cub::KeyValuePair`) \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to \p temp_storage_bytes and no work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, @@ -620,7 +623,7 @@ struct DeviceReduce cub::detail::non_void_value_t>; using AccumT = OutputTupleT; - + using InitT = detail::reduce::empty_problem_init_t; // The output value type @@ -634,7 +637,7 @@ struct DeviceReduce // Initial value // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Max())}; + InitT initial_value{AccumT(1, Traits::Max())}; return DispatchReduce') operator. * * @par - * - Uses `std::numeric_limits::lowest()` as the initial value of the + * - Uses `std::numeric_limits::lowest()` as the initial value of the * reduction. * - Does not support `>` operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction @@ -687,14 +690,14 @@ struct DeviceReduce * - @devicestorage * * @par Snippet - * The code snippet below illustrates the max-reduction of a device vector of + * The code snippet below illustrates the max-reduction of a device vector of * `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -717,38 +720,38 @@ struct DeviceReduce * // d_out <-- [9] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * * @tparam NumItemsT **[inferred]** Type of num_items * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, @@ -766,9 +769,9 @@ struct DeviceReduce using InitT = InputT; - return DispatchReduce::Dispatch(d_temp_storage, temp_storage_bytes, @@ -776,11 +779,11 @@ struct DeviceReduce d_out, static_cast(num_items), cub::Max(), - // replace with + // replace with // std::numeric_limits::lowest() - // when C++11 support is more + // when C++11 support is more // prevalent - Traits::Lowest(), + Traits::Lowest(), stream); } @@ -805,15 +808,15 @@ struct DeviceReduce } /** - * @brief Finds the first device-wide maximum using the greater-than ('>') + * @brief Finds the first device-wide maximum using the greater-than ('>') * operator, also returning the index of that item * * @par - * - The output value type of `d_out` is cub::KeyValuePair `` + * - The output value type of `d_out` is cub::KeyValuePair `` * (assuming the value type of `d_in` is `T`) - * - The maximum is written to `d_out.value` and its offset in the input + * - The maximum is written to `d_out.value` and its offset in the input * array is written to `d_out.key`. - * - The `{1, std::numeric_limits::lowest()}` tuple is produced for + * - The `{1, std::numeric_limits::lowest()}` tuple is produced for * zero-length inputs * - Does not support `>` operators that are non-commutative. * - Provides "run-to-run" determinism for pseudo-associative reduction @@ -825,14 +828,14 @@ struct DeviceReduce * - @devicestorage * * @par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector + * The code snippet below illustrates the argmax-reduction of a device vector * of `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -856,36 +859,36 @@ struct DeviceReduce * * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input items + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input items * (of some type \p T) \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate * (having value type `cub::KeyValuePair`) \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - template CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, @@ -920,7 +923,7 @@ struct DeviceReduce // Initial value // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Lowest())}; + InitT initial_value{AccumT(1, Traits::Lowest())}; return DispatchReduceth run - * encountered, the first key of the run and the corresponding value - * aggregate of that run are written to `d_unique_out[i] and - * `d_aggregates_out[i]`, respectively. The total number of runs encountered + * the specified binary `reduction_op` functor. The segments are identified + * by "runs" of corresponding keys in `d_keys_in`, where runs are maximal + * ranges of consecutive, identical keys. For the *i*th run + * encountered, the first key of the run and the corresponding value + * aggregate of that run are written to `d_unique_out[i] and + * `d_aggregates_out[i]`, respectively. The total number of runs encountered * is written to `d_num_runs_out`. * * @par - * - The `==` equality operator is used to determine whether keys are + * - The `==` equality operator is used to determine whether keys are * equivalent * - Provides "run-to-run" determinism for pseudo-associative reduction * (e.g., addition of floating point types) on the same GPU device. * However, results for pseudo-associative reduction may be inconsistent * from one device to a another device of a different compute-capability * because CUB can employ different tile-sizing for different architectures. - * - Let `out` be any of + * - Let `out` be any of * `[d_unique_out, d_unique_out + *d_num_runs_out)` * `[d_aggregates_out, d_aggregates_out + *d_num_runs_out)` - * `d_num_runs_out`. The ranges represented by `out` shall not overlap + * `d_num_runs_out`. The ranges represented by `out` shall not overlap * `[d_keys_in, d_keys_in + num_items)`, * `[d_values_in, d_values_in + num_items)` nor `out` in any way. * - @devicestorage * * @par Performance * The following chart illustrates reduction-by-key (sum) performance across - * different CUDA architectures for `fp32` and `fp64` values, respectively. - * Segments are identified by `int32` keys, and have lengths uniformly + * different CUDA architectures for `fp32` and `fp64` values, respectively. + * Segments are identified by `int32` keys, and have lengths uniformly * sampled from `[1, 1000]`. * * @image html reduce_by_key_fp32_len_500.png * @image html reduce_by_key_fp64_len_500.png * * @par - * The following charts are similar, but with segment lengths uniformly + * The following charts are similar, but with segment lengths uniformly * sampled from [1,10]: * * @image html reduce_by_key_fp32_len_5.png * @image html reduce_by_key_fp64_len_5.png * * @par Snippet - * The code snippet below illustrates the segmented reduction of `int` values + * The code snippet below illustrates the segmented reduction of `int` values * grouped by runs of associated `int` keys. * @par * @code - * #include + * #include * // or equivalently * * // CustomMin functor @@ -1021,7 +1024,7 @@ struct DeviceReduce * } * }; * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] @@ -1036,8 +1039,8 @@ struct DeviceReduce * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceReduce::ReduceByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_unique_out, d_values_in, + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_unique_out, d_values_in, * d_aggregates_out, d_num_runs_out, reduction_op, num_items); * * // Allocate temporary storage @@ -1045,8 +1048,8 @@ struct DeviceReduce * * // Run reduce-by-key * cub::DeviceReduce::ReduceByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_unique_out, d_values_in, + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_unique_out, d_values_in, * d_aggregates_out, d_num_runs_out, reduction_op, num_items); * * // d_unique_out <-- [0, 2, 9, 5, 8] @@ -1054,66 +1057,66 @@ struct DeviceReduce * // d_num_runs_out <-- [5] * @endcode * - * @tparam KeysInputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam KeysInputIteratorT + * **[inferred]** Random-access input iterator type for reading input * keys \iterator * - * @tparam UniqueOutputIteratorT - * **[inferred]** Random-access output iterator type for writing unique + * @tparam UniqueOutputIteratorT + * **[inferred]** Random-access output iterator type for writing unique * output keys \iterator * - * @tparam ValuesInputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam ValuesInputIteratorT + * **[inferred]** Random-access input iterator type for reading input * values \iterator * - * @tparam AggregatesOutputIterator - * **[inferred]** Random-access output iterator type for writing output + * @tparam AggregatesOutputIterator + * **[inferred]** Random-access output iterator type for writing output * value aggregates \iterator * - * @tparam NumRunsOutputIteratorT - * **[inferred]** Output iterator type for recording the number of runs + * @tparam NumRunsOutputIteratorT + * **[inferred]** Output iterator type for recording the number of runs * encountered \iterator * - * @tparam ReductionOpT - * **[inferred]*8 Binary reduction functor type having member + * @tparam ReductionOpT + * **[inferred]*8 Binary reduction functor type having member * `T operator()(const T &a, const T &b)` * * @tparam NumItemsT **[inferred]** Type of num_items * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_keys_in + * @param[in] d_keys_in * Pointer to the input sequence of keys * - * @param[out] d_unique_out + * @param[out] d_unique_out * Pointer to the output sequence of unique keys (one key per run) * - * @param[in] d_values_in + * @param[in] d_values_in * Pointer to the input sequence of corresponding values * - * @param[out] d_aggregates_out - * Pointer to the output sequence of value aggregates + * @param[out] d_aggregates_out + * Pointer to the output sequence of value aggregates * (one aggregate per run) * - * @param[out] d_num_runs_out - * Pointer to total number of runs encountered + * @param[out] d_num_runs_out + * Pointer to total number of runs encountered * (i.e., the length of `d_unique_out`) * - * @param[in] reduction_op + * @param[in] reduction_op * Binary reduction functor * - * @param[in] num_items - * Total number of associated key+value pairs + * @param[in] num_items + * Total number of associated key+value pairs * (i.e., the length of `d_in_keys` and `d_in_values`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 20cb8ba872f..1f8442fb1d5 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -13,9 +13,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -27,14 +27,17 @@ ******************************************************************************/ /** - * @file cub::DeviceScan provides device-wide, parallel operations for - * computing a prefix scan across a sequence of data items residing + * @file cub::DeviceScan provides device-wide, parallel operations for + * computing a prefix scan across a sequence of data items residing * within device-accessible memory. */ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include @@ -44,32 +47,32 @@ CUB_NAMESPACE_BEGIN /** - * @brief DeviceScan provides device-wide, parallel operations for computing a - * prefix scan across a sequence of data items residing within + * @brief DeviceScan provides device-wide, parallel operations for computing a + * prefix scan across a sequence of data items residing within * device-accessible memory. ![](device_scan.png) * * @ingroup SingleModule * * @par Overview - * Given a sequence of input elements and a binary reduction operator, a - * [*prefix scan*](http://en.wikipedia.org/wiki/Prefix_sum) produces an output - * sequence where each element is computed to be the reduction of the elements - * occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan - * with the addition operator. The term *inclusive* indicates that the + * Given a sequence of input elements and a binary reduction operator, a + * [*prefix scan*](http://en.wikipedia.org/wiki/Prefix_sum) produces an output + * sequence where each element is computed to be the reduction of the elements + * occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan + * with the addition operator. The term *inclusive* indicates that the * *i*th output reduction incorporates the *i*th input. - * The term *exclusive* indicates the *i*th input is not - * incorporated into the *i*th output reduction. When the input and + * The term *exclusive* indicates the *i*th input is not + * incorporated into the *i*th output reduction. When the input and * output sequences are the same, the scan is performed in-place. * * @par - * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our - * *"decoupled look-back"* algorithm for performing global prefix scan with - * only a single pass through the input data, as described in our 2016 technical - * report [1]. The central idea is to leverage a small, constant factor of - * redundant work in order to overlap the latencies of global prefix + * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our + * *"decoupled look-back"* algorithm for performing global prefix scan with + * only a single pass through the input data, as described in our 2016 technical + * report [1]. The central idea is to leverage a small, constant factor of + * redundant work in order to overlap the latencies of global prefix * propagation with local computation. As such, our algorithm requires only - * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and - * typically proceeds at "memcpy" speeds. Our algorithm supports inplace + * ~2*n* data movement (*n* inputs are read, *n* outputs are written), and + * typically proceeds at "memcpy" speeds. Our algorithm supports inplace * operations. * * @par @@ -82,7 +85,7 @@ CUB_NAMESPACE_BEGIN * @linear_performance{prefix scan} * * @par - * The following chart illustrates DeviceScan::ExclusiveSum performance across + * The following chart illustrates DeviceScan::ExclusiveSum performance across * different CUDA architectures for `int32` keys. * @plots_below * @@ -97,7 +100,7 @@ struct DeviceScan //@{ /** - * @brief Computes a device-wide exclusive prefix sum. The value of `0` is + * @brief Computes a device-wide exclusive prefix sum. The value of `0` is * applied as the initial value, and is assigned to `*d_out`. * * @par @@ -106,13 +109,13 @@ struct DeviceScan * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Performance - * The following charts illustrate saturated exclusive sum performance across + * The following charts illustrate saturated exclusive sum performance across * different CUDA architectures for `int32` and `int64` items, respectively. * * @image html scan_int32.png @@ -125,7 +128,7 @@ struct DeviceScan * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -136,7 +139,7 @@ struct DeviceScan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveSum( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items); * * // Allocate temporary storage @@ -144,24 +147,24 @@ struct DeviceScan * * // Run exclusive prefix sum * cub::DeviceScan::ExclusiveSum( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items); * * // d_out <-- [0, 8, 14, 21, 26, 29, 29] * * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes @@ -177,7 +180,7 @@ struct DeviceScan * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -227,7 +230,7 @@ struct DeviceScan } /** - * @brief Computes a device-wide exclusive prefix sum in-place. The value of + * @brief Computes a device-wide exclusive prefix sum in-place. The value of * `0` is applied as the initial value, and is assigned to `*d_data`. * * @par @@ -239,7 +242,7 @@ struct DeviceScan * - @devicestorage * * @par Performance - * The following charts illustrate saturated exclusive sum performance across + * The following charts illustrate saturated exclusive sum performance across * different CUDA architectures for `int32` and `int64` items, respectively. * * @image html scan_int32.png @@ -252,7 +255,7 @@ struct DeviceScan * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -262,7 +265,7 @@ struct DeviceScan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveSum( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, num_items); * * // Allocate temporary storage @@ -270,20 +273,20 @@ struct DeviceScan * * // Run exclusive prefix sum * cub::DeviceScan::ExclusiveSum( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, num_items); * * // d_data <-- [0, 8, 14, 21, 26, 29, 29] * * @endcode * - * @tparam IteratorT - * **[inferred]** Random-access iterator type for reading scan + * @tparam IteratorT + * **[inferred]** Random-access iterator type for reading scan * inputs and wrigin scan outputs * * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes @@ -296,7 +299,7 @@ struct DeviceScan * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -337,8 +340,8 @@ struct DeviceScan } /** - * @brief Computes a device-wide exclusive prefix scan using the specified - * binary `scan_op` functor. The `init_value` value is applied as + * @brief Computes a device-wide exclusive prefix scan using the specified + * binary `scan_op` functor. The `init_value` value is applied as * the initial value, and is assigned to `*d_out`. * * @par @@ -347,13 +350,13 @@ struct DeviceScan * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an + * The code snippet below illustrates the exclusive prefix min-scan of an * `int` device vector * @par * @code @@ -370,7 +373,7 @@ struct DeviceScan * } * }; * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -378,12 +381,12 @@ struct DeviceScan * CustomMin min_op; * ... * - * // Determine temporary device storage requirements for exclusive + * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, (int) INT_MAX, num_items); * * // Allocate temporary storage for exclusive prefix scan @@ -391,32 +394,32 @@ struct DeviceScan * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, (int) INT_MAX, num_items); * * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] * * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` used Binary scan functor type + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` used Binary scan functor type * having member `T operator()(const T &a, const T &b)` * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes @@ -438,7 +441,7 @@ struct DeviceScan * Total number of input items (i.e., the length of \p d_in) * * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. Default is + * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -505,8 +508,8 @@ struct DeviceScan } /** - * @brief Computes a device-wide exclusive prefix scan using the specified - * binary `scan_op` functor. The `init_value` value is applied as + * @brief Computes a device-wide exclusive prefix scan using the specified + * binary `scan_op` functor. The `init_value` value is applied as * the initial value, and is assigned to `*d_data`. * * @par @@ -518,7 +521,7 @@ struct DeviceScan * - @devicestorage * * @par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an + * The code snippet below illustrates the exclusive prefix min-scan of an * `int` device vector * @par * @code @@ -535,19 +538,19 @@ struct DeviceScan * } * }; * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * CustomMin min_op; * ... * - * // Determine temporary device storage requirements for exclusive + * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, min_op, (int) INT_MAX, num_items); * * // Allocate temporary storage for exclusive prefix scan @@ -555,28 +558,28 @@ struct DeviceScan * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, min_op, (int) INT_MAX, num_items); * * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] * * @endcode * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam IteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs and writing scan outputs * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` used Binary scan functor type + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` used Binary scan functor type * having member `T operator()(const T &a, const T &b)` * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * * @param[in,out] temp_storage_bytes @@ -595,7 +598,7 @@ struct DeviceScan * Total number of input items (i.e., the length of \p d_in) * * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. Default is + * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -648,8 +651,8 @@ struct DeviceScan } /** - * @brief Computes a device-wide exclusive prefix scan using the specified - * binary `scan_op` functor. The `init_value` value is provided as + * @brief Computes a device-wide exclusive prefix scan using the specified + * binary `scan_op` functor. The `init_value` value is provided as * a future value. * * @par @@ -658,13 +661,13 @@ struct DeviceScan * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an + * The code snippet below illustrates the exclusive prefix min-scan of an * `int` device vector * @par * @code @@ -681,7 +684,7 @@ struct DeviceScan * } * }; * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -689,17 +692,17 @@ struct DeviceScan * int *d_init_iter; // e.g., INT_MAX * CustomMin min_op; * - * auto future_init_value = + * auto future_init_value = * cub::FutureValue(d_init_iter); * * ... * - * // Determine temporary device storage requirements for exclusive + * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, future_init_value, num_items); * * // Allocate temporary storage for exclusive prefix scan @@ -707,54 +710,54 @@ struct DeviceScan * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, future_init_value, num_items); * * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] * * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` used Binary scan functor type + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` used Binary scan functor type * having member `T operator()(const T &a, const T &b)` * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output sequence of data items * - * @param[in] scan_op + * @param[in] scan_op * Binary scan functor * - * @param[in] init_value + * @param[in] init_value * Initial value to seed the exclusive scan (and is assigned to `*d_out`) * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -826,8 +829,8 @@ struct DeviceScan } /** - * @brief Computes a device-wide exclusive prefix scan using the specified - * binary `scan_op` functor. The `init_value` value is provided as + * @brief Computes a device-wide exclusive prefix scan using the specified + * binary `scan_op` functor. The `init_value` value is provided as * a future value. * * @par @@ -839,7 +842,7 @@ struct DeviceScan * - @devicestorage * * @par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an + * The code snippet below illustrates the exclusive prefix min-scan of an * `int` device vector * @par * @code @@ -856,24 +859,24 @@ struct DeviceScan * } * }; * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_init_iter; // e.g., INT_MAX * CustomMin min_op; * - * auto future_init_value = + * auto future_init_value = * cub::FutureValue(d_init_iter); * * ... * - * // Determine temporary device storage requirements for exclusive + * // Determine temporary device storage requirements for exclusive * // prefix scan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, min_op, future_init_value, num_items); * * // Allocate temporary storage for exclusive prefix scan @@ -881,47 +884,47 @@ struct DeviceScan * * // Run exclusive prefix min-scan * cub::DeviceScan::ExclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, min_op, future_init_value, num_items); * * // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0] * * @endcode * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam IteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs and writing scan outputs * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` - * - * @tparam InitValueT - * **[inferred]** Type of the `init_value` used Binary scan functor type + * + * @tparam InitValueT + * **[inferred]** Type of the `init_value` used Binary scan functor type * having member `T operator()(const T &a, const T &b)` * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * * @param[in,out] d_data * Pointer to the sequence of data items * - * @param[in] scan_op + * @param[in] scan_op * Binary scan functor * - * @param[in] init_value + * @param[in] init_value * Initial value to seed the exclusive scan (and is assigned to `*d_out`) * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -992,8 +995,8 @@ struct DeviceScan * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * @@ -1005,19 +1008,19 @@ struct DeviceScan * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] * int *d_out; // e.g., [ , , , , , , ] * ... * - * // Determine temporary device storage requirements for inclusive + * // Determine temporary device storage requirements for inclusive * // prefix sum * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveSum( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items); * * // Allocate temporary storage for inclusive prefix sum @@ -1025,40 +1028,40 @@ struct DeviceScan * * // Run inclusive prefix sum * cub::DeviceScan::InclusiveSum( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, num_items); * * // d_out <-- [8, 14, 21, 26, 29, 29, 38] * * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Random-access iterator to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Random-access iterator to the output sequence of data items * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -1129,18 +1132,18 @@ struct DeviceScan * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * - * // Determine temporary device storage requirements for inclusive + * // Determine temporary device storage requirements for inclusive * // prefix sum * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveSum( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, num_items); * * // Allocate temporary storage for inclusive prefix sum @@ -1148,33 +1151,33 @@ struct DeviceScan * * // Run inclusive prefix sum * cub::DeviceScan::InclusiveSum( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, num_items); * * // d_data <-- [8, 14, 21, 26, 29, 29, 38] * * @endcode * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam IteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs and writing scan outputs * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_data * Random-access iterator to the sequence of data items * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., the length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -1215,7 +1218,7 @@ struct DeviceScan } /** - * @brief Computes a device-wide inclusive prefix scan using the specified + * @brief Computes a device-wide inclusive prefix scan using the specified * binary `scan_op` functor. * * @par @@ -1224,13 +1227,13 @@ struct DeviceScan * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. - * - When `d_in` and `d_out` are equal, the scan is performed in-place. The - * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` + * - When `d_in` and `d_out` are equal, the scan is performed in-place. The + * range `[d_in, d_in + num_items)` and `[d_out, d_out + num_items)` * shall not overlap in any other way. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the inclusive prefix min-scan of an + * The code snippet below illustrates the inclusive prefix min-scan of an * `int` device vector. * * @par @@ -1248,7 +1251,7 @@ struct DeviceScan * } * }; * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] @@ -1256,12 +1259,12 @@ struct DeviceScan * CustomMin min_op; * ... * - * // Determine temporary device storage requirements for inclusive + * // Determine temporary device storage requirements for inclusive * // prefix scan * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, num_items); * * // Allocate temporary storage for inclusive prefix scan @@ -1269,28 +1272,28 @@ struct DeviceScan * * // Run inclusive prefix min-scan * cub::DeviceScan::InclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, num_items); * * // d_out <-- [8, 6, 6, 5, 3, 0, 0] * * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs \iterator * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan * outputs \iterator * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * - * @param[in] - * d_temp_storage Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to + * @param[in] + * d_temp_storage Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes @@ -1309,7 +1312,7 @@ struct DeviceScan * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -1366,7 +1369,7 @@ struct DeviceScan } /** - * @brief Computes a device-wide inclusive prefix scan using the specified + * @brief Computes a device-wide inclusive prefix scan using the specified * binary `scan_op` functor. * * @par @@ -1378,7 +1381,7 @@ struct DeviceScan * - @devicestorage * * @par Snippet - * The code snippet below illustrates the inclusive prefix min-scan of an + * The code snippet below illustrates the inclusive prefix min-scan of an * `int` device vector. * * @par @@ -1396,19 +1399,19 @@ struct DeviceScan * } * }; * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9] * CustomMin min_op; * ... * - * // Determine temporary device storage requirements for inclusive + * // Determine temporary device storage requirements for inclusive * // prefix scan * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_data, min_op, num_items); * * // Allocate temporary storage for inclusive prefix scan @@ -1416,24 +1419,24 @@ struct DeviceScan * * // Run inclusive prefix min-scan * cub::DeviceScan::InclusiveScan( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, min_op, num_items); * * // d_data <-- [8, 6, 6, 5, 3, 0, 0] * * @endcode * - * @tparam IteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam IteratorT + * **[inferred]** Random-access input iterator type for reading scan * inputs and writing scan outputs * - * @tparam ScanOp - * **[inferred]** Binary scan functor type having member + * @tparam ScanOp + * **[inferred]** Binary scan functor type having member * `T operator()(const T &a, const T &b)` * - * @param[in] - * d_temp_storage Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to + * @param[in] + * d_temp_storage Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to * `temp_storage_bytes` and no work is done. * * @param[in,out] temp_storage_bytes @@ -1449,7 +1452,7 @@ struct DeviceScan * Total number of input items (i.e., the length of `d_in`) * * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -1495,8 +1498,8 @@ struct DeviceScan /** * @brief Computes a device-wide exclusive prefix sum-by-key with key equality - * defined by `equality_op`. The value of `0` is applied as the initial - * value, and is assigned to the beginning of each segment in + * defined by `equality_op`. The value of `0` is applied as the initial + * value, and is assigned to the beginning of each segment in * `d_values_out`. * * @par @@ -1505,22 +1508,22 @@ struct DeviceScan * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. - * - `d_keys_in` may equal `d_values_out` but the range - * `[d_keys_in, d_keys_in + num_items)` and the range + * - `d_keys_in` may equal `d_values_out` but the range + * `[d_keys_in, d_keys_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. - * - `d_values_in` may equal `d_values_out` but the range - * `[d_values_in, d_values_in + num_items)` and the range + * - `d_values_in` may equal `d_values_out` but the range + * `[d_values_in, d_values_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the exclusive prefix sum-by-key of an + * The code snippet below illustrates the exclusive prefix sum-by-key of an * `int` device vector. * @par * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] @@ -1532,7 +1535,7 @@ struct DeviceScan * void *d_temp_storage = nullptr; * size_t temp_storage_bytes = 0; * cub::DeviceScan::ExclusiveSumByKey( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, num_items); * * // Allocate temporary storage @@ -1540,57 +1543,57 @@ struct DeviceScan * * // Run exclusive prefix sum * cub::DeviceScan::ExclusiveSumByKey( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, num_items); * * // d_values_out <-- [0, 8, 0, 7, 12, 0, 0] * * @endcode * - * @tparam KeysInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan keys + * @tparam KeysInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan keys * inputs \iterator - * - * @tparam ValuesInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan + * + * @tparam ValuesInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan * values inputs \iterator * - * @tparam ValuesOutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan + * @tparam ValuesOutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan * values outputs \iterator * - * @tparam EqualityOpT - * **[inferred]** Functor type having member - * `T operator()(const T &a, const T &b)` for binary operations that + * @tparam EqualityOpT + * **[inferred]** Functor type having member + * `T operator()(const T &a, const T &b)` for binary operations that * defines the equality of keys * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_keys_in + * @param[in] d_keys_in * Random-access input iterator to the input sequence of key items * - * @param[in] d_values_in + * @param[in] d_values_in * Random-access input iterator to the input sequence of value items * - * @param[out] d_values_out + * @param[out] d_values_out * Random-access output iterator to the output sequence of value items * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_keys_in` and + * @param[in] num_items + * Total number of input items (i.e., the length of `d_keys_in` and * `d_values_in`) * - * @param[in] equality_op - * Binary functor that defines the equality of keys. + * @param[in] equality_op + * Binary functor that defines the equality of keys. * Default is cub::Equality(). * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -1614,7 +1617,7 @@ struct DeviceScan using InitT = cub::detail::value_t; // Initial value - InitT init_value{}; + InitT init_value{}; return DispatchScanByKey0. * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back @@ -1888,7 +1891,7 @@ struct DeviceScan } /** - * @brief Computes a device-wide inclusive prefix sum-by-key with key + * @brief Computes a device-wide inclusive prefix sum-by-key with key * equality defined by `equality_op`. * * @par @@ -1897,22 +1900,22 @@ struct DeviceScan * addition of floating-point types). Results for pseudo-associative * operators may vary from run to run. Additional details can be found in * the [decoupled look-back] description. - * - `d_keys_in` may equal `d_values_out` but the range - * `[d_keys_in, d_keys_in + num_items)` and the range + * - `d_keys_in` may equal `d_values_out` but the range + * `[d_keys_in, d_keys_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. - * - `d_values_in` may equal `d_values_out` but the range - * `[d_values_in, d_values_in + num_items)` and the range + * - `d_values_in` may equal `d_values_out` but the range + * `[d_values_in, d_values_in + num_items)` and the range * `[d_values_out, d_values_out + num_items)` shall not overlap otherwise. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the inclusive prefix sum-by-key of an + * The code snippet below illustrates the inclusive prefix sum-by-key of an * `int` device vector. * @par * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // input and output * int num_items; // e.g., 7 * int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2] @@ -1924,7 +1927,7 @@ struct DeviceScan * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceScan::InclusiveSumByKey( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, num_items); * * // Allocate temporary storage for inclusive prefix sum @@ -1932,59 +1935,59 @@ struct DeviceScan * * // Run inclusive prefix sum * cub::DeviceScan::InclusiveSumByKey( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_keys_in, d_values_in, d_values_out, num_items); * * // d_out <-- [8, 14, 7, 12, 15, 0, 9] * * @endcode * - * @tparam KeysInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan + * @tparam KeysInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan * keys inputs \iterator - * - * @tparam ValuesInputIteratorT - * **[inferred]** Random-access input iterator type for reading scan + * + * @tparam ValuesInputIteratorT + * **[inferred]** Random-access input iterator type for reading scan * values inputs \iterator - * - * @tparam ValuesOutputIteratorT - * **[inferred]** Random-access output iterator type for writing scan + * + * @tparam ValuesOutputIteratorT + * **[inferred]** Random-access output iterator type for writing scan * values outputs \iterator - * - * @tparam EqualityOpT - * **[inferred]** Functor type having member - * `T operator()(const T &a, const T &b)` for binary operations that + * + * @tparam EqualityOpT + * **[inferred]** Functor type having member + * `T operator()(const T &a, const T &b)` for binary operations that * defines the equality of keys * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to * `temp_storage_bytes` and no work is done. - * - * @param[in,out] temp_storage_bytes + * + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation - * - * @param[in] d_keys_in + * + * @param[in] d_keys_in * Random-access input iterator to the input sequence of key items - * - * @param[in] d_values_in + * + * @param[in] d_values_in * Random-access input iterator to the input sequence of value items - * - * @param[out] d_values_out + * + * @param[out] d_values_out * Random-access output iterator to the output sequence of value items - * - * @param[in] num_items - * Total number of input items (i.e., the length of `d_keys_in` and + * + * @param[in] num_items + * Total number of input items (i.e., the length of `d_keys_in` and * `d_values_in`) - * - * @param[in] equality_op - * Binary functor that defines the equality of keys. + * + * @param[in] equality_op + * Binary functor that defines the equality of keys. * Default is cub::Equality(). - * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. - * + * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template 0. - * + * * [decoupled look-back]: https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back */ template #include -#include #include #include @@ -46,27 +49,27 @@ CUB_NAMESPACE_BEGIN /** - * @brief DeviceSegmentedRadixSort provides device-wide, parallel operations - * for computing a batched radix sort across multiple, non-overlapping - * sequences of data items residing within device-accessible memory. + * @brief DeviceSegmentedRadixSort provides device-wide, parallel operations + * for computing a batched radix sort across multiple, non-overlapping + * sequences of data items residing within device-accessible memory. * ![](segmented_sorting_logo.png) * @ingroup SegmentedModule * * @par Overview - * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) - * arranges items into ascending (or descending) order. The algorithm relies - * upon a positional representation for keys, i.e., each key is comprised of an - * ordered sequence of symbols (e.g., digits, characters, etc.) specified from - * least-significant to most-significant. For a given input sequence of keys - * and a set of rules specifying a total ordering of the symbolic alphabet, the + * The [*radix sorting method*](http://en.wikipedia.org/wiki/Radix_sort) + * arranges items into ascending (or descending) order. The algorithm relies + * upon a positional representation for keys, i.e., each key is comprised of an + * ordered sequence of symbols (e.g., digits, characters, etc.) specified from + * least-significant to most-significant. For a given input sequence of keys + * and a set of rules specifying a total ordering of the symbolic alphabet, the * radix sorting method produces a lexicographic ordering of those keys. * * @par See Also * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See * that algorithm's documentation for more information. * - * @par Segments are not required to be contiguous. Any element of input(s) or - * output(s) outside the specified segments will not be accessed nor modified. + * @par Segments are not required to be contiguous. Any element of input(s) or + * output(s) outside the specified segments will not be accessed nor modified. * * @par Usage Considerations * @cdp_class{DeviceSegmentedRadixSort} @@ -80,7 +83,7 @@ struct DeviceSegmentedRadixSort //@{ /** - * @brief Sorts segments of key-value pairs into ascending order. + * @brief Sorts segments of key-value pairs into ascending order. * (`~2N` auxiliary storage required) * * @par @@ -89,31 +92,31 @@ struct DeviceSegmentedRadixSort * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). - * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - * bits can be specified. This can reduce overall sorting overhead and + * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of - * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall - * not overlap `[in, in + num_items)`, + * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall + * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - @devicestorageNP For sorting using only `O(P)` temporary storage, see + * - @devicestorageNP For sorting using only `O(P)` temporary storage, see * the sorting interface using DoubleBuffer wrappers below. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, - * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, + * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of `int` keys with associated vector of + * The code snippet below illustrates the batched sorting of three segments + * (with one zero-length segment) of `int` keys with associated vector of * `int` values. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 @@ -145,72 +148,72 @@ struct DeviceSegmentedRadixSort * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] * @endcode * - * @tparam KeyT + * @tparam KeyT * **[inferred]** Key type * - * @tparam ValueT + * @tparam ValueT * **[inferred]** Value type * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_keys_in + * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * - * @param[out] d_keys_out + * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * - * @param[in] d_values_in - * Device-accessible pointer to the corresponding input sequence of + * @param[in] d_values_in + * Device-accessible pointer to the corresponding input sequence of * associated value items * - * @param[out] d_values_out - * Device-accessible pointer to the correspondingly-reordered output + * @param[out] d_values_out + * Device-accessible pointer to the correspondingly-reordered output * sequence of associated value items * - * @param[in] num_items + * @param[in] num_items * The total number of items within the segmented array, including items not * covered by segments. `num_items` should match the largest element within * the range `[d_end_offsets, d_end_offsets + num_segments)`. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. If - * `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. If + * `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for + * @param[in] begin_bit + * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key + * @param[in] end_bit + * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * - * @param[in] stream + * @param[in] stream * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ @@ -299,7 +302,7 @@ struct DeviceSegmentedRadixSort } /** - * @brief Sorts segments of key-value pairs into ascending order. + * @brief Sorts segments of key-value pairs into ascending order. * (`~N` auxiliary storage required) * * @par @@ -307,42 +310,42 @@ struct DeviceSegmentedRadixSort * pair of associated value buffers. Each pair is managed by a DoubleBuffer * structure that indicates which of the two buffers is "current" (and thus * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the + * - The contents of both buffers within each pair may be altered by the * sorting operation. - * - Upon completion, the sorting operation will update the "current" - * indicator within each DoubleBuffer wrapper to reference which of the two - * buffers now contains the sorted output sequence (a function of the number + * - Upon completion, the sorting operation will update the "current" + * indicator within each DoubleBuffer wrapper to reference which of the two + * buffers now contains the sorted output sequence (a function of the number * of key bits specified and the targeted device architecture). * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased for both - * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is + * `segment_offsets` (of length `num_segments + 1`) can be aliased for both + * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is * specified as `segment_offsets + 1`). - * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - * bits can be specified. This can reduce overall sorting overhead and yield + * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + * bits can be specified. This can reduce overall sorting overhead and yield * a corresponding performance improvement. - * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` - * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range - * `[cur, cur + num_items)` shall not overlap + * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` + * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range + * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, - * `d_values.Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, + * `d_values.Alternate()[i]` will not be accessed nor modified. * - @devicestorageP * - @devicestorage * * @par Snippet - * The code snippet below illustrates the batched sorting of three segments - * (with one zero-length segment) of `int` keys with associated vector of + * The code snippet below illustrates the batched sorting of three segments + * (with one zero-length segment) of `int` keys with associated vector of * `int` values. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 @@ -377,69 +380,69 @@ struct DeviceSegmentedRadixSort * * @endcode * - * @tparam KeyT + * @tparam KeyT * **[inferred]** Key type * - * @tparam ValueT + * @tparam ValueT * **[inferred]** Value type * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to + * @param[in,out] d_keys + * Reference to the double-buffer of keys whose "current" device-accessible + * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * - * @param[in,out] d_values - * Double-buffer of values whose "current" device-accessible buffer - * contains the unsorted input values and, upon return, is updated to point + * @param[in,out] d_values + * Double-buffer of values whose "current" device-accessible buffer + * contains the unsorted input values and, upon return, is updated to point * to the sorted output values * - * @param[in] num_items + * @param[in] num_items * The total number of items within the segmented array, including items not * covered by segments. `num_items` should match the largest element within * the range `[d_end_offsets, d_end_offsets + num_segments)`. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for + * @param[in] begin_bit + * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key + * @param[in] end_bit + * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 @@ -582,73 +585,73 @@ struct DeviceSegmentedRadixSort * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] * @endcode * - * @tparam KeyT + * @tparam KeyT * **[inferred]** Key type * - * @tparam ValueT + * @tparam ValueT * **[inferred]** Value type * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_keys_in + * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * - * @param[out] d_keys_out + * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * - * @param[in] d_values_in - * Device-accessible pointer to the corresponding input sequence of + * @param[in] d_values_in + * Device-accessible pointer to the corresponding input sequence of * associated value items * - * @param[out] d_values_out - * Device-accessible pointer to the correspondingly-reordered output + * @param[out] d_values_out + * Device-accessible pointer to the correspondingly-reordered output * sequence of associated value items * - * @param[in] num_items + * @param[in] num_items * The total number of items within the segmented array, including items not * covered by segments. `num_items` should match the largest element within * the range `[d_end_offsets, d_end_offsets + num_segments)`. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th * is considered empty. * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for + * @param[in] begin_bit + * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key + * @param[in] end_bit + * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 @@ -816,69 +819,69 @@ struct DeviceSegmentedRadixSort * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] * @endcode * - * @tparam KeyT + * @tparam KeyT * **[inferred]** Key type * - * @tparam ValueT + * @tparam ValueT * **[inferred]** Value type * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to + * @param[in,out] d_keys + * Reference to the double-buffer of keys whose "current" device-accessible + * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * - * @param[in,out] d_values - * Double-buffer of values whose "current" device-accessible buffer - * contains the unsorted input values and, upon return, is updated to point + * @param[in,out] d_values + * Double-buffer of values whose "current" device-accessible buffer + * contains the unsorted input values and, upon return, is updated to point * to the sorted output values * - * @param[in] num_items + * @param[in] num_items * The total number of items within the segmented array, including items not * covered by segments. `num_items` should match the largest element within * the range `[d_end_offsets, d_end_offsets + num_segments)`. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th * is considered empty. * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for + * @param[in] begin_bit + * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key + * @param[in] end_bit + * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 @@ -1008,7 +1011,7 @@ struct DeviceSegmentedRadixSort * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeys( + * cub::DeviceSegmentedRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * @@ -1016,7 +1019,7 @@ struct DeviceSegmentedRadixSort * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeys( + * cub::DeviceSegmentedRadixSort::SortKeys( * d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, * num_items, num_segments, d_offsets, d_offsets + 1); * @@ -1024,60 +1027,60 @@ struct DeviceSegmentedRadixSort * * @endcode * - * @tparam KeyT + * @tparam KeyT * **[inferred]** Key type * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage + * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * - * @param[in] d_keys_in + * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * - * @param[out] d_keys_out + * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * - * @param[in] num_items + * @param[in] num_items * The total number of items within the segmented array, including items not * covered by segments. `num_items` should match the largest element within * the range `[d_end_offsets, d_end_offsets + num_segments)`. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for + * @param[in] begin_bit + * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key + * @param[in] end_bit + * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template N auxiliary storage required). * * @par - * - The sorting operation is given a pair of key buffers managed by a + * - The sorting operation is given a pair of key buffers managed by a * DoubleBuffer structure that indicates which of the two buffers is * "current" (and thus contains the input data to be sorted). * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" - * indicator within the DoubleBuffer wrapper to reference which of the two - * buffers now contains the sorted output sequence (a function of the + * - Upon completion, the sorting operation will update the "current" + * indicator within the DoubleBuffer wrapper to reference which of the two + * buffers now contains the sorted output sequence (a function of the * number of key bits specified and the targeted device architecture). * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased for both - * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter + * `segment_offsets` (of length `num_segments + 1`) can be aliased for both + * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter * is specified as `segment_offsets + 1`). - * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key - * bits can be specified. This can reduce overall sorting overhead and + * - An optional bit subrange `[begin_bit, end_bit)` of differentiating key + * bits can be specified. This can reduce overall sorting overhead and * yield a corresponding performance improvement. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap + * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * - @devicestorageP * - @devicestorage * * @par Snippet - * The code snippet below illustrates the batched sorting of three segments + * The code snippet below illustrates the batched sorting of three segments * (with one zero-length segment) of `int` keys. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for + * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 @@ -1225,61 +1228,61 @@ struct DeviceSegmentedRadixSort * * @endcode * - * @tparam KeyT + * @tparam KeyT * **[inferred]** Key type * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to + * @param[in,out] d_keys + * Reference to the double-buffer of keys whose "current" device-accessible + * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * - * @param[in] num_items + * @param[in] num_items * The total number of items within the segmented array, including items not * covered by segments. `num_items` should match the largest element within * the range `[d_end_offsets, d_end_offsets + num_segments)`. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. * If `d_end_offsets[i] - 1` <= d_begin_offsets[i]`, the *i*th * is considered empty. * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) + * @param[in] begin_bit + * **[optional]** The least-significant bit index (inclusive) * needed for key comparison * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key + * @param[in] end_bit + * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 @@ -1417,62 +1420,62 @@ struct DeviceSegmentedRadixSort * * @endcode * - * @tparam KeyT + * @tparam KeyT * **[inferred]** Key type * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_keys_in + * @param[in] d_keys_in * Device-accessible pointer to the input data of key data to sort * - * @param[out] d_keys_out + * @param[out] d_keys_out * Device-accessible pointer to the sorted output sequence of key data * - * @param[in] num_items + * @param[in] num_items * The total number of items within the segmented array, including items not * covered by segments. `num_items` should match the largest element within * the range `[d_end_offsets, d_end_offsets + num_segments)`. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for + * @param[in] begin_bit + * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key + * @param[in] end_bit + * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., sizeof(unsigned int) * 8) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int num_segments; // e.g., 3 @@ -1619,61 +1622,61 @@ struct DeviceSegmentedRadixSort * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] * @endcode * - * @tparam KeyT + * @tparam KeyT * **[inferred]** Key type * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in,out] d_keys - * Reference to the double-buffer of keys whose "current" device-accessible - * buffer contains the unsorted input keys and, upon return, is updated to + * @param[in,out] d_keys + * Reference to the double-buffer of keys whose "current" device-accessible + * buffer contains the unsorted input keys and, upon return, is updated to * point to the sorted output keys * - * @param[in] num_items + * @param[in] num_items * The total number of items within the segmented array, including items not * covered by segments. `num_items` should match the largest element within * the range `[d_end_offsets, d_end_offsets + num_segments)`. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i], the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i], the *i*th is * considered empty. * - * @param[in] begin_bit - * **[optional]** The least-significant bit index (inclusive) needed for + * @param[in] begin_bit + * **[optional]** The least-significant bit index (inclusive) needed for * key comparison * - * @param[in] end_bit - * **[optional]** The most-significant bit index (exclusive) needed for key + * @param[in] end_bit + * **[optional]** The most-significant bit index (exclusive) needed for key * comparison (e.g., `sizeof(unsigned int) * 8`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/device/device_segmented_sort.cuh b/cub/cub/device/device_segmented_sort.cuh index 2b86da95078..4d2aebd64f4 100644 --- a/cub/cub/device/device_segmented_sort.cuh +++ b/cub/cub/device/device_segmented_sort.cuh @@ -34,7 +34,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include @@ -72,8 +75,8 @@ CUB_NAMESPACE_BEGIN * (`unsigned char`, `int`, `double`, etc.) as well as CUDA's `__half` and * `__nv_bfloat16` 16-bit floating-point types. * - * @par Segments are not required to be contiguous. Any element of input(s) or - * output(s) outside the specified segments will not be accessed nor modified. + * @par Segments are not required to be contiguous. Any element of input(s) or + * output(s) outside the specified segments will not be accessed nor modified. * * @par A simple example * @code @@ -135,12 +138,12 @@ struct DeviceSegmentedSort * guaranteed that the relative order of these two elements will be * preserved by sort. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, + * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not + * be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -309,12 +312,12 @@ struct DeviceSegmentedSort * not guaranteed that the relative order of these two elements will be * preserved by sort. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, + * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not + * be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -490,13 +493,13 @@ struct DeviceSegmentedSort * not guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap + * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -671,13 +674,13 @@ struct DeviceSegmentedSort * not guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap + * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -846,12 +849,12 @@ struct DeviceSegmentedSort * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, + * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not + * be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -1009,12 +1012,12 @@ struct DeviceSegmentedSort * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - The range `[d_keys_out, d_keys_out + num_items)` shall not overlap - * `[d_keys_in, d_keys_in + num_items)`, + * `[d_keys_in, d_keys_in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not - * be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_keys_out[i]` will not + * be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -1181,13 +1184,13 @@ struct DeviceSegmentedSort * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap + * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -1350,13 +1353,13 @@ struct DeviceSegmentedSort * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `cur = d_keys.Current()` and `alt = d_keys.Alternate()`. - * The range `[cur, cur + num_items)` shall not overlap + * The range `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_keys[i].Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_keys[i].Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -1520,13 +1523,13 @@ struct DeviceSegmentedSort * guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of - * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall - * not overlap `[in, in + num_items)`, + * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall + * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, - * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, + * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -1720,13 +1723,13 @@ struct DeviceSegmentedSort * guaranteed that the relative order of these two elements will be * preserved by sort. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of - * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall - * not overlap `[in, in + num_items)`, + * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall + * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, - * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, + * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -1930,16 +1933,16 @@ struct DeviceSegmentedSort * @p j are equivalent: neither one is less than the other. It is not * guaranteed that the relative order of these two elements will be * preserved by sort. - * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` - * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range - * `[cur, cur + num_items)` shall not overlap + * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` + * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range + * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, - * `d_values.Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, + * `d_values.Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -2131,16 +2134,16 @@ struct DeviceSegmentedSort * @p i and @p j are equivalent: neither one is less than the other. It is * not guaranteed that the relative order of these two elements will be * preserved by sort. - * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` - * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range - * `[cur, cur + num_items)` shall not overlap + * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` + * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range + * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, - * `d_values.Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, + * `d_values.Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -2327,13 +2330,13 @@ struct DeviceSegmentedSort * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of - * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall - * not overlap `[in, in + num_items)`, + * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall + * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, - * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, + * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -2520,13 +2523,13 @@ struct DeviceSegmentedSort * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. * - Let `in` be one of `{d_keys_in, d_values_in}` and `out` be any of - * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall - * not overlap `[in, in + num_items)`, + * `{d_keys_out, d_values_out}`. The range `[out, out + num_items)` shall + * not overlap `[in, in + num_items)`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, - * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys_in[i]`, `d_values_in[i]`, + * `d_keys_out[i]`, `d_values_out[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -2723,16 +2726,16 @@ struct DeviceSegmentedSort * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. - * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` - * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range - * `[cur, cur + num_items)` shall not overlap + * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` + * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range + * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, - * `d_values.Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, + * `d_values.Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments @@ -2918,16 +2921,16 @@ struct DeviceSegmentedSort * @p x precedes @p y, and if the two elements are equivalent (neither * @p x < @p y nor @p y < @p x) then a postcondition of stable sort is that * @p x still precedes @p y. - * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` - * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range - * `[cur, cur + num_items)` shall not overlap + * - Let `cur` be one of `{d_keys.Current(), d_values.Current()}` and `alt` + * be any of `{d_keys.Alternate(), d_values.Alternate()}`. The range + * `[cur, cur + num_items)` shall not overlap * `[alt, alt + num_items)`. Both ranges shall not overlap * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)` in any way. - * - Segments are not required to be contiguous. For all index values `i` - * outside the specified segments `d_keys.Current()[i]`, - * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, - * `d_values.Alternate()[i]` will not be accessed nor modified. + * - Segments are not required to be contiguous. For all index values `i` + * outside the specified segments `d_keys.Current()[i]`, + * `d_values.Current()[i]`, `d_keys.Alternate()[i]`, + * `d_values.Alternate()[i]` will not be accessed nor modified. * * @par Snippet * The code snippet below illustrates the batched sorting of three segments diff --git a/cub/cub/device/device_select.cuh b/cub/cub/device/device_select.cuh index f21431391ab..1c93894c95c 100644 --- a/cub/cub/device/device_select.cuh +++ b/cub/cub/device/device_select.cuh @@ -13,9 +13,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -27,17 +27,20 @@ ******************************************************************************/ /** - * @file cub::DeviceSelect provides device-wide, parallel operations for - * compacting selected items from sequences of data items residing within + * @file cub::DeviceSelect provides device-wide, parallel operations for + * compacting selected items from sequences of data items residing within * device-accessible memory. */ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include #include #include #include @@ -46,8 +49,8 @@ CUB_NAMESPACE_BEGIN /** - * @brief DeviceSelect provides device-wide, parallel operations for compacting - * selected items from sequences of data items residing within + * @brief DeviceSelect provides device-wide, parallel operations for compacting + * selected items from sequences of data items residing within * device-accessible memory. ![](select_logo.png) * @ingroup SingleModule * @@ -62,15 +65,15 @@ CUB_NAMESPACE_BEGIN * @linear_performance{select-flagged, select-if, and select-unique} * * @par - * The following chart illustrates DeviceSelect::If performance across - * different CUDA architectures for `int32` items, where 50% of the items are + * The following chart illustrates DeviceSelect::If performance across + * different CUDA architectures for `int32` items, where 50% of the items are * randomly selected. * * @image html select_if_int32_50_percent.png * * @par - * The following chart illustrates DeviceSelect::Unique performance across - * different CUDA architectures for `int32` items where segments have lengths + * The following chart illustrates DeviceSelect::Unique performance across + * different CUDA architectures for `int32` items where segments have lengths * uniformly sampled from `[1, 1000]`. * * @image html select_unique_int32_len_500.png @@ -82,28 +85,28 @@ CUB_NAMESPACE_BEGIN struct DeviceSelect { /** - * @brief Uses the `d_flags` sequence to selectively copy the corresponding - * items from `d_in` into `d_out`. The total number of items selected + * @brief Uses the `d_flags` sequence to selectively copy the corresponding + * items from `d_in` into `d_out`. The total number of items selected * is written to `d_num_selected_out`. ![](select_flags_logo.png) * * @par - * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`, + * - The value type of `d_flags` must be castable to `bool` (e.g., `bool`, * `char`, `int`, etc.). - * - Copies of the selected items are compacted into `d_out` and maintain + * - Copies of the selected items are compacted into `d_out` and maintain * their original relative ordering. - * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap + * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap * `[d_in, d_in + num_items)`, `[d_flags, d_flags + num_items)` nor * `d_num_selected_out` in any way. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the compaction of items selected from + * The code snippet below illustrates the compaction of items selected from * an `int` device vector. * @par * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for input, + * // Declare, allocate, and initialize device-accessible pointers for input, * // flags, and output * int num_items; // e.g., 8 * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] @@ -116,7 +119,7 @@ struct DeviceSelect * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::Flagged( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_out, d_num_selected_out, num_items); * * // Allocate temporary storage @@ -124,7 +127,7 @@ struct DeviceSelect * * // Run selection * cub::DeviceSelect::Flagged( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_out, d_num_selected_out, num_items); * * // d_out <-- [1, 4, 6, 7] @@ -132,48 +135,48 @@ struct DeviceSelect * * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam FlagIterator - * **[inferred]** Random-access input iterator type for reading selection + * @tparam FlagIterator + * **[inferred]** Random-access input iterator type for reading selection * flags \iterator * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing selected * items \iterator * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items + * @tparam NumSelectedIteratorT + * **[inferred]** Output iterator type for recording the number of items * selected \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[in] d_flags + * @param[in] d_flags * Pointer to the input sequence of selection flags * - * @param[out] d_out + * @param[out] d_out * Pointer to the output sequence of selected data items * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected + * @param[out] d_num_selected_out + * Pointer to the output total number of items selected * (i.e., length of `d_out`) * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for input, + * // Declare, allocate, and initialize device-accessible pointers for input, * // flags, and output * int num_items; // e.g., 8 * int *d_data; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] @@ -278,7 +281,7 @@ struct DeviceSelect * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::Flagged( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_num_selected_out, num_items); * * // Allocate temporary storage @@ -286,7 +289,7 @@ struct DeviceSelect * * // Run selection * cub::DeviceSelect::Flagged( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_flags, d_num_selected_out, num_items); * * // d_data <-- [1, 4, 6, 7] @@ -294,40 +297,40 @@ struct DeviceSelect * * @endcode * - * @tparam IteratorT - * **[inferred]** Random-access iterator type for reading and writing + * @tparam IteratorT + * **[inferred]** Random-access iterator type for reading and writing * selected items \iterator * - * @tparam FlagIterator - * **[inferred]** Random-access input iterator type for reading selection + * @tparam FlagIterator + * **[inferred]** Random-access input iterator type for reading selection * flags \iterator * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items + * @tparam NumSelectedIteratorT + * **[inferred]** Output iterator type for recording the number of items * selected \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_data * Pointer to the sequence of data items * - * @param[in] d_flags + * @param[in] d_flags * Pointer to the input sequence of selection flags * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected + * @param[out] d_num_selected_out + * Pointer to the output total number of items selected * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_data`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template ::Dispatch(d_temp_storage, temp_storage_bytes, d_data, // in @@ -395,20 +398,20 @@ struct DeviceSelect } /** - * @brief Uses the `select_op` functor to selectively copy items from `d_in` - * into `d_out`. The total number of items selected is written to + * @brief Uses the `select_op` functor to selectively copy items from `d_in` + * into `d_out`. The total number of items selected is written to * `d_num_selected_out`. ![](select_logo.png) * * @par - * - Copies of the selected items are compacted into `d_out` and maintain + * - Copies of the selected items are compacted into `d_out` and maintain * their original relative ordering. - * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap + * - The range `[d_out, d_out + *d_num_selected_out)` shall not overlap * `[d_in, d_in + num_items)` nor `d_num_selected_out` in any way. * - @devicestorage * * @par Performance - * The following charts illustrate saturated select-if performance across - * different CUDA architectures for `int32` and `int64` items, respectively. + * The following charts illustrate saturated select-if performance across + * different CUDA architectures for `int32` and `int64` items, respectively. * Items are selected with 50% probability. * * @image html select_if_int32_50_percent.png @@ -421,7 +424,7 @@ struct DeviceSelect * @image html select_if_int64_5_percent.png * * @par Snippet - * The code snippet below illustrates the compaction of items selected from + * The code snippet below illustrates the compaction of items selected from * an `int` device vector. * @par * @code @@ -441,7 +444,7 @@ struct DeviceSelect * } * }; * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] @@ -454,7 +457,7 @@ struct DeviceSelect * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::If( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items, select_op); * * // Allocate temporary storage @@ -462,55 +465,55 @@ struct DeviceSelect * * // Run selection * cub::DeviceSelect::If( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items, select_op); * * // d_out <-- [0, 2, 3, 5, 2] * // d_num_selected_out <-- [5] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing selected * items \iterator * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items + * @tparam NumSelectedIteratorT + * **[inferred]** Output iterator type for recording the number of items * selected \iterator * - * @tparam SelectOp - * **[inferred]** Selection operator type having member + * @tparam SelectOp + * **[inferred]** Selection operator type having member * `bool operator()(const T &a)` * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output sequence of selected data items * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected + * @param[out] d_num_selected_out + * Pointer to the output total number of items selected * (i.e., length of `d_out`) * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] select_op + * @param[in] select_op * Unary selection operator * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template 0. */ template // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] @@ -785,7 +788,7 @@ struct DeviceSelect * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::Unique( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items); * * // Allocate temporary storage @@ -793,48 +796,48 @@ struct DeviceSelect * * // Run selection * cub::DeviceSelect::Unique( - * d_temp_storage, temp_storage_bytes, + * d_temp_storage, temp_storage_bytes, * d_in, d_out, d_num_selected_out, num_items); * * // d_out <-- [0, 2, 9, 5, 8] * // d_num_selected_out <-- [5] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected + * @tparam OutputIteratorT + * **[inferred]** Random-access output iterator type for writing selected * items \iterator * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items + * @tparam NumSelectedIteratorT + * **[inferred]** Output iterator type for recording the number of items * selected \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output sequence of selected data items * - * @param[out] d_num_selected_out - * Pointer to the output total number of items selected + * @param[out] d_num_selected_out + * Pointer to the output total number of items selected * (i.e., length of `d_out`) * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_items; // e.g., 8 * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] @@ -941,8 +944,8 @@ struct DeviceSelect * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; * cub::DeviceSelect::UniqueByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, * d_keys_out, d_values_out, d_num_selected_out, num_items); * * // Allocate temporary storage @@ -950,8 +953,8 @@ struct DeviceSelect * * // Run selection * cub::DeviceSelect::UniqueByKey( - * d_temp_storage, temp_storage_bytes, - * d_keys_in, d_values_in, + * d_temp_storage, temp_storage_bytes, + * d_keys_in, d_values_in, * d_keys_out, d_values_out, d_num_selected_out, num_items); * * // d_keys_out <-- [0, 2, 9, 5, 8] @@ -959,56 +962,56 @@ struct DeviceSelect * // d_num_selected_out <-- [5] * @endcode * - * @tparam KeyInputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam KeyInputIteratorT + * **[inferred]** Random-access input iterator type for reading input * keys \iterator * - * @tparam ValueInputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam ValueInputIteratorT + * **[inferred]** Random-access input iterator type for reading input * values \iterator * - * @tparam KeyOutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected + * @tparam KeyOutputIteratorT + * **[inferred]** Random-access output iterator type for writing selected * keys \iterator * - * @tparam ValueOutputIteratorT - * **[inferred]** Random-access output iterator type for writing selected + * @tparam ValueOutputIteratorT + * **[inferred]** Random-access output iterator type for writing selected * values \iterator * - * @tparam NumSelectedIteratorT - * **[inferred]** Output iterator type for recording the number of items + * @tparam NumSelectedIteratorT + * **[inferred]** Output iterator type for recording the number of items * selected \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_keys_in + * @param[in] d_keys_in * Pointer to the input sequence of keys * - * @param[in] d_values_in + * @param[in] d_values_in * Pointer to the input sequence of values * - * @param[out] d_keys_out + * @param[out] d_keys_out * Pointer to the output sequence of selected keys * - * @param[out] d_values_out + * @param[out] d_values_out * Pointer to the output sequence of selected values * - * @param[out] d_num_selected_out - * Pointer to the total number of items selected (i.e., length of + * @param[out] d_num_selected_out + * Pointer to the total number of items selected (i.e., length of * `d_keys_out` or `d_values_out`) * - * @param[in] num_items - * Total number of input items (i.e., length of `d_keys_in` or + * @param[in] num_items + * Total number of input items (i.e., length of `d_keys_in` or * `d_values_in`) * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template #include #include diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 6328e7d787b..be0be052a54 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -27,8 +27,11 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include @@ -67,10 +70,10 @@ DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input, DifferenceOpT difference_op, OffsetT num_items) { - using ActivePolicyT = + using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy; - // It is OK to introspect the return type or parameter types of the + // It is OK to introspect the return type or parameter types of the // `operator()` function of `__device__` extended lambda within device code. using OutputT = detail::invoke_result_t; @@ -94,7 +97,7 @@ DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input, num_items); int tile_idx = static_cast(blockIdx.x); - OffsetT tile_base = static_cast(tile_idx) + OffsetT tile_base = static_cast(tile_idx) * ActivePolicyT::ITEMS_PER_TILE; agent.Process(tile_idx, tile_base); @@ -313,7 +316,7 @@ struct DispatchAdjacentDifference : public SelectedPolicy num_items); error = CubDebug(detail::DebugSyncStream(stream)); - + if (cudaSuccess != error) { break; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 04384ae0451..61b4232bf15 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -33,9 +33,12 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include #include #include #include diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 5c9aab47a85..921eeef7258 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -35,8 +35,11 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include @@ -567,8 +570,8 @@ struct dispatch_histogram * @tparam OffsetT * Signed integer type for global offsets * - * @tparam SelectedPolicy - * Implementation detail, do not specify directly, requirements on the + * @tparam SelectedPolicy + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template #include #include diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index b57ccaace88..f64f16d6bb7 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -33,13 +33,16 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include #include #include -#include #include #include #include @@ -75,7 +78,7 @@ template < bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename OffsetT, ///< Signed integer type for global offsets - typename DecomposerT = detail::identity_decomposer_t> + typename DecomposerT = detail::identity_decomposer_t> __launch_bounds__ (int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))) @@ -316,7 +319,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel( ValueT values[ITEMS_PER_THREAD]; // Get default (min/max) value for out-of-bounds keys - bit_ordered_type default_key_bits = IS_DESCENDING + bit_ordered_type default_key_bits = IS_DESCENDING ? traits::min_raw_binary_key(decomposer) : traits::max_raw_binary_key(decomposer); @@ -540,7 +543,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel( * Onesweep kernels ******************************************************************************/ -/** +/** * Kernel for computing multiple histograms */ @@ -552,7 +555,7 @@ template -CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) +CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel(OffsetT *d_bins_out, const KeyT *d_keys_in, OffsetT num_items, @@ -594,7 +597,7 @@ DeviceRadixSortOnesweepKernel } -/** +/** * Exclusive sum kernel */ template < @@ -722,10 +725,10 @@ struct DeviceRadixSortPolicy // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; - + // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; - + // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, @@ -778,10 +781,10 @@ struct DeviceRadixSortPolicy // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; - + // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; - + // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, @@ -821,10 +824,10 @@ struct DeviceRadixSortPolicy // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; - + // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; - + // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, OFFSET_64BIT ? 29 : 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, @@ -864,10 +867,10 @@ struct DeviceRadixSortPolicy // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; - + // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; - + // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, @@ -905,10 +908,10 @@ struct DeviceRadixSortPolicy // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; - + // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; - + // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2, RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, @@ -948,10 +951,10 @@ struct DeviceRadixSortPolicy // Histogram policy typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; - + // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; - + // Onesweep policy typedef AgentRadixSortOnesweepPolicy <256, sizeof(KeyT) == 4 && sizeof(ValueT) == 4 ? 46 : 23, DominantT, 4, @@ -993,10 +996,10 @@ struct DeviceRadixSortPolicy // Histogram policy typedef AgentRadixSortHistogramPolicy <128, 16, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; - + // Exclusive sum policy typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; - + // Onesweep policy typedef AgentRadixSortOnesweepPolicy <384, OFFSET_64BIT && sizeof(KeyT) == 4 && !KEYS_ONLY ? 17 : 21, DominantT, 1, @@ -1162,8 +1165,8 @@ struct DeviceRadixSortPolicy * @tparam OffsetT * Signed integer type for global offsets * - * @tparam DecomposerT - * Implementation detail, do not specify directly, requirements on the + * @tparam DecomposerT + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template (upsweep_kernel)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } error = CubDebug(scan_config.Init(scan_kernel)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } error = CubDebug(downsweep_config.Init(downsweep_kernel)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -1629,7 +1632,7 @@ struct DispatchRadixSort : SelectedPolicy histogram_kernel, HISTO_BLOCK_THREADS, 0)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -1713,7 +1716,7 @@ struct DispatchRadixSort : SelectedPolicy 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), stream)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -1760,7 +1763,7 @@ struct DispatchRadixSort : SelectedPolicy { break; } - + // use the temporary buffers if no overwrite is allowed if (!is_overwrite_okay && pass == 0) { @@ -1775,7 +1778,7 @@ struct DispatchRadixSort : SelectedPolicy d_values.selector ^= 1; } } while (0); - + return error; } @@ -1799,7 +1802,7 @@ struct DispatchRadixSort : SelectedPolicy // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -1808,7 +1811,7 @@ struct DispatchRadixSort : SelectedPolicy int sm_count; error = CubDebug( cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -1899,7 +1902,7 @@ struct DispatchRadixSort : SelectedPolicy spine_length, current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -1917,7 +1920,7 @@ struct DispatchRadixSort : SelectedPolicy current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -1956,7 +1959,7 @@ struct DispatchRadixSort : SelectedPolicy DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT, DecomposerT>, RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>, - DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>); + DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT, DecomposerT>); } template @@ -1977,7 +1980,7 @@ struct DispatchRadixSort : SelectedPolicy temp_storage_bytes = 1; return cudaSuccess; } - + // Copy keys #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking async copy of %lld keys on stream %lld\n", (long long)num_items, @@ -2103,7 +2106,7 @@ struct DispatchRadixSort : SelectedPolicy int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -2397,7 +2400,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -2437,7 +2440,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy d_values_remaining_passes.Current(), current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -2452,7 +2455,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], current_bit, (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -2533,7 +2536,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy int ptx_version = 0; error = CubDebug(PtxVersion(ptx_version)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -2548,7 +2551,7 @@ struct DispatchSegmentedRadixSort : SelectedPolicy // Dispatch to chained policy error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 39a777c8e3d..e6b6ab0d452 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -34,8 +34,11 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index a758fa60992..7e4deb4c3d6 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -33,8 +33,11 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include @@ -205,8 +208,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH * @tparam OffsetT * Signed integer type for global offsets * - * @tparam SelectedPolicy - * Implementation detail, do not specify directly, requirements on the + * @tparam SelectedPolicy + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template -#include #include #include #include @@ -172,8 +175,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREA * @tparam OffsetT * Signed integer type for global offsets * - * @tparam SelectedPolicy - * Implementation detail, do not specify directly, requirements on the + * @tparam SelectedPolicy + * Implementation detail, do not specify directly, requirements on the * content of this type are subject to breaking change. */ template -#include #include #include #include @@ -211,7 +214,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) * Random-access output iterator type for writing scan outputs \iterator * * @tparam ScanOpT - * Binary scan functor type having member + * Binary scan functor type having member * `auto operator()(const T &a, const U &b)` * * @tparam InitValueT @@ -226,9 +229,9 @@ template ::value, cub::detail::value_t, diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 62df5c6b913..13a4f354f00 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -12,9 +12,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -26,15 +26,18 @@ ******************************************************************************/ /** - * @file DeviceScan provides device-wide, parallel operations for computing a - * prefix scan across a sequence of data items residing within + * @file DeviceScan provides device-wide, parallel operations for computing a + * prefix scan across a sequence of data items residing within * device-accessible memory. */ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include @@ -465,7 +468,7 @@ struct DispatchScanByKey : SelectedPolicy } // Sync the stream if specified to flush runtime errors - + error = CubDebug(detail::DebugSyncStream(stream)); if (cudaSuccess != error) { diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index e2d5da09669..68db255b044 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -27,6 +27,10 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index bb9fe685768..8b84113bb2b 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -28,14 +28,17 @@ /** * @file - * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences + * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences * of data items residing within device-accessible memory. */ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include @@ -64,39 +67,39 @@ CUB_NAMESPACE_BEGIN * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) * - * @tparam InputIteratorT + * @tparam InputIteratorT * Random-access input iterator type for reading input items * - * @tparam FlagsInputIteratorT - * Random-access input iterator type for reading selection flags (NullType* if a selection functor + * @tparam FlagsInputIteratorT + * Random-access input iterator type for reading selection flags (NullType* if a selection functor * or discontinuity flagging is to be used for selection) * - * @tparam SelectedOutputIteratorT + * @tparam SelectedOutputIteratorT * Random-access output iterator type for writing selected items * - * @tparam NumSelectedIteratorT + * @tparam NumSelectedIteratorT * Output iterator type for recording the number of items selected * - * @tparam ScanTileStateT + * @tparam ScanTileStateT * Tile status interface type * - * @tparam SelectOpT - * Selection operator type (NullType if selection flags or discontinuity flagging is + * @tparam SelectOpT + * Selection operator type (NullType if selection flags or discontinuity flagging is * to be used for selection) * - * @tparam EqualityOpT - * Equality operator type (NullType if selection functor or selection flags is + * @tparam EqualityOpT + * Equality operator type (NullType if selection functor or selection flags is * to be used for selection) * - * @tparam OffsetT + * @tparam OffsetT * Signed integer type for global offsets * - * @tparam KEEP_REJECTS + * @tparam KEEP_REJECTS * Whether or not we push rejected items to the back of the output * * @param[in] d_in * Pointer to the input sequence of data items - * + * * @param[in] d_flags * Pointer to the input sequence of selection flags (if applicable) * @@ -106,18 +109,18 @@ CUB_NAMESPACE_BEGIN * @param[out] d_num_selected_out * Pointer to the total number of items selected (i.e., length of \p d_selected_out) * - * @param[in] tile_status + * @param[in] tile_status * Tile status interface * * @param[in] select_op * Selection operator - * + * * @param[in] equality_op * Equality operator - * + * * @param[in] num_items * Total number of input items (i.e., length of \p d_in) - * + * * @param[in] num_tiles * Total number of tiles for the entire problem */ @@ -176,7 +179,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREA * Random-access input iterator type for reading input items * * @tparam FlagsInputIteratorT - * Random-access input iterator type for reading selection flags + * Random-access input iterator type for reading selection flags * (NullType* if a selection functor or discontinuity flagging is to be used for selection) * * @tparam SelectedOutputIteratorT @@ -186,11 +189,11 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREA * Output iterator type for recording the number of items selected * * @tparam SelectOpT - * Selection operator type (NullType if selection flags or discontinuity flagging is + * Selection operator type (NullType if selection flags or discontinuity flagging is * to be used for selection) * * @tparam EqualityOpT - * Equality operator type (NullType if selection functor or selection flags is to + * Equality operator type (NullType if selection functor or selection flags is to * be used for selection) * * @tparam OffsetT @@ -225,13 +228,13 @@ struct DispatchSelectIf : SelectedPolicy static constexpr int INIT_KERNEL_THREADS = 128; - /// Device-accessible allocation of temporary storage. - /// When `nullptr`, the required allocation size is written to `temp_storage_bytes` + /// Device-accessible allocation of temporary storage. + /// When `nullptr`, the required allocation size is written to `temp_storage_bytes` /// and no work is done. void* d_temp_storage; /// Reference to size in bytes of `d_temp_storage` allocation - size_t& temp_storage_bytes; + size_t& temp_storage_bytes; /// Pointer to the input sequence of data items InputIteratorT d_in; @@ -261,11 +264,11 @@ struct DispatchSelectIf : SelectedPolicy /** * @param d_temp_storage - * Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to `temp_storage_bytes` + * Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to `temp_storage_bytes` * and no work is done. - * - * @param temp_storage_bytes + * + * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in @@ -339,7 +342,7 @@ struct DispatchSelectIf : SelectedPolicy // Get device ordinal int device_ordinal; error = CubDebug(cudaGetDevice(&device_ordinal)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -352,7 +355,7 @@ struct DispatchSelectIf : SelectedPolicy // bytes needed for tile status descriptors error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -361,7 +364,7 @@ struct DispatchSelectIf : SelectedPolicy void* allocations[1] = {}; error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -375,7 +378,7 @@ struct DispatchSelectIf : SelectedPolicy // Construct the tile status interface ScanTileStateT tile_status; error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -418,7 +421,7 @@ struct DispatchSelectIf : SelectedPolicy // Get max x-dimension of grid int max_dim_x; error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -509,11 +512,11 @@ struct DispatchSelectIf : SelectedPolicy * Internal dispatch routine * * @param d_temp_storage - * Device-accessible allocation of temporary storage. - * When `nullptr`, the required allocation size is written to `temp_storage_bytes` + * Device-accessible allocation of temporary storage. + * When `nullptr`, the required allocation size is written to `temp_storage_bytes` * and no work is done. - * - * @param temp_storage_bytes + * + * @param temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param d_in @@ -555,7 +558,7 @@ struct DispatchSelectIf : SelectedPolicy using MaxPolicyT = typename SelectedPolicy::MaxPolicy; int ptx_version = 0; - if (cudaError_t error = CubDebug(PtxVersion(ptx_version))) + if (cudaError_t error = CubDebug(PtxVersion(ptx_version))) { return error; } @@ -571,23 +574,23 @@ struct DispatchSelectIf : SelectedPolicy num_items, stream, ptx_version); - + return CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch( - void* d_temp_storage, - size_t& temp_storage_bytes, - InputIteratorT d_in, - FlagsInputIteratorT d_flags, - SelectedOutputIteratorT d_selected_out, - NumSelectedIteratorT d_num_selected_out, - SelectOpT select_op, - EqualityOpT equality_op, - OffsetT num_items, - cudaStream_t stream, + void* d_temp_storage, + size_t& temp_storage_bytes, + InputIteratorT d_in, + FlagsInputIteratorT d_flags, + SelectedOutputIteratorT d_selected_out, + NumSelectedIteratorT d_num_selected_out, + SelectOpT select_op, + EqualityOpT equality_op, + OffsetT num_items, + cudaStream_t stream, bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index 8f2a68f967c..89376f617df 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -34,10 +34,13 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include -#include #include #include #include @@ -196,9 +199,9 @@ DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) { ValueT result = 0.0; - CUB_IF_CONSTEXPR(HAS_BETA) + CUB_IF_CONSTEXPR(HAS_BETA) { - result += spmv_params.beta * spmv_params.d_vector_y[row]; + result += spmv_params.beta * spmv_params.d_vector_y[row]; } spmv_params.d_vector_y[row] = result; @@ -838,7 +841,7 @@ struct DispatchSpmv constexpr bool has_beta = false; if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, + d_temp_storage, temp_storage_bytes, spmv_params, stream, DeviceSpmv1ColKernel, DeviceSpmvSearchKernel, DeviceSpmvKernel, diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 8b1849fa24c..3e36e336dfc 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -27,8 +27,11 @@ #pragma once +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index c924e71ef7e..36ab55be61c 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -34,8 +34,8 @@ #include #include #include +#include #include -#include #include #include diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 2b595e91e79..9cdc7173f62 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -27,9 +27,12 @@ #pragma once +#include "../../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include #include #include diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index e06ddb5019e..1ea37100a07 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -27,12 +27,15 @@ #pragma once +#include "../../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include #include -#include #include #include diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index d7ad21c808f..cfad085825f 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -27,13 +27,16 @@ #pragma once +#include "../../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include #include #include -#include #include #include @@ -157,7 +160,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm90_tuning { @@ -259,7 +262,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm80_tuning { @@ -287,7 +290,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm90_tuning { @@ -412,7 +415,7 @@ struct sm80_tuning static constexpr int items = CUB_MIN(nominal_4b_items_per_thread, CUB_MAX(1, (nominal_4b_items_per_thread * 4 / sizeof(KeyT)))); - + static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; static constexpr bool store_with_time_slicing = true; @@ -476,7 +479,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm80_tuning { diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index eff4f29fcd2..f26a1655f9c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -27,12 +27,15 @@ #pragma once +#include "../../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include #include -#include #include #include @@ -102,7 +105,7 @@ template struct sm90_tuning struct sm90_tuning : tuning<128, 24, 688, 1140> {}; template <> struct sm90_tuning : tuning<224, 24, 576, 1215> {}; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm90_tuning< __int128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {}; template <> struct sm90_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_size::_16> : tuning<576, 21, 860, 630> {}; #endif @@ -229,7 +232,7 @@ struct sm80_tuning<__uint128_t, primitive_op::yes, primitive_accum::no, accum_si } // namespace detail -template +template struct DeviceScanPolicy { // For large values, use timesliced loads/stores to fit shared memory. diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index 92e4931fe81..9f7e679c45a 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -27,12 +27,15 @@ #pragma once +#include "../../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include #include -#include #include #include #include @@ -169,7 +172,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm90_tuning { @@ -255,7 +258,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm90_tuning { @@ -341,7 +344,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm90_tuning { @@ -427,7 +430,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm90_tuning { @@ -513,7 +516,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm90_tuning { @@ -627,7 +630,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm80_tuning { @@ -713,7 +716,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm80_tuning { @@ -799,7 +802,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm80_tuning { @@ -885,7 +888,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm80_tuning { @@ -971,7 +974,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template struct sm80_tuning { diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 41e5b34f5ae..1d1dc6eb587 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -27,20 +27,23 @@ #pragma once +#include "../../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include -#include #include #include CUB_NAMESPACE_BEGIN -namespace detail +namespace detail { -namespace select +namespace select { enum class flagged { no, yes }; @@ -152,7 +155,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -221,7 +224,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -290,7 +293,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -359,7 +362,7 @@ struct sm90_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm90_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -450,7 +453,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -519,7 +522,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::no, offset_size::_4, primitive::no, input_size::_16> { @@ -588,7 +591,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { @@ -657,7 +660,7 @@ struct sm80_tuning; }; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct sm80_tuning<__int128_t, flagged::yes, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16> { diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index c4b09d8495f..a73e8eed295 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -27,11 +27,14 @@ #pragma once +#include "../../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include -#include #include #include #include diff --git a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh index d16f770965b..3fffeb055a2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh @@ -27,11 +27,14 @@ #pragma once +#include "../../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include #include -#include #include #include #include @@ -716,7 +719,7 @@ struct DeviceUniqueByKeyPolicy detail::default_delay_constructor_t>; }; - struct DefaultTuning + struct DefaultTuning { static constexpr int INPUT_SIZE = sizeof(KeyT); enum diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh index 063a2c395cd..c91d8c8624d 100644 --- a/cub/cub/grid/grid_barrier.cuh +++ b/cub/cub/grid/grid_barrier.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,8 +33,11 @@ #pragma once -#include "../util_debug.cuh" #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + +#include "../util_debug.cuh" #include "../thread/thread_load.cuh" CUB_NAMESPACE_BEGIN @@ -195,13 +198,13 @@ public: // Allocate and initialize to zero retval = CubDebug(cudaMalloc((void**) &d_sync, sync_bytes)); - if (cudaSuccess != retval) + if (cudaSuccess != retval) { break; } retval = CubDebug(cudaMemset(d_sync, 0, new_sync_bytes)); - if (cudaSuccess != retval) + if (cudaSuccess != retval) { break; } diff --git a/cub/cub/grid/grid_even_share.cuh b/cub/cub/grid/grid_even_share.cuh index d2150511321..56e396e4c87 100644 --- a/cub/cub/grid/grid_even_share.cuh +++ b/cub/cub/grid/grid_even_share.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -35,8 +35,9 @@ #pragma once #include "../config.cuh" -#include "../util_namespace.cuh" -#include "../util_macro.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../util_math.cuh" #include "../util_type.cuh" #include "grid_mapping.cuh" diff --git a/cub/cub/grid/grid_mapping.cuh b/cub/cub/grid/grid_mapping.cuh index b57f193deb8..cf69555a9a0 100644 --- a/cub/cub/grid/grid_mapping.cuh +++ b/cub/cub/grid/grid_mapping.cuh @@ -35,6 +35,9 @@ #include "../config.cuh" +_CCCL_IMPLICIT_SYSTEM_HEADER + + CUB_NAMESPACE_BEGIN diff --git a/cub/cub/grid/grid_queue.cuh b/cub/cub/grid/grid_queue.cuh index e1933e3d381..6dba6fe55d2 100644 --- a/cub/cub/grid/grid_queue.cuh +++ b/cub/cub/grid/grid_queue.cuh @@ -33,7 +33,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/host/mutex.cuh b/cub/cub/host/mutex.cuh index 4ee40288452..dbd6435db00 100644 --- a/cub/cub/host/mutex.cuh +++ b/cub/cub/host/mutex.cuh @@ -33,9 +33,12 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include @@ -43,8 +46,8 @@ CUB_NAMESPACE_BEGIN /** - * Wraps std::mutex - * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed + * Wraps std::mutex + * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed * in a future release. Use `std::mutex` instead. */ struct CUB_DEPRECATED Mutex diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh index 7ea860981e6..eb94aace2a2 100644 --- a/cub/cub/iterator/arg_index_input_iterator.cuh +++ b/cub/cub/iterator/arg_index_input_iterator.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,10 +33,13 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include "../config.cuh" #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" diff --git a/cub/cub/iterator/cache_modified_input_iterator.cuh b/cub/cub/iterator/cache_modified_input_iterator.cuh index 9a5936d5ac6..dae5c0bb63f 100644 --- a/cub/cub/iterator/cache_modified_input_iterator.cuh +++ b/cub/cub/iterator/cache_modified_input_iterator.cuh @@ -33,10 +33,13 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include "../config.cuh" #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh index 91d4fc91a7e..daf3b5e905e 100644 --- a/cub/cub/iterator/cache_modified_output_iterator.cuh +++ b/cub/cub/iterator/cache_modified_output_iterator.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,12 +33,15 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" -#include "../config.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer diff --git a/cub/cub/iterator/constant_input_iterator.cuh b/cub/cub/iterator/constant_input_iterator.cuh index 3de5123df34..c9128d5101c 100644 --- a/cub/cub/iterator/constant_input_iterator.cuh +++ b/cub/cub/iterator/constant_input_iterator.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,12 +33,15 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" -#include "../config.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer diff --git a/cub/cub/iterator/counting_input_iterator.cuh b/cub/cub/iterator/counting_input_iterator.cuh index 700455f420c..ed6d254ef47 100644 --- a/cub/cub/iterator/counting_input_iterator.cuh +++ b/cub/cub/iterator/counting_input_iterator.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,12 +33,15 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" -#include "../config.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer diff --git a/cub/cub/iterator/discard_output_iterator.cuh b/cub/cub/iterator/discard_output_iterator.cuh index ac47a3ff344..29a30060123 100644 --- a/cub/cub/iterator/discard_output_iterator.cuh +++ b/cub/cub/iterator/discard_output_iterator.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,11 +33,13 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include "../config.cuh" - #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer #include diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh index cd3d015aab8..7e7e3fc0ea2 100644 --- a/cub/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/cub/iterator/tex_obj_input_iterator.cuh @@ -33,7 +33,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/iterator/tex_ref_input_iterator.cuh b/cub/cub/iterator/tex_ref_input_iterator.cuh index 0d877e1db49..cc29c69e2f3 100644 --- a/cub/cub/iterator/tex_ref_input_iterator.cuh +++ b/cub/cub/iterator/tex_ref_input_iterator.cuh @@ -33,7 +33,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/iterator/transform_input_iterator.cuh b/cub/cub/iterator/transform_input_iterator.cuh index 0b3350e88a4..99979655a15 100644 --- a/cub/cub/iterator/transform_input_iterator.cuh +++ b/cub/cub/iterator/transform_input_iterator.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,12 +33,15 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" -#include "../config.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh index 0a8456898ec..0127e899d81 100644 --- a/cub/cub/thread/thread_load.cuh +++ b/cub/cub/thread/thread_load.cuh @@ -33,9 +33,12 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" diff --git a/cub/cub/thread/thread_operators.cuh b/cub/cub/thread/thread_operators.cuh index c6b097ea354..f1f9ba460a1 100644 --- a/cub/cub/thread/thread_operators.cuh +++ b/cub/cub/thread/thread_operators.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,10 +12,10 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -37,7 +37,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include @@ -164,8 +167,8 @@ struct ArgMax const KeyValuePair &b) const { // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) - // return ((b.value > a.value) || - // ((a.value == b.value) && (b.key < a.key))) + // return ((b.value > a.value) || + // ((a.value == b.value) && (b.key < a.key))) // ? b : a; if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) @@ -282,14 +285,14 @@ public: /** * @brief Reduce-by-segment functor. * - * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative - * combining operator `f(const T &x, const T &y)`, an instance of this functor - * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose - * `value` field is either `b.value` if `b.key` is non-zero, or + * Given two cub::KeyValuePair inputs `a` and `b` and a binary associative + * combining operator `f(const T &x, const T &y)`, an instance of this functor + * returns a cub::KeyValuePair whose `key` field is `a.key + b.key`, and whose + * `value` field is either `b.value` if `b.key` is non-zero, or * `f(a.value, b.value)` otherwise. * - * ReduceBySegmentOp is an associative, non-commutative binary combining - * operator for input sequences of cub::KeyValuePair pairings. Such sequences + * ReduceBySegmentOp is an associative, non-commutative binary combining + * operator for input sequences of cub::KeyValuePair pairings. Such sequences * are typically used to represent a segmented set of values to be reduced * and a corresponding set of {0,1}-valued integer "head flags" demarcating the * first value of each segment. @@ -348,7 +351,7 @@ struct ReduceBySegmentOp // else { // The second partial reduction does not span a reset, so accumulate both // into the running aggregate - // } + // } retval.value = (second.key) ? second.value : op(first.value, second.value); #endif return retval; diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 82042a17667..55b15db8143 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,9 +33,12 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../thread/thread_operators.cuh" #include "../detail/type_traits.cuh" -#include "../config.cuh" CUB_NAMESPACE_BEGIN @@ -50,7 +53,7 @@ template < typename T, typename ReductionOp, typename PrefixT, - typename AccumT = detail::accumulator_t> + typename AccumT = detail::accumulator_t> __device__ __forceinline__ AccumT ThreadReduce( T* input, ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator @@ -79,7 +82,7 @@ template < typename T, typename ReductionOp, typename PrefixT, - typename AccumT = detail::accumulator_t> + typename AccumT = detail::accumulator_t> __device__ __forceinline__ AccumT ThreadReduce( T* input, ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator @@ -121,7 +124,7 @@ template < typename T, typename ReductionOp, typename PrefixT, - typename AccumT = detail::accumulator_t> + typename AccumT = detail::accumulator_t> __device__ __forceinline__ AccumT ThreadReduce( T (&input)[LENGTH], ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator diff --git a/cub/cub/thread/thread_scan.cuh b/cub/cub/thread/thread_scan.cuh index b5e42710fcd..7b051d88e6c 100644 --- a/cub/cub/thread/thread_scan.cuh +++ b/cub/cub/thread/thread_scan.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,6 +34,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../thread/thread_operators.cuh" CUB_NAMESPACE_BEGIN diff --git a/cub/cub/thread/thread_search.cuh b/cub/cub/thread/thread_search.cuh index 62b3cdb4e5d..86bdb4a59ec 100644 --- a/cub/cub/thread/thread_search.cuh +++ b/cub/cub/thread/thread_search.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,10 +33,13 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include -#include #include diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh index 5d486789684..68280db8dc6 100644 --- a/cub/cub/thread/thread_sort.cuh +++ b/cub/cub/thread/thread_sort.cuh @@ -28,6 +28,9 @@ #pragma once #include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../util_ptx.cuh" #include "../util_type.cuh" diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index d1f055df158..6ccbb9d8819 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -33,7 +33,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh index 7dc12fb920e..b0f3d22ec6c 100644 --- a/cub/cub/util_allocator.cuh +++ b/cub/cub/util_allocator.cuh @@ -33,6 +33,10 @@ #pragma once +#include "config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "util_namespace.cuh" #include "util_debug.cuh" @@ -453,13 +457,13 @@ struct CachingDeviceAllocator if (device != entrypoint_device) { error = CubDebug(cudaGetDevice(&entrypoint_device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } error = CubDebug(cudaSetDevice(device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } @@ -491,13 +495,13 @@ struct CachingDeviceAllocator // Free device memory and destroy stream event. error = CubDebug(cudaFree(block_itr->d_ptr)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } error = CubDebug(cudaEventDestroy(block_itr->ready_event)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -519,7 +523,7 @@ struct CachingDeviceAllocator // Try to allocate again error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } @@ -547,7 +551,7 @@ struct CachingDeviceAllocator if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { error = CubDebug(cudaSetDevice(entrypoint_device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } @@ -639,13 +643,13 @@ struct CachingDeviceAllocator if (device != entrypoint_device) { error = CubDebug(cudaGetDevice(&entrypoint_device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } error = CubDebug(cudaSetDevice(device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } @@ -655,7 +659,7 @@ struct CachingDeviceAllocator { // Insert the ready event in the associated stream (must have current device set properly) error = CubDebug(cudaEventRecord(search_key.ready_event, search_key.associated_stream)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } @@ -665,13 +669,13 @@ struct CachingDeviceAllocator { // Free the allocation from the runtime and cleanup the event. error = CubDebug(cudaFree(d_ptr)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } error = CubDebug(cudaEventDestroy(search_key.ready_event)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } @@ -684,7 +688,7 @@ struct CachingDeviceAllocator if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { error = CubDebug(cudaSetDevice(entrypoint_device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } @@ -728,7 +732,7 @@ struct CachingDeviceAllocator if (entrypoint_device == INVALID_DEVICE_ORDINAL) { error = CubDebug(cudaGetDevice(&entrypoint_device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -738,7 +742,7 @@ struct CachingDeviceAllocator if (begin->device != current_device) { error = CubDebug(cudaSetDevice(begin->device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -747,13 +751,13 @@ struct CachingDeviceAllocator // Free device memory error = CubDebug(cudaFree(begin->d_ptr)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } error = CubDebug(cudaEventDestroy(begin->ready_event)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh index d2506e93cfc..4d8608e8e88 100644 --- a/cub/cub/util_arch.cuh +++ b/cub/cub/util_arch.cuh @@ -33,9 +33,13 @@ #pragma once -#include -#include -#include +#include "config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + +#include "util_cpp_dialect.cuh" +#include "util_namespace.cuh" +#include "util_macro.cuh" // Legacy include; this functionality used to be defined in here. #include @@ -44,7 +48,7 @@ CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -// \deprecated [Since 2.1.0] +// \deprecated [Since 2.1.0] #define CUB_USE_COOPERATIVE_GROUPS /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are diff --git a/cub/cub/util_compiler.cuh b/cub/cub/util_compiler.cuh index 7cda3c44012..4acf6ba83b5 100644 --- a/cub/cub/util_compiler.cuh +++ b/cub/cub/util_compiler.cuh @@ -32,6 +32,11 @@ #pragma once +// For `_CCCL_IMPLICIT_SYSTEM_HEADER` +#include + +_CCCL_IMPLICIT_SYSTEM_HEADER + // enumerate host compilers we know about #define CUB_HOST_COMPILER_UNKNOWN 0 #define CUB_HOST_COMPILER_MSVC 1 diff --git a/cub/cub/util_cpp_dialect.cuh b/cub/cub/util_cpp_dialect.cuh index 23adf8e8dc7..1b1afb53cf8 100644 --- a/cub/cub/util_cpp_dialect.cuh +++ b/cub/cub/util_cpp_dialect.cuh @@ -31,6 +31,10 @@ #pragma once +#include + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "util_compiler.cuh" // Deprecation warnings may be silenced by defining the following macros. These diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh index 5023524dd6e..6584afb5172 100644 --- a/cub/cub/util_debug.cuh +++ b/cub/cub/util_debug.cuh @@ -36,8 +36,9 @@ #pragma once -#include -#include +#include "config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER #include @@ -58,8 +59,8 @@ CUB_NAMESPACE_BEGIN /** * @def CUB_DEBUG_SYNC * - * Causes synchronization of the stream after every kernel launch to check - * for errors. Also causes kernel launch configurations to be printed to the + * Causes synchronization of the stream after every kernel launch to check + * for errors. Also causes kernel launch configurations to be printed to the * console. */ #define CUB_DEBUG_SYNC @@ -67,7 +68,7 @@ CUB_NAMESPACE_BEGIN /** * @def CUB_DEBUG_HOST_ASSERTIONS * - * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition + * Extends `CUB_DEBUG_SYNC` effects by checking host-side precondition * assertions. */ #define CUB_DEBUG_HOST_ASSERTIONS @@ -75,7 +76,7 @@ CUB_NAMESPACE_BEGIN /** * @def CUB_DEBUG_DEVICE_ASSERTIONS * - * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side + * Extends `CUB_DEBUG_HOST_ASSERTIONS` effects by checking device-side * precondition assertions. */ #define CUB_DEBUG_DEVICE_ASSERTIONS @@ -83,14 +84,14 @@ CUB_NAMESPACE_BEGIN /** * @def CUB_DEBUG_ALL * - * Causes host and device-side precondition assertions to be checked. Apart - * from that, causes synchronization of the stream after every kernel launch to - * check for errors. Also causes kernel launch configurations to be printed to + * Causes host and device-side precondition assertions to be checked. Apart + * from that, causes synchronization of the stream after every kernel launch to + * check for errors. Also causes kernel launch configurations to be printed to * the console. */ #define CUB_DEBUG_ALL -#endif // DOXYGEN_SHOULD_SKIP_THIS +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * \addtogroup UtilMgmt @@ -132,7 +133,7 @@ CUB_NAMESPACE_BEGIN // All #ifdef CUB_DEBUG_ALL -#define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL +#define CUB_DETAIL_DEBUG_LEVEL CUB_DETAIL_DEBUG_LEVEL_ALL #endif // Default case, no extra debugging: @@ -196,11 +197,11 @@ cudaError_t Debug(cudaError_t error, const char *filename, int line) cudaError_t last_error = cudaSuccess; NV_IF_TARGET( - NV_IS_HOST, + NV_IS_HOST, (last_error = cudaGetLastError();), (CUB_TEMP_DEVICE_CODE;) ); - + #undef CUB_TEMP_DEVICE_CODE // clang-format on diff --git a/cub/cub/util_deprecated.cuh b/cub/cub/util_deprecated.cuh index a988c9fca90..842f3560146 100644 --- a/cub/cub/util_deprecated.cuh +++ b/cub/cub/util_deprecated.cuh @@ -32,13 +32,13 @@ #pragma once +#include "config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER #include -#include -#include #include - #if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API) # define CUB_IGNORE_DEPRECATED_API #endif diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index c7e15cafe06..dd7bfaace9f 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -37,14 +37,14 @@ #pragma once +#include "config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include -#include -#include #include -#include -#include #include // for backward compatibility #include @@ -412,7 +412,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d { int major = 0, minor = 0; error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { break; } @@ -544,7 +544,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva) cudaError_t error = cudaSuccess; int device = -1; error = CubDebug(cudaGetDevice(&device)); - if (cudaSuccess != error) + if (cudaSuccess != error) { return error; } diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index d8f46f09075..720d4918606 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -32,6 +32,10 @@ #pragma once +#include "version.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include @@ -125,7 +129,7 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, */ #if !defined(CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION) _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") -_LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") +_LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") #endif /** @} */ // end group UtilModule diff --git a/cub/cub/util_math.cuh b/cub/cub/util_math.cuh index d69fc2ee2d5..8d27c26003e 100644 --- a/cub/cub/util_math.cuh +++ b/cub/cub/util_math.cuh @@ -32,10 +32,11 @@ #pragma once -#include +#include "config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER -#include "util_namespace.cuh" -#include "util_macro.cuh" +#include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index 27ff12dbbaa..7289ebe02b4 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -38,7 +38,10 @@ // This is not used by this file; this is a hack so that we can detect the // CUB version from Thrust on older versions of CUB that did not have // version.cuh. -#include +#include "version.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include // Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index ff6fdb07f50..425eec2e7a0 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,14 +34,14 @@ #pragma once -#include "util_type.cuh" -#include "util_arch.cuh" -#include "util_namespace.cuh" -#include "util_debug.cuh" +#include "config.cuh" +_CCCL_IMPLICIT_SYSTEM_HEADER -CUB_NAMESPACE_BEGIN +#include "util_debug.cuh" +#include "util_type.cuh" +CUB_NAMESPACE_BEGIN /** * \addtogroup UtilPtx @@ -163,7 +163,7 @@ __device__ __forceinline__ unsigned int BFE( return (source >> bit_start) & MASK; } -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED /** * Bitfield-extract for 128-bit types. */ @@ -328,7 +328,7 @@ __device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_m /** * Warp synchronous shfl_up */ -__device__ __forceinline__ +__device__ __forceinline__ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) { asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" @@ -339,7 +339,7 @@ unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned /** * Warp synchronous shfl_down */ -__device__ __forceinline__ +__device__ __forceinline__ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) { asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" @@ -350,7 +350,7 @@ unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsign /** * Warp synchronous shfl_idx */ -__device__ __forceinline__ +__device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) { asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" @@ -361,7 +361,7 @@ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned /** * Warp synchronous shfl_idx */ -__device__ __forceinline__ +__device__ __forceinline__ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask) { return __shfl_sync(member_mask, word, src_lane); @@ -395,7 +395,7 @@ __device__ __forceinline__ float FFMA_RZ(float a, float b, float c) */ __device__ __forceinline__ void ThreadExit() { asm volatile("exit;"); -} +} /** @@ -561,7 +561,7 @@ __device__ __forceinline__ T ShuffleUp( typedef typename UnitWord::ShuffleWord ShuffleWord; constexpr int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); - + T output; ShuffleWord *output_alias = reinterpret_cast(&output); ShuffleWord *input_alias = reinterpret_cast(&input); @@ -728,26 +728,26 @@ __device__ __forceinline__ T ShuffleIndex( #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -namespace detail +namespace detail { -/** - * Implementation detail for `MatchAny`. It provides specializations for full and partial warps. - * For partial warps, inactive threads must be masked out. This is done in the partial warp - * specialization below. +/** + * Implementation detail for `MatchAny`. It provides specializations for full and partial warps. + * For partial warps, inactive threads must be masked out. This is done in the partial warp + * specialization below. * Usage: * ``` - * // returns a mask of threads with the same 4 least-significant bits of `label` + * // returns a mask of threads with the same 4 least-significant bits of `label` * // in a warp with 16 active threads - * warp_matcher_t<4, 16>::match_any(label); + * warp_matcher_t<4, 16>::match_any(label); * - * // returns a mask of threads with the same 4 least-significant bits of `label` + * // returns a mask of threads with the same 4 least-significant bits of `label` * // in a warp with 32 active threads (no extra work is done) - * warp_matcher_t<4, 32>::match_any(label); + * warp_matcher_t<4, 32>::match_any(label); * ``` */ template -struct warp_matcher_t +struct warp_matcher_t { static __device__ unsigned int match_any(unsigned int label) @@ -758,7 +758,7 @@ struct warp_matcher_t }; template -struct warp_matcher_t +struct warp_matcher_t { // match.any.sync.b32 is slower when matching a few bits diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh index 588c554a32f..c5695ff56f9 100644 --- a/cub/cub/util_temporary_storage.cuh +++ b/cub/cub/util_temporary_storage.cuh @@ -33,6 +33,10 @@ #pragma once +#include "config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/util_type.cuh b/cub/cub/util_type.cuh index 2beecbc892c..bb207a4972f 100644 --- a/cub/cub/util_type.cuh +++ b/cub/cub/util_type.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -33,6 +33,10 @@ #pragma once +#include "config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include @@ -48,11 +52,6 @@ #endif #include -#include -#include -#include -#include -#include #include @@ -1220,7 +1219,7 @@ template <> struct NumericTraits : BaseTraits struct NumericTraits : BaseTraits {}; -#if CUB_IS_INT128_ENABLED +#if CUB_IS_INT128_ENABLED template <> struct NumericTraits<__uint128_t> { diff --git a/cub/cub/version.cuh b/cub/cub/version.cuh index d51023912fa..2ad82bb4468 100644 --- a/cub/cub/version.cuh +++ b/cub/cub/version.cuh @@ -35,6 +35,11 @@ #pragma once +// For `_CCCL_IMPLICIT_SYSTEM_HEADER` +#include + +_CCCL_IMPLICIT_SYSTEM_HEADER + /*! \def CUB_VERSION * \brief The preprocessor macro \p CUB_VERSION encodes the version * number of the CUB library. diff --git a/cub/cub/warp/specializations/warp_exchange_shfl.cuh b/cub/cub/warp/specializations/warp_exchange_shfl.cuh index fa73509b319..f7d214f8b2f 100644 --- a/cub/cub/warp/specializations/warp_exchange_shfl.cuh +++ b/cub/cub/warp/specializations/warp_exchange_shfl.cuh @@ -27,7 +27,10 @@ #pragma once -#include +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include @@ -51,7 +54,7 @@ class WarpExchangeShfl static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); // concrete recursion class - template + template class CompileTimeArray : protected CompileTimeArray { protected: @@ -239,7 +242,7 @@ class WarpExchangeShfl }; // terminating partial specialization - template + template class CompileTimeArray { protected: diff --git a/cub/cub/warp/specializations/warp_exchange_smem.cuh b/cub/cub/warp/specializations/warp_exchange_smem.cuh index f1c2edce60f..372e1def2b0 100644 --- a/cub/cub/warp/specializations/warp_exchange_smem.cuh +++ b/cub/cub/warp/specializations/warp_exchange_smem.cuh @@ -33,7 +33,10 @@ #pragma once -#include +#include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index ad6d2512f92..fabea446673 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -34,6 +34,9 @@ #pragma once #include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../../thread/thread_operators.cuh" #include "../../util_ptx.cuh" #include "../../util_type.cuh" @@ -46,31 +49,31 @@ CUB_NAMESPACE_BEGIN -namespace detail +namespace detail { template -struct reduce_add_exists : ::cuda::std::false_type +struct reduce_add_exists : ::cuda::std::false_type {}; template -struct reduce_add_exists : ::cuda::std::true_type +struct reduce_add_exists : ::cuda::std::true_type {}; template -struct reduce_min_exists : ::cuda::std::false_type +struct reduce_min_exists : ::cuda::std::false_type {}; template -struct reduce_min_exists : ::cuda::std::true_type +struct reduce_min_exists : ::cuda::std::true_type {}; template -struct reduce_max_exists : ::cuda::std::false_type +struct reduce_max_exists : ::cuda::std::false_type {}; template -struct reduce_max_exists : ::cuda::std::true_type +struct reduce_max_exists : ::cuda::std::true_type {}; } @@ -419,7 +422,7 @@ struct WarpReduceShfl //--------------------------------------------------------------------- template __device__ __forceinline__ T ReduceImpl( - Int2Type<0> /* all_lanes_valid */, + Int2Type<0> /* all_lanes_valid */, T input, ///< [in] Calling thread's input int valid_items, ///< [in] Total number of valid items across the logical warp ReductionOp reduction_op) ///< [in] Binary reduction operator @@ -436,7 +439,7 @@ struct WarpReduceShfl template __device__ __forceinline__ T ReduceImpl( - Int2Type<1> /* all_lanes_valid */, + Int2Type<1> /* all_lanes_valid */, T input, ///< [in] Calling thread's input int /* valid_items */, ///< [in] Total number of valid items across the logical warp ReductionOp reduction_op) ///< [in] Binary reduction operator @@ -452,7 +455,7 @@ struct WarpReduceShfl } template - __device__ __forceinline__ + __device__ __forceinline__ typename std::enable_if< (std::is_same::value || std::is_same::value) && detail::reduce_add_exists<>::value, T>::type @@ -474,7 +477,7 @@ struct WarpReduceShfl } template - __device__ __forceinline__ + __device__ __forceinline__ typename std::enable_if< (std::is_same::value || std::is_same::value) && detail::reduce_min_exists<>::value, T>::type @@ -496,7 +499,7 @@ struct WarpReduceShfl } template - __device__ __forceinline__ + __device__ __forceinline__ typename std::enable_if< (std::is_same::value || std::is_same::value) && detail::reduce_max_exists<>::value, T>::type diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index 7ffb73a3c27..fc86802cb53 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,6 +34,9 @@ #pragma once #include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../../thread/thread_operators.cuh" #include "../../thread/thread_load.cuh" #include "../../thread/thread_store.cuh" diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh index c2988711c8d..cb452c40967 100644 --- a/cub/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,6 +34,9 @@ #pragma once #include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../../thread/thread_operators.cuh" #include "../../util_type.cuh" #include "../../util_ptx.cuh" diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh index f5290e979a4..a7768233aa4 100644 --- a/cub/cub/warp/specializations/warp_scan_smem.cuh +++ b/cub/cub/warp/specializations/warp_scan_smem.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -34,6 +34,9 @@ #pragma once #include "../../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include "../../thread/thread_operators.cuh" #include "../../thread/thread_load.cuh" #include "../../thread/thread_store.cuh" diff --git a/cub/cub/warp/warp_exchange.cuh b/cub/cub/warp/warp_exchange.cuh index e863f67b103..798b3bbacba 100644 --- a/cub/cub/warp/warp_exchange.cuh +++ b/cub/cub/warp/warp_exchange.cuh @@ -33,7 +33,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/warp/warp_load.cuh b/cub/cub/warp/warp_load.cuh index 424145588c2..0d917f953f8 100644 --- a/cub/cub/warp/warp_load.cuh +++ b/cub/cub/warp/warp_load.cuh @@ -29,8 +29,11 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include diff --git a/cub/cub/warp/warp_merge_sort.cuh b/cub/cub/warp/warp_merge_sort.cuh index 3ad5dccd9c4..7f7beb7796b 100644 --- a/cub/cub/warp/warp_merge_sort.cuh +++ b/cub/cub/warp/warp_merge_sort.cuh @@ -27,8 +27,11 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include diff --git a/cub/cub/warp/warp_reduce.cuh b/cub/cub/warp/warp_reduce.cuh index 2901f56bbcc..7b636fa3c46 100644 --- a/cub/cub/warp/warp_reduce.cuh +++ b/cub/cub/warp/warp_reduce.cuh @@ -34,7 +34,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/warp/warp_scan.cuh b/cub/cub/warp/warp_scan.cuh index feff7ffe69d..0d4bb31ff4e 100644 --- a/cub/cub/warp/warp_scan.cuh +++ b/cub/cub/warp/warp_scan.cuh @@ -34,7 +34,10 @@ #pragma once -#include +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include #include #include diff --git a/cub/cub/warp/warp_store.cuh b/cub/cub/warp/warp_store.cuh index d40e63e65ee..cbb426aa68f 100644 --- a/cub/cub/warp/warp_store.cuh +++ b/cub/cub/warp/warp_store.cuh @@ -29,8 +29,11 @@ #pragma once +#include "../config.cuh" + +_CCCL_IMPLICIT_SYSTEM_HEADER + #include -#include #include #include #include diff --git a/thrust/thrust/system/cuda/detail/util.h b/thrust/thrust/system/cuda/detail/util.h index 439e25e2de0..34289f21810 100644 --- a/thrust/thrust/system/cuda/detail/util.h +++ b/thrust/thrust/system/cuda/detail/util.h @@ -38,7 +38,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER #include #include -#include +#include #include #include