Skip to content

Commit

Permalink
Add _CCCL_IMPLICIT_SYSTEM_HEADER to cub headers
Browse files Browse the repository at this point in the history
  • Loading branch information
miscco committed Oct 10, 2023
1 parent 445f844 commit 3c815cd
Show file tree
Hide file tree
Showing 138 changed files with 3,138 additions and 2,882 deletions.
3 changes: 2 additions & 1 deletion cub/cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

#include <thrust/system/cuda/detail/core/util.h>

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -159,7 +160,7 @@ struct AgentDifference
}
else
{
InputT tile_prev_input = MayAlias
InputT tile_prev_input = MayAlias
? first_tile_previous[tile_idx]
: *(input_it + tile_base - 1);

Expand Down
2 changes: 2 additions & 0 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@

#include <cstdint>

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN

namespace detail
Expand Down
8 changes: 5 additions & 3 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
#include "../grid/grid_queue.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN


Expand Down Expand Up @@ -301,8 +303,8 @@ struct AgentHistogram
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
{
int channel_bins = num_privatized_bins[CHANNEL];
for (int privatized_bin = threadIdx.x;
privatized_bin < channel_bins;
for (int privatized_bin = threadIdx.x;
privatized_bin < channel_bins;
privatized_bin += BLOCK_THREADS)
{
int output_bin = -1;
Expand Down Expand Up @@ -631,7 +633,7 @@ struct AgentHistogram
// Consume a partially-full tile at the end of the row
OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
}
}
else
{
// Consume full tile
Expand Down
2 changes: 2 additions & 0 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@

#include <thrust/system/cuda/detail/core/util.h>

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN


Expand Down
12 changes: 7 additions & 5 deletions cub/cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
#include <cub/util_type.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN


Expand Down Expand Up @@ -135,7 +137,7 @@ struct AgentRadixSortDownsweep
using ValuesItr = CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>;

// Radix ranking type to use
using BlockRadixRankT =
using BlockRadixRankT =
cub::detail::block_radix_rank_t<
RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>;

Expand Down Expand Up @@ -202,7 +204,7 @@ struct AgentRadixSortDownsweep
// The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
OffsetT bin_offset[BINS_TRACKED_PER_THREAD];

std::uint32_t current_bit;
std::uint32_t current_bit;
std::uint32_t num_bits;

// Whether to short-cirucit
Expand Down Expand Up @@ -488,15 +490,15 @@ struct AgentRadixSortDownsweep
OffsetT relative_bin_offsets[ITEMS_PER_THREAD];

// Assign default (min/max) value to all keys
bit_ordered_type default_key = IS_DESCENDING
? traits::min_raw_binary_key(decomposer)
bit_ordered_type default_key = IS_DESCENDING
? traits::min_raw_binary_key(decomposer)
: traits::max_raw_binary_key(decomposer);

// Load tile of keys
LoadKeys(
keys,
block_offset,
valid_items,
valid_items,
default_key,
Int2Type<FULL_TILE>(),
Int2Type<LOAD_WARP_STRIPED>());
Expand Down
7 changes: 4 additions & 3 deletions cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include "../util_math.cuh"
#include "../util_type.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -120,7 +121,7 @@ struct AgentRadixSortHistogram
// thread fields
// shared memory storage
_TempStorage& s;

// bins for the histogram
OffsetT* d_bins_out;

Expand Down Expand Up @@ -175,7 +176,7 @@ struct AgentRadixSortHistogram
}

__device__ __forceinline__
void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
{
// tile_offset < num_items always, hence the line below works
bool full_tile = num_items - tile_offset >= TILE_ITEMS;
Expand Down Expand Up @@ -264,7 +265,7 @@ struct AgentRadixSortHistogram
AccumulateSharedHistograms(tile_offset, keys);
}
CTA_SYNC();

// Accumulate the result in global memory.
AccumulateGlobalHistograms();
CTA_SYNC();
Expand Down
26 changes: 14 additions & 12 deletions cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
#include "../util_ptx.cuh"
#include "../util_type.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN

/** \brief cub::RadixSortStoreAlgorithm enumerates different algorithms to write
Expand All @@ -64,7 +66,7 @@ template <
int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
/** \brief Number of private histograms to use in the ranker;
/** \brief Number of private histograms to use in the ranker;
ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
int _RANK_NUM_PARTS,
/** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
Expand Down Expand Up @@ -106,7 +108,7 @@ struct AgentRadixSortOnesweep
RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS,
RADIX_DIGITS = 1 << RADIX_BITS,
RADIX_DIGITS = 1 << RADIX_BITS,
BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
WARP_THREADS = CUB_PTX_WARP_THREADS,
Expand All @@ -127,7 +129,7 @@ struct AgentRadixSortOnesweep
typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;

typedef PortionOffsetT AtomicOffsetT;

static constexpr RadixRankAlgorithm RANK_ALGORITHM =
AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
static constexpr BlockScanAlgorithm SCAN_ALGORITHM =
Expand Down Expand Up @@ -224,7 +226,7 @@ struct AgentRadixSortOnesweep
__device__ __forceinline__ void LookbackPartial(int (&bins)[BINS_PER_THREAD])
{
#pragma unroll
for (int u = 0; u < BINS_PER_THREAD; ++u)
for (int u = 0; u < BINS_PER_THREAD; ++u)
{
int bin = ThreadBin(u);
if (FULL_BINS || bin < RADIX_DIGITS)
Expand Down Expand Up @@ -260,7 +262,7 @@ struct AgentRadixSortOnesweep
agent.TryShortCircuit(keys, bins);
}
};

__device__ __forceinline__ void LookbackGlobal(int (&bins)[BINS_PER_THREAD])
{
#pragma unroll
Expand Down Expand Up @@ -452,7 +454,7 @@ struct AgentRadixSortOnesweep
{
s.global_offsets[bin] = d_bins_in[bin] - offsets[u];
}
}
}
}

__device__ __forceinline__ void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD],
Expand Down Expand Up @@ -512,7 +514,7 @@ struct AgentRadixSortOnesweep
constexpr int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS;
constexpr int ALIGN = 8;
constexpr auto CACHE_MODIFIER = STORE_CG;

int warp_start = warp * ITEMS_PER_WARP;
int warp_end = (warp + 1) * ITEMS_PER_WARP;
int warp_offset = warp_start;
Expand Down Expand Up @@ -596,19 +598,19 @@ struct AgentRadixSortOnesweep
// compute digits corresponding to the keys
int digits[ITEMS_PER_THREAD];
ComputeKeyDigits(digits);

// load values
ValueT values[ITEMS_PER_THREAD];
LoadValues(block_idx * TILE_ITEMS, values);

// scatter values
CTA_SYNC();
ScatterValuesShared(values, ranks);

CTA_SYNC();
ScatterValuesGlobal(digits);
}


__device__ __forceinline__ void GatherScatterValues(
int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
Expand All @@ -628,7 +630,7 @@ struct AgentRadixSortOnesweep
BlockRadixRankT(s.rank_temp_storage).RankKeys(
keys, ranks, digit_extractor(), exclusive_digit_prefix,
CountsCallback(*this, bins, keys));

// scatter keys in shared memory
CTA_SYNC();
ScatterKeysShared(keys, ranks);
Expand All @@ -637,7 +639,7 @@ struct AgentRadixSortOnesweep
LoadBinsToOffsetsGlobal(exclusive_digit_prefix);
LookbackGlobal(bins);
UpdateBinsGlobal(bins, exclusive_digit_prefix);

// scatter keys in global memory
CTA_SYNC();
ScatterKeysGlobal();
Expand Down
6 changes: 4 additions & 2 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down Expand Up @@ -321,7 +323,7 @@ struct AgentRadixSortUpsweep
const OffsetT &block_end)
{
// Process partial tile if necessary using single loads
for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
{
// Load and bucket key
bit_ordered_type key = d_keys_in[block_offset + offset];
Expand All @@ -346,7 +348,7 @@ struct AgentRadixSortUpsweep
:
temp_storage(temp_storage.Alias()),
d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in)),
current_bit(current_bit),
current_bit(current_bit),
num_bits(num_bits),
decomposer(decomposer)
{}
Expand Down
14 changes: 8 additions & 6 deletions cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
Expand All @@ -27,7 +27,7 @@
******************************************************************************/

/**
* @file cub::AgentReduce implements a stateful abstraction of CUDA thread
* @file cub::AgentReduce implements a stateful abstraction of CUDA thread
* blocks for participating in device-wide reduction.
*/

Expand All @@ -44,6 +44,8 @@
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/util_type.cuh>

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down Expand Up @@ -368,7 +370,7 @@ struct AgentReduce
.Reduce(thread_aggregate, reduction_op, valid_items);
}

// Extracting this into a function saves 8% of generated kernel size by allowing to reuse
// Extracting this into a function saves 8% of generated kernel size by allowing to reuse
// the block reduction below. This also workaround hang in nvcc.
ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize);

Expand Down Expand Up @@ -439,7 +441,7 @@ private:

even_share.block_offset += even_share.block_stride;

// Consume subsequent full tiles of input, at least one full tile was processed, so
// Consume subsequent full tiles of input, at least one full tile was processed, so
// `even_share.block_end >= TILE_ITEMS`
while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
{
Expand Down
8 changes: 5 additions & 3 deletions cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@

#include <iterator>

_CCCL_IMPLICIT_SYSTEM_HEADER

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand All @@ -68,8 +70,8 @@ CUB_NAMESPACE_BEGIN
* @tparam _SCAN_ALGORITHM
* The BlockScan algorithm to use
*
* @tparam DelayConstructorT
* Implementation detail, do not specify directly, requirements on the
* @tparam DelayConstructorT
* Implementation detail, do not specify directly, requirements on the
* content of this type are subject to breaking change.
*/
template <int _BLOCK_THREADS,
Expand All @@ -95,7 +97,7 @@ struct AgentReduceByKeyPolicy
///< The BlockScan algorithm to use
static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;

struct detail
struct detail
{
using delay_constructor_t = DelayConstructorT;
};
Expand Down
Loading

0 comments on commit 3c815cd

Please sign in to comment.