Skip to content

Commit

Permalink
Add _CCCL_IMPLICIT_SYSTEM_HEADER to cub headers
Browse files Browse the repository at this point in the history
  • Loading branch information
miscco committed Oct 11, 2023
1 parent b283432 commit b7e6717
Show file tree
Hide file tree
Showing 142 changed files with 3,402 additions and 2,988 deletions.
5 changes: 4 additions & 1 deletion cub/cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include "../util_type.cuh"
#include "../util_namespace.cuh"
#include "../block/block_load.cuh"
Expand Down Expand Up @@ -159,7 +162,7 @@ struct AgentDifference
}
else
{
InputT tile_prev_input = MayAlias
InputT tile_prev_input = MayAlias
? first_tile_previous[tile_idx]
: *(input_it + tile_base - 1);

Expand Down
4 changes: 4 additions & 0 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@

#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/block/block_exchange.cuh>
#include <cub/block/block_load.cuh>
Expand Down
11 changes: 7 additions & 4 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@

#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include <iterator>

#include "../util_type.cuh"
#include "../block/block_load.cuh"
#include "../config.cuh"
#include "../grid/grid_queue.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"

Expand Down Expand Up @@ -301,8 +304,8 @@ struct AgentHistogram
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
{
int channel_bins = num_privatized_bins[CHANNEL];
for (int privatized_bin = threadIdx.x;
privatized_bin < channel_bins;
for (int privatized_bin = threadIdx.x;
privatized_bin < channel_bins;
privatized_bin += BLOCK_THREADS)
{
int output_bin = -1;
Expand Down Expand Up @@ -631,7 +634,7 @@ struct AgentHistogram
// Consume a partially-full tile at the end of the row
OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
}
}
else
{
// Consume full tile
Expand Down
3 changes: 3 additions & 0 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include "../util_type.cuh"
#include "../util_namespace.cuh"
#include "../block/block_load.cuh"
Expand Down
15 changes: 9 additions & 6 deletions cub/cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@

#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include <stdint.h>
#include <type_traits>

Expand All @@ -44,7 +48,6 @@
#include <cub/block/block_radix_rank.cuh>
#include <cub/block/block_exchange.cuh>
#include <cub/block/radix_rank_sort_operations.cuh>
#include <cub/config.cuh>
#include <cub/util_type.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>

Expand Down Expand Up @@ -135,7 +138,7 @@ struct AgentRadixSortDownsweep
using ValuesItr = CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>;

// Radix ranking type to use
using BlockRadixRankT =
using BlockRadixRankT =
cub::detail::block_radix_rank_t<
RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>;

Expand Down Expand Up @@ -202,7 +205,7 @@ struct AgentRadixSortDownsweep
// The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
OffsetT bin_offset[BINS_TRACKED_PER_THREAD];

std::uint32_t current_bit;
std::uint32_t current_bit;
std::uint32_t num_bits;

// Whether to short-cirucit
Expand Down Expand Up @@ -488,15 +491,15 @@ struct AgentRadixSortDownsweep
OffsetT relative_bin_offsets[ITEMS_PER_THREAD];

// Assign default (min/max) value to all keys
bit_ordered_type default_key = IS_DESCENDING
? traits::min_raw_binary_key(decomposer)
bit_ordered_type default_key = IS_DESCENDING
? traits::min_raw_binary_key(decomposer)
: traits::max_raw_binary_key(decomposer);

// Load tile of keys
LoadKeys(
keys,
block_offset,
valid_items,
valid_items,
default_key,
Int2Type<FULL_TILE>(),
Int2Type<LOAD_WARP_STRIPED>());
Expand Down
11 changes: 7 additions & 4 deletions cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,12 @@

#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include "../block/block_load.cuh"
#include "../block/radix_rank_sort_operations.cuh"
#include "../config.cuh"
#include "../thread/thread_reduce.cuh"
#include "../util_math.cuh"
#include "../util_type.cuh"
Expand Down Expand Up @@ -120,7 +123,7 @@ struct AgentRadixSortHistogram
// thread fields
// shared memory storage
_TempStorage& s;

// bins for the histogram
OffsetT* d_bins_out;

Expand Down Expand Up @@ -175,7 +178,7 @@ struct AgentRadixSortHistogram
}

__device__ __forceinline__
void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
{
// tile_offset < num_items always, hence the line below works
bool full_tile = num_items - tile_offset >= TILE_ITEMS;
Expand Down Expand Up @@ -264,7 +267,7 @@ struct AgentRadixSortHistogram
AccumulateSharedHistograms(tile_offset, keys);
}
CTA_SYNC();

// Accumulate the result in global memory.
AccumulateGlobalHistograms();
CTA_SYNC();
Expand Down
29 changes: 16 additions & 13 deletions cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,13 @@

#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include "../block/block_radix_rank.cuh"
#include "../block/radix_rank_sort_operations.cuh"
#include "../block/block_store.cuh"
#include "../config.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"

Expand Down Expand Up @@ -64,7 +67,7 @@ template <
int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
/** \brief Number of private histograms to use in the ranker;
/** \brief Number of private histograms to use in the ranker;
ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
int _RANK_NUM_PARTS,
/** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
Expand Down Expand Up @@ -106,7 +109,7 @@ struct AgentRadixSortOnesweep
RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS,
RADIX_DIGITS = 1 << RADIX_BITS,
RADIX_DIGITS = 1 << RADIX_BITS,
BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
WARP_THREADS = CUB_PTX_WARP_THREADS,
Expand All @@ -127,7 +130,7 @@ struct AgentRadixSortOnesweep
typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;

typedef PortionOffsetT AtomicOffsetT;

static constexpr RadixRankAlgorithm RANK_ALGORITHM =
AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
static constexpr BlockScanAlgorithm SCAN_ALGORITHM =
Expand Down Expand Up @@ -224,7 +227,7 @@ struct AgentRadixSortOnesweep
__device__ __forceinline__ void LookbackPartial(int (&bins)[BINS_PER_THREAD])
{
#pragma unroll
for (int u = 0; u < BINS_PER_THREAD; ++u)
for (int u = 0; u < BINS_PER_THREAD; ++u)
{
int bin = ThreadBin(u);
if (FULL_BINS || bin < RADIX_DIGITS)
Expand Down Expand Up @@ -260,7 +263,7 @@ struct AgentRadixSortOnesweep
agent.TryShortCircuit(keys, bins);
}
};

__device__ __forceinline__ void LookbackGlobal(int (&bins)[BINS_PER_THREAD])
{
#pragma unroll
Expand Down Expand Up @@ -452,7 +455,7 @@ struct AgentRadixSortOnesweep
{
s.global_offsets[bin] = d_bins_in[bin] - offsets[u];
}
}
}
}

__device__ __forceinline__ void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD],
Expand Down Expand Up @@ -512,7 +515,7 @@ struct AgentRadixSortOnesweep
constexpr int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS;
constexpr int ALIGN = 8;
constexpr auto CACHE_MODIFIER = STORE_CG;

int warp_start = warp * ITEMS_PER_WARP;
int warp_end = (warp + 1) * ITEMS_PER_WARP;
int warp_offset = warp_start;
Expand Down Expand Up @@ -596,19 +599,19 @@ struct AgentRadixSortOnesweep
// compute digits corresponding to the keys
int digits[ITEMS_PER_THREAD];
ComputeKeyDigits(digits);

// load values
ValueT values[ITEMS_PER_THREAD];
LoadValues(block_idx * TILE_ITEMS, values);

// scatter values
CTA_SYNC();
ScatterValuesShared(values, ranks);

CTA_SYNC();
ScatterValuesGlobal(digits);
}


__device__ __forceinline__ void GatherScatterValues(
int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
Expand All @@ -628,7 +631,7 @@ struct AgentRadixSortOnesweep
BlockRadixRankT(s.rank_temp_storage).RankKeys(
keys, ranks, digit_extractor(), exclusive_digit_prefix,
CountsCallback(*this, bins, keys));

// scatter keys in shared memory
CTA_SYNC();
ScatterKeysShared(keys, ranks);
Expand All @@ -637,7 +640,7 @@ struct AgentRadixSortOnesweep
LoadBinsToOffsetsGlobal(exclusive_digit_prefix);
LookbackGlobal(bins);
UpdateBinsGlobal(bins, exclusive_digit_prefix);

// scatter keys in global memory
CTA_SYNC();
ScatterKeysGlobal();
Expand Down
9 changes: 6 additions & 3 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,15 @@

#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include "../thread/thread_reduce.cuh"
#include "../thread/thread_load.cuh"
#include "../warp/warp_reduce.cuh"
#include "../block/block_load.cuh"
#include "../block/radix_rank_sort_operations.cuh"
#include "../config.cuh"
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"

Expand Down Expand Up @@ -321,7 +324,7 @@ struct AgentRadixSortUpsweep
const OffsetT &block_end)
{
// Process partial tile if necessary using single loads
for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
{
// Load and bucket key
bit_ordered_type key = d_keys_in[block_offset + offset];
Expand All @@ -346,7 +349,7 @@ struct AgentRadixSortUpsweep
:
temp_storage(temp_storage.Alias()),
d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in)),
current_bit(current_bit),
current_bit(current_bit),
num_bits(num_bits),
decomposer(decomposer)
{}
Expand Down
17 changes: 10 additions & 7 deletions cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
Expand All @@ -27,17 +27,20 @@
******************************************************************************/

/**
* @file cub::AgentReduce implements a stateful abstraction of CUDA thread
* @file cub::AgentReduce implements a stateful abstraction of CUDA thread
* blocks for participating in device-wide reduction.
*/

#pragma once

#include "../config.cuh"

_CCCL_IMPLICIT_SYSTEM_HEADER

#include <iterator>

#include <cub/block/block_load.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/config.cuh>
#include <cub/detail/type_traits.cuh>
#include <cub/grid/grid_even_share.cuh>
#include <cub/grid/grid_mapping.cuh>
Expand Down Expand Up @@ -368,7 +371,7 @@ struct AgentReduce
.Reduce(thread_aggregate, reduction_op, valid_items);
}

// Extracting this into a function saves 8% of generated kernel size by allowing to reuse
// Extracting this into a function saves 8% of generated kernel size by allowing to reuse
// the block reduction below. This also workaround hang in nvcc.
ConsumeFullTileRange(thread_aggregate, even_share, can_vectorize);

Expand Down Expand Up @@ -439,7 +442,7 @@ private:

even_share.block_offset += even_share.block_stride;

// Consume subsequent full tiles of input, at least one full tile was processed, so
// Consume subsequent full tiles of input, at least one full tile was processed, so
// `even_share.block_end >= TILE_ITEMS`
while (even_share.block_offset <= even_share.block_end - TILE_ITEMS)
{
Expand Down
Loading

0 comments on commit b7e6717

Please sign in to comment.