Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Always treat CCCL as system headers #531

Merged
merged 6 commits into from
Oct 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 3 additions & 0 deletions cub/cmake/CubBuildCompilerTargets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ function(cub_build_compiler_targets)
set(cxx_compile_options)
set(cuda_compile_options)

# Ensure that we build our tests without treating ourself as system header
list(APPEND cxx_compile_definitions "_CCCL_NO_SYSTEM_HEADER")

if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
list(APPEND cxx_compile_definitions _ENABLE_EXTENDED_ALIGNED_STORAGE)
list(APPEND cuda_compile_options "--use-local-env")
Expand Down
9 changes: 8 additions & 1 deletion cub/cub/agent/agent_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
#pragma once

#include "../config.cuh"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#include "../util_type.cuh"
#include "../util_namespace.cuh"
#include "../block/block_load.cuh"
Expand Down Expand Up @@ -159,7 +166,7 @@ struct AgentDifference
}
else
{
InputT tile_prev_input = MayAlias
InputT tile_prev_input = MayAlias
? first_tile_previous[tile_idx]
: *(input_it + tile_base - 1);

Expand Down
8 changes: 8 additions & 0 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@

#pragma once

#include "../config.cuh"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/block/block_exchange.cuh>
#include <cub/block/block_load.cuh>
Expand Down
15 changes: 11 additions & 4 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,18 @@

#pragma once

#include "../config.cuh"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#include <iterator>

#include "../util_type.cuh"
#include "../block/block_load.cuh"
#include "../config.cuh"
#include "../grid/grid_queue.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"

Expand Down Expand Up @@ -301,8 +308,8 @@ struct AgentHistogram
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
{
int channel_bins = num_privatized_bins[CHANNEL];
for (int privatized_bin = threadIdx.x;
privatized_bin < channel_bins;
for (int privatized_bin = threadIdx.x;
privatized_bin < channel_bins;
privatized_bin += BLOCK_THREADS)
{
int output_bin = -1;
Expand Down Expand Up @@ -631,7 +638,7 @@ struct AgentHistogram
// Consume a partially-full tile at the end of the row
OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
}
}
else
{
// Consume full tile
Expand Down
7 changes: 7 additions & 0 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
#pragma once

#include "../config.cuh"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#include "../util_type.cuh"
#include "../util_namespace.cuh"
#include "../block/block_load.cuh"
Expand Down
19 changes: 13 additions & 6 deletions cub/cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@

#pragma once

#include "../config.cuh"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#include <stdint.h>
#include <type_traits>

Expand All @@ -44,7 +52,6 @@
#include <cub/block/block_radix_rank.cuh>
#include <cub/block/block_exchange.cuh>
#include <cub/block/radix_rank_sort_operations.cuh>
#include <cub/config.cuh>
#include <cub/util_type.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>

Expand Down Expand Up @@ -135,7 +142,7 @@ struct AgentRadixSortDownsweep
using ValuesItr = CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>;

// Radix ranking type to use
using BlockRadixRankT =
using BlockRadixRankT =
cub::detail::block_radix_rank_t<
RANK_ALGORITHM, BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>;

Expand Down Expand Up @@ -202,7 +209,7 @@ struct AgentRadixSortDownsweep
// The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
OffsetT bin_offset[BINS_TRACKED_PER_THREAD];

std::uint32_t current_bit;
std::uint32_t current_bit;
std::uint32_t num_bits;

// Whether to short-cirucit
Expand Down Expand Up @@ -488,15 +495,15 @@ struct AgentRadixSortDownsweep
OffsetT relative_bin_offsets[ITEMS_PER_THREAD];

// Assign default (min/max) value to all keys
bit_ordered_type default_key = IS_DESCENDING
? traits::min_raw_binary_key(decomposer)
bit_ordered_type default_key = IS_DESCENDING
? traits::min_raw_binary_key(decomposer)
: traits::max_raw_binary_key(decomposer);

// Load tile of keys
LoadKeys(
keys,
block_offset,
valid_items,
valid_items,
default_key,
Int2Type<FULL_TILE>(),
Int2Type<LOAD_WARP_STRIPED>());
Expand Down
15 changes: 11 additions & 4 deletions cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,16 @@

#pragma once

#include "../config.cuh"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#include "../block/block_load.cuh"
#include "../block/radix_rank_sort_operations.cuh"
#include "../config.cuh"
#include "../thread/thread_reduce.cuh"
#include "../util_math.cuh"
#include "../util_type.cuh"
Expand Down Expand Up @@ -120,7 +127,7 @@ struct AgentRadixSortHistogram
// thread fields
// shared memory storage
_TempStorage& s;

// bins for the histogram
OffsetT* d_bins_out;

Expand Down Expand Up @@ -175,7 +182,7 @@ struct AgentRadixSortHistogram
}

__device__ __forceinline__
void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
void LoadTileKeys(OffsetT tile_offset, bit_ordered_type (&keys)[ITEMS_PER_THREAD])
{
// tile_offset < num_items always, hence the line below works
bool full_tile = num_items - tile_offset >= TILE_ITEMS;
Expand Down Expand Up @@ -264,7 +271,7 @@ struct AgentRadixSortHistogram
AccumulateSharedHistograms(tile_offset, keys);
}
CTA_SYNC();

// Accumulate the result in global memory.
AccumulateGlobalHistograms();
CTA_SYNC();
Expand Down
33 changes: 20 additions & 13 deletions cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,17 @@

#pragma once

#include "../config.cuh"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#include "../block/block_radix_rank.cuh"
#include "../block/radix_rank_sort_operations.cuh"
#include "../block/block_store.cuh"
#include "../config.cuh"
#include "../util_ptx.cuh"
#include "../util_type.cuh"

Expand Down Expand Up @@ -64,7 +71,7 @@ template <
int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
/** \brief Number of private histograms to use in the ranker;
/** \brief Number of private histograms to use in the ranker;
ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
int _RANK_NUM_PARTS,
/** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
Expand Down Expand Up @@ -106,7 +113,7 @@ struct AgentRadixSortOnesweep
RANK_NUM_PARTS = AgentRadixSortOnesweepPolicy::RANK_NUM_PARTS,
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
RADIX_BITS = AgentRadixSortOnesweepPolicy::RADIX_BITS,
RADIX_DIGITS = 1 << RADIX_BITS,
RADIX_DIGITS = 1 << RADIX_BITS,
BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
WARP_THREADS = CUB_PTX_WARP_THREADS,
Expand All @@ -127,7 +134,7 @@ struct AgentRadixSortOnesweep
typename traits::template digit_extractor_t<fundamental_digit_extractor_t, DecomposerT>;

typedef PortionOffsetT AtomicOffsetT;

static constexpr RadixRankAlgorithm RANK_ALGORITHM =
AgentRadixSortOnesweepPolicy::RANK_ALGORITHM;
static constexpr BlockScanAlgorithm SCAN_ALGORITHM =
Expand Down Expand Up @@ -224,7 +231,7 @@ struct AgentRadixSortOnesweep
__device__ __forceinline__ void LookbackPartial(int (&bins)[BINS_PER_THREAD])
{
#pragma unroll
for (int u = 0; u < BINS_PER_THREAD; ++u)
for (int u = 0; u < BINS_PER_THREAD; ++u)
{
int bin = ThreadBin(u);
if (FULL_BINS || bin < RADIX_DIGITS)
Expand Down Expand Up @@ -260,7 +267,7 @@ struct AgentRadixSortOnesweep
agent.TryShortCircuit(keys, bins);
}
};

__device__ __forceinline__ void LookbackGlobal(int (&bins)[BINS_PER_THREAD])
{
#pragma unroll
Expand Down Expand Up @@ -452,7 +459,7 @@ struct AgentRadixSortOnesweep
{
s.global_offsets[bin] = d_bins_in[bin] - offsets[u];
}
}
}
}

__device__ __forceinline__ void UpdateBinsGlobal(int (&bins)[BINS_PER_THREAD],
Expand Down Expand Up @@ -512,7 +519,7 @@ struct AgentRadixSortOnesweep
constexpr int ITEMS_PER_WARP = TILE_ITEMS / BLOCK_WARPS;
constexpr int ALIGN = 8;
constexpr auto CACHE_MODIFIER = STORE_CG;

int warp_start = warp * ITEMS_PER_WARP;
int warp_end = (warp + 1) * ITEMS_PER_WARP;
int warp_offset = warp_start;
Expand Down Expand Up @@ -596,19 +603,19 @@ struct AgentRadixSortOnesweep
// compute digits corresponding to the keys
int digits[ITEMS_PER_THREAD];
ComputeKeyDigits(digits);

// load values
ValueT values[ITEMS_PER_THREAD];
LoadValues(block_idx * TILE_ITEMS, values);

// scatter values
CTA_SYNC();
ScatterValuesShared(values, ranks);

CTA_SYNC();
ScatterValuesGlobal(digits);
}


__device__ __forceinline__ void GatherScatterValues(
int (&ranks)[ITEMS_PER_THREAD], Int2Type<true> keys_only) {}
Expand All @@ -628,7 +635,7 @@ struct AgentRadixSortOnesweep
BlockRadixRankT(s.rank_temp_storage).RankKeys(
keys, ranks, digit_extractor(), exclusive_digit_prefix,
CountsCallback(*this, bins, keys));

// scatter keys in shared memory
CTA_SYNC();
ScatterKeysShared(keys, ranks);
Expand All @@ -637,7 +644,7 @@ struct AgentRadixSortOnesweep
LoadBinsToOffsetsGlobal(exclusive_digit_prefix);
LookbackGlobal(bins);
UpdateBinsGlobal(bins, exclusive_digit_prefix);

// scatter keys in global memory
CTA_SYNC();
ScatterKeysGlobal();
Expand Down
13 changes: 10 additions & 3 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,19 @@

#pragma once

#include "../config.cuh"

#if defined(_CCCL_COMPILER_NVHPC) && defined(_CCCL_USE_IMPLICIT_SYSTEM_DEADER)
#pragma GCC system_header
#else // ^^^ _CCCL_COMPILER_NVHPC ^^^ / vvv !_CCCL_COMPILER_NVHPC vvv
_CCCL_IMPLICIT_SYSTEM_HEADER
#endif // !_CCCL_COMPILER_NVHPC

#include "../thread/thread_reduce.cuh"
#include "../thread/thread_load.cuh"
#include "../warp/warp_reduce.cuh"
#include "../block/block_load.cuh"
#include "../block/radix_rank_sort_operations.cuh"
#include "../config.cuh"
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"

Expand Down Expand Up @@ -321,7 +328,7 @@ struct AgentRadixSortUpsweep
const OffsetT &block_end)
{
// Process partial tile if necessary using single loads
for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
for (OffsetT offset = threadIdx.x; offset < block_end - block_offset; offset += BLOCK_THREADS)
{
// Load and bucket key
bit_ordered_type key = d_keys_in[block_offset + offset];
Expand All @@ -346,7 +353,7 @@ struct AgentRadixSortUpsweep
:
temp_storage(temp_storage.Alias()),
d_keys_in(reinterpret_cast<const bit_ordered_type*>(d_keys_in)),
current_bit(current_bit),
current_bit(current_bit),
num_bits(num_bits),
decomposer(decomposer)
{}
Expand Down
Loading