Skip to content

Commit

Permalink
Merge pull request #443 from senior-zero/fix-main/github/visibility
Browse files Browse the repository at this point in the history
Fix Thrust/CUB Linkage Issues
  • Loading branch information
gevtushenko authored Sep 21, 2023
2 parents dc77fea + dd32e8d commit 06f5d99
Show file tree
Hide file tree
Showing 25 changed files with 434 additions and 303 deletions.
2 changes: 1 addition & 1 deletion ci-overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ The syntax of the build and test scripts is the same:
./ci/test_thrust.sh <host compiler> <c++ standard> <gpu architectures>

#examples
./ci/build_thrust.sh g++ 17 70;80;86
./ci/build_thrust.sh g++ 17 "70;80;86"
```

In summary, the heart of our build and test jobs is the corresponding build or test script.
Expand Down
11 changes: 1 addition & 10 deletions cub/cub/detail/detect_cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,14 @@
******************************************************************************/

/**
* \file
* @file
* Utilities for CUDA dynamic parallelism.
*/

#pragma once

#include <cub/util_namespace.cuh>

#include <cuda_runtime_api.h>

CUB_NAMESPACE_BEGIN
namespace detail
{

#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:

/**
Expand Down Expand Up @@ -111,6 +105,3 @@ namespace detail
#endif

#endif // Do not document

} // namespace detail
CUB_NAMESPACE_END
22 changes: 7 additions & 15 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,14 @@

CUB_NAMESPACE_BEGIN


template <typename AgentDifferenceInitT,
typename InputIteratorT,
typename InputT,
typename OffsetT>
void __global__ DeviceAdjacentDifferenceInitKernel(InputIteratorT first,
InputT *result,
OffsetT num_tiles,
int items_per_tile)
template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(InputIteratorT first,
InputT *result,
OffsetT num_tiles,
int items_per_tile)
{
const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
AgentDifferenceInitT::Process(tile_idx,
first,
result,
num_tiles,
items_per_tile);
AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile);
}

template <typename ChainedPolicyT,
Expand All @@ -68,7 +60,7 @@ template <typename ChainedPolicyT,
typename InputT,
bool MayAlias,
bool ReadLeft>
void __global__
CUB_DETAIL_KERNEL_ATTRIBUTES void
DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input,
InputT *first_tile_previous,
OutputIteratorT result,
Expand Down
41 changes: 22 additions & 19 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ struct AgentBatchMemcpyLargeBuffersPolicy
template <typename BufferOffsetScanTileStateT,
typename BlockOffsetScanTileStateT,
typename TileOffsetT>
__global__ void InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
BlockOffsetScanTileStateT block_offset_scan_tile_state,
TileOffsetT num_tiles)
CUB_DETAIL_KERNEL_ATTRIBUTES void
InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
BlockOffsetScanTileStateT block_offset_scan_tile_state,
TileOffsetT num_tiles)
{
// Initialize tile status
buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
Expand All @@ -93,12 +94,13 @@ template <typename ChainedPolicyT,
typename TileOffsetT,
bool IsMemcpy>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
__global__ void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferTileOffsetItT buffer_tile_offsets,
TileT buffer_offset_tile,
TileOffsetT last_tile_offset)
CUB_DETAIL_KERNEL_ATTRIBUTES
void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferTileOffsetItT buffer_tile_offsets,
TileT buffer_offset_tile,
TileOffsetT last_tile_offset)
{
using StatusWord = typename TileT::StatusWord;
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
Expand Down Expand Up @@ -219,16 +221,17 @@ template <typename ChainedPolicyT,
typename BLevBlockOffsetTileState,
bool IsMemcpy>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
__global__ void BatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferOffsetT num_buffers,
BlevBufferSrcsOutItT blev_buffer_srcs,
BlevBufferDstsOutItT blev_buffer_dsts,
BlevBufferSizesOutItT blev_buffer_sizes,
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
BLevBufferOffsetTileState blev_buffer_scan_state,
BLevBlockOffsetTileState blev_block_scan_state)
CUB_DETAIL_KERNEL_ATTRIBUTES
void BatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferOffsetT num_buffers,
BlevBufferSrcsOutItT blev_buffer_srcs,
BlevBufferDstsOutItT blev_buffer_dsts,
BlevBufferSizesOutItT blev_buffer_sizes,
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
BLevBufferOffsetTileState blev_buffer_scan_state,
BLevBlockOffsetTileState blev_block_scan_state)
{
// Internal type used for storing a buffer's size
using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ CUB_NAMESPACE_BEGIN
* Drain queue descriptor for dynamically mapping tile data onto thread blocks
*/
template <int NUM_ACTIVE_CHANNELS, typename CounterT, typename OffsetT>
__global__ void
CUB_DETAIL_KERNEL_ATTRIBUTES void
DeviceHistogramInitKernel(ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,
ArrayWrapper<CounterT *, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper,
GridQueue<int> tile_queue)
Expand Down Expand Up @@ -193,8 +193,8 @@ template <typename ChainedPolicyT,
typename PrivatizedDecodeOpT,
typename OutputDecodeOpT,
typename OffsetT>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS)) __global__
void DeviceHistogramSweepKernel(
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel(
SampleIteratorT d_samples,
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper,
Expand Down
68 changes: 32 additions & 36 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@

CUB_NAMESPACE_BEGIN


template <bool UseVShmem,
typename ChainedPolicyT,
typename KeyInputIteratorT,
Expand All @@ -49,17 +48,17 @@ template <bool UseVShmem,
typename CompareOpT,
typename KeyT,
typename ValueT>
void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
DeviceMergeSortBlockSortKernel(bool ping,
KeyInputIteratorT keys_in,
ValueInputIteratorT items_in,
KeyIteratorT keys_out,
ValueIteratorT items_out,
OffsetT keys_count,
KeyT *tmp_keys_out,
ValueT *tmp_items_out,
CompareOpT compare_op,
char *vshmem)
__launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(bool ping,
KeyInputIteratorT keys_in,
ValueInputIteratorT items_in,
KeyIteratorT keys_out,
ValueIteratorT items_out,
OffsetT keys_count,
KeyT *tmp_keys_out,
ValueT *tmp_items_out,
CompareOpT compare_op,
char *vshmem)
{
extern __shared__ char shmem[];
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::MergeSortPolicy;
Expand Down Expand Up @@ -95,19 +94,16 @@ DeviceMergeSortBlockSortKernel(bool ping,
agent.Process();
}

template <typename KeyIteratorT,
typename OffsetT,
typename CompareOpT,
typename KeyT>
__global__ void DeviceMergeSortPartitionKernel(bool ping,
KeyIteratorT keys_ping,
KeyT *keys_pong,
OffsetT keys_count,
OffsetT num_partitions,
OffsetT *merge_partitions,
CompareOpT compare_op,
OffsetT target_merged_tiles_number,
int items_per_tile)
template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(bool ping,
KeyIteratorT keys_ping,
KeyT *keys_pong,
OffsetT keys_count,
OffsetT num_partitions,
OffsetT *merge_partitions,
CompareOpT compare_op,
OffsetT target_merged_tiles_number,
int items_per_tile)
{
OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x;

Expand Down Expand Up @@ -136,17 +132,17 @@ template <bool UseVShmem,
typename CompareOpT,
typename KeyT,
typename ValueT>
void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
DeviceMergeSortMergeKernel(bool ping,
KeyIteratorT keys_ping,
ValueIteratorT items_ping,
OffsetT keys_count,
KeyT *keys_pong,
ValueT *items_pong,
CompareOpT compare_op,
OffsetT *merge_partitions,
OffsetT target_merged_tiles_number,
char *vshmem)
__launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(bool ping,
KeyIteratorT keys_ping,
ValueIteratorT items_ping,
OffsetT keys_count,
KeyT *keys_pong,
ValueT *items_pong,
CompareOpT compare_op,
OffsetT *merge_partitions,
OffsetT target_merged_tiles_number,
char *vshmem)
{
extern __shared__ char shmem[];

Expand Down
22 changes: 11 additions & 11 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@

#pragma once

#include <stdio.h>
#include <iterator>

#include <cub/agent/agent_radix_sort_downsweep.cuh>
#include <cub/agent/agent_radix_sort_histogram.cuh>
#include <cub/agent/agent_radix_sort_onesweep.cuh>
Expand All @@ -52,6 +49,9 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <iterator>
#include <stdio.h>

// suppress warnings triggered by #pragma unroll:
// "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]"
#if defined(__clang__)
Expand Down Expand Up @@ -79,7 +79,7 @@ template <
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) :
int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
__global__ void DeviceRadixSortUpsweepKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
const KeyT *d_keys, ///< [in] Input keys buffer
OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
OffsetT /*num_items*/, ///< [in] Total number of input data items
Expand Down Expand Up @@ -138,7 +138,7 @@ template <
typename ChainedPolicyT, ///< Chained tuning policy
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
__global__ void RadixSortScanBinsKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(
OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
int num_counts) ///< [in] Total number of bin-counts
{
Expand Down Expand Up @@ -191,7 +191,7 @@ template <
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) :
int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
__global__ void DeviceRadixSortDownsweepKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel(
const KeyT *d_keys_in, ///< [in] Input keys buffer
KeyT *d_keys_out, ///< [in] Output keys buffer
const ValueT *d_values_in, ///< [in] Input values buffer
Expand Down Expand Up @@ -255,7 +255,7 @@ template <
typename OffsetT, ///< Signed integer type for global offsets
typename DecomposerT = detail::identity_decomposer_t>
__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
__global__ void DeviceRadixSortSingleTileKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
const KeyT *d_keys_in, ///< [in] Input keys buffer
KeyT *d_keys_out, ///< [in] Output keys buffer
const ValueT *d_values_in, ///< [in] Input values buffer
Expand Down Expand Up @@ -380,7 +380,7 @@ template <
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
__global__ void DeviceSegmentedRadixSortKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel(
const KeyT *d_keys_in, ///< [in] Input keys buffer
KeyT *d_keys_out, ///< [in] Output keys buffer
const ValueT *d_values_in, ///< [in] Input values buffer
Expand Down Expand Up @@ -552,7 +552,7 @@ template <typename ChainedPolicyT,
typename KeyT,
typename OffsetT,
typename DecomposerT = detail::identity_decomposer_t>
__global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
void DeviceRadixSortHistogramKernel(OffsetT *d_bins_out,
const KeyT *d_keys_in,
OffsetT num_items,
Expand All @@ -576,7 +576,7 @@ template <
typename PortionOffsetT,
typename AtomicOffsetT = PortionOffsetT,
typename DecomposerT = detail::identity_decomposer_t>
__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
DeviceRadixSortOnesweepKernel
(AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out,
const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out,
Expand All @@ -600,7 +600,7 @@ DeviceRadixSortOnesweepKernel
template <
typename ChainedPolicyT,
typename OffsetT>
__global__ void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
{
typedef typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy ExclusiveSumPolicyT;
const int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS;
Expand Down
Loading

0 comments on commit 06f5d99

Please sign in to comment.