Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Thrust/CUB Linkage Issues #443

Merged
merged 17 commits into from
Sep 21, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci-overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ The syntax of the build and test scripts is the same:
./ci/test_thrust.sh <host compiler> <c++ standard> <gpu architectures>

#examples
./ci/build_thrust.sh g++ 17 70;80;86
./ci/build_thrust.sh g++ 17 "70;80;86"
```

In summary, the heart of our build and test jobs is the corresponding build or test script.
Expand Down
22 changes: 7 additions & 15 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,14 @@

CUB_NAMESPACE_BEGIN


template <typename AgentDifferenceInitT,
typename InputIteratorT,
typename InputT,
typename OffsetT>
void __global__ DeviceAdjacentDifferenceInitKernel(InputIteratorT first,
InputT *result,
OffsetT num_tiles,
int items_per_tile)
template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(InputIteratorT first,
InputT *result,
OffsetT num_tiles,
int items_per_tile)
{
const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
AgentDifferenceInitT::Process(tile_idx,
first,
result,
num_tiles,
items_per_tile);
AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile);
}

template <typename ChainedPolicyT,
Expand All @@ -68,7 +60,7 @@ template <typename ChainedPolicyT,
typename InputT,
bool MayAlias,
bool ReadLeft>
void __global__
CUB_DETAIL_KERNEL_ATTRIBUTES void
DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input,
InputT *first_tile_previous,
OutputIteratorT result,
Expand Down
41 changes: 22 additions & 19 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ struct AgentBatchMemcpyLargeBuffersPolicy
template <typename BufferOffsetScanTileStateT,
typename BlockOffsetScanTileStateT,
typename TileOffsetT>
__global__ void InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
BlockOffsetScanTileStateT block_offset_scan_tile_state,
TileOffsetT num_tiles)
CUB_DETAIL_KERNEL_ATTRIBUTES void
InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
BlockOffsetScanTileStateT block_offset_scan_tile_state,
TileOffsetT num_tiles)
{
// Initialize tile status
buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
Expand All @@ -93,12 +94,13 @@ template <typename ChainedPolicyT,
typename TileOffsetT,
bool IsMemcpy>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
__global__ void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferTileOffsetItT buffer_tile_offsets,
TileT buffer_offset_tile,
TileOffsetT last_tile_offset)
CUB_DETAIL_KERNEL_ATTRIBUTES
void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferTileOffsetItT buffer_tile_offsets,
TileT buffer_offset_tile,
TileOffsetT last_tile_offset)
{
using StatusWord = typename TileT::StatusWord;
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
Expand Down Expand Up @@ -219,16 +221,17 @@ template <typename ChainedPolicyT,
typename BLevBlockOffsetTileState,
bool IsMemcpy>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
__global__ void BatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferOffsetT num_buffers,
BlevBufferSrcsOutItT blev_buffer_srcs,
BlevBufferDstsOutItT blev_buffer_dsts,
BlevBufferSizesOutItT blev_buffer_sizes,
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
BLevBufferOffsetTileState blev_buffer_scan_state,
BLevBlockOffsetTileState blev_block_scan_state)
CUB_DETAIL_KERNEL_ATTRIBUTES
void BatchMemcpyKernel(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
BufferOffsetT num_buffers,
BlevBufferSrcsOutItT blev_buffer_srcs,
BlevBufferDstsOutItT blev_buffer_dsts,
BlevBufferSizesOutItT blev_buffer_sizes,
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
BLevBufferOffsetTileState blev_buffer_scan_state,
BLevBlockOffsetTileState blev_block_scan_state)
{
// Internal type used for storing a buffer's size
using BufferSizeT = cub::detail::value_t<BufferSizeIteratorT>;
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ CUB_NAMESPACE_BEGIN
* Drain queue descriptor for dynamically mapping tile data onto thread blocks
*/
template <int NUM_ACTIVE_CHANNELS, typename CounterT, typename OffsetT>
__global__ void
CUB_DETAIL_KERNEL_ATTRIBUTES void
DeviceHistogramInitKernel(ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,
ArrayWrapper<CounterT *, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper,
GridQueue<int> tile_queue)
Expand Down Expand Up @@ -193,8 +193,8 @@ template <typename ChainedPolicyT,
typename PrivatizedDecodeOpT,
typename OutputDecodeOpT,
typename OffsetT>
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS)) __global__
void DeviceHistogramSweepKernel(
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS))
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel(
SampleIteratorT d_samples,
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper,
ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper,
Expand Down
68 changes: 32 additions & 36 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@

CUB_NAMESPACE_BEGIN


template <bool UseVShmem,
typename ChainedPolicyT,
typename KeyInputIteratorT,
Expand All @@ -49,17 +48,17 @@ template <bool UseVShmem,
typename CompareOpT,
typename KeyT,
typename ValueT>
void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
DeviceMergeSortBlockSortKernel(bool ping,
KeyInputIteratorT keys_in,
ValueInputIteratorT items_in,
KeyIteratorT keys_out,
ValueIteratorT items_out,
OffsetT keys_count,
KeyT *tmp_keys_out,
ValueT *tmp_items_out,
CompareOpT compare_op,
char *vshmem)
__launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(bool ping,
KeyInputIteratorT keys_in,
ValueInputIteratorT items_in,
KeyIteratorT keys_out,
ValueIteratorT items_out,
OffsetT keys_count,
KeyT *tmp_keys_out,
ValueT *tmp_items_out,
CompareOpT compare_op,
char *vshmem)
{
extern __shared__ char shmem[];
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::MergeSortPolicy;
Expand Down Expand Up @@ -95,19 +94,16 @@ DeviceMergeSortBlockSortKernel(bool ping,
agent.Process();
}

template <typename KeyIteratorT,
typename OffsetT,
typename CompareOpT,
typename KeyT>
__global__ void DeviceMergeSortPartitionKernel(bool ping,
KeyIteratorT keys_ping,
KeyT *keys_pong,
OffsetT keys_count,
OffsetT num_partitions,
OffsetT *merge_partitions,
CompareOpT compare_op,
OffsetT target_merged_tiles_number,
int items_per_tile)
template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename KeyT>
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(bool ping,
KeyIteratorT keys_ping,
KeyT *keys_pong,
OffsetT keys_count,
OffsetT num_partitions,
OffsetT *merge_partitions,
CompareOpT compare_op,
OffsetT target_merged_tiles_number,
int items_per_tile)
{
OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x;

Expand Down Expand Up @@ -136,17 +132,17 @@ template <bool UseVShmem,
typename CompareOpT,
typename KeyT,
typename ValueT>
void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
DeviceMergeSortMergeKernel(bool ping,
KeyIteratorT keys_ping,
ValueIteratorT items_ping,
OffsetT keys_count,
KeyT *keys_pong,
ValueT *items_pong,
CompareOpT compare_op,
OffsetT *merge_partitions,
OffsetT target_merged_tiles_number,
char *vshmem)
__launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS)
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(bool ping,
KeyIteratorT keys_ping,
ValueIteratorT items_ping,
OffsetT keys_count,
KeyT *keys_pong,
ValueT *items_pong,
CompareOpT compare_op,
OffsetT *merge_partitions,
OffsetT target_merged_tiles_number,
char *vshmem)
{
extern __shared__ char shmem[];

Expand Down
22 changes: 11 additions & 11 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@

#pragma once

#include <stdio.h>
#include <iterator>

#include <cub/agent/agent_radix_sort_downsweep.cuh>
#include <cub/agent/agent_radix_sort_histogram.cuh>
#include <cub/agent/agent_radix_sort_onesweep.cuh>
Expand All @@ -52,6 +49,9 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <iterator>
#include <stdio.h>

// suppress warnings triggered by #pragma unroll:
// "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]"
#if defined(__clang__)
Expand Down Expand Up @@ -79,7 +79,7 @@ template <
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) :
int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
__global__ void DeviceRadixSortUpsweepKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
const KeyT *d_keys, ///< [in] Input keys buffer
OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
OffsetT /*num_items*/, ///< [in] Total number of input data items
Expand Down Expand Up @@ -138,7 +138,7 @@ template <
typename ChainedPolicyT, ///< Chained tuning policy
typename OffsetT> ///< Signed integer type for global offsets
__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
__global__ void RadixSortScanBinsKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(
OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
int num_counts) ///< [in] Total number of bin-counts
{
Expand Down Expand Up @@ -191,7 +191,7 @@ template <
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) :
int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
__global__ void DeviceRadixSortDownsweepKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel(
const KeyT *d_keys_in, ///< [in] Input keys buffer
KeyT *d_keys_out, ///< [in] Output keys buffer
const ValueT *d_values_in, ///< [in] Input values buffer
Expand Down Expand Up @@ -255,7 +255,7 @@ template <
typename OffsetT, ///< Signed integer type for global offsets
typename DecomposerT = detail::identity_decomposer_t>
__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
__global__ void DeviceRadixSortSingleTileKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
const KeyT *d_keys_in, ///< [in] Input keys buffer
KeyT *d_keys_out, ///< [in] Output keys buffer
const ValueT *d_values_in, ///< [in] Input values buffer
Expand Down Expand Up @@ -380,7 +380,7 @@ template <
__launch_bounds__ (int((ALT_DIGIT_BITS) ?
ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
__global__ void DeviceSegmentedRadixSortKernel(
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel(
const KeyT *d_keys_in, ///< [in] Input keys buffer
KeyT *d_keys_out, ///< [in] Output keys buffer
const ValueT *d_values_in, ///< [in] Input values buffer
Expand Down Expand Up @@ -552,7 +552,7 @@ template <typename ChainedPolicyT,
typename KeyT,
typename OffsetT,
typename DecomposerT = detail::identity_decomposer_t>
__global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
void DeviceRadixSortHistogramKernel(OffsetT *d_bins_out,
const KeyT *d_keys_in,
OffsetT num_items,
Expand All @@ -576,7 +576,7 @@ template <
typename PortionOffsetT,
typename AtomicOffsetT = PortionOffsetT,
typename DecomposerT = detail::identity_decomposer_t>
__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
DeviceRadixSortOnesweepKernel
(AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out,
const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out,
Expand All @@ -600,7 +600,7 @@ DeviceRadixSortOnesweepKernel
template <
typename ChainedPolicyT,
typename OffsetT>
__global__ void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
{
typedef typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy ExclusiveSumPolicyT;
const int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS;
Expand Down
Loading