Skip to content

Commit

Permalink
Refactors thrust::unique_by_key to use `cub::DeviceSelect::UniqueBy…
Browse files Browse the repository at this point in the history
…Key` (NVIDIA#1245)

* adds copy assignment for hugetype to enable std algorithm

* adds vsmem option to unique-by-key

* move huge data type to c2h utilities

* ports unique_by_key implementation to cub

* adds tests for large problem counts to thrust

* adds tests for custom equality op

* adds sfinae to equality_op overload to avoid ambiguity

* prevent loop unrolling to alleviate perf degradation for u32

* fixes signedness of comparison in tests

* addresses review comments
  • Loading branch information
elstehle authored Jan 5, 2024
1 parent b4d490b commit 50ce545
Show file tree
Hide file tree
Showing 6 changed files with 676 additions and 892 deletions.
7 changes: 6 additions & 1 deletion cub/cub/agent/agent_unique_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,9 @@ struct AgentUniqueByKey

CTA_SYNC();

// Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
// types
#pragma unroll(1)
for (int item = threadIdx.x;
item < num_tile_selections;
item += BLOCK_THREADS)
Expand Down Expand Up @@ -626,7 +629,9 @@ struct AgentUniqueByKey
{
// Blocks are launched in increasing order, so just assign one tile per block
int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
OffsetT tile_offset = tile_idx * ITEMS_PER_TILE; // Global offset for the current tile

// Global offset for the current tile
OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(ITEMS_PER_TILE);

if (tile_idx < num_tiles - 1)
{
Expand Down
268 changes: 213 additions & 55 deletions cub/cub/device/device_select.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,14 @@
# pragma system_header
#endif // no system header

#include <iterator>
#include <stdio.h>

#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_select_if.cuh>
#include <cub/device/dispatch/dispatch_unique_by_key.cuh>
#include <cub/util_deprecated.cuh>

#include <iterator>
#include <stdio.h>

CUB_NAMESPACE_BEGIN


Expand Down Expand Up @@ -837,6 +838,165 @@ struct DeviceSelect
stream);
}

//! @rst
//! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
//! equal-valued keys, only the first key and its value from each run is selectively copied
//! to ``d_keys_out`` and ``d_values_out``.
//! The total number of items selected is written to ``d_num_selected_out``.
//!
//! - The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent
//! - Copies of the selected items are compacted into ``d_out`` and maintain
//! their original relative ordering.
//! - In-place operations are not supported. There must be no overlap between
//! any of the provided ranges:
//!
//! - ``[d_keys_in, d_keys_in + num_items)``
//! - ``[d_keys_out, d_keys_out + *d_num_selected_out)``
//! - ``[d_values_in, d_values_in + num_items)``
//! - ``[d_values_out, d_values_out + *d_num_selected_out)``
//! - ``[d_num_selected_out, d_num_selected_out + 1)``
//!
//! - @devicestorage
//!
//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh>
//!
//! // Declare, allocate, and initialize device-accessible pointers
//! // for input and output
//! int num_items; // e.g., 8
//! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
//! int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
//! int *d_keys_out; // e.g., [ , , , , , , , ]
//! int *d_values_out; // e.g., [ , , , , , , , ]
//! int *d_num_selected_out; // e.g., [ ]
//! ...
//!
//! // Determine temporary device storage requirements
//! void *d_temp_storage = NULL;
//! size_t temp_storage_bytes = 0;
//! cub::DeviceSelect::UniqueByKey(
//! d_temp_storage, temp_storage_bytes,
//! d_keys_in, d_values_in,
//! d_keys_out, d_values_out, d_num_selected_out, num_items);
//!
//! // Allocate temporary storage
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
//!
//! // Run selection
//! cub::DeviceSelect::UniqueByKey(
//! d_temp_storage, temp_storage_bytes,
//! d_keys_in, d_values_in,
//! d_keys_out, d_values_out, d_num_selected_out, num_items);
//!
//! // d_keys_out <-- [0, 2, 9, 5, 8]
//! // d_values_out <-- [1, 2, 4, 5, 8]
//! // d_num_selected_out <-- [5]
//!
//! @endrst
//!
//! @tparam KeyInputIteratorT
//! **[inferred]** Random-access input iterator type for reading input keys @iterator
//!
//! @tparam ValueInputIteratorT
//! **[inferred]** Random-access input iterator type for reading input values @iterator
//!
//! @tparam KeyOutputIteratorT
//! **[inferred]** Random-access output iterator type for writing selected keys @iterator
//!
//! @tparam ValueOutputIteratorT
//! **[inferred]** Random-access output iterator type for writing selected values @iterator
//!
//! @tparam NumSelectedIteratorT
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
//!
//! @tparam NumItemsT
//! **[inferred]** Type of num_items
//!
//! @tparam EqualityOpT
//! **[inferred]** Type of equality_op
//!
//! @param[in] d_temp_storage
//! Device-accessible allocation of temporary storage. When `nullptr`, the
//! required allocation size is written to `temp_storage_bytes` and no work is done.
//!
//! @param[in,out] temp_storage_bytes
//! Reference to size in bytes of `d_temp_storage` allocation
//!
//! @param[in] d_keys_in
//! Pointer to the input sequence of keys
//!
//! @param[in] d_values_in
//! Pointer to the input sequence of values
//!
//! @param[out] d_keys_out
//! Pointer to the output sequence of selected keys
//!
//! @param[out] d_values_out
//! Pointer to the output sequence of selected values
//!
//! @param[out] d_num_selected_out
//! Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
//!
//! @param[in] num_items
//! Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
//!
//! @param[in] equality_op
//! Binary predicate to determine equality
//!
//! @param[in] stream
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! @endrst
template <typename KeyInputIteratorT,
typename ValueInputIteratorT,
typename KeyOutputIteratorT,
typename ValueOutputIteratorT,
typename NumSelectedIteratorT,
typename NumItemsT,
typename EqualityOpT>
CUB_RUNTIME_FUNCTION __forceinline__ static //
typename ::cuda::std::enable_if< //
!::cuda::std::is_convertible<EqualityOpT, cudaStream_t>::value, //
cudaError_t>::type
UniqueByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
NumItemsT num_items,
EqualityOpT equality_op,
cudaStream_t stream = 0)
{
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

return DispatchUniqueByKey<
KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
NumSelectedIteratorT,
EqualityOpT,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
equality_op,
static_cast<OffsetT>(num_items),
stream);
}

//! @rst
//! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
//! equal-valued keys, only the first key and its value from each run is selectively copied
Expand Down Expand Up @@ -914,6 +1074,9 @@ struct DeviceSelect
//! @tparam NumSelectedIteratorT
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
//!
//! @tparam NumItemsT
//! **[inferred]** Type of num_items
//!
//! @param[in] d_temp_storage
//! Device-accessible allocation of temporary storage. When `nullptr`, the
//! required allocation size is written to `temp_storage_bytes` and no work is done.
Expand Down Expand Up @@ -947,72 +1110,67 @@ struct DeviceSelect
typename ValueInputIteratorT,
typename KeyOutputIteratorT,
typename ValueOutputIteratorT,
typename NumSelectedIteratorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
UniqueByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream = 0)
typename NumSelectedIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
NumItemsT num_items,
cudaStream_t stream = 0)
{
using OffsetT = int;
using EqualityOp = Equality;

return DispatchUniqueByKey<KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
NumSelectedIteratorT,
EqualityOp,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
EqualityOp(),
num_items,
stream);
return UniqueByKey(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
num_items,
Equality{},
stream);
}

template <typename KeyInputIteratorT,
typename ValueInputIteratorT,
typename KeyOutputIteratorT,
typename ValueOutputIteratorT,
typename NumSelectedIteratorT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
UniqueByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream,
bool debug_synchronous)
typename NumSelectedIteratorT,
typename NumItemsT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
NumItemsT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

return UniqueByKey<KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
NumSelectedIteratorT>(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
num_items,
stream);
NumSelectedIteratorT,
NumItemsT>(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
num_items,
stream);
}
};

Expand Down
Loading

0 comments on commit 50ce545

Please sign in to comment.