Skip to content

Commit

Permalink
Deprecate cub::{min, max} and replace internal uses with those from…
Browse files Browse the repository at this point in the history
… libcu++ (#3419)

* Deprecate `cub::{min, max}` and replace internal uses with those from libcu++

Fixes #3404
  • Loading branch information
miscco authored Jan 18, 2025
1 parent fdf3efb commit cac3738
Show file tree
Hide file tree
Showing 23 changed files with 101 additions and 66 deletions.
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/radix_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cub/device/device_radix_sort.cuh>
#include <cub/util_arch.cuh>

#include <cuda/std/functional>
#include <cuda/std/type_traits>

#include <nvbench_helper.cuh>
Expand Down Expand Up @@ -109,7 +110,8 @@ constexpr std::size_t max_onesweep_temp_storage_size()
using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
using hist_agent = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;

return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage));
return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
sizeof(typename hist_agent::TempStorage));
}

template <typename KeyT, typename ValueT, typename OffsetT>
Expand Down
4 changes: 3 additions & 1 deletion cub/benchmarks/bench/radix_sort/pairs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cub/device/device_radix_sort.cuh>
#include <cub/util_arch.cuh>

#include <cuda/std/functional>
#include <cuda/std/type_traits>

#include <nvbench_helper.cuh>
Expand Down Expand Up @@ -107,7 +108,8 @@ constexpr std::size_t max_onesweep_temp_storage_size()
using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
using hist_agent = cub::AgentRadixSortHistogram<hist_policy, is_descending, KeyT, OffsetT>;

return cub::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage), sizeof(typename hist_agent::TempStorage));
return (::cuda::std::max)(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
sizeof(typename hist_agent::TempStorage));
}

template <typename KeyT, typename ValueT, typename OffsetT>
Expand Down
9 changes: 5 additions & 4 deletions cub/cub/agent/agent_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@

#include <thrust/system/cuda/detail/core/util.h>

#include <cuda/std/__cccl/dialect.h>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>

CUB_NAMESPACE_BEGIN
namespace detail
Expand Down Expand Up @@ -116,7 +117,7 @@ struct agent_t
const Offset partition_end = merge_partitions[tile_idx + 1];

const Offset diag0 = items_per_tile * tile_idx;
const Offset diag1 = (cub::min)(keys1_count + keys2_count, diag0 + items_per_tile);
const Offset diag1 = (::cuda::std::min)(keys1_count + keys2_count, diag0 + items_per_tile);

// compute bounding box for keys1 & keys2
const Offset keys1_beg = partition_beg;
Expand All @@ -136,7 +137,7 @@ struct agent_t

// use binary search in shared memory to find merge path for each of thread.
// we can use int type here, because the number of items in shared memory is limited
const int diag0_loc = min<int>(num_keys1 + num_keys2, items_per_thread * threadIdx.x);
const int diag0_loc = (::cuda::std::min)(num_keys1 + num_keys2, static_cast<int>(items_per_thread * threadIdx.x));

const int keys1_beg_loc =
MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
Expand Down Expand Up @@ -215,7 +216,7 @@ struct agent_t
const Offset tile_base = tile_idx * items_per_tile;
// TODO(bgruber): random mixing of int and Offset
const int items_in_tile =
static_cast<int>(cub::min(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
static_cast<int>((::cuda::std::min)(static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
if (items_in_tile == items_per_tile)
{
consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
Expand Down
27 changes: 15 additions & 12 deletions cub/cub/agent/agent_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@

#include <thrust/system/cuda/detail/core/util.h>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>

CUB_NAMESPACE_BEGIN

template <int _BLOCK_THREADS,
Expand Down Expand Up @@ -156,7 +159,7 @@ struct AgentBlockSort
auto tile_idx = static_cast<OffsetT>(blockIdx.x);
auto num_tiles = static_cast<OffsetT>(gridDim.x);
auto tile_base = tile_idx * ITEMS_PER_TILE;
int items_in_tile = (cub::min)(keys_count - tile_base, int{ITEMS_PER_TILE});
int items_in_tile = (::cuda::std::min)(static_cast<int>(keys_count - tile_base), int{ITEMS_PER_TILE});

if (tile_idx < num_tiles - 1)
{
Expand Down Expand Up @@ -335,10 +338,10 @@ struct AgentPartition
// partition_idx / target_merged_tiles_number
const OffsetT local_tile_idx = mask & partition_idx;

const OffsetT keys1_beg = (cub::min)(keys_count, start);
const OffsetT keys1_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(start, size));
const OffsetT keys1_beg = (::cuda::std::min)(keys_count, start);
const OffsetT keys1_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(start, size));
const OffsetT keys2_beg = keys1_end;
const OffsetT keys2_end = (cub::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));
const OffsetT keys2_end = (::cuda::std::min)(keys_count, detail::safe_add_bound_to_max(keys2_beg, size));

_CCCL_PDL_GRID_DEPENDENCY_SYNC();

Expand All @@ -349,7 +352,7 @@ struct AgentPartition
}
else
{
const OffsetT partition_at = (cub::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);
const OffsetT partition_at = (::cuda::std::min)(keys2_end - keys1_beg, items_per_tile * local_tile_idx);

OffsetT partition_diag =
ping
Expand Down Expand Up @@ -526,15 +529,15 @@ struct AgentMerge
// diag >= keys1_beg, because diag is the distance of the total merge path so far (keys1 + keys2)
// diag+ITEMS_PER_TILE >= keys1_end, because diag+ITEMS_PER_TILE is the distance of the merge path for the next tile
// and keys1_end is key1's component of that path
const OffsetT keys2_beg = (cub::min)(max_keys2, diag - keys1_beg);
OffsetT keys2_end =
(cub::min)(max_keys2, detail::safe_add_bound_to_max(diag, static_cast<OffsetT>(ITEMS_PER_TILE)) - keys1_end);
const OffsetT keys2_beg = (::cuda::std::min)(max_keys2, diag - keys1_beg);
OffsetT keys2_end = (::cuda::std::min)(
max_keys2, detail::safe_add_bound_to_max(diag, static_cast<OffsetT>(ITEMS_PER_TILE)) - keys1_end);

// Check if it's the last tile in the tile group being merged
if (mask == (mask & tile_idx))
{
keys1_end = (cub::min)(keys_count - start, size);
keys2_end = (cub::min)(max_keys2, size);
keys1_end = (::cuda::std::min)(keys_count - start, size);
keys2_end = (::cuda::std::min)(max_keys2, size);
}

// number of keys per tile
Expand Down Expand Up @@ -591,7 +594,7 @@ struct AgentMerge
// we can use int type here, because the number of
// items in shared memory is limited
//
const int diag0_local = (cub::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);
const int diag0_local = (::cuda::std::min)(num_keys1 + num_keys2, ITEMS_PER_THREAD * tid);

const int keys1_beg_local = MergePath(
&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_local, compare_op);
Expand Down Expand Up @@ -731,7 +734,7 @@ struct AgentMerge
const OffsetT tile_base = OffsetT(tile_idx) * ITEMS_PER_TILE;
const int tid = static_cast<int>(threadIdx.x);
const int items_in_tile =
static_cast<int>((cub::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));
static_cast<int>((::cuda::std::min)(static_cast<OffsetT>(ITEMS_PER_TILE), keys_count - tile_base));

if (tile_idx < num_tiles - 1)
{
Expand Down
10 changes: 6 additions & 4 deletions cub/cub/agent/agent_spmv_orig.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
#include <cub/thread/thread_search.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/type_traits>

#include <iterator>
Expand Down Expand Up @@ -376,8 +378,8 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
// Gather the row end-offsets for the merge tile into shared memory
for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
{
const OffsetT offset =
(cub::min)(static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
const OffsetT offset = (::cuda::std::min)(
static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
}

Expand Down Expand Up @@ -557,8 +559,8 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv
#pragma unroll 1
for (int item = threadIdx.x; item < tile_num_rows + ITEMS_PER_THREAD; item += BLOCK_THREADS)
{
const OffsetT offset =
(cub::min)(static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
const OffsetT offset = (::cuda::std::min)(
static_cast<OffsetT>(tile_start_coord.x + item), static_cast<OffsetT>(spmv_params.num_rows - 1));
s_tile_row_end_offsets[item] = wd_row_end_offsets[offset];
}

Expand Down
12 changes: 7 additions & 5 deletions cub/cub/block/block_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN
Expand All @@ -58,7 +60,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE OffsetT
MergePath(KeyIt1 keys1, KeyIt2 keys2, OffsetT keys1_count, OffsetT keys2_count, OffsetT diag, BinaryPred binary_pred)
{
OffsetT keys1_begin = diag < keys2_count ? 0 : diag - keys2_count;
OffsetT keys1_end = (cub::min)(diag, keys1_count);
OffsetT keys1_end = (::cuda::std::min)(diag, keys1_count);

while (keys1_begin < keys1_end)
{
Expand Down Expand Up @@ -425,12 +427,12 @@ public:

int thread_idx_in_thread_group_being_merged = mask & linear_tid;

int diag = (cub::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
int diag = (::cuda::std::min)(valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);

int keys1_beg = (cub::min)(valid_items, start);
int keys1_end = (cub::min)(valid_items, keys1_beg + size);
int keys1_beg = (::cuda::std::min)(valid_items, start);
int keys1_end = (::cuda::std::min)(valid_items, keys1_beg + size);
int keys2_beg = keys1_end;
int keys2_end = (cub::min)(valid_items, keys2_beg + size);
int keys2_end = (::cuda::std::min)(valid_items, keys2_beg + size);

int keys1_count = keys1_end - keys1_beg;
int keys2_count = keys2_end - keys2_beg;
Expand Down
5 changes: 4 additions & 1 deletion cub/cub/block/block_run_length_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>

#include <limits>
#include <type_traits>

Expand Down Expand Up @@ -284,7 +287,7 @@ private:
for (int i = 0; i <= Log2<MAX_NUM_ITEMS>::VALUE; i++)
{
OffsetT mid = cub::MidPoint<OffsetT>(lower_bound, upper_bound);
mid = (cub::min)(mid, num_items - 1);
mid = (::cuda::std::min)(mid, num_items - 1);

if (val < input[mid])
{
Expand Down
4 changes: 3 additions & 1 deletion cub/cub/block/radix_rank_sort_operations.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@

#include <thrust/type_traits/integer_sequence.h>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/cstdint>
#include <cuda/std/tuple>
#include <cuda/std/type_traits>
Expand Down Expand Up @@ -437,7 +439,7 @@ struct digit_f
using traits = traits_t<typename ::cuda::std::remove_cv<T>::type>;
using bit_ordered_type = typename traits::bit_ordered_type;

const ::cuda::std::uint32_t bits_to_copy = min(src_size - src_bit_start, num_bits);
const ::cuda::std::uint32_t bits_to_copy = (::cuda::std::min)(src_size - src_bit_start, num_bits);

if (bits_to_copy)
{
Expand Down
4 changes: 3 additions & 1 deletion cub/cub/detail/temporary_storage.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <cub/util_namespace.cuh>
#include <cub/util_temporary_storage.cuh>

#include <cuda/std/__algorithm/max.h>

CUB_NAMESPACE_BEGIN

namespace detail
Expand Down Expand Up @@ -96,7 +98,7 @@ public:
private:
_CCCL_HOST_DEVICE void set_bytes_required(std::size_t new_size)
{
m_size = (max) (m_size, new_size);
m_size = (::cuda::std::max)(m_size, new_size);
}

_CCCL_HOST_DEVICE std::size_t get_bytes_required() const
Expand Down
4 changes: 3 additions & 1 deletion cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/type_traits>

#include <cstdint>
Expand Down Expand Up @@ -173,7 +175,7 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLO
copy_items<IsMemcpy, BLOCK_THREADS, InputBufferT, OutputBufferT, BufferSizeT>(
input_buffer_it[buffer_id],
output_buffer_it[buffer_id],
(cub::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
(::cuda::std::min)(buffer_sizes[buffer_id] - tile_offset_within_buffer, TILE_SIZE),
tile_offset_within_buffer);
}

Expand Down
5 changes: 4 additions & 1 deletion cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>

CUB_NAMESPACE_BEGIN
namespace detail
{
Expand Down Expand Up @@ -80,7 +83,7 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
const Offset partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (partition_idx < num_partitions)
{
const Offset partition_at = (cub::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
const Offset partition_at = (::cuda::std::min)(partition_idx * items_per_tile, keys1_count + keys2_count);
merge_partitions[partition_idx] = cub::MergePath(keys1, keys2, keys1_count, keys2_count, partition_at, compare_op);
}
}
Expand Down
9 changes: 6 additions & 3 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
#include <thrust/detail/integer_math.h>
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -125,9 +127,10 @@ private:
// Use fallback if either (a) the default block sort or (b) the block merge agent exceed the maximum shared memory
// available per block and both (1) the fallback block sort and (2) the fallback merge agent would not exceed the
// available shared memory
static constexpr auto max_default_size = (cub::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
static constexpr auto max_default_size =
(::cuda::std::max)(block_sort_helper_t::default_size, merge_helper_t::default_size);
static constexpr auto max_fallback_size =
(cub::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
(::cuda::std::max)(block_sort_helper_t::fallback_size, merge_helper_t::fallback_size);
static constexpr bool uses_fallback_policy =
(max_default_size > max_smem_per_block) && (max_fallback_size <= max_smem_per_block);

Expand Down Expand Up @@ -445,7 +448,7 @@ struct DispatchMergeSort
*/
const std::size_t block_sort_smem_size = num_tiles * BlockSortVSmemHelperT::vsmem_per_block;
const std::size_t merge_smem_size = num_tiles * MergeAgentVSmemHelperT::vsmem_per_block;
const std::size_t virtual_shared_memory_size = (cub::max)(block_sort_smem_size, merge_smem_size);
const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size);

void* allocations[4] = {nullptr, nullptr, nullptr, nullptr};
std::size_t allocation_sizes[4] = {
Expand Down
12 changes: 7 additions & 5 deletions cub/cub/device/dispatch/dispatch_segmented_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/cmath>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>
#include <cuda/std/type_traits>

#include <type_traits>
Expand Down Expand Up @@ -232,7 +234,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
{
// Sort by a CTA with multiple reads from global memory
int current_bit = begin_bit;
int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
int pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));

d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin);
Expand All @@ -255,7 +257,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
#pragma unroll 1
while (current_bit < end_bit)
{
pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));

__syncthreads();
agent.ProcessIterative(
Expand Down Expand Up @@ -510,7 +512,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
{
// Sort reading global memory multiple times
int current_bit = begin_bit;
int pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
int pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));

d_keys_double_buffer = cub::detail::device_double_buffer<KeyT>(
d_keys_double_buffer.current() + segment_begin, d_keys_double_buffer.alternate() + segment_begin);
Expand All @@ -533,7 +535,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
#pragma unroll 1
while (current_bit < end_bit)
{
pass_bits = (cub::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));
pass_bits = (::cuda::std::min)(int{LargeSegmentPolicyT::RADIX_BITS}, (end_bit - current_bit));

__syncthreads();
agent.ProcessIterative(
Expand Down Expand Up @@ -958,7 +960,7 @@ struct DispatchSegmentedSort
constexpr auto num_segments_per_invocation_limit =
static_cast<global_segment_offset_t>(::cuda::std::numeric_limits<int>::max());
auto const max_num_segments_per_invocation = static_cast<global_segment_offset_t>(
::cuda::std::min(static_cast<global_segment_offset_t>(num_segments), num_segments_per_invocation_limit));
(::cuda::std::min)(static_cast<global_segment_offset_t>(num_segments), num_segments_per_invocation_limit));

large_and_medium_segments_indices.grow(max_num_segments_per_invocation);
small_segments_indices.grow(max_num_segments_per_invocation);
Expand Down
Loading

0 comments on commit cac3738

Please sign in to comment.