Skip to content

Commit

Permalink
Work around issues with cccl main (#15552)
Browse files Browse the repository at this point in the history
This gets cuDF build cccl main on 12.3.

There is one issue with the cuco tuple helpers but that will be fixed on
the cuco side

---------

Co-authored-by: Bernhard Manfred Gruber <[email protected]>
Co-authored-by: Bradley Dice <[email protected]>
Co-authored-by: Nghia Truong <[email protected]>
Co-authored-by: ptaylor <[email protected]>
Co-authored-by: Paul Taylor <[email protected]>
Co-authored-by: Yunsong Wang <[email protected]>
  • Loading branch information
7 people authored May 28, 2024
1 parent 8a40567 commit 29429f7
Show file tree
Hide file tree
Showing 15 changed files with 209 additions and 27 deletions.
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ repos:
- id: trailing-whitespace
exclude: |
(?x)^(
^cpp/cmake/thirdparty/patches/.*|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
)
- id: end-of-file-fixer
exclude: |
(?x)^(
^cpp/cmake/thirdparty/patches/.*|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
)
- repo: https://github.com/PyCQA/isort
Expand Down
20 changes: 20 additions & 0 deletions cpp/cmake/thirdparty/patches/cccl_override.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
"issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
"issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
"fixed_in" : ""
},
{
"file": "cccl/kernel_pointer_hiding.diff",
"issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
Expand All @@ -28,15 +33,30 @@
"issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
"issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
"issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
"issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
"issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
"issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
"fixed_in" : ""
}
]
}
Expand Down
47 changes: 47 additions & 0 deletions cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
index 046eb83c0..8047c9701 100644
--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -53,41 +53,15 @@ namespace cuda_cub

namespace __copy
{
-template <class Derived, class InputIt, class OutputIt>
-OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
- execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
-{
- typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
- const auto n = thrust::distance(first, last);
- if (n > 0)
- {
- cudaError status;
- status = trivial_copy_device_to_device(
- policy,
- reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
- reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
- n);
- cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
- }
-
- return result + n;
-}

template <class Derived, class InputIt, class OutputIt>
OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
- execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
+ execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
{
typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
}

-template <class Derived, class InputIt, class OutputIt>
-OutputIt THRUST_RUNTIME_FUNCTION
-device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
-{
- return device_to_device(
- policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
-}
} // namespace __copy

} // namespace cuda_cub
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
index 2a3cc4e33..8fb337b26 100644
--- a/thrust/thrust/system/cuda/detail/dispatch.h
+++ b/thrust/thrust/system/cuda/detail/dispatch.h
@@ -44,8 +44,7 @@
} \
else \
{ \
- auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
- status = call arguments; \
+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
}

/**
@@ -66,9 +65,7 @@
} \
else \
{ \
- auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
- auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
- status = call arguments; \
+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
}
/**
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 0606485bb..dbb99ff13 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
};

/// SM60 (GP100)
- struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
{
enum
{
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index f39613adb..75bd16ff9 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -488,7 +488,7 @@ struct DeviceReducePolicy
};

/// SM60
- struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 16;
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 419908c4e..6ab0840e1 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -339,7 +339,7 @@ struct DeviceScanPolicy
/// SM600
struct Policy600
: DefaultTuning
- , ChainedPolicy<600, Policy600, Policy520>
+ , ChainedPolicy<600, Policy600, Policy600>
{};

/// SM800
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index eb76ebb0b..c6c529a50 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
KeyT key1 = keys_shared[keys1_beg];
KeyT key2 = keys_shared[keys2_beg];

-#pragma unroll
+#pragma unroll 1
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
{
bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
@@ -376,7 +376,7 @@ public:
//
KeyT max_key = oob_default;

-#pragma unroll
+#pragma unroll 1
for (int item = 1; item < ITEMS_PER_THREAD; ++item)
{
if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
index 7d9e8622f..da5627306 100644
--- a/cub/cub/thread/thread_sort.cuh
+++ b/cub/cub/thread/thread_sort.cuh
@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
{
constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;

-#pragma unroll
+#pragma unroll 1
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
-#pragma unroll
+#pragma unroll 1
for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
{
if (compare_op(keys[j + 1], keys[j]))
9 changes: 5 additions & 4 deletions cpp/src/io/comp/statistics.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <rmm/exec_policy.hpp>

#include <cuda/functional>
#include <thrust/transform_reduce.h>

namespace cudf::io {
Expand All @@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics(
rmm::exec_policy(stream),
results.begin(),
results.end(),
[] __device__(auto& res) {
cuda::proclaim_return_type<size_t>([] __device__(compression_result const& res) {
return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
},
}),
0ul,
thrust::plus<size_t>());

Expand All @@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics(
rmm::exec_policy(stream),
zipped_begin,
zipped_end,
[status] __device__(auto tup) {
cuda::proclaim_return_type<size_t>([status] __device__(auto tup) {
return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
},
}),
0ul,
thrust::plus<size_t>());
};
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/io/orc/reader_impl_decode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
d_sizes = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
// Since the number of rows may not divisible by segment_length,
// the last segment may be shorter than the others.
auto const current_length =
cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
auto const current_length = min(segment_length, num_rows - segment_length * segment_idx);
auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes
return cumulative_size{static_cast<std::size_t>(current_length),
static_cast<std::size_t>(size)};
Expand Down
22 changes: 14 additions & 8 deletions cpp/src/io/orc/stripe_init.cu
Original file line number Diff line number Diff line change
Expand Up @@ -561,20 +561,26 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
uint32_t log2maxcr,
rmm::cuda_stream_view stream)
{
dim3 dim_block(128, 1);
dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block
gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
strm_info, num_streams, compression_block_size, log2maxcr);
auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block
if (num_blocks > 0) {
dim3 dim_block(128, 1);
dim3 dim_grid(num_blocks, 1);
gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
strm_info, num_streams, compression_block_size, log2maxcr);
}
}

void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
int32_t num_streams,
rmm::cuda_stream_view stream)
{
dim3 dim_block(128, 1);
dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block
gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
num_streams);
auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block
if (num_blocks > 0) {
dim3 dim_block(128, 1);
dim3 dim_grid(num_blocks, 1);
gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
num_streams);
}
}

void __host__ ParseRowGroupIndex(RowGroup* row_groups,
Expand Down
13 changes: 8 additions & 5 deletions cpp/src/io/parquet/page_string_decode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1197,14 +1197,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
cudf::detail::join_streams(streams, stream);

// check for needed temp space for DELTA_BYTE_ARRAY
auto const need_sizes = thrust::any_of(
rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) {
return page.temp_string_size != 0;
});
auto const need_sizes =
thrust::any_of(rmm::exec_policy(stream),
pages.device_begin(),
pages.device_end(),
cuda::proclaim_return_type<bool>(
[] __device__(auto& page) { return page.temp_string_size != 0; }));

if (need_sizes) {
// sum up all of the temp_string_sizes
auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
auto const page_sizes = cuda::proclaim_return_type<int64_t>(
[] __device__(PageInfo const& page) { return page.temp_string_size; });
auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
pages.device_begin(),
pages.device_end(),
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/reader_impl_preprocess.cu
Original file line number Diff line number Diff line change
Expand Up @@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding)
[[nodiscard]] std::string list_unsupported_encodings(device_span<PageInfo const> pages,
rmm::cuda_stream_view stream)
{
auto const to_mask = [] __device__(auto const& page) {
auto const to_mask = cuda::proclaim_return_type<uint32_t>([] __device__(auto const& page) {
return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding);
};
});
uint32_t const unsupported = thrust::transform_reduce(
rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or<uint32_t>());
return encoding_bitmask_to_str(unsupported);
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/io/utilities/data_casting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <rmm/resource_ref.hpp>

#include <cub/cub.cuh>
#include <cuda/functional>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
Expand Down Expand Up @@ -783,7 +784,8 @@ template <typename SymbolT>
struct to_string_view_pair {
SymbolT const* data;
to_string_view_pair(SymbolT const* _data) : data(_data) {}
__device__ auto operator()(thrust::tuple<size_type, size_type> ip)
__device__ thrust::pair<char const*, std::size_t> operator()(
thrust::tuple<size_type, size_type> ip)
{
return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
static_cast<std::size_t>(thrust::get<1>(ip))};
Expand All @@ -805,7 +807,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
rmm::exec_policy(stream),
str_tuples,
str_tuples + col_size,
[] __device__(auto t) { return t.second; },
cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
size_type{0},
thrust::maximum<size_type>{});

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/distinct_hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});

auto const [probe_indices_end, _] = this->_hash_table.retrieve(
iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});

auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
build_indices->resize(actual_size, stream);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/split/split_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,9 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
[d_offsets] __device__(auto const idx) -> size_type {
cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto const idx) -> size_type {
return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
},
}),
0,
thrust::maximum<size_type>{});

Expand Down
1 change: 0 additions & 1 deletion cpp/tests/hash_map/map_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture {

using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
key_value_types<int64_t, int64_t>,
key_value_types<int16_t, int16_t>,
key_value_types<int32_t, float>,
key_value_types<int64_t, double>>;

Expand Down

0 comments on commit 29429f7

Please sign in to comment.