From 29429f7e4c871758c0de930026347e6e3b0a5a9a Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 28 May 2024 05:47:58 -0700 Subject: [PATCH] Work around issues with cccl main (#15552) This gets cuDF build cccl main on 12.3. There is one issue with the cuco tuple helpers but that will be fixed on the cuco side --------- Co-authored-by: Bernhard Manfred Gruber Co-authored-by: Bradley Dice Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Co-authored-by: ptaylor Co-authored-by: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Co-authored-by: Yunsong Wang --- .pre-commit-config.yaml | 2 + .../thirdparty/patches/cccl_override.json | 20 ++++++++ .../patches/revert_pr_211_cccl_2.5.0.diff | 47 +++++++++++++++++++ ..._disable_64bit_dispatching_cccl_2.5.0.diff | 25 ++++++++++ ..._faster_scan_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++ ..._faster_sort_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++ cpp/src/io/comp/statistics.cu | 9 ++-- cpp/src/io/orc/reader_impl_decode.cu | 3 +- cpp/src/io/orc/stripe_init.cu | 22 +++++---- cpp/src/io/parquet/page_string_decode.cu | 13 +++-- cpp/src/io/parquet/reader_impl_preprocess.cu | 4 +- cpp/src/io/utilities/data_casting.cu | 6 ++- cpp/src/join/distinct_hash_join.cu | 2 +- cpp/src/strings/split/split_re.cu | 4 +- cpp/tests/hash_map/map_test.cu | 1 - 15 files changed, 209 insertions(+), 27 deletions(-) create mode 100644 cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff create mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5a8d9f54673..2d3ffc287e9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,11 +7,13 @@ repos: - id: trailing-whitespace exclude: | (?x)^( + ^cpp/cmake/thirdparty/patches/.*| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - id: end-of-file-fixer exclude: | (?x)^( + ^cpp/cmake/thirdparty/patches/.*| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - repo: https://github.com/PyCQA/isort diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index b33f17f3e4a..059f713e7a5 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -18,6 +18,11 @@ "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", "fixed_in" : "" }, + { + "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff", + "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", + "fixed_in" : "" + }, { "file": "cccl/kernel_pointer_hiding.diff", "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]", @@ -28,15 +33,30 @@ "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", "fixed_in" : "" }, + { + "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff", + "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", + "fixed_in" : "" + }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", "fixed_in" : "" }, + { + "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff", + "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", + "fixed_in" : "" + }, { "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff", "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", "fixed_in" : "" + }, + { + "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff", + "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", + "fixed_in" : "" } ] } diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff new file mode 100644 index 00000000000..27ff16744f5 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff @@ -0,0 +1,47 @@ +diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h +index 046eb83c0..8047c9701 100644 +--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h ++++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h +@@ -53,41 +53,15 @@ namespace cuda_cub + + namespace __copy + { +-template +-OutputIt THRUST_RUNTIME_FUNCTION device_to_device( +- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type) +-{ +- typedef typename thrust::iterator_traits::value_type InputTy; +- const auto n = thrust::distance(first, last); +- if (n > 0) +- { +- cudaError status; +- status = trivial_copy_device_to_device( +- policy, +- reinterpret_cast(thrust::raw_pointer_cast(&*result)), +- reinterpret_cast(thrust::raw_pointer_cast(&*first)), +- n); +- cuda_cub::throw_on_error(status, "__copy:: D->D: failed"); +- } +- +- return result + n; +-} + + template + OutputIt THRUST_RUNTIME_FUNCTION device_to_device( +- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type) ++ execution_policy& policy, InputIt first, InputIt last, OutputIt result) + { + typedef typename thrust::iterator_traits::value_type InputTy; + return cuda_cub::transform(policy, first, last, result, thrust::identity()); + } + +-template +-OutputIt THRUST_RUNTIME_FUNCTION +-device_to_device(execution_policy& policy, InputIt first, InputIt last, OutputIt result) +-{ +- return device_to_device( +- policy, first, last, result, typename is_indirectly_trivially_relocatable_to::type()); +-} + } // namespace __copy + + } // namespace cuda_cub diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff new file mode 100644 index 00000000000..6ae1e1c917b --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff @@ -0,0 +1,25 @@ +diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h +index 2a3cc4e33..8fb337b26 100644 +--- a/thrust/thrust/system/cuda/detail/dispatch.h ++++ b/thrust/thrust/system/cuda/detail/dispatch.h +@@ -44,8 +44,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } + + /** +@@ -66,9 +65,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } + /** + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff new file mode 100644 index 00000000000..fee46046194 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff @@ -0,0 +1,39 @@ +diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh +index 0606485bb..dbb99ff13 100644 +--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh ++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh +@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy + }; + + /// SM60 (GP100) +- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + enum + { +diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh +index f39613adb..75bd16ff9 100644 +--- a/cub/cub/device/dispatch/dispatch_reduce.cuh ++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh +@@ -488,7 +488,7 @@ struct DeviceReducePolicy + }; + + /// SM60 +- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + static constexpr int threads_per_block = 256; + static constexpr int items_per_thread = 16; +diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh +index 419908c4e..6ab0840e1 100644 +--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh ++++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh +@@ -339,7 +339,7 @@ struct DeviceScanPolicy + /// SM600 + struct Policy600 + : DefaultTuning +- , ChainedPolicy<600, Policy600, Policy520> ++ , ChainedPolicy<600, Policy600, Policy600> + {}; + + /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff new file mode 100644 index 00000000000..cb0cc55f4d2 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff @@ -0,0 +1,39 @@ +diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh +index eb76ebb0b..c6c529a50 100644 +--- a/cub/cub/block/block_merge_sort.cuh ++++ b/cub/cub/block/block_merge_sort.cuh +@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( + KeyT key1 = keys_shared[keys1_beg]; + KeyT key2 = keys_shared[keys2_beg]; + +-#pragma unroll ++#pragma unroll 1 + for (int item = 0; item < ITEMS_PER_THREAD; ++item) + { + bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); +@@ -376,7 +376,7 @@ public: + // + KeyT max_key = oob_default; + +-#pragma unroll ++#pragma unroll 1 + for (int item = 1; item < ITEMS_PER_THREAD; ++item) + { + if (ITEMS_PER_THREAD * linear_tid + item < valid_items) +diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh +index 7d9e8622f..da5627306 100644 +--- a/cub/cub/thread/thread_sort.cuh ++++ b/cub/cub/thread/thread_sort.cuh +@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE + { + constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; + +-#pragma unroll ++#pragma unroll 1 + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { +-#pragma unroll ++#pragma unroll 1 + for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) + { + if (compare_op(keys[j + 1], keys[j])) diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu index 2a9eb782800..faf967041bc 100644 --- a/cpp/src/io/comp/statistics.cu +++ b/cpp/src/io/comp/statistics.cu @@ -18,6 +18,7 @@ #include +#include #include namespace cudf::io { @@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics( rmm::exec_policy(stream), results.begin(), results.end(), - [] __device__(auto& res) { + cuda::proclaim_return_type([] __device__(compression_result const& res) { return res.status == compression_status::SUCCESS ? res.bytes_written : 0; - }, + }), 0ul, thrust::plus()); @@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics( rmm::exec_policy(stream), zipped_begin, zipped_end, - [status] __device__(auto tup) { + cuda::proclaim_return_type([status] __device__(auto tup) { return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0; - }, + }), 0ul, thrust::plus()); }; diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index ec936b85761..da9fb802a0a 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -692,8 +692,7 @@ std::vector find_table_splits(table_view const& input, d_sizes = d_segmented_sizes->view().begin()] __device__(auto const segment_idx) { // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. - auto const current_length = - cuda::std::min(segment_length, num_rows - segment_length * segment_idx); + auto const current_length = min(segment_length, num_rows - segment_length * segment_idx); auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes return cumulative_size{static_cast(current_length), static_cast(size)}; diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index dd44b779402..89dbbcb796c 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -561,20 +561,26 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info, uint32_t log2maxcr, rmm::cuda_stream_view stream) { - dim3 dim_block(128, 1); - dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block - gpuParseCompressedStripeData<<>>( - strm_info, num_streams, compression_block_size, log2maxcr); + auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block + if (num_blocks > 0) { + dim3 dim_block(128, 1); + dim3 dim_grid(num_blocks, 1); + gpuParseCompressedStripeData<<>>( + strm_info, num_streams, compression_block_size, log2maxcr); + } } void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams, rmm::cuda_stream_view stream) { - dim3 dim_block(128, 1); - dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block - gpuPostDecompressionReassemble<<>>(strm_info, - num_streams); + auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block + if (num_blocks > 0) { + dim3 dim_block(128, 1); + dim3 dim_grid(num_blocks, 1); + gpuPostDecompressionReassemble<<>>(strm_info, + num_streams); + } } void __host__ ParseRowGroupIndex(RowGroup* row_groups, diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index cf1dc58b06a..ba3d35b9586 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -1197,14 +1197,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span pages, cudf::detail::join_streams(streams, stream); // check for needed temp space for DELTA_BYTE_ARRAY - auto const need_sizes = thrust::any_of( - rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) { - return page.temp_string_size != 0; - }); + auto const need_sizes = + thrust::any_of(rmm::exec_policy(stream), + pages.device_begin(), + pages.device_end(), + cuda::proclaim_return_type( + [] __device__(auto& page) { return page.temp_string_size != 0; })); if (need_sizes) { // sum up all of the temp_string_sizes - auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; }; + auto const page_sizes = cuda::proclaim_return_type( + [] __device__(PageInfo const& page) { return page.temp_string_size; }); auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index f533f04e427..7cb982f103d 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding) [[nodiscard]] std::string list_unsupported_encodings(device_span pages, rmm::cuda_stream_view stream) { - auto const to_mask = [] __device__(auto const& page) { + auto const to_mask = cuda::proclaim_return_type([] __device__(auto const& page) { return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding); - }; + }); uint32_t const unsupported = thrust::transform_reduce( rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or()); return encoding_bitmask_to_str(unsupported); diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index c9e507925ec..60cbfbc0dae 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -783,7 +784,8 @@ template struct to_string_view_pair { SymbolT const* data; to_string_view_pair(SymbolT const* _data) : data(_data) {} - __device__ auto operator()(thrust::tuple ip) + __device__ thrust::pair operator()( + thrust::tuple ip) { return thrust::pair{data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; @@ -805,7 +807,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, rmm::exec_policy(stream), str_tuples, str_tuples + col_size, - [] __device__(auto t) { return t.second; }, + cuda::proclaim_return_type([] __device__(auto t) { return t.second; }), size_type{0}, thrust::maximum{}); diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index ad401bdccba..5048da25e86 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -182,7 +182,7 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{}); auto const [probe_indices_end, _] = this->_hash_table.retrieve( - iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value()); + iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()}); auto const actual_size = std::distance(probe_indices_begin, probe_indices_end); build_indices->resize(actual_size, stream); diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 6785ab9c893..d72ec1085b5 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -219,9 +219,9 @@ std::unique_ptr split_re(strings_column_view const& input, rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - [d_offsets] __device__(auto const idx) -> size_type { + cuda::proclaim_return_type([d_offsets] __device__(auto const idx) -> size_type { return static_cast(d_offsets[idx + 1] - d_offsets[idx]); - }, + }), 0, thrust::maximum{}); diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu index 4b10716706b..be2e33538b9 100644 --- a/cpp/tests/hash_map/map_test.cu +++ b/cpp/tests/hash_map/map_test.cu @@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture { using TestTypes = ::testing::Types, key_value_types, - key_value_types, key_value_types, key_value_types>;