diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index 059f713e7a5..e61102dffac 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,60 +3,25 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "cccl/bug_fixes.diff", - "issue" : "CCCL installs header-search.cmake files in nondeterministic order and has a typo in checking target creation that leads to duplicates", - "fixed_in" : "2.3" - }, - { - "file" : "cccl/hide_kernels.diff", - "issue" : "Mark all cub and thrust kernels with hidden visibility [https://github.com/nvidia/cccl/pulls/443]", - "fixed_in" : "2.3" - }, { "file" : "cccl/revert_pr_211.diff", "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff", - "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", - "fixed_in" : "" - }, - { - "file": "cccl/kernel_pointer_hiding.diff", - "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]", - "fixed_in": "2.4" - }, { "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff", - "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff", - "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff", "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", "fixed_in" : "" - }, - { - "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff", - "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", - "fixed_in" : "" } ] } diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff deleted file mode 100644 index 27ff16744f5..00000000000 --- a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff +++ /dev/null @@ -1,47 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -index 046eb83c0..8047c9701 100644 ---- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -@@ -53,41 +53,15 @@ namespace cuda_cub - - namespace __copy - { --template --OutputIt THRUST_RUNTIME_FUNCTION device_to_device( -- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type) --{ -- typedef typename thrust::iterator_traits::value_type InputTy; -- const auto n = thrust::distance(first, last); -- if (n > 0) -- { -- cudaError status; -- status = trivial_copy_device_to_device( -- policy, -- reinterpret_cast(thrust::raw_pointer_cast(&*result)), -- reinterpret_cast(thrust::raw_pointer_cast(&*first)), -- n); -- cuda_cub::throw_on_error(status, "__copy:: D->D: failed"); -- } -- -- return result + n; --} - - template - OutputIt THRUST_RUNTIME_FUNCTION device_to_device( -- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type) -+ execution_policy& policy, InputIt first, InputIt last, OutputIt result) - { - typedef typename thrust::iterator_traits::value_type InputTy; - return cuda_cub::transform(policy, first, last, result, thrust::identity()); - } - --template --OutputIt THRUST_RUNTIME_FUNCTION --device_to_device(execution_policy& policy, InputIt first, InputIt last, OutputIt result) --{ -- return device_to_device( -- policy, first, last, result, typename is_indirectly_trivially_relocatable_to::type()); --} - } // namespace __copy - - } // namespace cuda_cub diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff index d3f1a26781f..6ae1e1c917b 100644 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -1,25 +1,25 @@ diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index d0e3f94ec..5c32a9c60 100644 +index 2a3cc4e33..8fb337b26 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -32,8 +32,7 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - +@@ -44,8 +44,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } + /** -@@ -52,9 +51,7 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } +@@ -66,9 +65,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } /** * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff deleted file mode 100644 index 6ae1e1c917b..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff +++ /dev/null @@ -1,25 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 2a3cc4e33..8fb337b26 100644 ---- a/thrust/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -44,8 +44,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - - /** -@@ -66,9 +65,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff index a606e21b92d..fee46046194 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff @@ -1,23 +1,23 @@ diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -index 84b6ccffd..25a237f93 100644 +index 0606485bb..dbb99ff13 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -808,7 +808,7 @@ struct DeviceRadixSortPolicy - - - /// SM60 (GP100) -- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> +@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy + }; + + /// SM60 (GP100) +- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + enum { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh -index 994adc095..d3e6719a7 100644 +index f39613adb..75bd16ff9 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh -@@ -479,7 +479,7 @@ struct DeviceReducePolicy +@@ -488,7 +488,7 @@ struct DeviceReducePolicy }; - + /// SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> @@ -25,15 +25,15 @@ index 994adc095..d3e6719a7 100644 static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 16; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -index 0ea5c41ad..1bcd8a111 100644 +index 419908c4e..6ab0840e1 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -@@ -303,7 +303,7 @@ struct DeviceScanPolicy +@@ -339,7 +339,7 @@ struct DeviceScanPolicy /// SM600 struct Policy600 : DefaultTuning - , ChainedPolicy<600, Policy600, Policy520> + , ChainedPolicy<600, Policy600, Policy600> {}; - + /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff deleted file mode 100644 index fee46046194..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff +++ /dev/null @@ -1,39 +0,0 @@ -diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -index 0606485bb..dbb99ff13 100644 ---- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh -+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy - }; - - /// SM60 (GP100) -- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - enum - { -diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh -index f39613adb..75bd16ff9 100644 ---- a/cub/cub/device/dispatch/dispatch_reduce.cuh -+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh -@@ -488,7 +488,7 @@ struct DeviceReducePolicy - }; - - /// SM60 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - static constexpr int threads_per_block = 256; - static constexpr int items_per_thread = 16; -diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -index 419908c4e..6ab0840e1 100644 ---- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh -+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -@@ -339,7 +339,7 @@ struct DeviceScanPolicy - /// SM600 - struct Policy600 - : DefaultTuning -- , ChainedPolicy<600, Policy600, Policy520> -+ , ChainedPolicy<600, Policy600, Policy600> - {}; - - /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff index c34b6433d10..cb0cc55f4d2 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -1,39 +1,39 @@ diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index dc07ef6c2..a066c14da 100644 +index eb76ebb0b..c6c529a50 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh -@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, +@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; - + -#pragma unroll +#pragma unroll 1 for (int item = 0; item < ITEMS_PER_THREAD; ++item) { - bool p = (keys2_beg < keys2_end) && -@@ -383,7 +383,7 @@ public: + bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); +@@ -376,7 +376,7 @@ public: // KeyT max_key = oob_default; - -- #pragma unroll -+ #pragma unroll 1 + +-#pragma unroll ++#pragma unroll 1 for (int item = 1; item < ITEMS_PER_THREAD; ++item) { if (ITEMS_PER_THREAD * linear_tid + item < valid_items) diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh -index 5d4867896..b42fb5f00 100644 +index 7d9e8622f..da5627306 100644 --- a/cub/cub/thread/thread_sort.cuh +++ b/cub/cub/thread/thread_sort.cuh -@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], +@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE { - constexpr bool KEYS_ONLY = std::is_same::value; - -- #pragma unroll -+ #pragma unroll 1 + constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; + +-#pragma unroll ++#pragma unroll 1 for (int i = 0; i < ITEMS_PER_THREAD; ++i) { -- #pragma unroll -+ #pragma unroll 1 +-#pragma unroll ++#pragma unroll 1 for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) { if (compare_op(keys[j + 1], keys[j])) diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff deleted file mode 100644 index cb0cc55f4d2..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff +++ /dev/null @@ -1,39 +0,0 @@ -diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index eb76ebb0b..c6c529a50 100644 ---- a/cub/cub/block/block_merge_sort.cuh -+++ b/cub/cub/block/block_merge_sort.cuh -@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( - KeyT key1 = keys_shared[keys1_beg]; - KeyT key2 = keys_shared[keys2_beg]; - --#pragma unroll -+#pragma unroll 1 - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); -@@ -376,7 +376,7 @@ public: - // - KeyT max_key = oob_default; - --#pragma unroll -+#pragma unroll 1 - for (int item = 1; item < ITEMS_PER_THREAD; ++item) - { - if (ITEMS_PER_THREAD * linear_tid + item < valid_items) -diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh -index 7d9e8622f..da5627306 100644 ---- a/cub/cub/thread/thread_sort.cuh -+++ b/cub/cub/thread/thread_sort.cuh -@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE - { - constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; - --#pragma unroll -+#pragma unroll 1 - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { --#pragma unroll -+#pragma unroll 1 - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j]))