From 0df617810ce60724633cf9ef41160e6830c4ffbe Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 10 Aug 2022 21:25:06 -0500 Subject: [PATCH] Update to Thrust 1.17.0 (#11437) Updates the bundled version of Thrust to 1.17.0. I will run benchmarks and include results in a comment below. Depends on #11457. Supersedes #10489, #10577, #10586. Closes #10841. **This should be merged concurrently with https://github.com/rapidsai/rapids-cmake/pull/231.** Authors: - Bradley Dice (https://github.com/bdice) Approvers: - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/11437 --- cpp/cmake/thirdparty/get_thrust.cmake | 2 +- cpp/cmake/thrust.patch | 141 ++++++++++++++++---------- 2 files changed, 88 insertions(+), 55 deletions(-) diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 927186d3f49..cbdaf5520ff 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -80,6 +80,6 @@ function(find_and_configure_thrust VERSION) endif() endfunction() -set(CUDF_MIN_VERSION_Thrust 1.15.0) +set(CUDF_MIN_VERSION_Thrust 1.17.0) find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust}) diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch index 2f9201d8ab4..ae1962e4738 100644 --- a/cpp/cmake/thrust.patch +++ b/cpp/cmake/thrust.patch @@ -1,83 +1,116 @@ -diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h -index 1ffeef0..5e80800 100644 ---- a/thrust/system/cuda/detail/sort.h -+++ b/thrust/system/cuda/detail/sort.h -@@ -108,7 +108,7 @@ namespace __merge_sort { - key_type key2 = keys_shared[keys2_beg]; - - +diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh +index 4769df36..d86d6342 100644 +--- a/cub/block/block_merge_sort.cuh ++++ b/cub/block/block_merge_sort.cuh +@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, + KeyT key1 = keys_shared[keys1_beg]; + KeyT key2 = keys_shared[keys2_beg]; + -#pragma unroll +#pragma unroll 1 - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - bool p = (keys2_beg < keys2_end) && -@@ -311,10 +311,10 @@ namespace __merge_sort { - void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD], - item_type (&items)[ITEMS_PER_THREAD]) + for (int item = 0; item < ITEMS_PER_THREAD; ++item) + { + bool p = (keys2_beg < keys2_end) && +@@ -383,7 +383,7 @@ public: + // + KeyT max_key = oob_default; + +- #pragma unroll ++ #pragma unroll 1 + for (int item = 1; item < ITEMS_PER_THREAD; ++item) { --#pragma unroll -+#pragma unroll 1 - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { --#pragma unroll -+#pragma unroll 1 - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j])) -@@ -350,7 +350,7 @@ namespace __merge_sort { - // each thread has sorted keys_loc - // merge sort keys_loc in shared memory - // --#pragma unroll -+#pragma unroll 1 - for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2) - { - sync_threadblock(); -@@ -479,7 +479,7 @@ namespace __merge_sort { - // and fill the remainig keys with it - // - key_type max_key = keys_loc[0]; --#pragma unroll -+#pragma unroll 1 - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (ITEMS_PER_THREAD * tid + ITEM < num_remaining) -diff a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh -index 41eb1d2..f2893b4 100644 + if (ITEMS_PER_THREAD * linear_tid + item < valid_items) +@@ -407,7 +407,7 @@ public: + // each thread has sorted keys + // merge sort keys in shared memory + // +- #pragma unroll ++ #pragma unroll 1 + for (int target_merged_threads_number = 2; + target_merged_threads_number <= NUM_THREADS; + target_merged_threads_number *= 2) +diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh +index b188c75f..3f36656f 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -723,7 +723,7 @@ struct DeviceRadixSortPolicy - - +@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy + + /// SM60 (GP100) - struct Policy600 : ChainedPolicy<600, Policy600, Policy500> + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) -diff a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh -index f6aee45..dd64301 100644 +diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh +index e0470ccb..6a0c2ed6 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh -@@ -284,7 +284,7 @@ struct DeviceReducePolicy +@@ -280,7 +280,7 @@ struct DeviceReducePolicy }; - + /// SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> { // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) typedef AgentReducePolicy< -diff a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh -index c0c6d59..937ee31 100644 +diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh +index c2d04588..ac2d10e0 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh -@@ -178,7 +178,7 @@ struct DeviceScanPolicy +@@ -177,7 +177,7 @@ struct DeviceScanPolicy }; - + /// SM600 - struct Policy600 : ChainedPolicy<600, Policy600, Policy520> + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> { typedef AgentScanPolicy< 128, 15, ///< Threads per block, items per thread +diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh +index 5d486789..b42fb5f0 100644 +--- a/cub/thread/thread_sort.cuh ++++ b/cub/thread/thread_sort.cuh +@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], + { + constexpr bool KEYS_ONLY = std::is_same::value; + +- #pragma unroll ++ #pragma unroll 1 + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { +- #pragma unroll ++ #pragma unroll 1 + for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) + { + if (compare_op(keys[j + 1], keys[j])) +diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h +index d0e3f94..76774b0 100644 +--- a/thrust/system/cuda/detail/dispatch.h ++++ b/thrust/system/cuda/detail/dispatch.h +@@ -32,9 +32,8 @@ + status = call arguments; \ + } \ + else { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ +- } ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ ++ } + + /** + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm +@@ -52,10 +51,8 @@ + status = call arguments; \ + } \ + else { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ +- } ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ ++ } + /** + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm + * implementation. This version allows using different token sequences for callables