diff --git a/.gitignore b/.gitignore index 0d63c76bf9f..aaac92ff643 100644 --- a/.gitignore +++ b/.gitignore @@ -70,7 +70,6 @@ junit-cudf.xml test-results ## Patching -*.diff *.orig *.rej diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 379b1521bf0..25a4c9dd3ba 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -13,73 +13,34 @@ # ============================================================================= # This function finds thrust and sets any additional necessary environment variables. -function(find_and_configure_thrust VERSION) - # We only want to set `UPDATE_DISCONNECTED` while the GIT tag hasn't moved from the last time we - # cloned - set(cpm_thrust_disconnect_update "UPDATE_DISCONNECTED TRUE") - set(CPM_THRUST_CURRENT_VERSION - ${VERSION} - CACHE STRING "version of thrust we checked out" - ) - if(NOT VERSION VERSION_EQUAL CPM_THRUST_CURRENT_VERSION) - set(CPM_THRUST_CURRENT_VERSION - ${VERSION} - CACHE STRING "version of thrust we checked out" FORCE - ) - set(cpm_thrust_disconnect_update "") - endif() +function(find_and_configure_thrust) - # We currently require cuDF to always build with a custom version of thrust. This is needed so - # that build times of of cudf are kept reasonable, without this CI builds of cudf will be killed - # as some source file can take over 45 minutes to build - # - set(CPM_DOWNLOAD_ALL TRUE) - rapids_cpm_find( - Thrust ${VERSION} - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - CPM_ARGS - GIT_REPOSITORY https://github.com/NVIDIA/thrust.git - GIT_TAG ${VERSION} - GIT_SHALLOW TRUE ${cpm_thrust_disconnect_update} - PATCH_COMMAND patch --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true - OPTIONS "THRUST_INSTALL TRUE" - ) + include(${rapids-cmake-dir}/cpm/thrust.cmake) + include(${rapids-cmake-dir}/cpm/package_override.cmake) - if(NOT TARGET cudf::Thrust) - thrust_create_target(cudf::Thrust FROM_OPTIONS) - endif() + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/thrust_override.json") - if(Thrust_SOURCE_DIR) # only install thrust when we have an in-source version - include(GNUInstallDirs) - install( - DIRECTORY "${Thrust_SOURCE_DIR}/thrust" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/" - FILES_MATCHING - REGEX "\\.(h|inl)$" - ) - install( - DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/" - FILES_MATCHING - PATTERN "*.cuh" - ) + # Make sure we install thrust into the `include/libcudf` subdirectory instead of the default + include(GNUInstallDirs) + set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcudf") + set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib") - install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/thrust/" - ) - install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/cub/" - ) + # Find or install Thrust with our custom set of patches + rapids_cpm_thrust( + NAMESPACE cudf + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + ) + if(Thrust_SOURCE_DIR) # Store where CMake can find our custom Thrust install include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( - INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust/]=] cudf-exports + INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/cmake/thrust]=] + cudf-exports ) endif() endfunction() -set(CUDF_MIN_VERSION_Thrust 1.17.2) - -find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust}) +find_and_configure_thrust() diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff new file mode 100644 index 00000000000..382f7dca1b0 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -0,0 +1,29 @@ +diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h +index d0e3f94..76774b0 100644 +--- a/thrust/system/cuda/detail/dispatch.h ++++ b/thrust/system/cuda/detail/dispatch.h +@@ -32,9 +32,8 @@ + status = call arguments; \ + } \ + else { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ +- } ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ ++ } + + /** + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm +@@ -52,10 +51,8 @@ + status = call arguments; \ + } \ + else { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ +- } ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ ++ } + /** + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm + * implementation. This version allows using different token sequences for callables diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff new file mode 100644 index 00000000000..6bf165805cc --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff @@ -0,0 +1,39 @@ +diff --git a/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh +index b188c75f..3f36656f 100644 +--- a/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh ++++ b/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh +@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy + + + /// SM60 (GP100) +- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) +diff --git a/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh b/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh +index e0470ccb..6a0c2ed6 100644 +--- a/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh ++++ b/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh +@@ -280,7 +280,7 @@ struct DeviceReducePolicy + }; + + /// SM60 +- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) + typedef AgentReducePolicy< +diff --git a/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh b/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh +index c2d04588..ac2d10e0 100644 +--- a/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh ++++ b/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh +@@ -177,7 +177,7 @@ struct DeviceScanPolicy + }; + + /// SM600 +- struct Policy600 : ChainedPolicy<600, Policy600, Policy520> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + typedef AgentScanPolicy< + 128, 15, ///< Threads per block, items per thread diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff new file mode 100644 index 00000000000..e5d62e87ca4 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -0,0 +1,49 @@ +diff --git a/dependencies/cub/cub/block/block_merge_sort.cuh b/dependencies/cub/cub/block/block_merge_sort.cuh +index 4769df36..d86d6342 100644 +--- a/dependencies/cub/cub/block/block_merge_sort.cuh ++++ b/dependencies/cub/cub/block/block_merge_sort.cuh +@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, + KeyT key1 = keys_shared[keys1_beg]; + KeyT key2 = keys_shared[keys2_beg]; + +-#pragma unroll ++#pragma unroll 1 + for (int item = 0; item < ITEMS_PER_THREAD; ++item) + { + bool p = (keys2_beg < keys2_end) && +@@ -383,7 +383,7 @@ public: + // + KeyT max_key = oob_default; + +- #pragma unroll ++ #pragma unroll 1 + for (int item = 1; item < ITEMS_PER_THREAD; ++item) + { + if (ITEMS_PER_THREAD * linear_tid + item < valid_items) +@@ -407,7 +407,7 @@ public: + // each thread has sorted keys + // merge sort keys in shared memory + // +- #pragma unroll ++ #pragma unroll 1 + for (int target_merged_threads_number = 2; + target_merged_threads_number <= NUM_THREADS; + target_merged_threads_number *= 2) +diff --git a/dependencies/cub/cub/thread/thread_sort.cuh b/dependencies/cub/cub/thread/thread_sort.cuh +index 5d486789..b42fb5f0 100644 +--- a/dependencies/cub/cub/thread/thread_sort.cuh ++++ b/dependencies/cub/cub/thread/thread_sort.cuh +@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], + { + constexpr bool KEYS_ONLY = std::is_same::value; + +- #pragma unroll ++ #pragma unroll 1 + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { +- #pragma unroll ++ #pragma unroll 1 + for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) + { + if (compare_op(keys[j + 1], keys[j])) + diff --git a/cpp/cmake/thirdparty/patches/thrust_override.json b/cpp/cmake/thirdparty/patches/thrust_override.json new file mode 100644 index 00000000000..f1908a64719 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_override.json @@ -0,0 +1,34 @@ + +{ + "packages" : { + "Thrust" : { + "patches" : [ + { + "file" : "Thrust/install_rules.diff", + "issue" : "Thrust 1.X installs incorrect files [https://github.com/NVIDIA/thrust/issues/1790]", + "fixed_in" : "2.0.0" + }, + { + "file" : "${current_json_dir}/thrust_transform_iter_with_reduce_by_key.diff", + "issue" : "Support transform_output_iterator as output of reduce by key [https://github.com/NVIDIA/thrust/pull/1805]", + "fixed_in" : "2.1" + }, + { + "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", + "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", + "fixed_in" : "" + }, + { + "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", + "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", + "fixed_in" : "" + }, + { + "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff", + "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", + "fixed_in" : "" + } + ] + } + } +} diff --git a/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff b/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff new file mode 100644 index 00000000000..035da3ef385 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff @@ -0,0 +1,26 @@ +diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h +index f512a36..a5f725d 100644 +--- a/thrust/iterator/transform_input_output_iterator.h ++++ b/thrust/iterator/transform_input_output_iterator.h +@@ -102,6 +102,8 @@ template + /*! \endcond + */ + ++ transform_input_output_iterator() = default; ++ + /*! This constructor takes as argument a \c Iterator an \c InputFunction and an + * \c OutputFunction and copies them to a new \p transform_input_output_iterator + * +diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h +index 66fb46a..4a68cb5 100644 +--- a/thrust/iterator/transform_output_iterator.h ++++ b/thrust/iterator/transform_output_iterator.h +@@ -104,6 +104,8 @@ template + /*! \endcond + */ + ++ transform_output_iterator() = default; ++ + /*! This constructor takes as argument an \c OutputIterator and an \c + * UnaryFunction and copies them to a new \p transform_output_iterator + * diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch deleted file mode 100644 index 0dd9854d4aa..00000000000 --- a/cpp/cmake/thrust.patch +++ /dev/null @@ -1,142 +0,0 @@ -diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh -index 4769df36..d86d6342 100644 ---- a/cub/block/block_merge_sort.cuh -+++ b/cub/block/block_merge_sort.cuh -@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, - KeyT key1 = keys_shared[keys1_beg]; - KeyT key2 = keys_shared[keys2_beg]; - --#pragma unroll -+#pragma unroll 1 - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - bool p = (keys2_beg < keys2_end) && -@@ -383,7 +383,7 @@ public: - // - KeyT max_key = oob_default; - -- #pragma unroll -+ #pragma unroll 1 - for (int item = 1; item < ITEMS_PER_THREAD; ++item) - { - if (ITEMS_PER_THREAD * linear_tid + item < valid_items) -@@ -407,7 +407,7 @@ public: - // each thread has sorted keys - // merge sort keys in shared memory - // -- #pragma unroll -+ #pragma unroll 1 - for (int target_merged_threads_number = 2; - target_merged_threads_number <= NUM_THREADS; - target_merged_threads_number *= 2) -diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh -index b188c75f..3f36656f 100644 ---- a/cub/device/dispatch/dispatch_radix_sort.cuh -+++ b/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy - - - /// SM60 (GP100) -- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) -diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh -index e0470ccb..6a0c2ed6 100644 ---- a/cub/device/dispatch/dispatch_reduce.cuh -+++ b/cub/device/dispatch/dispatch_reduce.cuh -@@ -280,7 +280,7 @@ struct DeviceReducePolicy - }; - - /// SM60 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) - typedef AgentReducePolicy< -diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh -index c2d04588..ac2d10e0 100644 ---- a/cub/device/dispatch/dispatch_scan.cuh -+++ b/cub/device/dispatch/dispatch_scan.cuh -@@ -177,7 +177,7 @@ struct DeviceScanPolicy - }; - - /// SM600 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy520> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - typedef AgentScanPolicy< - 128, 15, ///< Threads per block, items per thread -diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh -index 5d486789..b42fb5f0 100644 ---- a/cub/thread/thread_sort.cuh -+++ b/cub/thread/thread_sort.cuh -@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], - { - constexpr bool KEYS_ONLY = std::is_same::value; - -- #pragma unroll -+ #pragma unroll 1 - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { -- #pragma unroll -+ #pragma unroll 1 - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j])) -diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h -index d0e3f94..76774b0 100644 ---- a/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/system/cuda/detail/dispatch.h -@@ -32,9 +32,8 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -- } -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ -+ } - - /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm -@@ -52,10 +51,8 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -- } -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ -+ } - /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm - * implementation. This version allows using different token sequences for callables -diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h -index f512a36..a5f725d 100644 ---- a/thrust/iterator/transform_input_output_iterator.h -+++ b/thrust/iterator/transform_input_output_iterator.h -@@ -102,6 +102,8 @@ template - /*! \endcond - */ - -+ transform_input_output_iterator() = default; -+ - /*! This constructor takes as argument a \c Iterator an \c InputFunction and an - * \c OutputFunction and copies them to a new \p transform_input_output_iterator - * -diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h -index 66fb46a..4a68cb5 100644 ---- a/thrust/iterator/transform_output_iterator.h -+++ b/thrust/iterator/transform_output_iterator.h -@@ -104,6 +104,8 @@ template - /*! \endcond - */ - -+ transform_output_iterator() = default; -+ - /*! This constructor takes as argument an \c OutputIterator and an \c - * UnaryFunction and copies them to a new \p transform_output_iterator - *