From 6b134dded4b3df7d71f103fb48589eff68c839c7 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 19 Dec 2023 11:26:52 -0600 Subject: [PATCH] Update to CCCL 2.2.0. (#14576) This PR updates cuDF to CCCL 2.2.0. Do not merge until all of RAPIDS is ready to update. Depends on #14577. Replaces #13222. Authors: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ray Douglass (https://github.com/raydouglass) - Vyas Ramasubramani (https://github.com/vyasr) --- ci/test_cpp_common.sh | 1 + cpp/CMakeLists.txt | 21 ++----- .../{get_thrust.cmake => get_cccl.cmake} | 28 ++++----- cpp/cmake/thirdparty/get_libcudacxx.cmake | 35 ----------- ...hrust_override.json => cccl_override.json} | 19 +++--- .../cub_segmented_sort_with_bool_key.diff | 14 ----- .../thrust_disable_64bit_dispatching.diff | 24 ++++---- .../thrust_faster_scan_compile_times.diff | 58 +++++++++---------- .../thrust_faster_sort_compile_times.diff | 25 +++----- ...ust_transform_iter_with_reduce_by_key.diff | 26 --------- 10 files changed, 71 insertions(+), 180 deletions(-) rename cpp/cmake/thirdparty/{get_thrust.cmake => get_cccl.cmake} (59%) delete mode 100644 cpp/cmake/thirdparty/get_libcudacxx.cmake rename cpp/cmake/thirdparty/patches/{thrust_override.json => cccl_override.json} (57%) delete mode 100644 cpp/cmake/thirdparty/patches/cub_segmented_sort_with_bool_key.diff delete mode 100644 cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh index 37aba751a09..163d381c1d4 100644 --- a/ci/test_cpp_common.sh +++ b/ci/test_cpp_common.sh @@ -22,6 +22,7 @@ conda activate test set -u CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) + RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index dbd67a1e6ac..4163275744e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -183,17 +183,15 @@ rapids_cpm_init() include(cmake/thirdparty/get_jitify.cmake) # find nvCOMP include(cmake/thirdparty/get_nvcomp.cmake) -# find thrust/cub -include(cmake/thirdparty/get_thrust.cmake) +# find CCCL before rmm so that we get cudf's patched version of CCCL +include(cmake/thirdparty/get_cccl.cmake) # find rmm include(cmake/thirdparty/get_rmm.cmake) # find arrow include(cmake/thirdparty/get_arrow.cmake) # find dlpack include(cmake/thirdparty/get_dlpack.cmake) -# find libcu++ -include(cmake/thirdparty/get_libcudacxx.cmake) -# find cuCollections Should come after including thrust and libcudacxx +# find cuCollections, should come after including CCCL include(cmake/thirdparty/get_cucollections.cmake) # find or install GoogleTest if(CUDF_BUILD_TESTUTIL) @@ -758,7 +756,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC ${ARROW_LIBRARIES} libcudacxx::libcudacxx cudf::Thrust rmm::rmm + PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio $ ) @@ -1009,14 +1007,6 @@ following IMPORTED GLOBAL targets: ]=] ) -set(common_code_string - [=[ -if(NOT TARGET cudf::Thrust) - thrust_create_target(cudf::Thrust FROM_OPTIONS) -endif() -]=] -) - if(CUDF_ENABLE_ARROW_PARQUET) string( APPEND @@ -1040,7 +1030,6 @@ if(testing IN_LIST cudf_FIND_COMPONENTS) endif() ]=] ) -string(APPEND install_code_string "${common_code_string}") rapids_export( INSTALL cudf @@ -1064,8 +1053,6 @@ endif() ]=] ) -string(APPEND build_code_string "${common_code_string}") - rapids_export( BUILD cudf EXPORT_SET cudf-exports ${_components_export_string} diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_cccl.cmake similarity index 59% rename from cpp/cmake/thirdparty/get_thrust.cmake rename to cpp/cmake/thirdparty/get_cccl.cmake index 67ed4287d7b..799bdd7f178 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_cccl.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,34 +12,30 @@ # the License. # ============================================================================= -# This function finds thrust and sets any additional necessary environment variables. -function(find_and_configure_thrust) +# This function finds cccl and sets any additional necessary environment variables. +function(find_and_configure_cccl) - include(${rapids-cmake-dir}/cpm/thrust.cmake) + include(${rapids-cmake-dir}/cpm/cccl.cmake) include(${rapids-cmake-dir}/cpm/package_override.cmake) set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") - rapids_cpm_package_override("${cudf_patch_dir}/thrust_override.json") + rapids_cpm_package_override("${cudf_patch_dir}/cccl_override.json") - # Make sure we install thrust into the `include/libcudf` subdirectory instead of the default + # Make sure we install cccl into the `include/libcudf` subdirectory instead of the default include(GNUInstallDirs) set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcudf") set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib") - # Find or install Thrust with our custom set of patches - rapids_cpm_thrust( - NAMESPACE cudf - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - ) + # Find or install CCCL with our custom set of patches + rapids_cpm_cccl(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) - # Store where CMake can find our custom Thrust install + # Store where CMake can find our custom CCCL install include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( - INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=] + INSTALL CCCL [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/cccl]=] EXPORT_SET cudf-exports - CONDITION Thrust_SOURCE_DIR + CONDITION CCCL_SOURCE_DIR ) endfunction() -find_and_configure_thrust() +find_and_configure_cccl() diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake deleted file mode 100644 index 285d66287f3..00000000000 --- a/cpp/cmake/thirdparty/get_libcudacxx.cmake +++ /dev/null @@ -1,35 +0,0 @@ -# ============================================================================= -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# This function finds thrust and sets any additional necessary environment variables. -function(find_and_configure_libcudacxx) - # Make sure we install libcudacxx beside our patched version of thrust - include(GNUInstallDirs) - set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcudf") - set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib") - - include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) - rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) - - # Store where CMake can find our custom Thrust install - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - INSTALL libcudacxx - [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=] - EXPORT_SET cudf-exports - CONDITION libcudacxx_SOURCE_DIR - ) -endfunction() - -find_and_configure_libcudacxx() diff --git a/cpp/cmake/thirdparty/patches/thrust_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json similarity index 57% rename from cpp/cmake/thirdparty/patches/thrust_override.json rename to cpp/cmake/thirdparty/patches/cccl_override.json index ded2b90eeba..fa82bfb5421 100644 --- a/cpp/cmake/thirdparty/patches/thrust_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -1,17 +1,17 @@ { "packages" : { - "Thrust" : { + "CCCL" : { "patches" : [ { - "file" : "Thrust/install_rules.diff", - "issue" : "Thrust 1.X installs incorrect files [https://github.com/NVIDIA/thrust/issues/1790]", - "fixed_in" : "2.0.0" + "file" : "cccl/bug_fixes.diff", + "issue" : "CCCL installs header-search.cmake files in nondeterministic order and has a typo in checking target creation that leads to duplicates", + "fixed_in" : "2.3" }, { - "file" : "${current_json_dir}/thrust_transform_iter_with_reduce_by_key.diff", - "issue" : "Support transform_output_iterator as output of reduce by key [https://github.com/NVIDIA/thrust/pull/1805]", - "fixed_in" : "2.1" + "file" : "cccl/revert_pr_211.diff", + "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", + "fixed_in" : "" }, { "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", @@ -27,11 +27,6 @@ "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff", "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", "fixed_in" : "" - }, - { - "file" : "${current_json_dir}/cub_segmented_sort_with_bool_key.diff", - "issue" : "Fix an error in CUB DeviceSegmentedSort when the keys are bool type [https://github.com/NVIDIA/cub/issues/594]", - "fixed_in" : "2.1" } ] } diff --git a/cpp/cmake/thirdparty/patches/cub_segmented_sort_with_bool_key.diff b/cpp/cmake/thirdparty/patches/cub_segmented_sort_with_bool_key.diff deleted file mode 100644 index 7c40fd4287d..00000000000 --- a/cpp/cmake/thirdparty/patches/cub_segmented_sort_with_bool_key.diff +++ /dev/null @@ -1,14 +0,0 @@ -diff --git a/dependencies/cub/cub/agent/agent_sub_warp_merge_sort.cuh b/dependencies/cub/cub/agent/agent_sub_warp_merge_sort.cuh -index ad65f2a3..ad45a21e 100644 ---- a/dependencies/cub/cub/agent/agent_sub_warp_merge_sort.cuh -+++ b/dependencies/cub/cub/agent/agent_sub_warp_merge_sort.cuh -@@ -221,7 +221,8 @@ public: - using UnsignedBitsT = typename Traits::UnsignedBits; - UnsignedBitsT default_key_bits = IS_DESCENDING ? Traits::LOWEST_KEY - : Traits::MAX_KEY; -- KeyT oob_default = reinterpret_cast(default_key_bits); -+ KeyT oob_default = std::is_same_v ? !IS_DESCENDING -+ : reinterpret_cast(default_key_bits); - - WarpLoadKeysT(storage.load_keys) - .Load(keys_input, keys, segment_size, oob_default); diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff index 382f7dca1b0..d3f1a26781f 100644 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -1,29 +1,25 @@ -diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h -index d0e3f94..76774b0 100644 ---- a/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/system/cuda/detail/dispatch.h -@@ -32,9 +32,8 @@ +diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h +index d0e3f94ec..5c32a9c60 100644 +--- a/thrust/thrust/system/cuda/detail/dispatch.h ++++ b/thrust/thrust/system/cuda/detail/dispatch.h +@@ -32,8 +32,7 @@ status = call arguments; \ } \ else { \ - auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ - status = call arguments; \ -- } -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ -+ } ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm -@@ -52,10 +51,8 @@ +@@ -52,9 +51,7 @@ status = call arguments; \ } \ else { \ - auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ - auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ - status = call arguments; \ -- } -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ -+ } ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } /** * Dispatch between 32-bit and 64-bit index based versions of the same algorithm - * implementation. This version allows using different token sequences for callables diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff index 6bf165805cc..a606e21b92d 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff @@ -1,8 +1,8 @@ -diff --git a/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh -index b188c75f..3f36656f 100644 ---- a/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh -+++ b/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy +diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh +index 84b6ccffd..25a237f93 100644 +--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh ++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh +@@ -808,7 +808,7 @@ struct DeviceRadixSortPolicy /// SM60 (GP100) @@ -11,29 +11,29 @@ index b188c75f..3f36656f 100644 { enum { PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) -diff --git a/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh b/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh -index e0470ccb..6a0c2ed6 100644 ---- a/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh -+++ b/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh -@@ -280,7 +280,7 @@ struct DeviceReducePolicy - }; +diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh +index 994adc095..d3e6719a7 100644 +--- a/cub/cub/device/dispatch/dispatch_reduce.cuh ++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh +@@ -479,7 +479,7 @@ struct DeviceReducePolicy + }; - /// SM60 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) - typedef AgentReducePolicy< -diff --git a/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh b/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh -index c2d04588..ac2d10e0 100644 ---- a/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh -+++ b/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh -@@ -177,7 +177,7 @@ struct DeviceScanPolicy - }; + /// SM60 +- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + static constexpr int threads_per_block = 256; + static constexpr int items_per_thread = 16; +diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh +index 0ea5c41ad..1bcd8a111 100644 +--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh ++++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh +@@ -303,7 +303,7 @@ struct DeviceScanPolicy + /// SM600 + struct Policy600 + : DefaultTuning +- , ChainedPolicy<600, Policy600, Policy520> ++ , ChainedPolicy<600, Policy600, Policy600> + {}; - /// SM600 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy520> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - typedef AgentScanPolicy< - 128, 15, ///< Threads per block, items per thread + /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff index 864c89d4504..c34b6433d10 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -1,7 +1,7 @@ -diff --git a/dependencies/cub/cub/block/block_merge_sort.cuh b/dependencies/cub/cub/block/block_merge_sort.cuh -index 4769df36..d86d6342 100644 ---- a/dependencies/cub/cub/block/block_merge_sort.cuh -+++ b/dependencies/cub/cub/block/block_merge_sort.cuh +diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh +index dc07ef6c2..a066c14da 100644 +--- a/cub/cub/block/block_merge_sort.cuh ++++ b/cub/cub/block/block_merge_sort.cuh @@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; @@ -20,19 +20,10 @@ index 4769df36..d86d6342 100644 for (int item = 1; item < ITEMS_PER_THREAD; ++item) { if (ITEMS_PER_THREAD * linear_tid + item < valid_items) -@@ -407,7 +407,7 @@ public: - // each thread has sorted keys - // merge sort keys in shared memory - // -- #pragma unroll -+ #pragma unroll 1 - for (int target_merged_threads_number = 2; - target_merged_threads_number <= NUM_THREADS; - target_merged_threads_number *= 2) -diff --git a/dependencies/cub/cub/thread/thread_sort.cuh b/dependencies/cub/cub/thread/thread_sort.cuh -index 5d486789..b42fb5f0 100644 ---- a/dependencies/cub/cub/thread/thread_sort.cuh -+++ b/dependencies/cub/cub/thread/thread_sort.cuh +diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh +index 5d4867896..b42fb5f00 100644 +--- a/cub/cub/thread/thread_sort.cuh ++++ b/cub/cub/thread/thread_sort.cuh @@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], { constexpr bool KEYS_ONLY = std::is_same::value; diff --git a/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff b/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff deleted file mode 100644 index 6a56af90d0d..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff +++ /dev/null @@ -1,26 +0,0 @@ -diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h -index f512a36..a5f725d 100644 ---- a/thrust/iterator/transform_input_output_iterator.h -+++ b/thrust/iterator/transform_input_output_iterator.h -@@ -102,6 +102,8 @@ template - /*! \endcond - */ - -+ transform_input_output_iterator() = default; -+ - /*! This constructor takes as argument a \c Iterator an \c InputFunction and an - * \c OutputFunction and copies them to a new \p transform_input_output_iterator - * -diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h -index 66fb46a..4a68cb5 100644 ---- a/thrust/iterator/transform_output_iterator.h -+++ b/thrust/iterator/transform_output_iterator.h -@@ -104,6 +104,8 @@ template - /*! \endcond - */ - -+ transform_output_iterator() = default; -+ - /*! This constructor takes as argument an \c OutputIterator and an \c - * UnaryFunction and copies them to a new \p transform_output_iterator - *