From 8994dc477a99a3d4c1c1ba78dd42e5357ef31947 Mon Sep 17 00:00:00 2001 From: Allison Piper Date: Tue, 19 Nov 2024 14:07:40 -0500 Subject: [PATCH] Add `thrust_create_target` `DISPATCH` option. (#2844) * Remove the THRUST_DISPATCH_TYPE header tests. These will end up specifying conflicting flags when `THRUST_DISPATCH_TYPE` is set to something other than `Dynamic`. * Add `DISPATCH` option to `thrust_create_target`. ``` thrust_create_target(TargetName DISPATCH [Dynamic|Force32bit|Force64bit] ``` * Skip 64-bit offset tests when forcing 32-bit dispatch. * Add 32/64-bit dispatch jobs to nightly CI. * Add 32-bit dispatch to pull_request workflow. --- ci/matrix.yaml | 6 ++- lib/cmake/thrust/README.md | 45 +++++++++++++---- lib/cmake/thrust/thrust-config.cmake | 50 ++++++++++++++++--- thrust/CMakeLists.txt | 2 +- thrust/cmake/ThrustBuildCompilerTargets.cmake | 6 --- thrust/cmake/ThrustBuildTargetList.cmake | 1 + thrust/cmake/ThrustHeaderTesting.cmake | 7 --- thrust/testing/copy.cu | 4 ++ thrust/testing/count.cu | 2 + thrust/testing/cuda/adjacent_difference.cu | 2 + thrust/testing/cuda/partition.cu | 3 ++ thrust/testing/cuda/reduce_by_key.cu | 2 + thrust/testing/cuda/sort.cu | 7 ++- thrust/testing/inner_product.cu | 2 + thrust/testing/max_element.cu | 2 + thrust/testing/min_element.cu | 2 + thrust/testing/minmax_element.cu | 2 + thrust/testing/reduce.cu | 2 + thrust/testing/scan.cu | 4 ++ thrust/testing/set_difference.cu | 3 ++ thrust/testing/set_intersection.cu | 2 + thrust/testing/unique_by_key.cu | 4 ++ 22 files changed, 125 insertions(+), 35 deletions(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 2586fab09e6..ae5cbfee083 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -28,6 +28,8 @@ workflows: - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} - {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'} + # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly. + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} # default_projects: clang-cuda - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'} - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90'} @@ -58,13 +60,15 @@ workflows: - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} nightly: - # Increased test coverage compared to nightlies: + # Increased test coverage compared to pull_request: - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']} - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']} - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']} - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']} # Edge-case jobs - {jobs: ['limited'], project: 'cub', std: 17} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'} # # These are waiting on the NVKS nodes: # - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11]} diff --git a/lib/cmake/thrust/README.md b/lib/cmake/thrust/README.md index 09cbc6debb5..165be15662c 100644 --- a/lib/cmake/thrust/README.md +++ b/lib/cmake/thrust/README.md @@ -38,6 +38,22 @@ If using Thrust from the CCCL sources, this would be $ cmake . -DThrust_DIR=/thrust/thrust/cmake/ ``` +#### Large Array (64-bit offseet) Handling: `DISPATCH` + +The `DISPATCH` option allows users to select the tradeoff of compile-time / binary-size +vs. performance vs. scalability when given large inputs that require 64-bit offset types. +This currently only applies when DEVICE=CUDA. + +- `Dynamic` May compile each kernel twice, once for 32-bit offsets and again for 64-bit + offsets, and choose dynamically using the input size at runtime. + This significantly increases compile-time and binary-size, but provides optimal performance + for small input sizes while also supporting 64-bit indexed workloads. +- `Force32bit` forces Thrust to use a 32 bit offset type. This improves compile time and + binary size but limits the input size. +- `Force64bit` forces Thrust to use a 64-bit offset type. This improves compile time and + binary size and allows large input sizes. However, it may degrade runtime performance + for 32-bit indexed workloads. + #### TBB / OpenMP To explicitly specify host/device systems, `HOST` and `DEVICE` arguments can be @@ -56,17 +72,21 @@ host system, but will find and use TBB or OpenMP for the device system. To allow a Thrust target to be configurable easily via `cmake-gui` or `ccmake`, pass the `FROM_OPTIONS` flag to `thrust_create_target`. This will add -`THRUST_HOST_SYSTEM` and `THRUST_DEVICE_SYSTEM` options to the CMake cache that -allow selection from the systems supported by this version of Thrust. +`THRUST_HOST_SYSTEM`, `THRUST_DEVICE_SYSTEM`, and `THRUST_DISPATCH_TYPE` options +to the CMake cache that allow selection from the systems supported by this version +of Thrust. ```cmake thrust_create_target(Thrust FROM_OPTIONS [HOST_OPTION