diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 2586fab09e6..ae5cbfee083 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -28,6 +28,8 @@ workflows: - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'} - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'} - {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'} + # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly. + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} # default_projects: clang-cuda - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'} - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90'} @@ -58,13 +60,15 @@ workflows: - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc', 'clang']} nightly: - # Increased test coverage compared to nightlies: + # Increased test coverage compared to pull_request: - {jobs: ['test'], std: 'all', cxx: ['gcc13', 'clang18', 'msvc2022']} - {jobs: ['test'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc12']} - {jobs: ['test'], project: 'cudax', ctk: ['12.0' ], std: 'all', cxx: ['clang14']} - {jobs: ['test'], project: 'cudax', ctk: [ 'curr'], std: 'all', cxx: ['clang18']} # Edge-case jobs - {jobs: ['limited'], project: 'cub', std: 17} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit'} + - {jobs: ['test_gpu'], project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force64bit'} # # These are waiting on the NVKS nodes: # - {jobs: ['test'], ctk: '11.1', gpu: 'v100', sm: 'gpu', cxx: 'gcc6', std: [11]} diff --git a/lib/cmake/thrust/README.md b/lib/cmake/thrust/README.md index 09cbc6debb5..165be15662c 100644 --- a/lib/cmake/thrust/README.md +++ b/lib/cmake/thrust/README.md @@ -38,6 +38,22 @@ If using Thrust from the CCCL sources, this would be $ cmake . -DThrust_DIR=/thrust/thrust/cmake/ ``` +#### Large Array (64-bit offseet) Handling: `DISPATCH` + +The `DISPATCH` option allows users to select the tradeoff of compile-time / binary-size +vs. performance vs. scalability when given large inputs that require 64-bit offset types. +This currently only applies when DEVICE=CUDA. + +- `Dynamic` May compile each kernel twice, once for 32-bit offsets and again for 64-bit + offsets, and choose dynamically using the input size at runtime. + This significantly increases compile-time and binary-size, but provides optimal performance + for small input sizes while also supporting 64-bit indexed workloads. +- `Force32bit` forces Thrust to use a 32 bit offset type. This improves compile time and + binary size but limits the input size. +- `Force64bit` forces Thrust to use a 64-bit offset type. This improves compile time and + binary size and allows large input sizes. However, it may degrade runtime performance + for 32-bit indexed workloads. + #### TBB / OpenMP To explicitly specify host/device systems, `HOST` and `DEVICE` arguments can be @@ -56,17 +72,21 @@ host system, but will find and use TBB or OpenMP for the device system. To allow a Thrust target to be configurable easily via `cmake-gui` or `ccmake`, pass the `FROM_OPTIONS` flag to `thrust_create_target`. This will add -`THRUST_HOST_SYSTEM` and `THRUST_DEVICE_SYSTEM` options to the CMake cache that -allow selection from the systems supported by this version of Thrust. +`THRUST_HOST_SYSTEM`, `THRUST_DEVICE_SYSTEM`, and `THRUST_DISPATCH_TYPE` options +to the CMake cache that allow selection from the systems supported by this version +of Thrust. ```cmake thrust_create_target(Thrust FROM_OPTIONS [HOST_OPTION