From 8424d0a0baae9d28e48000c6b1d785271482998a Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Mon, 31 Jul 2023 09:43:58 -0700 Subject: [PATCH 1/8] move to most recent Alpaka version --- setup.sh | 12 ++++++------ setup_hpg.sh | 9 ++++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/setup.sh b/setup.sh index 15973ebd9..45c83f89e 100644 --- a/setup.sh +++ b/setup.sh @@ -6,9 +6,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source $DIR/code/rooutil/thisrooutil.sh -export SCRAM_ARCH=el8_amd64_gcc10 -export CMSSW_VERSION=CMSSW_13_0_0_pre2 -export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/cuda/11.5.2-c927b7e765e06433950d8a7eab9eddb4/ +export SCRAM_ARCH=el8_amd64_gcc11 +export CMSSW_VERSION=CMSSW_13_0_0_pre4 source /cvmfs/cms.cern.ch/cmsset_default.sh cd /cvmfs/cms.cern.ch/$SCRAM_ARCH/cms/cmssw/$CMSSW_VERSION/src @@ -42,7 +41,8 @@ fi export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root" export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root" -source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh -export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a" -export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f" +# Alpaka, Boost, and CUDA dependencies +export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/boost/1.80.0-5305613b2f750cf1a05dcadf0d672647" +export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/alpaka/develop-20230621-9e2225ac6c979464a40749ef9d1e0331" +export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/cuda/11.8.0-9f0af0f4206be7b705fe550319c49a11/ #eof diff --git a/setup_hpg.sh b/setup_hpg.sh index c5ee03c90..823d9c3cc 100644 --- a/setup_hpg.sh +++ b/setup_hpg.sh @@ -11,8 +11,8 @@ module load cuda/11.4.3 git DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source $DIR/code/rooutil/thisrooutil.sh -export SCRAM_ARCH=el8_amd64_gcc10 -export CMSSW_VERSION=CMSSW_13_0_0_pre2 +export SCRAM_ARCH=el8_amd64_gcc11 +export CMSSW_VERSION=CMSSW_13_0_0_pre4 export CUDA_HOME=${HPC_CUDA_DIR} source /cvmfs/cms.cern.ch/cmsset_default.sh @@ -41,7 +41,6 @@ export LSTPERFORMANCEWEBDIR=/home/users/phchang/public_html/LSTPerformanceWeb export LATEST_CPU_BENCHMARK_EFF_MUONGUN= export LATEST_CPU_BENCHMARK_EFF_PU200= -source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh -export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a" -export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f" +export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/boost/1.80.0-5305613b2f750cf1a05dcadf0d672647" +export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/alpaka/develop-20230621-9e2225ac6c979464a40749ef9d1e0331" #eof From 4ebde568dcd11f7ef0b83e74ba63b54937e5aade Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Mon, 31 Jul 2023 09:51:37 -0700 Subject: [PATCH 2/8] remove temporary hyperbolic functions --- SDL/Hit.h | 18 ++---------------- SDL/Segment.h | 2 +- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/SDL/Hit.h b/SDL/Hit.h index c297b353e..c74c4c3b6 100644 --- a/SDL/Hit.h +++ b/SDL/Hit.h @@ -118,26 +118,12 @@ namespace SDL return alpaka::math::log(acc, val) / ln10; }; - // Hyperbolic functions were just merged into Alpaka early 2023, - // so we have to make use of temporary functions for now. - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE float temp_acosh(TAcc const & acc, float val) - { - return alpaka::math::log(acc, val + alpaka::math::sqrt(acc, val * val - 1)); - }; - - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE float temp_sinh(TAcc const & acc, float val) - { - return 0.5 * (alpaka::math::exp(acc, val) - alpaka::math::exp(acc, -val)); - }; - template ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float eta(TAcc const & acc, float x, float y, float z) { float r3 = alpaka::math::sqrt(acc, x*x + y*y + z*z ); float rt = alpaka::math::sqrt(acc, x*x + y*y ); - float eta = ((z > 0) - ( z < 0)) * temp_acosh(acc, r3 / rt ); + float eta = ((z > 0) - ( z < 0)) * alpaka::math::acosh(acc, r3 / rt ); return eta; }; @@ -282,7 +268,7 @@ namespace SDL hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y); hitsInGPU.phis[ihit] = SDL::phi(acc, ihit_x,ihit_y); // Acosh has no supported implementation in Alpaka right now. - hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * SDL::temp_acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]); + hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * alpaka::math::acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]); int found_index = binary_search(modulesInGPU.mapdetId, iDetId, nModules); uint16_t lastModuleIndex = modulesInGPU.mapIdx[found_index]; diff --git a/SDL/Segment.h b/SDL/Segment.h index 597133260..a6ea8ef7b 100644 --- a/SDL/Segment.h +++ b/SDL/Segment.h @@ -812,7 +812,7 @@ namespace SDL addMDToMemory(acc, mdsInGPU, hitsInGPU, modulesInGPU, hitIndices2[tid], hitIndices3[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,outerMDIndex); //in outer hits - pt, eta, phi - float slope = SDL::temp_sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]); + float slope = alpaka::math::sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]); float intercept = hitsInGPU.zs[mdsInGPU.anchorHitIndices[innerMDIndex]] - slope * hitsInGPU.rts[mdsInGPU.anchorHitIndices[innerMDIndex]]; float score_lsq=(hitsInGPU.rts[mdsInGPU.anchorHitIndices[outerMDIndex]] * slope + intercept) - (hitsInGPU.zs[mdsInGPU.anchorHitIndices[outerMDIndex]]); score_lsq = score_lsq * score_lsq; From 76eb86d5a9a90772b3517dc98ca156418ca48964 Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Mon, 31 Jul 2023 10:34:34 -0700 Subject: [PATCH 3/8] move to most recent caching allocator version --- code/alpaka_interface/AllocatorPolicy.h | 4 +- code/alpaka_interface/CachedBufAlloc.h | 10 +- code/alpaka_interface/CachingAllocator.h | 2 +- code/alpaka_interface/CopyToDevice.h | 35 +++ code/alpaka_interface/CopyToHost.h | 36 +++ code/alpaka_interface/ScopedContextFwd.h | 14 +- code/alpaka_interface/devices.h | 5 +- .../getDeviceCachingAllocator.h | 6 +- .../getHostCachingAllocator.h | 4 +- code/alpaka_interface/memory.h | 51 ++-- code/alpaka_interface/thread_safety_macros.h | 6 +- code/alpaka_interface/traits.h | 38 --- code/alpaka_interface/workdivision.h | 219 +++++++++++------- 13 files changed, 256 insertions(+), 174 deletions(-) create mode 100644 code/alpaka_interface/CopyToDevice.h create mode 100644 code/alpaka_interface/CopyToHost.h diff --git a/code/alpaka_interface/AllocatorPolicy.h b/code/alpaka_interface/AllocatorPolicy.h index 61b888791..5a36b1c61 100644 --- a/code/alpaka_interface/AllocatorPolicy.h +++ b/code/alpaka_interface/AllocatorPolicy.h @@ -13,7 +13,7 @@ namespace lst::alpakatools { // - Caching: (device and host) caching allocator enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 }; - template >> + template >> constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; #if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED @@ -43,6 +43,8 @@ namespace lst::alpakatools { constexpr inline AllocatorPolicy allocator_policy = #if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR AllocatorPolicy::Caching; +#elif HIP_VERSION >= 50400000 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR + AllocatorPolicy::Asynchronous; #else AllocatorPolicy::Synchronous; #endif diff --git a/code/alpaka_interface/CachedBufAlloc.h b/code/alpaka_interface/CachedBufAlloc.h index 2fd6d8f63..bb0fce838 100644 --- a/code/alpaka_interface/CachedBufAlloc.h +++ b/code/alpaka_interface/CachedBufAlloc.h @@ -18,7 +18,7 @@ namespace lst::alpakatools { typename TDev, typename TQueue, typename = void, - typename = std::enable_if_t and lst::alpakatools::is_queue_v>> + typename = std::enable_if_t and alpaka::isQueue>> struct CachedBufAlloc { static_assert(alpaka::meta::DependentFalseType::value, "This device does not support a caching allocator"); }; @@ -159,11 +159,7 @@ namespace lst::alpakatools { }; //! The caching memory allocator implementation for the ROCm/HIP device - template >> + template struct CachedBufAlloc { template ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent) @@ -197,7 +193,7 @@ namespace lst::alpakatools { typename TExtent, typename TQueue, typename TDev, - typename = std::enable_if_t and lst::alpakatools::is_queue_v>> + typename = std::enable_if_t and alpaka::isQueue>> ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) { return traits::CachedBufAlloc, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent); } diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h index 364edbe57..b689c544b 100644 --- a/code/alpaka_interface/CachingAllocator.h +++ b/code/alpaka_interface/CachingAllocator.h @@ -84,7 +84,7 @@ namespace lst::alpakatools { template and lst::alpakatools::is_queue_v>> + typename = std::enable_if_t and alpaka::isQueue>> class CachingAllocator { public: #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED diff --git a/code/alpaka_interface/CopyToDevice.h b/code/alpaka_interface/CopyToDevice.h new file mode 100644 index 000000000..2619e7201 --- /dev/null +++ b/code/alpaka_interface/CopyToDevice.h @@ -0,0 +1,35 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_CopyToDevice_h +#define HeterogeneousCore_AlpakaInterface_interface_CopyToDevice_h + +// TODO: this utility class is specific to CMSSW, but needs to be in a +// package that is suitable as DataFormat dependence + +namespace lst::alpakatools { + /** + * This class template needs to be specialized for each host-side + * EventSetup data product that should be implicitly copied to the + * device memory. The specialization is expected to define static + * copyAsync() function as in the following example + * + * \code + * template <> + * struct CopyToDevice { + * template + * static auto copyAsync(TQueue& queue, ExampleHostProduct const& hostData) { + * // construct ExampleDeviceProduct corresponding the device of the TQueue + * // asynchronous copy hostData to the ExampleDeviceProduct object + * // return ExampleDeviceProduct object by value + * } + * }; + * \endcode + * + * The copyAsync() function should not explicitly synchronize the + * queue. The ExampleHostProduct and ExampleDevicxeProduct can be the + * same type, if they internally are able to handle the memory + * allocation difference between host and device. + */ + template + struct CopyToDevice; +} // namespace lst::alpakatools + +#endif diff --git a/code/alpaka_interface/CopyToHost.h b/code/alpaka_interface/CopyToHost.h new file mode 100644 index 000000000..2d5ed914e --- /dev/null +++ b/code/alpaka_interface/CopyToHost.h @@ -0,0 +1,36 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_CopyToHost_h +#define HeterogeneousCore_AlpakaInterface_interface_CopyToHost_h + +// TODO: this utility class is specific to CMSSW, but needs to be in a +// package that is suitable as DataFormat dependence + +namespace lst::alpakatools { + /** + * This class template needs to be specialized for each device-side + * Event data product so that the framework can implicitly copy the + * device-side data product to the host memory. The specialization + * is expected to define static copyAsync() function as in the + * following example + * + * \code + * template <> + * struct CopyToHost { + * template + * static ExampleHostProduct copyAsync(TQueue& queue, ExampleDeviceProduct const& deviceData) { + * // construct ExampleHostProduct + * // asynchronous copy deviceData to the ExampleHostProduct object + * // return ExampleHostProduct object by value + * } + * }; + * \endcode + * + * The copyAsync() function should not explicitly synchronize the + * queue. The ExampleDeviceProduct and ExampleHostProduct can be the + * same type, if they internally are able to handle the memory + * allocation difference between host and device. + */ + template + struct CopyToHost; +} // namespace lst::alpakatools + +#endif diff --git a/code/alpaka_interface/ScopedContextFwd.h b/code/alpaka_interface/ScopedContextFwd.h index 0e154a630..271856707 100644 --- a/code/alpaka_interface/ScopedContextFwd.h +++ b/code/alpaka_interface/ScopedContextFwd.h @@ -1,7 +1,7 @@ #ifndef HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h #define HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h -#include "traits.h" +#include // Forward declaration of the alpaka framework Context classes // @@ -11,23 +11,23 @@ namespace lst::alpakatools { namespace impl { - template >> + template >> class ScopedContextBase; - template >> + template >> class ScopedContextGetterBase; } // namespace impl - template >> + template >> class ScopedContextAcquire; - template >> + template >> class ScopedContextProduce; - template >> + template >> class ScopedContextTask; - template >> + template >> class ScopedContextAnalyze; } // namespace lst::alpakatools diff --git a/code/alpaka_interface/devices.h b/code/alpaka_interface/devices.h index 3b342c84f..2445ba87e 100644 --- a/code/alpaka_interface/devices.h +++ b/code/alpaka_interface/devices.h @@ -7,13 +7,12 @@ #include #include "config.h" -#include "traits.h" namespace lst::alpakatools { namespace detail { - template >> + template >> inline std::vector> enumerate_devices() { using Platform = TPlatform; using Device = alpaka::Dev; @@ -32,7 +31,7 @@ namespace lst::alpakatools { } // namespace detail // return the alpaka accelerator devices for the given platform - template >> + template >> inline std::vector> const& devices() { static const auto devices = detail::enumerate_devices(); return devices; diff --git a/code/alpaka_interface/getDeviceCachingAllocator.h b/code/alpaka_interface/getDeviceCachingAllocator.h index 89b75767b..19ed9b01d 100644 --- a/code/alpaka_interface/getDeviceCachingAllocator.h +++ b/code/alpaka_interface/getDeviceCachingAllocator.h @@ -4,6 +4,8 @@ #include #include +#include + #include "thread_safety_macros.h" #include "AllocatorConfig.h" #include "CachingAllocator.h" @@ -16,7 +18,7 @@ namespace lst::alpakatools { template and lst::alpakatools::is_queue_v>> + typename = std::enable_if_t and alpaka::isQueue>> auto allocate_device_allocators() { using Allocator = CachingAllocator; auto const& devices = lst::alpakatools::devices>(); @@ -72,7 +74,7 @@ namespace lst::alpakatools { template and lst::alpakatools::is_queue_v>> + typename = std::enable_if_t and alpaka::isQueue>> inline CachingAllocator& getDeviceCachingAllocator(TDev const& device) { // initialise all allocators, one per device CMS_THREAD_SAFE static auto allocators = detail::allocate_device_allocators(); diff --git a/code/alpaka_interface/getHostCachingAllocator.h b/code/alpaka_interface/getHostCachingAllocator.h index 9fa5321c4..a0ad20ff6 100644 --- a/code/alpaka_interface/getHostCachingAllocator.h +++ b/code/alpaka_interface/getHostCachingAllocator.h @@ -1,6 +1,8 @@ #ifndef HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h #define HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h +#include + #include "thread_safety_macros.h" #include "AllocatorConfig.h" #include "CachingAllocator.h" @@ -10,7 +12,7 @@ namespace lst::alpakatools { - template >> + template >> inline CachingAllocator& getHostCachingAllocator() { // thread safe initialisation of the host allocator CMS_THREAD_SAFE static CachingAllocator allocator( diff --git a/code/alpaka_interface/memory.h b/code/alpaka_interface/memory.h index f6155104f..0cccd8375 100644 --- a/code/alpaka_interface/memory.h +++ b/code/alpaka_interface/memory.h @@ -18,7 +18,7 @@ namespace lst::alpakatools { // type deduction helpers namespace detail { - template >> + template >> struct buffer_type { using type = alpaka::Buf; }; @@ -33,7 +33,7 @@ namespace lst::alpakatools { using type = alpaka::Buf; }; - template >> + template >> struct view_type { using type = alpaka::ViewPlainPtr; }; @@ -66,13 +66,13 @@ namespace lst::alpakatools { } template - std::enable_if_t and not std::is_array_v>, host_buffer> + std::enable_if_t and not std::is_array_v>, host_buffer> make_host_buffer(Extent extent) { return alpaka::allocBuf, Idx>(host(), Vec1D{extent}); } template - std::enable_if_t and not std::is_array_v>, host_buffer> + std::enable_if_t and not std::is_array_v>, host_buffer> make_host_buffer() { return alpaka::allocBuf, Idx>(host(), Vec1D{std::extent_v}); } @@ -86,13 +86,13 @@ namespace lst::alpakatools { } template - std::enable_if_t and not std::is_array_v>, host_buffer> + std::enable_if_t and not std::is_array_v>, host_buffer> make_host_buffer(Extent extent) { return alpaka::allocMappedBuf, Idx>(host(), Vec1D{extent}); } template - std::enable_if_t and not std::is_array_v>, host_buffer> + std::enable_if_t and not std::is_array_v>, host_buffer> make_host_buffer() { return alpaka::allocMappedBuf, Idx>(host(), Vec1D{std::extent_v}); } @@ -101,7 +101,8 @@ namespace lst::alpakatools { // the memory is pinned according to the device associated to the queue template - std::enable_if_t and not std::is_array_v, host_buffer> make_host_buffer(TQueue const& queue) { + std::enable_if_t and not std::is_array_v, host_buffer> make_host_buffer( + TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf(host(), queue, Scalar{}); } else { @@ -110,7 +111,8 @@ namespace lst::alpakatools { } template - std::enable_if_t and cms::is_unbounded_array_v and not std::is_array_v>, + std::enable_if_t and lst::is_unbounded_array_v and + not std::is_array_v>, host_buffer> make_host_buffer(TQueue const& queue, Extent extent) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { @@ -122,7 +124,8 @@ namespace lst::alpakatools { } template - std::enable_if_t and cms::is_bounded_array_v and not std::is_array_v>, + std::enable_if_t and lst::is_bounded_array_v and + not std::is_array_v>, host_buffer> make_host_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { @@ -149,42 +152,44 @@ namespace lst::alpakatools { } template - std::enable_if_t and not std::is_array_v>, host_view> + std::enable_if_t and not std::is_array_v>, host_view> make_host_view(T& data, Extent extent) { return alpaka::ViewPlainPtr, Dim1D, Idx>(data, host(), Vec1D{extent}); } template - std::enable_if_t and not std::is_array_v>, host_view> + std::enable_if_t and not std::is_array_v>, host_view> make_host_view(T& data) { return alpaka::ViewPlainPtr, Dim1D, Idx>(data, host(), Vec1D{std::extent_v}); } // scalar and 1-dimensional device buffers - template >> + template >> using device_buffer = typename detail::buffer_type::type; - template >> + template >> using const_device_buffer = alpaka::ViewConst>; // non-cached, scalar and 1-dimensional device buffers template - std::enable_if_t and not std::is_array_v, device_buffer> make_device_buffer( + std::enable_if_t and not std::is_array_v, device_buffer> make_device_buffer( TDev const& device) { return alpaka::allocBuf(device, Scalar{}); } template - std::enable_if_t and cms::is_unbounded_array_v and not std::is_array_v>, + std::enable_if_t and lst::is_unbounded_array_v and + not std::is_array_v>, device_buffer> make_device_buffer(TDev const& device, Extent extent) { return alpaka::allocBuf, Idx>(device, Vec1D{extent}); } template - std::enable_if_t and cms::is_bounded_array_v and not std::is_array_v>, + std::enable_if_t and lst::is_bounded_array_v and + not std::is_array_v>, device_buffer> make_device_buffer(TDev const& device) { return alpaka::allocBuf, Idx>(device, Vec1D{std::extent_v}); @@ -193,7 +198,7 @@ namespace lst::alpakatools { // potentially-cached, scalar and 1-dimensional device buffers with queue-ordered semantic template - std::enable_if_t and not std::is_array_v, device_buffer, T>> + std::enable_if_t and not std::is_array_v, device_buffer, T>> make_device_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf(alpaka::getDev(queue), queue, Scalar{}); @@ -207,7 +212,8 @@ namespace lst::alpakatools { } template - std::enable_if_t and cms::is_unbounded_array_v and not std::is_array_v>, + std::enable_if_t and lst::is_unbounded_array_v and + not std::is_array_v>, device_buffer, T>> make_device_buffer(TQueue const& queue, Extent extent) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { @@ -222,7 +228,8 @@ namespace lst::alpakatools { } template - std::enable_if_t and cms::is_bounded_array_v and not std::is_array_v>, + std::enable_if_t and lst::is_bounded_array_v and + not std::is_array_v>, device_buffer, T>> make_device_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { @@ -238,7 +245,7 @@ namespace lst::alpakatools { // scalar and 1-dimensional device views - template >> + template >> using device_view = typename detail::view_type::type; template @@ -252,13 +259,13 @@ namespace lst::alpakatools { } template - std::enable_if_t and not std::is_array_v>, device_view> + std::enable_if_t and not std::is_array_v>, device_view> make_device_view(TDev const& device, T& data, Extent extent) { return alpaka::ViewPlainPtr, Dim1D, Idx>(data, device, Vec1D{extent}); } template - std::enable_if_t and not std::is_array_v>, device_view> + std::enable_if_t and not std::is_array_v>, device_view> make_device_view(TDev const& device, T& data) { return alpaka::ViewPlainPtr, Dim1D, Idx>(data, device, Vec1D{std::extent_v}); } diff --git a/code/alpaka_interface/thread_safety_macros.h b/code/alpaka_interface/thread_safety_macros.h index 3abbe0b9e..be2299334 100644 --- a/code/alpaka_interface/thread_safety_macros.h +++ b/code/alpaka_interface/thread_safety_macros.h @@ -1,9 +1,9 @@ #ifndef FWCore_Utilites_thread_safe_macros_h #define FWCore_Utilites_thread_safe_macros_h #if !defined __CLING__ && !defined __INTEL_COMPILER && !defined __NVCC__ -#define CMS_THREAD_SAFE [[cms::thread_safe]] -#define CMS_SA_ALLOW [[cms::sa_allow]] -#define CMS_THREAD_GUARD(_var_) [[cms::thread_guard(#_var_)]] +#define CMS_THREAD_SAFE [[lst::thread_safe]] +#define CMS_SA_ALLOW [[lst::sa_allow]] +#define CMS_THREAD_GUARD(_var_) [[lst::thread_guard(#_var_)]] #else #define CMS_THREAD_SAFE #define CMS_SA_ALLOW diff --git a/code/alpaka_interface/traits.h b/code/alpaka_interface/traits.h index c7d2b20ef..c469daf60 100644 --- a/code/alpaka_interface/traits.h +++ b/code/alpaka_interface/traits.h @@ -25,42 +25,4 @@ namespace cms { inline constexpr bool is_unbounded_array_v = is_unbounded_array::value; } // namespace cms -#include - -namespace lst::alpakatools { - - // is_platform - - template - using is_platform = alpaka::concepts::ImplementsConcept; - - template - inline constexpr bool is_platform_v = is_platform::value; - - // is_device - - template - using is_device = alpaka::concepts::ImplementsConcept; - - template - inline constexpr bool is_device_v = is_device::value; - - // is_accelerator - - template - using is_accelerator = alpaka::concepts::ImplementsConcept; - - template - inline constexpr bool is_accelerator_v = is_accelerator::value; - - // is_queue - - template - using is_queue = alpaka::concepts::ImplementsConcept; - - template - inline constexpr bool is_queue_v = is_queue::value; - -} // namespace lst::alpakatools - #endif // HeterogeneousCore_AlpakaInterface_interface_traits_h diff --git a/code/alpaka_interface/workdivision.h b/code/alpaka_interface/workdivision.h index 6153af74f..4b358f0c9 100644 --- a/code/alpaka_interface/workdivision.h +++ b/code/alpaka_interface/workdivision.h @@ -7,7 +7,6 @@ #include "config.h" #include "traits.h" -#include "vec.h" namespace lst::alpakatools { @@ -19,27 +18,33 @@ namespace lst::alpakatools { // Return the integer division of the first argument by the second argument, rounded up to the next integer inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; } - // Create an accelerator-dependent work division for 1-dimensional kernels - template and alpaka::Dim::value == 1>> - inline WorkDiv make_workdiv(Idx blocks, Idx elements) { + // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped + template >> + struct requires_single_thread_per_block : public std::true_type {}; + #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if constexpr (std::is_same_v>) { - // On GPU backends, each thread is looking at a single element: - // - the number of threads per block is "elements"; - // - the number of elements per thread is always 1. - return WorkDiv(blocks, elements, Idx{1}); - } else + template + struct requires_single_thread_per_block> : public std::false_type {}; #endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#if ALPAKA_ACC_GPU_HIP_ENABLED - if constexpr (std::is_same_v>) { + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + struct requires_single_thread_per_block> : public std::false_type {}; +#endif // ALPAKA_ACC_GPU_HIP_ENABLED + + // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped + template >> + inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block::value; + + // Create an accelerator-dependent work division for 1-dimensional kernels + template and alpaka::Dim::value == 1>> + inline WorkDiv make_workdiv(Idx blocks, Idx elements) { + if constexpr (not requires_single_thread_per_block_v) { // On GPU backends, each thread is looking at a single element: // - the number of threads per block is "elements"; // - the number of elements per thread is always 1. return WorkDiv(blocks, elements, Idx{1}); - } else -#endif // ALPAKA_ACC_GPU_HIP_ENABLED - { + } else { // On CPU backends, run serially with a single thread per block: // - the number of threads per block is always 1; // - the number of elements per thread is "elements". @@ -48,27 +53,16 @@ namespace lst::alpakatools { } // Create the accelerator-dependent workdiv for N-dimensional kernels - template >> + template >> inline WorkDiv> make_workdiv(const Vec>& blocks, const Vec>& elements) { using Dim = alpaka::Dim; -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - if constexpr (std::is_same_v>) { + if constexpr (not requires_single_thread_per_block_v) { // On GPU backends, each thread is looking at a single element: // - the number of threads per block is "elements"; // - the number of elements per thread is always 1. return WorkDiv(blocks, elements, Vec::ones()); - } else -#endif // ALPAKA_ACC_GPU_CUDA_ENABLED -#ifdef ALPAKA_ACC_GPU_HIP_ENABLED - if constexpr (std::is_same_v>) { - // On GPU backends, each thread is looking at a single element: - // - the number of threads per block is "elements"; - // - the number of elements per thread is always 1. - return WorkDiv(blocks, elements, Vec::ones()); - } else -#endif // ALPAKA_ACC_GPU_HIP_ENABLED - { + } else { // On CPU backends, run serially with a single thread per block: // - the number of threads per block is always 1; // - the number of elements per thread is "elements". @@ -76,8 +70,7 @@ namespace lst::alpakatools { } } - template and alpaka::Dim::value == 1>> + template and alpaka::Dim::value == 1>> class elements_with_stride { public: ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc) @@ -108,13 +101,15 @@ namespace lst::alpakatools { // pre-increment the iterator ALPAKA_FN_ACC inline iterator& operator++() { - // increment the index along the elements processed by the current thread - ++index_; - if (index_ < last_) - return *this; + if constexpr (requires_single_thread_per_block_v) { + // increment the index along the elements processed by the current thread + ++index_; + if (index_ < last_) + return *this; + } // increment the thread index with the grid stride - first_ += stride_ * elements_; + first_ += stride_; index_ = first_; last_ = std::min(first_ + elements_, extent_); if (index_ < extent_) @@ -162,8 +157,7 @@ namespace lst::alpakatools { const Idx extent_; }; - template and (alpaka::Dim::value > 0)>> + template and (alpaka::Dim::value > 0)>> class elements_with_stride_nd { public: using Dim = alpaka::Dim; @@ -183,76 +177,123 @@ namespace lst::alpakatools { class iterator { friend class elements_with_stride_nd; - constexpr static const auto last_dimension = Dim::value - 1; - - ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first) - : elements_{elements}, - stride_{stride}, - extent_{extent}, - first_{alpaka::elementwise_min(first, extent)}, - index_{first_}, - last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {} public: ALPAKA_FN_ACC inline Vec operator*() const { return index_; } // pre-increment the iterator - ALPAKA_FN_ACC inline iterator& operator++() { - // increment the index along the elements processed by the current thread - ++index_[last_dimension]; - if (index_[last_dimension] < last_) - return *this; - - // increment the thread index along with the last dimension with the grid stride - first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension]; - index_[last_dimension] = first_[last_dimension]; - last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]); - if (index_[last_dimension] < extent_[last_dimension]) - return *this; - - // increment the thread index along the outer dimensions with the grid stride - if constexpr (last_dimension > 0) - for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) { - first_[dimension] += stride_[dimension]; - index_[dimension] = first_[dimension]; - if (index_[dimension] < extent_[dimension]) - return *this; - } - - // the iterator has reached or passed the end of the extent, clamp it to the extent - first_ = extent_; - index_ = extent_; - last_ = extent_[last_dimension]; + ALPAKA_FN_ACC constexpr inline iterator operator++() { + increment(); return *this; } // post-increment the iterator - ALPAKA_FN_ACC inline iterator operator++(int) { + ALPAKA_FN_ACC constexpr inline iterator operator++(int) { iterator old = *this; - ++(*this); + increment(); return old; } - ALPAKA_FN_ACC inline bool operator==(iterator const& other) const { - return (index_ == other.index_) and (first_ == other.first_); - } + ALPAKA_FN_ACC constexpr inline bool operator==(iterator const& other) const { return (index_ == other.index_); } - ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); } + ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); } private: - // non-const to support iterator copy and assignment - Vec elements_; - Vec stride_; - Vec extent_; + // private, explicit constructor + ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first) + : loop_{loop}, + thread_{alpaka::elementwise_min(first, loop->extent_)}, + range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)}, + index_{thread_} {} + + template + ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() { + bool overflow = false; + ++index_[I]; + if (index_[I] >= range_[I]) { + index_[I] = thread_[I]; + overflow = true; + } + return overflow; + } + + template + ALPAKA_FN_ACC inline constexpr bool do_elements_loops() { + if constexpr (N == 0) { + // overflow + return true; + } else { + if (not nth_elements_loop()) { + return false; + } else { + return do_elements_loops(); + } + } + } + + template + ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() { + bool overflow = false; + thread_[I] += loop_->stride_[I]; + if (thread_[I] >= loop_->extent_[I]) { + thread_[I] = loop_->first_[I]; + overflow = true; + } + index_[I] = thread_[I]; + range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]); + return overflow; + } + + template + ALPAKA_FN_ACC inline constexpr bool do_strided_loops() { + if constexpr (N == 0) { + // overflow + return true; + } else { + if (not nth_strided_loop()) { + return false; + } else { + return do_strided_loops(); + } + } + } + + // increment the iterator + ALPAKA_FN_ACC inline constexpr void increment() { + if constexpr (requires_single_thread_per_block_v) { + // linear N-dimensional loops over the elements associated to the thread; + // do_elements_loops<>() returns true if any of those loops overflows + if (not do_elements_loops()) { + // the elements loops did not overflow, return the next index + return; + } + } + + // strided N-dimensional loop over the threads in the kernel launch grid; + // do_strided_loops<>() returns true if any of those loops overflows + if (not do_strided_loops()) { + // the strided loops did not overflow, return the next index + return; + } + + // the iterator has reached or passed the end of the extent, clamp it to the extent + thread_ = loop_->extent_; + range_ = loop_->extent_; + index_ = loop_->extent_; + } + + // const pointer to the elements_with_stride_nd that the iterator refers to + const elements_with_stride_nd* loop_; + // modified by the pre/post-increment operator - Vec first_; - Vec index_; - Idx last_; + Vec thread_; // first element processed by this thread + Vec range_; // last element processed by this thread + Vec index_; // current element processed by this thread }; - ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); } + ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; } - ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); } + ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; } private: const Vec elements_; From f935655b1f678e4f81f0a5ffd5905341d1dde378 Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Mon, 31 Jul 2023 11:00:51 -0700 Subject: [PATCH 4/8] add last commit from CMSSW --- code/alpaka_interface/AlpakaServiceFwd.h | 4 ++-- code/alpaka_interface/CachingAllocator.h | 2 +- code/alpaka_interface/config.h | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/code/alpaka_interface/AlpakaServiceFwd.h b/code/alpaka_interface/AlpakaServiceFwd.h index 4345f3f34..f0f88287b 100644 --- a/code/alpaka_interface/AlpakaServiceFwd.h +++ b/code/alpaka_interface/AlpakaServiceFwd.h @@ -13,9 +13,9 @@ namespace alpaka_cuda_async { #endif // ALPAKA_ACC_GPU_CUDA_ENABLED #ifdef ALPAKA_ACC_GPU_HIP_ENABLED -namespace alpaka_hip_async { +namespace alpaka_rocm_async { class AlpakaService; -} // namespace alpaka_hip_async +} // namespace alpaka_rocm_async #endif // ALPAKA_ACC_GPU_HIP_ENABLED #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h index b689c544b..226a30cb3 100644 --- a/code/alpaka_interface/CachingAllocator.h +++ b/code/alpaka_interface/CachingAllocator.h @@ -91,7 +91,7 @@ namespace lst::alpakatools { friend class alpaka_cuda_async::AlpakaService; #endif #ifdef ALPAKA_ACC_GPU_HIP_ENABLED - friend class alpaka_hip_async::AlpakaService; + friend class alpaka_rocm_async::AlpakaService; #endif #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED friend class alpaka_serial_sync::AlpakaService; diff --git a/code/alpaka_interface/config.h b/code/alpaka_interface/config.h index 354a93b91..a99e5d4d9 100644 --- a/code/alpaka_interface/config.h +++ b/code/alpaka_interface/config.h @@ -65,7 +65,7 @@ namespace alpaka_cuda_async { #endif // ALPAKA_ACC_GPU_CUDA_ENABLED #ifdef ALPAKA_ACC_GPU_HIP_ENABLED -namespace alpaka_hip_async { +namespace alpaka_rocm_async { using namespace alpaka_common; using Platform = alpaka::PltfHipRt; @@ -79,13 +79,13 @@ namespace alpaka_hip_async { using Acc2D = Acc; using Acc3D = Acc; -} // namespace alpaka_hip_async +} // namespace alpaka_rocm_async #ifdef ALPAKA_ACCELERATOR_NAMESPACE #define ALPAKA_DUPLICATE_NAMESPACE #else -#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_hip_async -#define ALPAKA_TYPE_SUFFIX HipAsync +#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async +#define ALPAKA_TYPE_SUFFIX ROCmAsync #endif #endif // ALPAKA_ACC_GPU_HIP_ENABLED From 5da452f66c255a37f4a48ccaf56a9ee0a50b1ff2 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Sun, 6 Aug 2023 15:24:10 -0400 Subject: [PATCH 5/8] remove old comment --- SDL/Hit.h | 1 - 1 file changed, 1 deletion(-) diff --git a/SDL/Hit.h b/SDL/Hit.h index c74c4c3b6..6ae2415c9 100644 --- a/SDL/Hit.h +++ b/SDL/Hit.h @@ -267,7 +267,6 @@ namespace SDL hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y); hitsInGPU.phis[ihit] = SDL::phi(acc, ihit_x,ihit_y); - // Acosh has no supported implementation in Alpaka right now. hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * alpaka::math::acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]); int found_index = binary_search(modulesInGPU.mapdetId, iDetId, nModules); uint16_t lastModuleIndex = modulesInGPU.mapIdx[found_index]; From 6ea9524531a1cee0c263e7b17336915d2744af31 Mon Sep 17 00:00:00 2001 From: GNiendorf Date: Mon, 7 Aug 2023 13:46:55 -0400 Subject: [PATCH 6/8] add Andrea Bocci's latest commit --- code/alpaka_interface/CachingAllocator.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h index 226a30cb3..89a62076d 100644 --- a/code/alpaka_interface/CachingAllocator.h +++ b/code/alpaka_interface/CachingAllocator.h @@ -82,9 +82,7 @@ namespace lst::alpakatools { * - the `Queue` type can be either `Sync` _or_ `Async` on any allocation. */ - template and alpaka::isQueue>> + template class CachingAllocator { public: #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED @@ -106,6 +104,8 @@ namespace lst::alpakatools { using Buffer = alpaka::Buf, size_t>; // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU. + static_assert(alpaka::isDevice, "TDev should be an alpaka Device type."); + static_assert(alpaka::isQueue, "TQueue should be an alpaka Queue type."); static_assert(std::is_same_v> or std::is_same_v, "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the " "host CPU."); From 7f33a42bf0323b4b7b849214a71abd31a1f00e15 Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Mon, 28 Aug 2023 14:23:13 -0400 Subject: [PATCH 7/8] Update README.md for Alpaka CMSSW Integration --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3aac7ca1d..566881e94 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ cd CMSSW_13_0_0_pre4/src cmsenv git cms-init git remote add SegLink git@github.com:SegmentLinking/cmssw.git -git fetch SegLink CMSSW_13_0_0_pre4_LST_X +git fetch SegLink CMSSW_13_0_0_pre4_LST_X_alpaka git cms-addpkg RecoTracker Configuration git checkout CMSSW_13_0_0_pre4_LST_X cat <lst.xml From 80047ad8cc61a0cbb2d43911f9bcb1fca8eaa579 Mon Sep 17 00:00:00 2001 From: Gavin Niendorf Date: Mon, 28 Aug 2023 14:24:27 -0400 Subject: [PATCH 8/8] Update README Correction for Alpaka --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 566881e94..bf203488f 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ git cms-init git remote add SegLink git@github.com:SegmentLinking/cmssw.git git fetch SegLink CMSSW_13_0_0_pre4_LST_X_alpaka git cms-addpkg RecoTracker Configuration -git checkout CMSSW_13_0_0_pre4_LST_X +git checkout CMSSW_13_0_0_pre4_LST_X_alpaka cat <lst.xml