Merge pull request #314 from SegmentLinking/alpaka_upgrade

Move to Most Recent Alpaka Version + Newest Caching Allocator
SegmentLinking · Aug 29, 2023 · 8517040 · 8517040
2 parents db11848 + 80047ad
commit 8517040
Show file tree

Hide file tree

Showing 20 changed files with 280 additions and 214 deletions.
diff --git a/README.md b/README.md
@@ -122,9 +122,9 @@ cd CMSSW_13_0_0_pre4/src
 cmsenv
 git cms-init
 git remote add SegLink [email protected]:SegmentLinking/cmssw.git
-git fetch SegLink CMSSW_13_0_0_pre4_LST_X
+git fetch SegLink CMSSW_13_0_0_pre4_LST_X_alpaka
 git cms-addpkg RecoTracker Configuration
-git checkout CMSSW_13_0_0_pre4_LST_X
+git checkout CMSSW_13_0_0_pre4_LST_X_alpaka
 cat <<EOF >lst.xml
 <tool name="lst" version="1.0">
   <client>

diff --git a/SDL/Hit.h b/SDL/Hit.h
@@ -118,26 +118,12 @@ namespace SDL
         return alpaka::math::log(acc, val) / ln10;
     };
 
-    // Hyperbolic functions were just merged into Alpaka early 2023,
-    // so we have to make use of temporary functions for now.
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE float temp_acosh(TAcc const & acc, float val)
-    {
-        return alpaka::math::log(acc, val + alpaka::math::sqrt(acc, val * val - 1));
-    };
-
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE float temp_sinh(TAcc const & acc, float val)
-    {
-        return 0.5 * (alpaka::math::exp(acc, val) - alpaka::math::exp(acc, -val));
-    };
-
     template<typename TAcc>
     ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float eta(TAcc const & acc, float x, float y, float z)
     {
         float r3 = alpaka::math::sqrt(acc, x*x + y*y + z*z );
         float rt = alpaka::math::sqrt(acc, x*x + y*y );
-        float eta = ((z > 0) - ( z < 0)) * temp_acosh(acc, r3 / rt );
+        float eta = ((z > 0) - ( z < 0)) * alpaka::math::acosh(acc, r3 / rt );
         return eta;
     };
 
@@ -281,8 +267,7 @@ namespace SDL
 
                 hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y);
                 hitsInGPU.phis[ihit] = SDL::phi(acc, ihit_x,ihit_y);
-                // Acosh has no supported implementation in Alpaka right now.
-                hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * SDL::temp_acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]);
+                hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * alpaka::math::acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]);
                 int found_index = binary_search(modulesInGPU.mapdetId, iDetId, nModules);
                 uint16_t lastModuleIndex = modulesInGPU.mapIdx[found_index];
 

diff --git a/SDL/Segment.h b/SDL/Segment.h
@@ -812,7 +812,7 @@ namespace SDL
                 addMDToMemory(acc, mdsInGPU, hitsInGPU, modulesInGPU, hitIndices2[tid], hitIndices3[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,outerMDIndex);
 
                 //in outer hits - pt, eta, phi
-                float slope = SDL::temp_sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]);
+                float slope = alpaka::math::sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]);
                 float intercept = hitsInGPU.zs[mdsInGPU.anchorHitIndices[innerMDIndex]] - slope * hitsInGPU.rts[mdsInGPU.anchorHitIndices[innerMDIndex]];
                 float score_lsq=(hitsInGPU.rts[mdsInGPU.anchorHitIndices[outerMDIndex]] * slope + intercept) - (hitsInGPU.zs[mdsInGPU.anchorHitIndices[outerMDIndex]]);
                 score_lsq = score_lsq * score_lsq;

diff --git a/code/alpaka_interface/AllocatorPolicy.h b/code/alpaka_interface/AllocatorPolicy.h
@@ -13,7 +13,7 @@ namespace lst::alpakatools {
   //   - Caching:       (device and host) caching allocator
   enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 };
 
-  template <typename TDev, typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev>>>
+  template <typename TDev, typename = std::enable_if_t<alpaka::isDevice<TDev>>>
   constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;
 
 #if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
@@ -43,6 +43,8 @@ namespace lst::alpakatools {
   constexpr inline AllocatorPolicy allocator_policy<alpaka::DevHipRt> =
 #if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
       AllocatorPolicy::Caching;
+#elif HIP_VERSION >= 50400000 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR
+      AllocatorPolicy::Asynchronous;
 #else
       AllocatorPolicy::Synchronous;
 #endif

diff --git a/code/alpaka_interface/AlpakaServiceFwd.h b/code/alpaka_interface/AlpakaServiceFwd.h
@@ -13,9 +13,9 @@ namespace alpaka_cuda_async {
 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-namespace alpaka_hip_async {
+namespace alpaka_rocm_async {
   class AlpakaService;
-}  // namespace alpaka_hip_async
+}  // namespace alpaka_rocm_async
 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED

diff --git a/code/alpaka_interface/CachedBufAlloc.h b/code/alpaka_interface/CachedBufAlloc.h
@@ -18,7 +18,7 @@ namespace lst::alpakatools {
               typename TDev,
               typename TQueue,
               typename = void,
-              typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+              typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
     struct CachedBufAlloc {
       static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
     };
@@ -159,11 +159,7 @@ namespace lst::alpakatools {
     };
 
     //! The caching memory allocator implementation for the ROCm/HIP device
-    template <typename TElem,
-              typename TDim,
-              typename TIdx,
-              typename TQueue,
-              typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
       template <typename TExtent>
       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent)
@@ -197,7 +193,7 @@ namespace lst::alpakatools {
             typename TExtent,
             typename TQueue,
             typename TDev,
-            typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+            typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
   ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
     return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
   }

diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h
@@ -82,16 +82,14 @@ namespace lst::alpakatools {
    *    - the `Queue` type can be either `Sync` _or_ `Async` on any allocation.
    */
 
-  template <typename TDev,
-            typename TQueue,
-            typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TDev, typename TQueue>
   class CachingAllocator {
   public:
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
     friend class alpaka_cuda_async::AlpakaService;
 #endif
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-    friend class alpaka_hip_async::AlpakaService;
+    friend class alpaka_rocm_async::AlpakaService;
 #endif
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
     friend class alpaka_serial_sync::AlpakaService;
@@ -106,6 +104,8 @@ namespace lst::alpakatools {
     using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
 
     // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
+    static_assert(alpaka::isDevice<Device>, "TDev should be an alpaka Device type.");
+    static_assert(alpaka::isQueue<Queue>, "TQueue should be an alpaka Queue type.");
     static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
                   "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
                   "host CPU.");

diff --git a/code/alpaka_interface/CopyToDevice.h b/code/alpaka_interface/CopyToDevice.h
@@ -0,0 +1,35 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_CopyToDevice_h
+#define HeterogeneousCore_AlpakaInterface_interface_CopyToDevice_h
+
+// TODO: this utility class is specific to CMSSW, but needs to be in a
+// package that is suitable as DataFormat dependence
+
+namespace lst::alpakatools {
+  /**
+   * This class template needs to be specialized for each host-side
+   * EventSetup data product that should be implicitly copied to the
+   * device memory. The specialization is expected to define static
+   * copyAsync() function as in the following example
+   *
+   * \code
+   * template <>
+   * struct CopyToDevice<ExampleHostProduct> {
+   *   template <typename TQueue>
+   *   static auto copyAsync(TQueue& queue, ExampleHostProduct const& hostData) {
+   *     // construct ExampleDeviceProduct corresponding the device of the TQueue
+   *     // asynchronous copy hostData to the ExampleDeviceProduct object
+   *     // return ExampleDeviceProduct object by value
+   *   }
+   * };
+   * \endcode
+   *
+   * The copyAsync() function should not explicitly synchronize the
+   * queue. The ExampleHostProduct and ExampleDevicxeProduct can be the
+   * same type, if they internally are able to handle the memory
+   * allocation difference between host and device.
+   */
+  template <typename THostData>
+  struct CopyToDevice;
+}  // namespace lst::alpakatools
+
+#endif
diff --git a/code/alpaka_interface/CopyToHost.h b/code/alpaka_interface/CopyToHost.h
@@ -0,0 +1,36 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_CopyToHost_h
+#define HeterogeneousCore_AlpakaInterface_interface_CopyToHost_h
+
+// TODO: this utility class is specific to CMSSW, but needs to be in a
+// package that is suitable as DataFormat dependence
+
+namespace lst::alpakatools {
+  /**
+   * This class template needs to be specialized for each device-side
+   * Event data product so that the framework can implicitly copy the
+   * device-side data product to the host memory. The specialization
+   * is expected to define static copyAsync() function as in the
+   * following example
+   *
+   * \code
+   * template <>
+   * struct CopyToHost<ExampleDeviceProduct> {
+   *   template <typename TQueue>
+   *   static ExampleHostProduct copyAsync(TQueue& queue, ExampleDeviceProduct const& deviceData) {
+   *     // construct ExampleHostProduct
+   *     // asynchronous copy deviceData to the ExampleHostProduct object
+   *     // return ExampleHostProduct object by value
+   *   }
+   * };
+   * \endcode
+   *
+   * The copyAsync() function should not explicitly synchronize the
+   * queue. The ExampleDeviceProduct and ExampleHostProduct can be the
+   * same type, if they internally are able to handle the memory
+   * allocation difference between host and device.
+   */
+  template <typename TDeviceData>
+  struct CopyToHost;
+}  // namespace lst::alpakatools
+
+#endif
diff --git a/code/alpaka_interface/ScopedContextFwd.h b/code/alpaka_interface/ScopedContextFwd.h
@@ -1,7 +1,7 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h
 #define HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h
 
-#include "traits.h"
+#include <alpaka/alpaka.hpp>
 
 // Forward declaration of the alpaka framework Context classes
 //
@@ -11,23 +11,23 @@
 namespace lst::alpakatools {
 
   namespace impl {
-    template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+    template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
     class ScopedContextBase;
 
-    template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+    template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
     class ScopedContextGetterBase;
   }  // namespace impl
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   class ScopedContextAcquire;
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   class ScopedContextProduce;
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   class ScopedContextTask;
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   class ScopedContextAnalyze;
 
 }  // namespace lst::alpakatools

diff --git a/code/alpaka_interface/config.h b/code/alpaka_interface/config.h
@@ -65,7 +65,7 @@ namespace alpaka_cuda_async {
 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-namespace alpaka_hip_async {
+namespace alpaka_rocm_async {
   using namespace alpaka_common;
 
   using Platform = alpaka::PltfHipRt;
@@ -79,13 +79,13 @@ namespace alpaka_hip_async {
   using Acc2D = Acc<Dim2D>;
   using Acc3D = Acc<Dim3D>;
 
-}  // namespace alpaka_hip_async
+}  // namespace alpaka_rocm_async
 
 #ifdef ALPAKA_ACCELERATOR_NAMESPACE
 #define ALPAKA_DUPLICATE_NAMESPACE
 #else
-#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_hip_async
-#define ALPAKA_TYPE_SUFFIX HipAsync
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async
+#define ALPAKA_TYPE_SUFFIX ROCmAsync
 #endif
 
 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED

diff --git a/code/alpaka_interface/devices.h b/code/alpaka_interface/devices.h
@@ -7,13 +7,12 @@
 #include <alpaka/alpaka.hpp>
 
 #include "config.h"
-#include "traits.h"
 
 namespace lst::alpakatools {
 
   namespace detail {
 
-    template <typename TPlatform, typename = std::enable_if_t<is_platform_v<TPlatform>>>
+    template <typename TPlatform, typename = std::enable_if_t<alpaka::isPlatform<TPlatform>>>
     inline std::vector<alpaka::Dev<TPlatform>> enumerate_devices() {
       using Platform = TPlatform;
       using Device = alpaka::Dev<Platform>;
@@ -32,7 +31,7 @@ namespace lst::alpakatools {
   }  // namespace detail
 
   // return the alpaka accelerator devices for the given platform
-  template <typename TPlatform, typename = std::enable_if_t<is_platform_v<TPlatform>>>
+  template <typename TPlatform, typename = std::enable_if_t<alpaka::isPlatform<TPlatform>>>
   inline std::vector<alpaka::Dev<TPlatform>> const& devices() {
     static const auto devices = detail::enumerate_devices<TPlatform>();
     return devices;

diff --git a/code/alpaka_interface/getDeviceCachingAllocator.h b/code/alpaka_interface/getDeviceCachingAllocator.h
@@ -4,6 +4,8 @@
 #include <cassert>
 #include <memory>
 
+#include <alpaka/alpaka.hpp>
+
 #include "thread_safety_macros.h"
 #include "AllocatorConfig.h"
 #include "CachingAllocator.h"
@@ -16,7 +18,7 @@ namespace lst::alpakatools {
 
     template <typename TDev,
               typename TQueue,
-              typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+              typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
     auto allocate_device_allocators() {
       using Allocator = CachingAllocator<TDev, TQueue>;
       auto const& devices = lst::alpakatools::devices<alpaka::Pltf<TDev>>();
@@ -72,7 +74,7 @@ namespace lst::alpakatools {
 
   template <typename TDev,
             typename TQueue,
-            typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+            typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
   inline CachingAllocator<TDev, TQueue>& getDeviceCachingAllocator(TDev const& device) {
     // initialise all allocators, one per device
     CMS_THREAD_SAFE static auto allocators = detail::allocate_device_allocators<TDev, TQueue>();

diff --git a/code/alpaka_interface/getHostCachingAllocator.h b/code/alpaka_interface/getHostCachingAllocator.h
@@ -1,6 +1,8 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h
 #define HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h
 
+#include <alpaka/alpaka.hpp>
+
 #include "thread_safety_macros.h"
 #include "AllocatorConfig.h"
 #include "CachingAllocator.h"
@@ -10,7 +12,7 @@
 
 namespace lst::alpakatools {
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   inline CachingAllocator<alpaka_common::DevHost, TQueue>& getHostCachingAllocator() {
     // thread safe initialisation of the host allocator
     CMS_THREAD_SAFE static CachingAllocator<alpaka_common::DevHost, TQueue> allocator(