From 8424d0a0baae9d28e48000c6b1d785271482998a Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Mon, 31 Jul 2023 09:43:58 -0700
Subject: [PATCH 1/8] move to most recent Alpaka version

---
 setup.sh     | 12 ++++++------
 setup_hpg.sh |  9 ++++-----
 2 files changed, 10 insertions(+), 11 deletions(-)
diff --git a/setup.sh b/setup.sh
index 15973ebd9..45c83f89e 100644
--- a/setup.sh
+++ b/setup.sh
@@ -6,9 +6,8 @@
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 source $DIR/code/rooutil/thisrooutil.sh
 
-export SCRAM_ARCH=el8_amd64_gcc10
-export CMSSW_VERSION=CMSSW_13_0_0_pre2
-export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/cuda/11.5.2-c927b7e765e06433950d8a7eab9eddb4/
+export SCRAM_ARCH=el8_amd64_gcc11
+export CMSSW_VERSION=CMSSW_13_0_0_pre4
 
 source /cvmfs/cms.cern.ch/cmsset_default.sh
 cd /cvmfs/cms.cern.ch/$SCRAM_ARCH/cms/cmssw/$CMSSW_VERSION/src
@@ -42,7 +41,8 @@ fi
 export LATEST_CPU_BENCHMARK_EFF_MUONGUN="/data2/segmentlinking/muonGun_cpu_efficiencies.root"
 export LATEST_CPU_BENCHMARK_EFF_PU200="/data2/segmentlinking/pu200_cpu_efficiencies.root"
 
-source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh
-export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a"
-export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f"
+# Alpaka, Boost, and CUDA dependencies
+export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/boost/1.80.0-5305613b2f750cf1a05dcadf0d672647"
+export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/alpaka/develop-20230621-9e2225ac6c979464a40749ef9d1e0331"
+export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/cuda/11.8.0-9f0af0f4206be7b705fe550319c49a11/
 #eof
diff --git a/setup_hpg.sh b/setup_hpg.sh
index c5ee03c90..823d9c3cc 100644
--- a/setup_hpg.sh
+++ b/setup_hpg.sh
@@ -11,8 +11,8 @@ module load cuda/11.4.3 git
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 source $DIR/code/rooutil/thisrooutil.sh
 
-export SCRAM_ARCH=el8_amd64_gcc10
-export CMSSW_VERSION=CMSSW_13_0_0_pre2
+export SCRAM_ARCH=el8_amd64_gcc11
+export CMSSW_VERSION=CMSSW_13_0_0_pre4
 export CUDA_HOME=${HPC_CUDA_DIR}
 
 source /cvmfs/cms.cern.ch/cmsset_default.sh
@@ -41,7 +41,6 @@ export LSTPERFORMANCEWEBDIR=/home/users/phchang/public_html/LSTPerformanceWeb
 export LATEST_CPU_BENCHMARK_EFF_MUONGUN=
 export LATEST_CPU_BENCHMARK_EFF_PU200=
 
-source /cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f/etc/profile.d/init.sh
-export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/boost/1.78.0-12075919175e8d078539685f9234134a"
-export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc10/external/alpaka/develop-20220902-e80d13b043e1608b43d2007d06ad7e2f"
+export BOOST_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/boost/1.80.0-5305613b2f750cf1a05dcadf0d672647"
+export ALPAKA_ROOT="/cvmfs/cms.cern.ch/el8_amd64_gcc11/external/alpaka/develop-20230621-9e2225ac6c979464a40749ef9d1e0331"
 #eof

From 4ebde568dcd11f7ef0b83e74ba63b54937e5aade Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Mon, 31 Jul 2023 09:51:37 -0700
Subject: [PATCH 2/8] remove temporary hyperbolic functions

---
 SDL/Hit.h     | 18 ++----------------
 SDL/Segment.h |  2 +-
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/SDL/Hit.h b/SDL/Hit.h
index c297b353e..c74c4c3b6 100644
--- a/SDL/Hit.h
+++ b/SDL/Hit.h
@@ -118,26 +118,12 @@ namespace SDL
         return alpaka::math::log(acc, val) / ln10;
     };
 
-    // Hyperbolic functions were just merged into Alpaka early 2023,
-    // so we have to make use of temporary functions for now.
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE float temp_acosh(TAcc const & acc, float val)
-    {
-        return alpaka::math::log(acc, val + alpaka::math::sqrt(acc, val * val - 1));
-    };
-
-    template<typename TAcc>
-    ALPAKA_FN_ACC ALPAKA_FN_INLINE float temp_sinh(TAcc const & acc, float val)
-    {
-        return 0.5 * (alpaka::math::exp(acc, val) - alpaka::math::exp(acc, -val));
-    };
-
     template<typename TAcc>
     ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float eta(TAcc const & acc, float x, float y, float z)
     {
         float r3 = alpaka::math::sqrt(acc, x*x + y*y + z*z );
         float rt = alpaka::math::sqrt(acc, x*x + y*y );
-        float eta = ((z > 0) - ( z < 0)) * temp_acosh(acc, r3 / rt );
+        float eta = ((z > 0) - ( z < 0)) * alpaka::math::acosh(acc, r3 / rt );
         return eta;
     };
 
@@ -282,7 +268,7 @@ namespace SDL
                 hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y);
                 hitsInGPU.phis[ihit] = SDL::phi(acc, ihit_x,ihit_y);
                 // Acosh has no supported implementation in Alpaka right now.
-                hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * SDL::temp_acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]);
+                hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * alpaka::math::acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]);
                 int found_index = binary_search(modulesInGPU.mapdetId, iDetId, nModules);
                 uint16_t lastModuleIndex = modulesInGPU.mapIdx[found_index];
 
diff --git a/SDL/Segment.h b/SDL/Segment.h
index 597133260..a6ea8ef7b 100644
--- a/SDL/Segment.h
+++ b/SDL/Segment.h
@@ -812,7 +812,7 @@ namespace SDL
                 addMDToMemory(acc, mdsInGPU, hitsInGPU, modulesInGPU, hitIndices2[tid], hitIndices3[tid], pixelModuleIndex, 0,0,0,0,0,0,0,0,0,outerMDIndex);
     
                 //in outer hits - pt, eta, phi
-                float slope = SDL::temp_sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]);
+                float slope = alpaka::math::sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]);
                 float intercept = hitsInGPU.zs[mdsInGPU.anchorHitIndices[innerMDIndex]] - slope * hitsInGPU.rts[mdsInGPU.anchorHitIndices[innerMDIndex]];
                 float score_lsq=(hitsInGPU.rts[mdsInGPU.anchorHitIndices[outerMDIndex]] * slope + intercept) - (hitsInGPU.zs[mdsInGPU.anchorHitIndices[outerMDIndex]]);
                 score_lsq = score_lsq * score_lsq;

From 76eb86d5a9a90772b3517dc98ca156418ca48964 Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Mon, 31 Jul 2023 10:34:34 -0700
Subject: [PATCH 3/8] move to most recent caching allocator version

---
 code/alpaka_interface/AllocatorPolicy.h       |   4 +-
 code/alpaka_interface/CachedBufAlloc.h        |  10 +-
 code/alpaka_interface/CachingAllocator.h      |   2 +-
 code/alpaka_interface/CopyToDevice.h          |  35 +++
 code/alpaka_interface/CopyToHost.h            |  36 +++
 code/alpaka_interface/ScopedContextFwd.h      |  14 +-
 code/alpaka_interface/devices.h               |   5 +-
 .../getDeviceCachingAllocator.h               |   6 +-
 .../getHostCachingAllocator.h                 |   4 +-
 code/alpaka_interface/memory.h                |  51 ++--
 code/alpaka_interface/thread_safety_macros.h  |   6 +-
 code/alpaka_interface/traits.h                |  38 ---
 code/alpaka_interface/workdivision.h          | 219 +++++++++++-------
 13 files changed, 256 insertions(+), 174 deletions(-)
 create mode 100644 code/alpaka_interface/CopyToDevice.h
 create mode 100644 code/alpaka_interface/CopyToHost.h

diff --git a/code/alpaka_interface/AllocatorPolicy.h b/code/alpaka_interface/AllocatorPolicy.h
index 61b888791..5a36b1c61 100644
--- a/code/alpaka_interface/AllocatorPolicy.h
+++ b/code/alpaka_interface/AllocatorPolicy.h
@@ -13,7 +13,7 @@ namespace lst::alpakatools {
   //   - Caching:       (device and host) caching allocator
   enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 };
 
-  template <typename TDev, typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev>>>
+  template <typename TDev, typename = std::enable_if_t<alpaka::isDevice<TDev>>>
   constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;
 
 #if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
@@ -43,6 +43,8 @@ namespace lst::alpakatools {
   constexpr inline AllocatorPolicy allocator_policy<alpaka::DevHipRt> =
 #if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
       AllocatorPolicy::Caching;
+#elif HIP_VERSION >= 50400000 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR
+      AllocatorPolicy::Asynchronous;
 #else
       AllocatorPolicy::Synchronous;
 #endif
diff --git a/code/alpaka_interface/CachedBufAlloc.h b/code/alpaka_interface/CachedBufAlloc.h
index 2fd6d8f63..bb0fce838 100644
--- a/code/alpaka_interface/CachedBufAlloc.h
+++ b/code/alpaka_interface/CachedBufAlloc.h
@@ -18,7 +18,7 @@ namespace lst::alpakatools {
               typename TDev,
               typename TQueue,
               typename = void,
-              typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+              typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
     struct CachedBufAlloc {
       static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
     };
@@ -159,11 +159,7 @@ namespace lst::alpakatools {
     };
 
     //! The caching memory allocator implementation for the ROCm/HIP device
-    template <typename TElem,
-              typename TDim,
-              typename TIdx,
-              typename TQueue,
-              typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
       template <typename TExtent>
       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent)
@@ -197,7 +193,7 @@ namespace lst::alpakatools {
             typename TExtent,
             typename TQueue,
             typename TDev,
-            typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+            typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
   ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
     return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
   }
diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h
index 364edbe57..b689c544b 100644
--- a/code/alpaka_interface/CachingAllocator.h
+++ b/code/alpaka_interface/CachingAllocator.h
@@ -84,7 +84,7 @@ namespace lst::alpakatools {
 
   template <typename TDev,
             typename TQueue,
-            typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+            typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
   class CachingAllocator {
   public:
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
diff --git a/code/alpaka_interface/CopyToDevice.h b/code/alpaka_interface/CopyToDevice.h
new file mode 100644
index 000000000..2619e7201
--- /dev/null
+++ b/code/alpaka_interface/CopyToDevice.h
@@ -0,0 +1,35 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_CopyToDevice_h
+#define HeterogeneousCore_AlpakaInterface_interface_CopyToDevice_h
+
+// TODO: this utility class is specific to CMSSW, but needs to be in a
+// package that is suitable as DataFormat dependence
+
+namespace lst::alpakatools {
+  /**
+   * This class template needs to be specialized for each host-side
+   * EventSetup data product that should be implicitly copied to the
+   * device memory. The specialization is expected to define static
+   * copyAsync() function as in the following example
+   *
+   * \code
+   * template <>
+   * struct CopyToDevice<ExampleHostProduct> {
+   *   template <typename TQueue>
+   *   static auto copyAsync(TQueue& queue, ExampleHostProduct const& hostData) {
+   *     // construct ExampleDeviceProduct corresponding the device of the TQueue
+   *     // asynchronous copy hostData to the ExampleDeviceProduct object
+   *     // return ExampleDeviceProduct object by value
+   *   }
+   * };
+   * \endcode
+   *
+   * The copyAsync() function should not explicitly synchronize the
+   * queue. The ExampleHostProduct and ExampleDevicxeProduct can be the
+   * same type, if they internally are able to handle the memory
+   * allocation difference between host and device.
+   */
+  template <typename THostData>
+  struct CopyToDevice;
+}  // namespace lst::alpakatools
+
+#endif
diff --git a/code/alpaka_interface/CopyToHost.h b/code/alpaka_interface/CopyToHost.h
new file mode 100644
index 000000000..2d5ed914e
--- /dev/null
+++ b/code/alpaka_interface/CopyToHost.h
@@ -0,0 +1,36 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_CopyToHost_h
+#define HeterogeneousCore_AlpakaInterface_interface_CopyToHost_h
+
+// TODO: this utility class is specific to CMSSW, but needs to be in a
+// package that is suitable as DataFormat dependence
+
+namespace lst::alpakatools {
+  /**
+   * This class template needs to be specialized for each device-side
+   * Event data product so that the framework can implicitly copy the
+   * device-side data product to the host memory. The specialization
+   * is expected to define static copyAsync() function as in the
+   * following example
+   *
+   * \code
+   * template <>
+   * struct CopyToHost<ExampleDeviceProduct> {
+   *   template <typename TQueue>
+   *   static ExampleHostProduct copyAsync(TQueue& queue, ExampleDeviceProduct const& deviceData) {
+   *     // construct ExampleHostProduct
+   *     // asynchronous copy deviceData to the ExampleHostProduct object
+   *     // return ExampleHostProduct object by value
+   *   }
+   * };
+   * \endcode
+   *
+   * The copyAsync() function should not explicitly synchronize the
+   * queue. The ExampleDeviceProduct and ExampleHostProduct can be the
+   * same type, if they internally are able to handle the memory
+   * allocation difference between host and device.
+   */
+  template <typename TDeviceData>
+  struct CopyToHost;
+}  // namespace lst::alpakatools
+
+#endif
diff --git a/code/alpaka_interface/ScopedContextFwd.h b/code/alpaka_interface/ScopedContextFwd.h
index 0e154a630..271856707 100644
--- a/code/alpaka_interface/ScopedContextFwd.h
+++ b/code/alpaka_interface/ScopedContextFwd.h
@@ -1,7 +1,7 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h
 #define HeterogeneousCore_AlpakaInterface_interface_ScopedContextFwd_h
 
-#include "traits.h"
+#include <alpaka/alpaka.hpp>
 
 // Forward declaration of the alpaka framework Context classes
 //
@@ -11,23 +11,23 @@
 namespace lst::alpakatools {
 
   namespace impl {
-    template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+    template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
     class ScopedContextBase;
 
-    template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+    template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
     class ScopedContextGetterBase;
   }  // namespace impl
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   class ScopedContextAcquire;
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   class ScopedContextProduce;
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   class ScopedContextTask;
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   class ScopedContextAnalyze;
 
 }  // namespace lst::alpakatools
diff --git a/code/alpaka_interface/devices.h b/code/alpaka_interface/devices.h
index 3b342c84f..2445ba87e 100644
--- a/code/alpaka_interface/devices.h
+++ b/code/alpaka_interface/devices.h
@@ -7,13 +7,12 @@
 #include <alpaka/alpaka.hpp>
 
 #include "config.h"
-#include "traits.h"
 
 namespace lst::alpakatools {
 
   namespace detail {
 
-    template <typename TPlatform, typename = std::enable_if_t<is_platform_v<TPlatform>>>
+    template <typename TPlatform, typename = std::enable_if_t<alpaka::isPlatform<TPlatform>>>
     inline std::vector<alpaka::Dev<TPlatform>> enumerate_devices() {
       using Platform = TPlatform;
       using Device = alpaka::Dev<Platform>;
@@ -32,7 +31,7 @@ namespace lst::alpakatools {
   }  // namespace detail
 
   // return the alpaka accelerator devices for the given platform
-  template <typename TPlatform, typename = std::enable_if_t<is_platform_v<TPlatform>>>
+  template <typename TPlatform, typename = std::enable_if_t<alpaka::isPlatform<TPlatform>>>
   inline std::vector<alpaka::Dev<TPlatform>> const& devices() {
     static const auto devices = detail::enumerate_devices<TPlatform>();
     return devices;
diff --git a/code/alpaka_interface/getDeviceCachingAllocator.h b/code/alpaka_interface/getDeviceCachingAllocator.h
index 89b75767b..19ed9b01d 100644
--- a/code/alpaka_interface/getDeviceCachingAllocator.h
+++ b/code/alpaka_interface/getDeviceCachingAllocator.h
@@ -4,6 +4,8 @@
 #include <cassert>
 #include <memory>
 
+#include <alpaka/alpaka.hpp>
+
 #include "thread_safety_macros.h"
 #include "AllocatorConfig.h"
 #include "CachingAllocator.h"
@@ -16,7 +18,7 @@ namespace lst::alpakatools {
 
     template <typename TDev,
               typename TQueue,
-              typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+              typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
     auto allocate_device_allocators() {
       using Allocator = CachingAllocator<TDev, TQueue>;
       auto const& devices = lst::alpakatools::devices<alpaka::Pltf<TDev>>();
@@ -72,7 +74,7 @@ namespace lst::alpakatools {
 
   template <typename TDev,
             typename TQueue,
-            typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev> and lst::alpakatools::is_queue_v<TQueue>>>
+            typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
   inline CachingAllocator<TDev, TQueue>& getDeviceCachingAllocator(TDev const& device) {
     // initialise all allocators, one per device
     CMS_THREAD_SAFE static auto allocators = detail::allocate_device_allocators<TDev, TQueue>();
diff --git a/code/alpaka_interface/getHostCachingAllocator.h b/code/alpaka_interface/getHostCachingAllocator.h
index 9fa5321c4..a0ad20ff6 100644
--- a/code/alpaka_interface/getHostCachingAllocator.h
+++ b/code/alpaka_interface/getHostCachingAllocator.h
@@ -1,6 +1,8 @@
 #ifndef HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h
 #define HeterogeneousCore_AlpakaInterface_interface_getHostCachingAllocator_h
 
+#include <alpaka/alpaka.hpp>
+
 #include "thread_safety_macros.h"
 #include "AllocatorConfig.h"
 #include "CachingAllocator.h"
@@ -10,7 +12,7 @@
 
 namespace lst::alpakatools {
 
-  template <typename TQueue, typename = std::enable_if_t<lst::alpakatools::is_queue_v<TQueue>>>
+  template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   inline CachingAllocator<alpaka_common::DevHost, TQueue>& getHostCachingAllocator() {
     // thread safe initialisation of the host allocator
     CMS_THREAD_SAFE static CachingAllocator<alpaka_common::DevHost, TQueue> allocator(
diff --git a/code/alpaka_interface/memory.h b/code/alpaka_interface/memory.h
index f6155104f..0cccd8375 100644
--- a/code/alpaka_interface/memory.h
+++ b/code/alpaka_interface/memory.h
@@ -18,7 +18,7 @@ namespace lst::alpakatools {
   // type deduction helpers
   namespace detail {
 
-    template <typename TDev, typename T, typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev>>>
+    template <typename TDev, typename T, typename = std::enable_if_t<alpaka::isDevice<TDev>>>
     struct buffer_type {
       using type = alpaka::Buf<TDev, T, Dim0D, Idx>;
     };
@@ -33,7 +33,7 @@ namespace lst::alpakatools {
       using type = alpaka::Buf<TDev, T, Dim1D, Idx>;
     };
 
-    template <typename TDev, typename T, typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev>>>
+    template <typename TDev, typename T, typename = std::enable_if_t<alpaka::isDevice<TDev>>>
     struct view_type {
       using type = alpaka::ViewPlainPtr<TDev, T, Dim0D, Idx>;
     };
@@ -66,13 +66,13 @@ namespace lst::alpakatools {
   }
 
   template <typename T>
-  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  std::enable_if_t<lst::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
   make_host_buffer(Extent extent) {
     return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(host(), Vec1D{extent});
   }
 
   template <typename T>
-  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  std::enable_if_t<lst::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
   make_host_buffer() {
     return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(host(), Vec1D{std::extent_v<T>});
   }
@@ -86,13 +86,13 @@ namespace lst::alpakatools {
   }
 
   template <typename T, typename TPlatform>
-  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  std::enable_if_t<lst::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
   make_host_buffer(Extent extent) {
     return alpaka::allocMappedBuf<TPlatform, std::remove_extent_t<T>, Idx>(host(), Vec1D{extent});
   }
 
   template <typename T, typename TPlatform>
-  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
+  std::enable_if_t<lst::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_buffer<T>>
   make_host_buffer() {
     return alpaka::allocMappedBuf<TPlatform, std::remove_extent_t<T>, Idx>(host(), Vec1D{std::extent_v<T>});
   }
@@ -101,7 +101,8 @@ namespace lst::alpakatools {
   // the memory is pinned according to the device associated to the queue
 
   template <typename T, typename TQueue>
-  std::enable_if_t<is_queue_v<TQueue> and not std::is_array_v<T>, host_buffer<T>> make_host_buffer(TQueue const& queue) {
+  std::enable_if_t<alpaka::isQueue<TQueue> and not std::is_array_v<T>, host_buffer<T>> make_host_buffer(
+      TQueue const& queue) {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
       return allocCachedBuf<T, Idx>(host(), queue, Scalar{});
     } else {
@@ -110,7 +111,8 @@ namespace lst::alpakatools {
   }
 
   template <typename T, typename TQueue>
-  std::enable_if_t<is_queue_v<TQueue> and cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+  std::enable_if_t<alpaka::isQueue<TQueue> and lst::is_unbounded_array_v<T> and
+                       not std::is_array_v<std::remove_extent_t<T>>,
                    host_buffer<T>>
   make_host_buffer(TQueue const& queue, Extent extent) {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
@@ -122,7 +124,8 @@ namespace lst::alpakatools {
   }
 
   template <typename T, typename TQueue>
-  std::enable_if_t<is_queue_v<TQueue> and cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+  std::enable_if_t<alpaka::isQueue<TQueue> and lst::is_bounded_array_v<T> and
+                       not std::is_array_v<std::remove_extent_t<T>>,
                    host_buffer<T>>
   make_host_buffer(TQueue const& queue) {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
@@ -149,42 +152,44 @@ namespace lst::alpakatools {
   }
 
   template <typename T>
-  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_view<T>>
+  std::enable_if_t<lst::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_view<T>>
   make_host_view(T& data, Extent extent) {
     return alpaka::ViewPlainPtr<DevHost, std::remove_extent_t<T>, Dim1D, Idx>(data, host(), Vec1D{extent});
   }
 
   template <typename T>
-  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_view<T>>
+  std::enable_if_t<lst::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, host_view<T>>
   make_host_view(T& data) {
     return alpaka::ViewPlainPtr<DevHost, std::remove_extent_t<T>, Dim1D, Idx>(data, host(), Vec1D{std::extent_v<T>});
   }
 
   // scalar and 1-dimensional device buffers
 
-  template <typename TDev, typename T, typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev>>>
+  template <typename TDev, typename T, typename = std::enable_if_t<alpaka::isDevice<TDev>>>
   using device_buffer = typename detail::buffer_type<TDev, T>::type;
 
-  template <typename TDev, typename T, typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev>>>
+  template <typename TDev, typename T, typename = std::enable_if_t<alpaka::isDevice<TDev>>>
   using const_device_buffer = alpaka::ViewConst<device_buffer<TDev, T>>;
 
   // non-cached, scalar and 1-dimensional device buffers
 
   template <typename T, typename TDev>
-  std::enable_if_t<is_device_v<TDev> and not std::is_array_v<T>, device_buffer<TDev, T>> make_device_buffer(
+  std::enable_if_t<alpaka::isDevice<TDev> and not std::is_array_v<T>, device_buffer<TDev, T>> make_device_buffer(
       TDev const& device) {
     return alpaka::allocBuf<T, Idx>(device, Scalar{});
   }
 
   template <typename T, typename TDev>
-  std::enable_if_t<is_device_v<TDev> and cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+  std::enable_if_t<alpaka::isDevice<TDev> and lst::is_unbounded_array_v<T> and
+                       not std::is_array_v<std::remove_extent_t<T>>,
                    device_buffer<TDev, T>>
   make_device_buffer(TDev const& device, Extent extent) {
     return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(device, Vec1D{extent});
   }
 
   template <typename T, typename TDev>
-  std::enable_if_t<is_device_v<TDev> and cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+  std::enable_if_t<alpaka::isDevice<TDev> and lst::is_bounded_array_v<T> and
+                       not std::is_array_v<std::remove_extent_t<T>>,
                    device_buffer<TDev, T>>
   make_device_buffer(TDev const& device) {
     return alpaka::allocBuf<std::remove_extent_t<T>, Idx>(device, Vec1D{std::extent_v<T>});
@@ -193,7 +198,7 @@ namespace lst::alpakatools {
   // potentially-cached, scalar and 1-dimensional device buffers with queue-ordered semantic
 
   template <typename T, typename TQueue>
-  std::enable_if_t<is_queue_v<TQueue> and not std::is_array_v<T>, device_buffer<alpaka::Dev<TQueue>, T>>
+  std::enable_if_t<alpaka::isQueue<TQueue> and not std::is_array_v<T>, device_buffer<alpaka::Dev<TQueue>, T>>
   make_device_buffer(TQueue const& queue) {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
       return allocCachedBuf<T, Idx>(alpaka::getDev(queue), queue, Scalar{});
@@ -207,7 +212,8 @@ namespace lst::alpakatools {
   }
 
   template <typename T, typename TQueue>
-  std::enable_if_t<is_queue_v<TQueue> and cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+  std::enable_if_t<alpaka::isQueue<TQueue> and lst::is_unbounded_array_v<T> and
+                       not std::is_array_v<std::remove_extent_t<T>>,
                    device_buffer<alpaka::Dev<TQueue>, T>>
   make_device_buffer(TQueue const& queue, Extent extent) {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
@@ -222,7 +228,8 @@ namespace lst::alpakatools {
   }
 
   template <typename T, typename TQueue>
-  std::enable_if_t<is_queue_v<TQueue> and cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>,
+  std::enable_if_t<alpaka::isQueue<TQueue> and lst::is_bounded_array_v<T> and
+                       not std::is_array_v<std::remove_extent_t<T>>,
                    device_buffer<alpaka::Dev<TQueue>, T>>
   make_device_buffer(TQueue const& queue) {
     if constexpr (allocator_policy<alpaka::Dev<TQueue>> == AllocatorPolicy::Caching) {
@@ -238,7 +245,7 @@ namespace lst::alpakatools {
 
   // scalar and 1-dimensional device views
 
-  template <typename TDev, typename T, typename = std::enable_if_t<lst::alpakatools::is_device_v<TDev>>>
+  template <typename TDev, typename T, typename = std::enable_if_t<alpaka::isDevice<TDev>>>
   using device_view = typename detail::view_type<TDev, T>::type;
 
   template <typename T, typename TDev>
@@ -252,13 +259,13 @@ namespace lst::alpakatools {
   }
 
   template <typename T, typename TDev>
-  std::enable_if_t<cms::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, device_view<TDev, T>>
+  std::enable_if_t<lst::is_unbounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, device_view<TDev, T>>
   make_device_view(TDev const& device, T& data, Extent extent) {
     return alpaka::ViewPlainPtr<TDev, std::remove_extent_t<T>, Dim1D, Idx>(data, device, Vec1D{extent});
   }
 
   template <typename T, typename TDev>
-  std::enable_if_t<cms::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, device_view<TDev, T>>
+  std::enable_if_t<lst::is_bounded_array_v<T> and not std::is_array_v<std::remove_extent_t<T>>, device_view<TDev, T>>
   make_device_view(TDev const& device, T& data) {
     return alpaka::ViewPlainPtr<TDev, std::remove_extent_t<T>, Dim1D, Idx>(data, device, Vec1D{std::extent_v<T>});
   }
diff --git a/code/alpaka_interface/thread_safety_macros.h b/code/alpaka_interface/thread_safety_macros.h
index 3abbe0b9e..be2299334 100644
--- a/code/alpaka_interface/thread_safety_macros.h
+++ b/code/alpaka_interface/thread_safety_macros.h
@@ -1,9 +1,9 @@
 #ifndef FWCore_Utilites_thread_safe_macros_h
 #define FWCore_Utilites_thread_safe_macros_h
 #if !defined __CLING__ && !defined __INTEL_COMPILER && !defined __NVCC__
-#define CMS_THREAD_SAFE [[cms::thread_safe]]
-#define CMS_SA_ALLOW [[cms::sa_allow]]
-#define CMS_THREAD_GUARD(_var_) [[cms::thread_guard(#_var_)]]
+#define CMS_THREAD_SAFE [[lst::thread_safe]]
+#define CMS_SA_ALLOW [[lst::sa_allow]]
+#define CMS_THREAD_GUARD(_var_) [[lst::thread_guard(#_var_)]]
 #else
 #define CMS_THREAD_SAFE
 #define CMS_SA_ALLOW
diff --git a/code/alpaka_interface/traits.h b/code/alpaka_interface/traits.h
index c7d2b20ef..c469daf60 100644
--- a/code/alpaka_interface/traits.h
+++ b/code/alpaka_interface/traits.h
@@ -25,42 +25,4 @@ namespace cms {
   inline constexpr bool is_unbounded_array_v = is_unbounded_array<T>::value;
 }  // namespace cms
 
-#include <alpaka/alpaka.hpp>
-
-namespace lst::alpakatools {
-
-  // is_platform
-
-  template <typename T>
-  using is_platform = alpaka::concepts::ImplementsConcept<alpaka::ConceptPltf, T>;
-
-  template <typename T>
-  inline constexpr bool is_platform_v = is_platform<T>::value;
-
-  // is_device
-
-  template <typename T>
-  using is_device = alpaka::concepts::ImplementsConcept<alpaka::ConceptDev, T>;
-
-  template <typename T>
-  inline constexpr bool is_device_v = is_device<T>::value;
-
-  // is_accelerator
-
-  template <typename T>
-  using is_accelerator = alpaka::concepts::ImplementsConcept<alpaka::ConceptAcc, T>;
-
-  template <typename T>
-  inline constexpr bool is_accelerator_v = is_accelerator<T>::value;
-
-  // is_queue
-
-  template <typename T>
-  using is_queue = alpaka::concepts::ImplementsConcept<alpaka::ConceptQueue, T>;
-
-  template <typename T>
-  inline constexpr bool is_queue_v = is_queue<T>::value;
-
-}  // namespace lst::alpakatools
-
 #endif  // HeterogeneousCore_AlpakaInterface_interface_traits_h
diff --git a/code/alpaka_interface/workdivision.h b/code/alpaka_interface/workdivision.h
index 6153af74f..4b358f0c9 100644
--- a/code/alpaka_interface/workdivision.h
+++ b/code/alpaka_interface/workdivision.h
@@ -7,7 +7,6 @@
 
 #include "config.h"
 #include "traits.h"
-#include "vec.h"
 
 namespace lst::alpakatools {
 
@@ -19,27 +18,33 @@ namespace lst::alpakatools {
   // Return the integer division of the first argument by the second argument, rounded up to the next integer
   inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; }
 
-  // Create an accelerator-dependent work division for 1-dimensional kernels
-  template <typename TAcc,
-            typename = std::enable_if_t<lst::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
-  inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
+  // Trait describing whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  struct requires_single_thread_per_block : public std::true_type {};
+
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim1D, Idx>>) {
-      // On GPU backends, each thread is looking at a single element:
-      //   - the number of threads per block is "elements";
-      //   - the number of elements per thread is always 1.
-      return WorkDiv<Dim1D>(blocks, elements, Idx{1});
-    } else
+  template <typename TDim>
+  struct requires_single_thread_per_block<alpaka::AccGpuCudaRt<TDim, Idx>> : public std::false_type {};
 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
-#if ALPAKA_ACC_GPU_HIP_ENABLED
-        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim1D, Idx>>) {
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+  template <typename TDim>
+  struct requires_single_thread_per_block<alpaka::AccGpuHipRt<TDim, Idx>> : public std::false_type {};
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+  // Whether or not the accelerator expects the threads-per-block and elements-per-thread to be swapped
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+  inline constexpr bool requires_single_thread_per_block_v = requires_single_thread_per_block<TAcc>::value;
+
+  // Create an accelerator-dependent work division for 1-dimensional kernels
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  inline WorkDiv<Dim1D> make_workdiv(Idx blocks, Idx elements) {
+    if constexpr (not requires_single_thread_per_block_v<TAcc>) {
       // On GPU backends, each thread is looking at a single element:
       //   - the number of threads per block is "elements";
       //   - the number of elements per thread is always 1.
       return WorkDiv<Dim1D>(blocks, elements, Idx{1});
-    } else
-#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
-    {
+    } else {
       // On CPU backends, run serially with a single thread per block:
       //   - the number of threads per block is always 1;
       //   - the number of elements per thread is "elements".
@@ -48,27 +53,16 @@ namespace lst::alpakatools {
   }
 
   // Create the accelerator-dependent workdiv for N-dimensional kernels
-  template <typename TAcc, typename = std::enable_if_t<lst::alpakatools::is_accelerator_v<TAcc>>>
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
   inline WorkDiv<alpaka::Dim<TAcc>> make_workdiv(const Vec<alpaka::Dim<TAcc>>& blocks,
                                                  const Vec<alpaka::Dim<TAcc>>& elements) {
     using Dim = alpaka::Dim<TAcc>;
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
-    if constexpr (std::is_same_v<TAcc, alpaka::AccGpuCudaRt<Dim, Idx>>) {
+    if constexpr (not requires_single_thread_per_block_v<TAcc>) {
       // On GPU backends, each thread is looking at a single element:
       //   - the number of threads per block is "elements";
       //   - the number of elements per thread is always 1.
       return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
-    } else
-#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
-#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-        if constexpr (std::is_same_v<TAcc, alpaka::AccGpuHipRt<Dim, Idx>>) {
-      // On GPU backends, each thread is looking at a single element:
-      //   - the number of threads per block is "elements";
-      //   - the number of elements per thread is always 1.
-      return WorkDiv<Dim>(blocks, elements, Vec<Dim>::ones());
-    } else
-#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
-    {
+    } else {
       // On CPU backends, run serially with a single thread per block:
       //   - the number of threads per block is always 1;
       //   - the number of elements per thread is "elements".
@@ -76,8 +70,7 @@ namespace lst::alpakatools {
     }
   }
 
-  template <typename TAcc,
-            typename = std::enable_if_t<lst::alpakatools::is_accelerator_v<TAcc> and alpaka::Dim<TAcc>::value == 1>>
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and alpaka::Dim<TAcc>::value == 1>>
   class elements_with_stride {
   public:
     ALPAKA_FN_ACC inline elements_with_stride(TAcc const& acc)
@@ -108,13 +101,15 @@ namespace lst::alpakatools {
 
       // pre-increment the iterator
       ALPAKA_FN_ACC inline iterator& operator++() {
-        // increment the index along the elements processed by the current thread
-        ++index_;
-        if (index_ < last_)
-          return *this;
+        if constexpr (requires_single_thread_per_block_v<TAcc>) {
+          // increment the index along the elements processed by the current thread
+          ++index_;
+          if (index_ < last_)
+            return *this;
+        }
 
         // increment the thread index with the grid stride
-        first_ += stride_ * elements_;
+        first_ += stride_;
         index_ = first_;
         last_ = std::min(first_ + elements_, extent_);
         if (index_ < extent_)
@@ -162,8 +157,7 @@ namespace lst::alpakatools {
     const Idx extent_;
   };
 
-  template <typename TAcc,
-            typename = std::enable_if_t<lst::alpakatools::is_accelerator_v<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
+  template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc> and (alpaka::Dim<TAcc>::value > 0)>>
   class elements_with_stride_nd {
   public:
     using Dim = alpaka::Dim<TAcc>;
@@ -183,76 +177,123 @@ namespace lst::alpakatools {
 
     class iterator {
       friend class elements_with_stride_nd;
-      constexpr static const auto last_dimension = Dim::value - 1;
-
-      ALPAKA_FN_ACC inline iterator(Vec elements, Vec stride, Vec extent, Vec first)
-          : elements_{elements},
-            stride_{stride},
-            extent_{extent},
-            first_{alpaka::elementwise_min(first, extent)},
-            index_{first_},
-            last_{std::min(first[last_dimension] + elements[last_dimension], extent[last_dimension])} {}
 
     public:
       ALPAKA_FN_ACC inline Vec operator*() const { return index_; }
 
       // pre-increment the iterator
-      ALPAKA_FN_ACC inline iterator& operator++() {
-        // increment the index along the elements processed by the current thread
-        ++index_[last_dimension];
-        if (index_[last_dimension] < last_)
-          return *this;
-
-        // increment the thread index along with the last dimension with the grid stride
-        first_[last_dimension] += stride_[last_dimension] * elements_[last_dimension];
-        index_[last_dimension] = first_[last_dimension];
-        last_ = std::min(first_[last_dimension] + elements_[last_dimension], extent_[last_dimension]);
-        if (index_[last_dimension] < extent_[last_dimension])
-          return *this;
-
-        // increment the thread index along the outer dimensions with the grid stride
-        if constexpr (last_dimension > 0)
-          for (auto dimension = last_dimension - 1; dimension >= 0; --dimension) {
-            first_[dimension] += stride_[dimension];
-            index_[dimension] = first_[dimension];
-            if (index_[dimension] < extent_[dimension])
-              return *this;
-          }
-
-        // the iterator has reached or passed the end of the extent, clamp it to the extent
-        first_ = extent_;
-        index_ = extent_;
-        last_ = extent_[last_dimension];
+      ALPAKA_FN_ACC constexpr inline iterator operator++() {
+        increment();
         return *this;
       }
 
       // post-increment the iterator
-      ALPAKA_FN_ACC inline iterator operator++(int) {
+      ALPAKA_FN_ACC constexpr inline iterator operator++(int) {
         iterator old = *this;
-        ++(*this);
+        increment();
         return old;
       }
 
-      ALPAKA_FN_ACC inline bool operator==(iterator const& other) const {
-        return (index_ == other.index_) and (first_ == other.first_);
-      }
+      ALPAKA_FN_ACC constexpr inline bool operator==(iterator const& other) const { return (index_ == other.index_); }
 
-      ALPAKA_FN_ACC inline bool operator!=(iterator const& other) const { return not(*this == other); }
+      ALPAKA_FN_ACC constexpr inline bool operator!=(iterator const& other) const { return not(*this == other); }
 
     private:
-      // non-const to support iterator copy and assignment
-      Vec elements_;
-      Vec stride_;
-      Vec extent_;
+      // private, explicit constructor
+      ALPAKA_FN_ACC inline iterator(elements_with_stride_nd const* loop, Vec first)
+          : loop_{loop},
+            thread_{alpaka::elementwise_min(first, loop->extent_)},
+            range_{alpaka::elementwise_min(first + loop->elements_, loop->extent_)},
+            index_{thread_} {}
+
+      template <size_t I>
+      ALPAKA_FN_ACC inline constexpr bool nth_elements_loop() {
+        bool overflow = false;
+        ++index_[I];
+        if (index_[I] >= range_[I]) {
+          index_[I] = thread_[I];
+          overflow = true;
+        }
+        return overflow;
+      }
+
+      template <size_t N>
+      ALPAKA_FN_ACC inline constexpr bool do_elements_loops() {
+        if constexpr (N == 0) {
+          // overflow
+          return true;
+        } else {
+          if (not nth_elements_loop<N - 1>()) {
+            return false;
+          } else {
+            return do_elements_loops<N - 1>();
+          }
+        }
+      }
+
+      template <size_t I>
+      ALPAKA_FN_ACC inline constexpr bool nth_strided_loop() {
+        bool overflow = false;
+        thread_[I] += loop_->stride_[I];
+        if (thread_[I] >= loop_->extent_[I]) {
+          thread_[I] = loop_->first_[I];
+          overflow = true;
+        }
+        index_[I] = thread_[I];
+        range_[I] = std::min(thread_[I] + loop_->elements_[I], loop_->extent_[I]);
+        return overflow;
+      }
+
+      template <size_t N>
+      ALPAKA_FN_ACC inline constexpr bool do_strided_loops() {
+        if constexpr (N == 0) {
+          // overflow
+          return true;
+        } else {
+          if (not nth_strided_loop<N - 1>()) {
+            return false;
+          } else {
+            return do_strided_loops<N - 1>();
+          }
+        }
+      }
+
+      // increment the iterator
+      ALPAKA_FN_ACC inline constexpr void increment() {
+        if constexpr (requires_single_thread_per_block_v<TAcc>) {
+          // linear N-dimensional loops over the elements associated to the thread;
+          // do_elements_loops<>() returns true if any of those loops overflows
+          if (not do_elements_loops<Dim::value>()) {
+            // the elements loops did not overflow, return the next index
+            return;
+          }
+        }
+
+        // strided N-dimensional loop over the threads in the kernel launch grid;
+        // do_strided_loops<>() returns true if any of those loops overflows
+        if (not do_strided_loops<Dim::value>()) {
+          // the strided loops did not overflow, return the next index
+          return;
+        }
+
+        // the iterator has reached or passed the end of the extent, clamp it to the extent
+        thread_ = loop_->extent_;
+        range_ = loop_->extent_;
+        index_ = loop_->extent_;
+      }
+
+      // const pointer to the elements_with_stride_nd that the iterator refers to
+      const elements_with_stride_nd* loop_;
+
       // modified by the pre/post-increment operator
-      Vec first_;
-      Vec index_;
-      Idx last_;
+      Vec thread_;  // first element processed by this thread
+      Vec range_;   // last element processed by this thread
+      Vec index_;   // current element processed by this thread
     };
 
-    ALPAKA_FN_ACC inline iterator begin() const { return iterator(elements_, stride_, extent_, first_); }
+    ALPAKA_FN_ACC inline iterator begin() const { return iterator{this, first_}; }
 
-    ALPAKA_FN_ACC inline iterator end() const { return iterator(elements_, stride_, extent_, extent_); }
+    ALPAKA_FN_ACC inline iterator end() const { return iterator{this, extent_}; }
 
   private:
     const Vec elements_;

From f935655b1f678e4f81f0a5ffd5905341d1dde378 Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Mon, 31 Jul 2023 11:00:51 -0700
Subject: [PATCH 4/8] add last commit from CMSSW

---
 code/alpaka_interface/AlpakaServiceFwd.h | 4 ++--
 code/alpaka_interface/CachingAllocator.h | 2 +-
 code/alpaka_interface/config.h           | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/code/alpaka_interface/AlpakaServiceFwd.h b/code/alpaka_interface/AlpakaServiceFwd.h
index 4345f3f34..f0f88287b 100644
--- a/code/alpaka_interface/AlpakaServiceFwd.h
+++ b/code/alpaka_interface/AlpakaServiceFwd.h
@@ -13,9 +13,9 @@ namespace alpaka_cuda_async {
 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-namespace alpaka_hip_async {
+namespace alpaka_rocm_async {
   class AlpakaService;
-}  // namespace alpaka_hip_async
+}  // namespace alpaka_rocm_async
 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED
 
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h
index b689c544b..226a30cb3 100644
--- a/code/alpaka_interface/CachingAllocator.h
+++ b/code/alpaka_interface/CachingAllocator.h
@@ -91,7 +91,7 @@ namespace lst::alpakatools {
     friend class alpaka_cuda_async::AlpakaService;
 #endif
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-    friend class alpaka_hip_async::AlpakaService;
+    friend class alpaka_rocm_async::AlpakaService;
 #endif
 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
     friend class alpaka_serial_sync::AlpakaService;
diff --git a/code/alpaka_interface/config.h b/code/alpaka_interface/config.h
index 354a93b91..a99e5d4d9 100644
--- a/code/alpaka_interface/config.h
+++ b/code/alpaka_interface/config.h
@@ -65,7 +65,7 @@ namespace alpaka_cuda_async {
 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
 
 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
-namespace alpaka_hip_async {
+namespace alpaka_rocm_async {
   using namespace alpaka_common;
 
   using Platform = alpaka::PltfHipRt;
@@ -79,13 +79,13 @@ namespace alpaka_hip_async {
   using Acc2D = Acc<Dim2D>;
   using Acc3D = Acc<Dim3D>;
 
-}  // namespace alpaka_hip_async
+}  // namespace alpaka_rocm_async
 
 #ifdef ALPAKA_ACCELERATOR_NAMESPACE
 #define ALPAKA_DUPLICATE_NAMESPACE
 #else
-#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_hip_async
-#define ALPAKA_TYPE_SUFFIX HipAsync
+#define ALPAKA_ACCELERATOR_NAMESPACE alpaka_rocm_async
+#define ALPAKA_TYPE_SUFFIX ROCmAsync
 #endif
 
 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED

From 5da452f66c255a37f4a48ccaf56a9ee0a50b1ff2 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Sun, 6 Aug 2023 15:24:10 -0400
Subject: [PATCH 5/8] remove old comment

---
 SDL/Hit.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SDL/Hit.h b/SDL/Hit.h
index c74c4c3b6..6ae2415c9 100644
--- a/SDL/Hit.h
+++ b/SDL/Hit.h
@@ -267,7 +267,6 @@ namespace SDL
 
                 hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x*ihit_x + ihit_y*ihit_y);
                 hitsInGPU.phis[ihit] = SDL::phi(acc, ihit_x,ihit_y);
-                // Acosh has no supported implementation in Alpaka right now.
                 hitsInGPU.etas[ihit] = ((ihit_z>0)-(ihit_z<0)) * alpaka::math::acosh(acc, alpaka::math::sqrt(acc, ihit_x*ihit_x+ihit_y*ihit_y+ihit_z*ihit_z)/hitsInGPU.rts[ihit]);
                 int found_index = binary_search(modulesInGPU.mapdetId, iDetId, nModules);
                 uint16_t lastModuleIndex = modulesInGPU.mapIdx[found_index];

From 6ea9524531a1cee0c263e7b17336915d2744af31 Mon Sep 17 00:00:00 2001
From: GNiendorf <gavinniendorf@gmail.com>
Date: Mon, 7 Aug 2023 13:46:55 -0400
Subject: [PATCH 6/8] add Andrea Bocci's latest commit

---
 code/alpaka_interface/CachingAllocator.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/code/alpaka_interface/CachingAllocator.h b/code/alpaka_interface/CachingAllocator.h
index 226a30cb3..89a62076d 100644
--- a/code/alpaka_interface/CachingAllocator.h
+++ b/code/alpaka_interface/CachingAllocator.h
@@ -82,9 +82,7 @@ namespace lst::alpakatools {
    *    - the `Queue` type can be either `Sync` _or_ `Async` on any allocation.
    */
 
-  template <typename TDev,
-            typename TQueue,
-            typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
+  template <typename TDev, typename TQueue>
   class CachingAllocator {
   public:
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
@@ -106,6 +104,8 @@ namespace lst::alpakatools {
     using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
 
     // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
+    static_assert(alpaka::isDevice<Device>, "TDev should be an alpaka Device type.");
+    static_assert(alpaka::isQueue<Queue>, "TQueue should be an alpaka Queue type.");
     static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
                   "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
                   "host CPU.");

From 7f33a42bf0323b4b7b849214a71abd31a1f00e15 Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Mon, 28 Aug 2023 14:23:13 -0400
Subject: [PATCH 7/8] Update README.md for Alpaka CMSSW Integration

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3aac7ca1d..566881e94 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,7 @@ cd CMSSW_13_0_0_pre4/src
 cmsenv
 git cms-init
 git remote add SegLink git@github.com:SegmentLinking/cmssw.git
-git fetch SegLink CMSSW_13_0_0_pre4_LST_X
+git fetch SegLink CMSSW_13_0_0_pre4_LST_X_alpaka
 git cms-addpkg RecoTracker Configuration
 git checkout CMSSW_13_0_0_pre4_LST_X
 cat <<EOF >lst.xml

From 80047ad8cc61a0cbb2d43911f9bcb1fca8eaa579 Mon Sep 17 00:00:00 2001
From: Gavin Niendorf <gavinniendorf@gmail.com>
Date: Mon, 28 Aug 2023 14:24:27 -0400
Subject: [PATCH 8/8] Update README Correction for Alpaka

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 566881e94..bf203488f 100644
--- a/README.md
+++ b/README.md
@@ -124,7 +124,7 @@ git cms-init
 git remote add SegLink git@github.com:SegmentLinking/cmssw.git
 git fetch SegLink CMSSW_13_0_0_pre4_LST_X_alpaka
 git cms-addpkg RecoTracker Configuration
-git checkout CMSSW_13_0_0_pre4_LST_X
+git checkout CMSSW_13_0_0_pre4_LST_X_alpaka
 cat <<EOF >lst.xml
 <tool name="lst" version="1.0">
   <client>