cms-sw · cmsbuild · Jul 2, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/HeterogeneousCore/AlpakaInterface/interface/AllocatorConfig.h b/HeterogeneousCore/AlpakaInterface/interface/AllocatorConfig.h
@@ -6,26 +6,50 @@
 
 namespace cms::alpakatools {
 
-  namespace config {
+  struct AllocatorConfig {
+    // Bin growth factor (bin_growth in cub::CachingDeviceAllocator)
+    unsigned int binGrowth = 2;
 
-    // bin growth factor (bin_growth in cub::CachingDeviceAllocator)
-    constexpr unsigned int binGrowth = 2;
+    // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CachingDeviceAllocator
+    unsigned int minBin = 8;  // 256 bytes
 
-    // smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CachingDeviceAllocator
-    constexpr unsigned int minBin = 8;  // 256 bytes
+    // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator).
+    // Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+    unsigned int maxBin = 30;  // 1 GB
 
-    // largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
-    constexpr unsigned int maxBin = 30;  // 1 GB
+    // Total storage for the allocator; 0 means no limit.
+    size_t maxCachedBytes = 0;
 
-    // total storage for the allocator; 0 means no limit.
-    constexpr size_t maxCachedBytes = 0;
+    // Fraction of total device memory taken for the allocator; 0 means no limit.
+    // If both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used.
+    double maxCachedFraction = 0.8;
 
-    // fraction of total device memory taken for the allocator; 0 means no limit.
-    constexpr double maxCachedFraction = 0.8;
+    // Fill all newly allocated or re-used memory blocks with fillAllocationValue.
+    bool fillAllocations = false;
 
-    // if both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used.
+    // Fill only the re-used memory blocks with fillReallocationValue.
+    // If both fillAllocations and fillReallocations are true, fillAllocationValue is used for newly allocated blocks and fillReallocationValue is used for re-allocated blocks.
+    bool fillReallocations = false;
 
-  }  // namespace config
+    // Fill memory blocks with fillDeallocationValue before freeing or caching them for re-use
+    bool fillDeallocations = false;
+
+    // Fill memory blocks with fillCacheValue before caching them for re-use.
+    // If both fillDeallocations and fillCaches are true, fillDeallocationValue is used for blocks about to be freed and fillCacheValue is used for blocks about to be cached.
+    bool fillCaches = false;
+
+    // Byte value used to fill all newly allocated or re-used memory blocks
+    uint8_t fillAllocationValue = 0xA5;
+
+    // Byte value used to fill all re-used memory blocks
+    uint8_t fillReallocationValue = 0x69;
+
+    // Byte value used to fill all deallocated or cached memory blocks
+    uint8_t fillDeallocationValue = 0x5A;
+
+    // Byte value used to fill all cached memory blocks
+    uint8_t fillCacheValue = 0x96;
+  };
 
 }  // namespace cms::alpakatools
 

diff --git a/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/CachingAllocator.h
@@ -16,7 +16,7 @@
 #include <alpaka/alpaka.hpp>
 
 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/AllocatorConfig.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/AlpakaServiceFwd.h"
 
 // Inspired by cub::CachingDeviceAllocator
@@ -37,18 +37,20 @@ namespace cms::alpakatools {
       return power;
     }
 
-    // format a memory size in B/kB/MB/GB
+    // format a memory size in B/KiB/MiB/GiB/TiB
     inline std::string as_bytes(size_t value) {
       if (value == std::numeric_limits<size_t>::max()) {
         return "unlimited";
-      } else if (value >= (1 << 30) and value % (1 << 30) == 0) {
-        return std::to_string(value >> 30) + " GB";
-      } else if (value >= (1 << 20) and value % (1 << 20) == 0) {
-        return std::to_string(value >> 20) + " MB";
-      } else if (value >= (1 << 10) and value % (1 << 10) == 0) {
-        return std::to_string(value >> 10) + " kB";
+      } else if (value >= (1ul << 40) and value % (1ul << 40) == 0) {
+        return std::to_string(value >> 40) + " TiB";
+      } else if (value >= (1ul << 30) and value % (1ul << 30) == 0) {
+        return std::to_string(value >> 30) + " GiB";
+      } else if (value >= (1ul << 20) and value % (1ul << 20) == 0) {
+        return std::to_string(value >> 20) + " MiB";
+      } else if (value >= (1ul << 10) and value % (1ul << 10) == 0) {
+        return std::to_string(value >> 10) + " KiB";
       } else {
-        return std::to_string(value) + "  B";
+        return std::to_string(value) + "   B";
       }
     }
 
@@ -119,27 +121,27 @@ namespace cms::alpakatools {
 
     explicit CachingAllocator(
         Device const& device,
-        unsigned int binGrowth,          // bin growth factor;
-        unsigned int minBin,             // smallest bin, corresponds to binGrowth^minBin bytes;
-                                         // smaller allocations are rounded to this value;
-        unsigned int maxBin,             // largest bin, corresponds to binGrowth^maxBin bytes;
-                                         // larger allocations will fail;
-        size_t maxCachedBytes,           // total storage for the allocator (0 means no limit);
-        double maxCachedFraction,        // fraction of total device memory taken for the allocator (0 means no limit);
-                                         // if both maxCachedBytes and maxCachedFraction are non-zero,
-                                         // the smallest resulting value is used.
+        AllocatorConfig const& config,
         bool reuseSameQueueAllocations,  // reuse non-ready allocations if they are in the same queue as the new one;
                                          // this is safe only if all memory operations are scheduled in the same queue
-        bool debug)
+        bool debug = false)
         : device_(device),
-          binGrowth_(binGrowth),
-          minBin_(minBin),
-          maxBin_(maxBin),
-          minBinBytes_(detail::power(binGrowth, minBin)),
-          maxBinBytes_(detail::power(binGrowth, maxBin)),
-          maxCachedBytes_(cacheSize(maxCachedBytes, maxCachedFraction)),
+          binGrowth_(config.binGrowth),
+          minBin_(config.minBin),
+          maxBin_(config.maxBin),
+          minBinBytes_(detail::power(binGrowth_, minBin_)),
+          maxBinBytes_(detail::power(binGrowth_, maxBin_)),
+          maxCachedBytes_(cacheSize(config.maxCachedBytes, config.maxCachedFraction)),
           reuseSameQueueAllocations_(reuseSameQueueAllocations),
-          debug_(debug) {
+          debug_(debug),
+          fillAllocations_(config.fillAllocations),
+          fillAllocationValue_(config.fillAllocationValue),
+          fillReallocations_(config.fillReallocations),
+          fillReallocationValue_(config.fillReallocationValue),
+          fillDeallocations_(config.fillDeallocations),
+          fillDeallocationValue_(config.fillDeallocationValue),
+          fillCaches_(config.fillCaches),
+          fillCacheValue_(config.fillCacheValue) {
       if (debug_) {
         std::ostringstream out;
         out << "CachingAllocator settings\n"
@@ -148,7 +150,7 @@ namespace cms::alpakatools {
             << "  max bin    " << maxBin_ << "\n"
             << "  resulting bins:\n";
         for (auto bin = minBin_; bin <= maxBin_; ++bin) {
-          auto binSize = detail::power(binGrowth, bin);
+          auto binSize = detail::power(binGrowth_, bin);
           out << "    " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n';
         }
         out << "  maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_);
@@ -182,8 +184,19 @@ namespace cms::alpakatools {
       std::tie(block.bin, block.bytes) = findBin(bytes);
 
       // try to re-use a cached block, or allocate a new buffer
-      if (not tryReuseCachedBlock(block)) {
+      if (tryReuseCachedBlock(block)) {
+        // fill the re-used memory block with a pattern
+        if (fillReallocations_) {
+          alpaka::memset(*block.queue, *block.buffer, fillReallocationValue_);
+        } else if (fillAllocations_) {
+          alpaka::memset(*block.queue, *block.buffer, fillAllocationValue_);
+        }
+      } else {
         allocateNewBlock(block);
+        // fill the newly allocated memory block with a pattern
+        if (fillAllocations_) {
+          alpaka::memset(*block.queue, *block.buffer, fillAllocationValue_);
+        }
       }
 
       return block.buffer->data();
@@ -215,11 +228,18 @@ namespace cms::alpakatools {
         // because of multiple exceptions it is best to ignore these
         // errors.
         try {
+          // fill memory blocks with a pattern before caching them
+          if (fillCaches_) {
+            alpaka::memset(*block.queue, *block.buffer, fillCacheValue_);
+          } else if (fillDeallocations_) {
+            alpaka::memset(*block.queue, *block.buffer, fillDeallocationValue_);
+          }
+          // record in the block a marker associated to the work queue
           alpaka::enqueue(*(block.queue), *(block.event));
         } catch (std::exception& e) {
           if (debug_) {
             std::ostringstream out;
-            out << "CachingAllocator::free() error from alpaka::enqueue(): " << e.what() << "\n";
+            out << "CachingAllocator::free() caught an alpaka error: " << e.what() << "\n";
             out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
                 << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
                 << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
@@ -244,6 +264,30 @@ namespace cms::alpakatools {
           std::cout << out.str() << std::endl;
         }
       } else {
+        // If the memset fails, very likely an error has occurred in the
+        // asynchronous processing. In that case the error will show up in all
+        // device API function calls, and the free() will be called by
+        // destructors during stack unwinding. In order to avoid terminate()
+        // being called because of multiple exceptions it is best to ignore
+        // these errors.
+        try {
+          // fill memory blocks with a pattern before freeing them
+          if (fillDeallocations_) {
+            alpaka::memset(*block.queue, *block.buffer, fillDeallocationValue_);
+          }
+        } catch (std::exception& e) {
+          if (debug_) {
+            std::ostringstream out;
+            out << "CachingAllocator::free() caught an alpaka error: " << e.what() << "\n";
+            out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
+                << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
+                << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
+                << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size()
+                << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
+            std::cout << out.str() << std::endl;
+          }
+          return;
+        }
         // if the buffer is not recached, it is automatically freed when block goes out of scope
         if (debug_) {
           std::ostringstream out;
@@ -452,6 +496,15 @@ namespace cms::alpakatools {
 
     const bool reuseSameQueueAllocations_;
     const bool debug_;
+
+    const bool fillAllocations_;
+    const uint8_t fillAllocationValue_;
+    const bool fillReallocations_;
+    const uint8_t fillReallocationValue_;
+    const bool fillDeallocations_;
+    const uint8_t fillDeallocationValue_;
+    const bool fillCaches_;
+    const uint8_t fillCacheValue_;
   };
 
 }  // namespace cms::alpakatools

diff --git a/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h
@@ -19,7 +19,7 @@ namespace cms::alpakatools {
     template <typename TDev,
               typename TQueue,
               typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
-    auto allocate_device_allocators() {
+    auto allocate_device_allocators(AllocatorConfig const& config, bool debug) {
       using Allocator = CachingAllocator<TDev, TQueue>;
       auto const& devices = cms::alpakatools::devices<alpaka::Platform<TDev>>();
       ssize_t const size = devices.size();
@@ -38,13 +38,9 @@ namespace cms::alpakatools {
 #endif
               ptr + index,
               devices[index],
-              config::binGrowth,
-              config::minBin,
-              config::maxBin,
-              config::maxCachedBytes,
-              config::maxCachedFraction,
-              true,    // reuseSameQueueAllocations
-              false);  // debug
+              config,
+              true,  // reuseSameQueueAllocations
+              debug);
         }
       } catch (...) {
         --index;
@@ -60,11 +56,11 @@ namespace cms::alpakatools {
       }
 
       // use a custom deleter to destroy all objects and deallocate the memory
-      auto deleter = [size](Allocator* ptr) {
+      auto deleter = [size](Allocator* allocators) {
         for (size_t i = size; i > 0; --i) {
-          std::destroy_at(ptr + i - 1);
+          std::destroy_at(allocators + i - 1);
         }
-        std::allocator<Allocator>().deallocate(ptr, size);
+        std::allocator<Allocator>().deallocate(allocators, size);
       };
 
       return std::unique_ptr<Allocator[], decltype(deleter)>(ptr, deleter);
@@ -75,9 +71,11 @@ namespace cms::alpakatools {
   template <typename TDev,
             typename TQueue,
             typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
-  inline CachingAllocator<TDev, TQueue>& getDeviceCachingAllocator(TDev const& device) {
+  inline CachingAllocator<TDev, TQueue>& getDeviceCachingAllocator(TDev const& device,
+                                                                   AllocatorConfig const& config = AllocatorConfig{},
+                                                                   bool debug = false) {
     // initialise all allocators, one per device
-    CMS_THREAD_SAFE static auto allocators = detail::allocate_device_allocators<TDev, TQueue>();
+    CMS_THREAD_SAFE static auto allocators = detail::allocate_device_allocators<TDev, TQueue>(config, debug);
 
     size_t const index = alpaka::getNativeHandle(device);
     assert(index < cms::alpakatools::devices<alpaka::Platform<TDev>>().size());

diff --git a/HeterogeneousCore/AlpakaInterface/interface/getHostCachingAllocator.h b/HeterogeneousCore/AlpakaInterface/interface/getHostCachingAllocator.h
@@ -13,17 +13,14 @@
 namespace cms::alpakatools {
 
   template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
-  inline CachingAllocator<alpaka_common::DevHost, TQueue>& getHostCachingAllocator() {
+  inline CachingAllocator<alpaka_common::DevHost, TQueue>& getHostCachingAllocator(
+      AllocatorConfig const& config = AllocatorConfig{}, bool debug = false) {
     // thread safe initialisation of the host allocator
     CMS_THREAD_SAFE static CachingAllocator<alpaka_common::DevHost, TQueue> allocator(
         host(),
-        config::binGrowth,
-        config::minBin,
-        config::maxBin,
-        config::maxCachedBytes,
-        config::maxCachedFraction,
-        false,   // reuseSameQueueAllocations
-        false);  // debug
+        config,
+        false,  // reuseSameQueueAllocations
+        debug);
 
     // the public interface is thread safe
     return allocator;