update HIP target

lattice · Sep 18, 2023 · 2cd916a · 2cd916a
1 parent e8c16b6
commit 2cd916a
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 339 deletions.
diff --git a/include/targets/hip/atomic_helper.h b/include/targets/hip/atomic_helper.h
@@ -45,6 +45,20 @@ namespace quda
     for (int i = 0; i < n; i++) atomic_fetch_add(&(*addr)[i], val[i]);
   }
 
+  __device__ __host__ inline void atomic_fetch_add(int4 *addr, int4 val)
+  {
+    atomic_fetch_add(reinterpret_cast<int *>(addr) + 0, val.x);
+    atomic_fetch_add(reinterpret_cast<int *>(addr) + 1, val.y);
+    atomic_fetch_add(reinterpret_cast<int *>(addr) + 2, val.z);
+    atomic_fetch_add(reinterpret_cast<int *>(addr) + 3, val.w);
+  }
+
+  template <typename T>
+  __device__ __host__ inline void atomic_add_local(T *addr, T val)
+  {
+    atomic_fetch_add(addr, val);
+  }
+
   template <bool is_device> struct atomic_fetch_abs_max_impl {
     template <typename T> inline void operator()(T *addr, T val)
     {
@@ -83,4 +97,8 @@ namespace quda
     target::dispatch<atomic_fetch_abs_max_impl>(addr, val);
   }
 
+  struct fetch_add_atomic_t {
+    template <class T> __device__ __host__ inline void operator()(T *out, T in) { atomic_fetch_add(out, in); }
+  };
+
 } // namespace quda
diff --git a/include/targets/hip/block_reduce_helper.h b/include/targets/hip/block_reduce_helper.h
@@ -2,6 +2,7 @@
 
 #include <target_device.h>
 #include <reducer.h>
+#include <helpers.h>
 
 /**
    @file block_reduce_helper.h
@@ -60,13 +61,39 @@ namespace quda
     }
   };
 
+  /**
+     @brief block_reduce_param is used as a container for passing
+     non-type parameters to specialize block_reduce through the
+     target::dispatch
+     @tparam block_dim Block dimension of the reduction (1, 2 or 3)
+     @tparam batch_size Batch size of the reduction.  Threads will be
+     ordered such that batch size is the slowest running index.  Note
+     that batch_size > 1 requires block_dim <= 2.
+   */
+  template <int block_dim_, int batch_size_ = 1> struct block_reduce_param {
+    static constexpr int block_dim = block_dim_;
+    static constexpr int batch_size = batch_size_;
+    static_assert(batch_size == 1 || block_dim <= 2, "Batching not possible with 3-d block reduction");
+  };
+
   // pre-declaration of block_reduce that we wish to specialize
-  template <bool> struct block_reduce;
+  template <bool> struct block_reduce_impl;
+
+  /**
+     @brief Dummy generic implementation of block_reduce
+  */
+  template <bool is_device> struct block_reduce_impl {
+    template <typename T, typename reducer_t, typename param_t>
+    T operator()(const T &value, bool, int, bool, reducer_t, param_t)
+    {
+      return value;
+    }
+  };
 
   /**
      @brief HIP specialization of block_reduce, building on the warp reduce
   */
-  template <> struct block_reduce<true> {
+  template <> struct block_reduce_impl<true> {
 
     template <int width_> struct warp_reduce_param {
       static constexpr int width = width_;
@@ -135,6 +162,19 @@ namespace quda
     }
   };
 
+  template <typename T, int block_dim, int batch_size>
+  struct block_reduce {
+    template <typename Ops> HostDevice inline block_reduce(Ops &) {};
+    template <typename ...Arg> static constexpr size_t shared_mem_size(dim3 block) {
+      return SizeBlockDivWarp::size(block) * sizeof(T);
+    }
+    template <typename reducer_t>
+    HostDevice inline T apply(const T &value, bool async, int batch, bool all, const reducer_t &r)
+    {
+      return target::dispatch<block_reduce_impl>(value, async, batch, all, r, block_reduce_param<block_dim, batch_size>());
+    }
+  };
+
 } // namespace quda
 
 #include "../generic/block_reduce_helper.h"
diff --git a/include/targets/hip/shared_memory_cache_helper.h b/include/targets/hip/shared_memory_cache_helper.h
@@ -1,295 +1 @@
-#pragma once
-
-#include <target_device.h>
-#include <array.h>
-
-/**
-   @file shared_memory_cache_helper.h
-
-   Helper functionality for aiding the use of the shared memory for
-   sharing data between threads in a thread block.
- */
-
-namespace quda
-{
-
-  /**
-     @brief Class which wraps around a shared memory cache for type T,
-     where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
-
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
-
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
-
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
-   */
-  template <typename T, int block_size_y_ = 1, int block_size_z_ = 1, bool dynamic_ = true> class SharedMemoryCache
-  {
-  public:
-    using value_type = T;
-    static constexpr int block_size_y = block_size_y_;
-    static constexpr int block_size_z = block_size_z_;
-    static constexpr bool dynamic = dynamic_;
-
-  private:
-    /** maximum number of threads in x given the y and z block sizes */
-    static constexpr int block_size_x = device::max_block_size<block_size_y, block_size_z>();
-
-    using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
-
-    // The number of elements of type atom_t that we break T into for optimal shared-memory access
-    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
-
-    const dim3 block;
-    const int stride;
-    const unsigned int offset = 0; // dynamic offset in bytes
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool, typename dummy = void> struct cache_dynamic {
-      atom_t *operator()(unsigned)
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, dynamic specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_dynamic<true, dummy> {
-      __device__ inline atom_t *operator()(unsigned int offset)
-      {
-        extern __shared__ int cache_[];
-        return reinterpret_cast<atom_t *>(reinterpret_cast<char *>(cache_) + offset);
-      }
-    };
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct cache_static {
-      atom_t *operator()()
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, static specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_static<true, dummy> {
-      __device__ inline atom_t *operator()()
-      {
-        static __shared__ atom_t cache_[n_element * block_size_x * block_size_y * block_size_z];
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_dynamic>(offset);
-    }
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<!dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_static>();
-    }
-
-    __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      memcpy(tmp, (void *)&a, sizeof(T));
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) cache<dynamic>()[i * stride + j] = tmp[i];
-    }
-
-    __device__ __host__ inline T load_detail(int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = cache<dynamic>()[i * stride + j];
-      T a;
-      memcpy((void *)&a, tmp, sizeof(T));
-      return a;
-    }
-
-    /**
-       @brief Dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct sync_impl {
-      void operator()() { }
-    };
-
-    /**
-       @brief Synchronize the cache when on the device
-    */
-    template <typename dummy> struct sync_impl<true, dummy> {
-      __device__ inline void operator()() { __syncthreads(); }
-    };
-
-  public:
-    /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
-    */
-    constexpr SharedMemoryCache(dim3 block = dim3(block_size_x, block_size_y, block_size_z),
-                                unsigned int thread_offset = 0) :
-      block(block), stride(block.x * block.y * block.z), offset(stride * thread_offset)
-    {
-    }
-
-    /**
-       @brief Grab the raw base address to shared memory.
-    */
-    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache<dynamic>()); }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, x, y, z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-     */
-    __device__ __host__ inline void save_x(const T &a, int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      save_detail(a, x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] y The y index to use
-     */
-    __device__ __host__ inline void save_y(const T &a, int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      save_detail(a, tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save_z(const T &a, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Load a value from the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-     */
-    __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      return load_detail(x, y, z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] x The x index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_x(int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      return load_detail(x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] y The y index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_y(int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      return load_detail(tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_z(int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      return load_detail(tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Synchronize the cache
-    */
-    __device__ __host__ void sync() const { target::dispatch<sync_impl>(); }
-
-    /**
-       @brief Cast operator to allow cache objects to be used where T
-       is expected
-     */
-    __device__ __host__ operator T() const { return load(); }
-
-    /**
-       @brief Assignment operator to allow cache objects to be used on
-       the lhs where T is otherwise expected.
-     */
-    __device__ __host__ void operator=(const T &src) const { save(src); }
-  };
-
-} // namespace quda
-
-// include overloads
 #include "../generic/shared_memory_cache_helper.h"