Skip to content

Commit

Permalink
update HIP target
Browse files Browse the repository at this point in the history
  • Loading branch information
jcosborn committed Sep 18, 2023
1 parent e8c16b6 commit 2cd916a
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 339 deletions.
18 changes: 18 additions & 0 deletions include/targets/hip/atomic_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@ namespace quda
for (int i = 0; i < n; i++) atomic_fetch_add(&(*addr)[i], val[i]);
}

__device__ __host__ inline void atomic_fetch_add(int4 *addr, int4 val)
{
atomic_fetch_add(reinterpret_cast<int *>(addr) + 0, val.x);
atomic_fetch_add(reinterpret_cast<int *>(addr) + 1, val.y);
atomic_fetch_add(reinterpret_cast<int *>(addr) + 2, val.z);
atomic_fetch_add(reinterpret_cast<int *>(addr) + 3, val.w);
}

template <typename T>
__device__ __host__ inline void atomic_add_local(T *addr, T val)
{
atomic_fetch_add(addr, val);
}

template <bool is_device> struct atomic_fetch_abs_max_impl {
template <typename T> inline void operator()(T *addr, T val)
{
Expand Down Expand Up @@ -83,4 +97,8 @@ namespace quda
target::dispatch<atomic_fetch_abs_max_impl>(addr, val);
}

struct fetch_add_atomic_t {
template <class T> __device__ __host__ inline void operator()(T *out, T in) { atomic_fetch_add(out, in); }
};

} // namespace quda
44 changes: 42 additions & 2 deletions include/targets/hip/block_reduce_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <target_device.h>
#include <reducer.h>
#include <helpers.h>

/**
@file block_reduce_helper.h
Expand Down Expand Up @@ -60,13 +61,39 @@ namespace quda
}
};

/**
@brief block_reduce_param is used as a container for passing
non-type parameters to specialize block_reduce through the
target::dispatch
@tparam block_dim Block dimension of the reduction (1, 2 or 3)
@tparam batch_size Batch size of the reduction. Threads will be
ordered such that batch size is the slowest running index. Note
that batch_size > 1 requires block_dim <= 2.
*/
template <int block_dim_, int batch_size_ = 1> struct block_reduce_param {
static constexpr int block_dim = block_dim_;
static constexpr int batch_size = batch_size_;
static_assert(batch_size == 1 || block_dim <= 2, "Batching not possible with 3-d block reduction");
};

// pre-declaration of block_reduce that we wish to specialize
template <bool> struct block_reduce;
template <bool> struct block_reduce_impl;

/**
@brief Dummy generic implementation of block_reduce
*/
template <bool is_device> struct block_reduce_impl {
template <typename T, typename reducer_t, typename param_t>
T operator()(const T &value, bool, int, bool, reducer_t, param_t)
{
return value;
}
};

/**
@brief HIP specialization of block_reduce, building on the warp reduce
*/
template <> struct block_reduce<true> {
template <> struct block_reduce_impl<true> {

template <int width_> struct warp_reduce_param {
static constexpr int width = width_;
Expand Down Expand Up @@ -135,6 +162,19 @@ namespace quda
}
};

template <typename T, int block_dim, int batch_size>
struct block_reduce {
template <typename Ops> HostDevice inline block_reduce(Ops &) {};
template <typename ...Arg> static constexpr size_t shared_mem_size(dim3 block) {
return SizeBlockDivWarp::size(block) * sizeof(T);
}
template <typename reducer_t>
HostDevice inline T apply(const T &value, bool async, int batch, bool all, const reducer_t &r)
{
return target::dispatch<block_reduce_impl>(value, async, batch, all, r, block_reduce_param<block_dim, batch_size>());
}
};

} // namespace quda

#include "../generic/block_reduce_helper.h"
294 changes: 0 additions & 294 deletions include/targets/hip/shared_memory_cache_helper.h
Original file line number Diff line number Diff line change
@@ -1,295 +1 @@
#pragma once

#include <target_device.h>
#include <array.h>

/**
@file shared_memory_cache_helper.h
Helper functionality for aiding the use of the shared memory for
sharing data between threads in a thread block.
*/

namespace quda
{

/**
@brief Class which wraps around a shared memory cache for type T,
where each thread in the thread block stores a unique value in
the cache which any other thread can access.
This accessor supports both explicit run-time block size and
compile-time sizing.
* For run-time block size, the constructor should be initialied
with the desired block size.
* For compile-time block size, no arguments should be passed to
the constructor, and then the second and third template
parameters correspond to the y and z dimensions of the block,
respectively. The x dimension of the block will be set
according the maximum number of threads possible, given these
dimensions.
*/
template <typename T, int block_size_y_ = 1, int block_size_z_ = 1, bool dynamic_ = true> class SharedMemoryCache
{
public:
using value_type = T;
static constexpr int block_size_y = block_size_y_;
static constexpr int block_size_z = block_size_z_;
static constexpr bool dynamic = dynamic_;

private:
/** maximum number of threads in x given the y and z block sizes */
static constexpr int block_size_x = device::max_block_size<block_size_y, block_size_z>();

using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");

// The number of elements of type atom_t that we break T into for optimal shared-memory access
static constexpr int n_element = sizeof(T) / sizeof(atom_t);

const dim3 block;
const int stride;
const unsigned int offset = 0; // dynamic offset in bytes

/**
@brief This is a dummy instantiation for the host compiler
*/
template <bool, typename dummy = void> struct cache_dynamic {
atom_t *operator()(unsigned)
{
static atom_t *cache_;
return reinterpret_cast<atom_t *>(cache_);
}
};

/**
@brief This is the handle to the shared memory, dynamic specialization
@return Shared memory pointer
*/
template <typename dummy> struct cache_dynamic<true, dummy> {
__device__ inline atom_t *operator()(unsigned int offset)
{
extern __shared__ int cache_[];
return reinterpret_cast<atom_t *>(reinterpret_cast<char *>(cache_) + offset);
}
};

/**
@brief This is a dummy instantiation for the host compiler
*/
template <bool is_device, typename dummy = void> struct cache_static {
atom_t *operator()()
{
static atom_t *cache_;
return reinterpret_cast<atom_t *>(cache_);
}
};

/**
@brief This is the handle to the shared memory, static specialization
@return Shared memory pointer
*/
template <typename dummy> struct cache_static<true, dummy> {
__device__ inline atom_t *operator()()
{
static __shared__ atom_t cache_[n_element * block_size_x * block_size_y * block_size_z];
return reinterpret_cast<atom_t *>(cache_);
}
};

template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<dynamic_shared, atom_t *> cache() const
{
return target::dispatch<cache_dynamic>(offset);
}

template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<!dynamic_shared, atom_t *> cache() const
{
return target::dispatch<cache_static>();
}

__device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
{
atom_t tmp[n_element];
memcpy(tmp, (void *)&a, sizeof(T));
int j = (z * block.y + y) * block.x + x;
#pragma unroll
for (int i = 0; i < n_element; i++) cache<dynamic>()[i * stride + j] = tmp[i];
}

__device__ __host__ inline T load_detail(int x, int y, int z) const
{
atom_t tmp[n_element];
int j = (z * block.y + y) * block.x + x;
#pragma unroll
for (int i = 0; i < n_element; i++) tmp[i] = cache<dynamic>()[i * stride + j];
T a;
memcpy((void *)&a, tmp, sizeof(T));
return a;
}

/**
@brief Dummy instantiation for the host compiler
*/
template <bool is_device, typename dummy = void> struct sync_impl {
void operator()() { }
};

/**
@brief Synchronize the cache when on the device
*/
template <typename dummy> struct sync_impl<true, dummy> {
__device__ inline void operator()() { __syncthreads(); }
};

public:
/**
@brief constructor for SharedMemory cache. If no arguments are
pass, then the dimensions are set according to the templates
block_size_y and block_size_z, together with the derived
block_size_x. Otherwise use the block sizes passed into the
constructor.
@param[in] block Block dimensions for the 3-d shared memory object
@param[in] thread_offset "Perceived" offset from dynamic shared
memory base pointer (used when we have multiple caches in
scope). Need to include block size to actual offset.
*/
constexpr SharedMemoryCache(dim3 block = dim3(block_size_x, block_size_y, block_size_z),
unsigned int thread_offset = 0) :
block(block), stride(block.x * block.y * block.z), offset(stride * thread_offset)
{
}

/**
@brief Grab the raw base address to shared memory.
*/
__device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache<dynamic>()); }

/**
@brief Save the value into the 3-d shared memory cache.
@param[in] a The value to store in the shared memory cache
@param[in] x The x index to use
@param[in] y The y index to use
@param[in] z The z index to use
*/
__device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const
{
auto tid = target::thread_idx();
x = (x == -1) ? tid.x : x;
y = (y == -1) ? tid.y : y;
z = (z == -1) ? tid.z : z;
save_detail(a, x, y, z);
}

/**
@brief Save the value into the 3-d shared memory cache.
@param[in] a The value to store in the shared memory cache
@param[in] x The x index to use
*/
__device__ __host__ inline void save_x(const T &a, int x = -1) const
{
auto tid = target::thread_idx();
x = (x == -1) ? tid.x : x;
save_detail(a, x, tid.y, tid.z);
}

/**
@brief Save the value into the 3-d shared memory cache.
@param[in] a The value to store in the shared memory cache
@param[in] y The y index to use
*/
__device__ __host__ inline void save_y(const T &a, int y = -1) const
{
auto tid = target::thread_idx();
y = (y == -1) ? tid.y : y;
save_detail(a, tid.x, y, tid.z);
}

/**
@brief Save the value into the 3-d shared memory cache.
@param[in] a The value to store in the shared memory cache
@param[in] z The z index to use
*/
__device__ __host__ inline void save_z(const T &a, int z = -1) const
{
auto tid = target::thread_idx();
z = (z == -1) ? tid.z : z;
save_detail(a, tid.x, tid.y, z);
}

/**
@brief Load a value from the shared memory cache
@param[in] x The x index to use
@param[in] y The y index to use
@param[in] z The z index to use
@return The value at coordinates (x,y,z)
*/
__device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const
{
auto tid = target::thread_idx();
x = (x == -1) ? tid.x : x;
y = (y == -1) ? tid.y : y;
z = (z == -1) ? tid.z : z;
return load_detail(x, y, z);
}

/**
@brief Load a vector from the shared memory cache
@param[in] x The x index to use
@return The value at coordinates (x,y,z)
*/
__device__ __host__ inline T load_x(int x = -1) const
{
auto tid = target::thread_idx();
x = (x == -1) ? tid.x : x;
return load_detail(x, tid.y, tid.z);
}

/**
@brief Load a vector from the shared memory cache
@param[in] y The y index to use
@return The value at coordinates (x,y,z)
*/
__device__ __host__ inline T load_y(int y = -1) const
{
auto tid = target::thread_idx();
y = (y == -1) ? tid.y : y;
return load_detail(tid.x, y, tid.z);
}

/**
@brief Load a vector from the shared memory cache
@param[in] z The z index to use
@return The value at coordinates (x,y,z)
*/
__device__ __host__ inline T load_z(int z = -1) const
{
auto tid = target::thread_idx();
z = (z == -1) ? tid.z : z;
return load_detail(tid.x, tid.y, z);
}

/**
@brief Synchronize the cache
*/
__device__ __host__ void sync() const { target::dispatch<sync_impl>(); }

/**
@brief Cast operator to allow cache objects to be used where T
is expected
*/
__device__ __host__ operator T() const { return load(); }

/**
@brief Assignment operator to allow cache objects to be used on
the lhs where T is otherwise expected.
*/
__device__ __host__ void operator=(const T &src) const { save(src); }
};

} // namespace quda

// include overloads
#include "../generic/shared_memory_cache_helper.h"
Loading

0 comments on commit 2cd916a

Please sign in to comment.