Skip to content

Commit

Permalink
Enable OMP threading for host kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
maddyscientist committed Jul 8, 2024
1 parent fbbe14c commit d604f80
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 7 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,6 @@ mark_as_advanced(QUDA_RECONSTRUCT)
mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE)
mark_as_advanced(QUDA_MULTIGRID_DSLASH_PROMOTE)
mark_as_advanced(QUDA_CTEST_SEP_DSLASH_POLICIES)
mark_as_advanced(QUDA_OPENMP)

mark_as_advanced(QUDA_BACKWARDS)

Expand Down
2 changes: 1 addition & 1 deletion include/targets/cuda/atomic_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ namespace quda
template <bool is_device> struct atomic_fetch_abs_max_impl {
template <typename T> inline void operator()(T *addr, T val)
{
#pragma omp atomic update
#pragma omp critical
*addr = std::max(*addr, val);
}
};
Expand Down
1 change: 1 addition & 0 deletions include/targets/generic/block_reduction_kernel_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ namespace quda
{
Functor<Arg> t(arg);
dim3 block(0, 0, 0);
#pragma omp parallel for
for (block.y = 0; block.y < arg.grid_dim.y; block.y++) {
for (block.x = 0; block.x < arg.grid_dim.x; block.x++) { t(block, dim3(0, 0, 0)); }
}
Expand Down
3 changes: 3 additions & 0 deletions include/targets/generic/kernel_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ namespace quda
template <template <typename> class Functor, typename Arg> void Kernel1D_host(const Arg &arg)
{
Functor<Arg> f(const_cast<Arg &>(arg));
#pragma omp parallel for
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { f(i); }
}

template <template <typename> class Functor, typename Arg> void Kernel2D_host(const Arg &arg)
{
Functor<Arg> f(const_cast<Arg &>(arg));
#pragma omp parallel for
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
for (int j = 0; j < static_cast<int>(arg.threads.y); j++) { f(i, j); }
}
Expand All @@ -20,6 +22,7 @@ namespace quda
template <template <typename> class Functor, typename Arg> void Kernel3D_host(const Arg &arg)
{
Functor<Arg> f(const_cast<Arg &>(arg));
#pragma omp parallel for
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
for (int k = 0; k < static_cast<int>(arg.threads.z); k++) { f(i, j, k); }
Expand Down
16 changes: 12 additions & 4 deletions include/targets/generic/reduction_kernel_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace quda
Functor<Arg> t(arg);

reduce_t value = t.init();

#pragma omp parallel for collapse(2) reduction(Functor <Arg>::apply : value)
for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value = t(value, i, j); }
}
Expand All @@ -21,16 +21,24 @@ namespace quda

template <template <typename> class Functor, typename Arg> auto MultiReduction_host(const Arg &arg)
{
#pragma omp declare reduction(multi_reduce \
: typename Functor <Arg>::reduce_t \
: omp_out = Functor <Arg>::apply(omp_out, omp_in)) \
initializer(omp_priv = Functor <Arg>::init())

using reduce_t = typename Functor<Arg>::reduce_t;
Functor<Arg> t(arg);

std::vector<reduce_t> value(arg.threads.z);
std::vector<reduce_t> value(arg.threads.z, t.init());
for (int k = 0; k < static_cast<int>(arg.threads.z); k++) {
value[k] = t.init();
auto val = t.init();

#pragma omp parallel for collapse(2) reduction(multi_reduce : val)
for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value[k] = t(value[k], i, j, k); }
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { val = t(val, i, j, k); }
}

value[k] = val;
}

return value;
Expand Down
2 changes: 1 addition & 1 deletion include/targets/hip/atomic_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace quda
template <bool is_device> struct atomic_fetch_abs_max_impl {
template <typename T> inline void operator()(T *addr, T val)
{
#pragma omp atomic update
#pragma omp critical
*addr = std::max(*addr, val);
}
};
Expand Down
8 changes: 8 additions & 0 deletions lib/targets/cuda/target_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,14 @@ target_compile_options(
-fsanitize=undefined>
>)

if(QUDA_OPENMP)
target_compile_options(
quda
PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:
"-Xcompiler=${OpenMP_CXX_FLAGS}"
>)
endif()

# malloc.cpp uses both the driver and runtime api So we need to find the CUDA_CUDA_LIBRARY (driver api) or the stub
target_link_libraries(quda PUBLIC CUDA::cuda_driver)
target_link_libraries(quda PUBLIC CUDA::nvml)
Expand Down

0 comments on commit d604f80

Please sign in to comment.