diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4e09a30ab..4f44a59233 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -280,7 +280,6 @@ mark_as_advanced(QUDA_RECONSTRUCT)
 mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE)
 mark_as_advanced(QUDA_MULTIGRID_DSLASH_PROMOTE)
 mark_as_advanced(QUDA_CTEST_SEP_DSLASH_POLICIES)
-mark_as_advanced(QUDA_OPENMP)
 
 mark_as_advanced(QUDA_BACKWARDS)
 
diff --git a/include/targets/cuda/atomic_helper.h b/include/targets/cuda/atomic_helper.h
index 43424e0f62..7620ee7cf2 100644
--- a/include/targets/cuda/atomic_helper.h
+++ b/include/targets/cuda/atomic_helper.h
@@ -81,7 +81,7 @@ namespace quda
   template <bool is_device> struct atomic_fetch_abs_max_impl {
     template <typename T> inline void operator()(T *addr, T val)
     {
-#pragma omp atomic update
+#pragma omp critical
       *addr = std::max(*addr, val);
     }
   };
diff --git a/include/targets/generic/block_reduction_kernel_host.h b/include/targets/generic/block_reduction_kernel_host.h
index 1a356d4f1c..f0997a7d98 100644
--- a/include/targets/generic/block_reduction_kernel_host.h
+++ b/include/targets/generic/block_reduction_kernel_host.h
@@ -5,6 +5,7 @@ namespace quda
   {
     Functor<Arg> t(arg);
     dim3 block(0, 0, 0);
+#pragma omp parallel for
     for (block.y = 0; block.y < arg.grid_dim.y; block.y++) {
       for (block.x = 0; block.x < arg.grid_dim.x; block.x++) { t(block, dim3(0, 0, 0)); }
     }
diff --git a/include/targets/generic/kernel_host.h b/include/targets/generic/kernel_host.h
index 96523df955..1416b3a536 100644
--- a/include/targets/generic/kernel_host.h
+++ b/include/targets/generic/kernel_host.h
@@ -6,12 +6,14 @@ namespace quda
   template <template <typename> class Functor, typename Arg> void Kernel1D_host(const Arg &arg)
   {
     Functor<Arg> f(const_cast<Arg &>(arg));
+#pragma omp parallel for
     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { f(i); }
   }
 
   template <template <typename> class Functor, typename Arg> void Kernel2D_host(const Arg &arg)
   {
     Functor<Arg> f(const_cast<Arg &>(arg));
+#pragma omp parallel for
     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) { f(i, j); }
     }
@@ -20,6 +22,7 @@ namespace quda
   template <template <typename> class Functor, typename Arg> void Kernel3D_host(const Arg &arg)
   {
     Functor<Arg> f(const_cast<Arg &>(arg));
+#pragma omp parallel for
     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
         for (int k = 0; k < static_cast<int>(arg.threads.z); k++) { f(i, j, k); }
diff --git a/include/targets/generic/reduction_kernel_host.h b/include/targets/generic/reduction_kernel_host.h
index c456de8744..4aadbb898c 100644
--- a/include/targets/generic/reduction_kernel_host.h
+++ b/include/targets/generic/reduction_kernel_host.h
@@ -11,7 +11,7 @@ namespace quda
     Functor<Arg> t(arg);
 
     reduce_t value = t.init();
-
+#pragma omp parallel for collapse(2) reduction(Functor <Arg>::apply : value)
     for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
       for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value = t(value, i, j); }
     }
@@ -21,16 +21,24 @@ namespace quda
 
   template <template <typename> class Functor, typename Arg> auto MultiReduction_host(const Arg &arg)
   {
+#pragma omp declare reduction(multi_reduce                                                                             \
+                              : typename Functor <Arg>::reduce_t                                                       \
+                              : omp_out = Functor <Arg>::apply(omp_out, omp_in))                                       \
+  initializer(omp_priv = Functor <Arg>::init())
+
     using reduce_t = typename Functor<Arg>::reduce_t;
     Functor<Arg> t(arg);
 
-    std::vector<reduce_t> value(arg.threads.z);
+    std::vector<reduce_t> value(arg.threads.z, t.init());
     for (int k = 0; k < static_cast<int>(arg.threads.z); k++) {
-      value[k] = t.init();
+      auto val = t.init();
 
+#pragma omp parallel for collapse(2) reduction(multi_reduce : val)
       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
-        for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value[k] = t(value[k], i, j, k); }
+        for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { val = t(val, i, j, k); }
       }
+
+      value[k] = val;
     }
 
     return value;
diff --git a/include/targets/hip/atomic_helper.h b/include/targets/hip/atomic_helper.h
index 3204becea7..5d6ff7a5db 100644
--- a/include/targets/hip/atomic_helper.h
+++ b/include/targets/hip/atomic_helper.h
@@ -48,7 +48,7 @@ namespace quda
   template <bool is_device> struct atomic_fetch_abs_max_impl {
     template <typename T> inline void operator()(T *addr, T val)
     {
-#pragma omp atomic update
+#pragma omp critical
       *addr = std::max(*addr, val);
     }
   };
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 4b2ce8e89d..0f9484b1b8 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -325,6 +325,14 @@ target_compile_options(
           -fsanitize=undefined>
           >)
 
+if(QUDA_OPENMP)
+  target_compile_options(
+    quda
+    PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:
+    "-Xcompiler=${OpenMP_CXX_FLAGS}"
+    >)
+endif()
+
 # malloc.cpp uses both the driver and runtime api So we need to find the CUDA_CUDA_LIBRARY (driver api) or the stub
 target_link_libraries(quda PUBLIC CUDA::cuda_driver)
 target_link_libraries(quda PUBLIC CUDA::nvml)