diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 3c5ccf194e..4801bdbf48 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -426,6 +426,12 @@ namespace quda
      */
     ColorSpinorField &operator=(ColorSpinorField &&field);
 
+    /**
+       @brief Returns if the object is empty (not initialized)
+       @return true if the object has not been allocated, otherwise false
+    */
+    bool empty() const { return !init; }
+
     /**
        @brief Copy the source field contents into this
        @param[in] src Source from which we are copying
diff --git a/include/gauge_field.h b/include/gauge_field.h
index b1757087cb..948960b36f 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -476,6 +476,17 @@ namespace quda {
       return reinterpret_cast<T>(gauge_array[d].data());
     }
 
+    void *raw_pointer() const
+    {
+      if (is_pointer_array(order)) {
+        static void *data_array[8];
+        for (int i = 0; i < site_dim; i++) data_array[i] = gauge_array[i].data();
+        return data_array;
+      } else {
+        return gauge.data();
+      }
+    }
+
     /**
        @brief Return array of pointers to the per dimension gauge field allocation(s).
        @tparam T Optional type to cast the pointer to (default is
diff --git a/include/quda_arch.h b/include/quda_arch.h
index ed88fb0b8e..45a8ed34e4 100644
--- a/include/quda_arch.h
+++ b/include/quda_arch.h
@@ -14,5 +14,8 @@
 
 #elif defined(QUDA_TARGET_SYCL)
 #include <targets/sycl/quda_sycl.h>
+#endif
 
+#ifdef QUDA_OPENMP
+#include <omp.h>
 #endif
diff --git a/include/util_quda.h b/include/util_quda.h
index 533df01970..3d68fb5a2e 100644
--- a/include/util_quda.h
+++ b/include/util_quda.h
@@ -66,7 +66,7 @@ char *getPrintBuffer();
    number of OMP threads for CPU functions recorded in the tune cache.
    @return Returns the string
 */
-char* getOmpThreadStr();
+const char *getOmpThreadStr();
 
 void errorQuda_(const char *func, const char *file, int line, ...);
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index d02a26370d..6c18ccae43 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -470,6 +470,7 @@ endif()
 
 if(QUDA_OPENMP)
   target_link_libraries(quda PUBLIC OpenMP::OpenMP_CXX)
+  target_compile_definitions(quda PUBLIC QUDA_OPENMP)
 endif()
 
 # set which precisions to enable
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index e949080b79..46c190f433 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -75,7 +75,7 @@ namespace quda
   {
     if (&src != this) {
       // if field not already initialized then move the field
-      if (!init || are_compatible(*this, src)) {
+      if (!init || are_compatible(*this, src) || src.empty()) {
         if (init) destroy();
         LatticeField::operator=(std::move(src));
         move(std::move(src));
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 7117677300..c9576f6f01 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -3219,11 +3219,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     // the split topology.
     logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n");
     if (!is_staggered) {
-      loadGaugeQuda(collected_gauge->data(), gauge_param);
+      loadGaugeQuda(collected_gauge->raw_pointer(), gauge_param);
     } else {
-      // freeGaugeQuda();
-      loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->data(),
-                           collected_milc_longlink_field->data());
+      loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->raw_pointer(),
+                           collected_milc_longlink_field->raw_pointer());
     }
     logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n");
 
diff --git a/lib/util_quda.cpp b/lib/util_quda.cpp
index 02a78834d0..93b9fb4627 100644
--- a/lib/util_quda.cpp
+++ b/lib/util_quda.cpp
@@ -134,17 +134,17 @@ void popOutputPrefix()
 
 char *getPrintBuffer() { return buffer_; }
 
-char* getOmpThreadStr() {
-  static char omp_thread_string[128];
+const char *getOmpThreadStr()
+{
+  static std::string omp_thread_string;
   static bool init = false;
   if (!init) {
-    strcpy(omp_thread_string,"omp_threads=");
-    char *omp_threads = getenv("OMP_NUM_THREADS");
-    strcat(omp_thread_string, omp_threads ? omp_threads : "1");
-    strcat(omp_thread_string, ",");
+#ifdef QUDA_OPENMP
+    omp_thread_string = std::string("omp_threads=" + std::to_string(omp_get_max_threads()) + ",");
+#endif
     init = true;
   }
-  return omp_thread_string;
+  return omp_thread_string.c_str();
 }
 
 void errorQuda_(const char *func, const char *file, int line, ...)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a1f25d4faf..3a95e355e8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -9,6 +9,15 @@ target_compile_options(
   $<IF:$<CONFIG:RELEASE>,-w,-Wall -Wextra>
   $<$<CONFIG:STRICT>:-Werror>
 )
+
+# ignore any unkown pragmas if not using OpenMP
+if(NOT ${QUDA_OPENMP})
+  target_compile_options(quda_test PUBLIC $<$<COMPILE_LANGUAGE:CXX>:
+    $<$<CXX_COMPILER_ID:Clang>:-Wno-unknown-pragmas>
+    $<$<CXX_COMPILER_ID:GNU>:-Wno-unknown-pragmas>
+    >)
+endif()
+
 if(BUILD_SHARED_LIBS)
   install(TARGETS quda_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
@@ -265,6 +274,7 @@ if(QUDA_MPI OR QUDA_QMP)
   if(DEFINED ENV{QUDA_TEST_GRID_SIZE})
     get_test_ranks($ENV{QUDA_TEST_GRID_SIZE} QUDA_TEST_NUM_PROCS)
   endif()
+  message(STATUS "ctest will run on ${QUDA_TEST_NUM_PROCS} processes")
   set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS}
     CACHE STRING "CTest Launcher command for QUDA's tests")
 endif()
@@ -375,6 +385,18 @@ foreach(pol IN LISTS DSLASH_POLICIES)
       set_tests_properties(dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    add_test(NAME dslash_${DIRAC_NAME}_splitgrid_policy${pol2}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --all-partitions 0
+                     --test Dslash
+                     --dim 2 4 6 8
+                     --gtest_output=xml:dslash_${DIRAC_NAME}_splitgrid_test_pol${pol2}.xml)
+    if(polenv)
+      set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
+    set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+
     add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
                      --dslash-type ${DIRAC_NAME}
@@ -830,6 +852,17 @@ endif()
       set_tests_properties(dslash_${DIRAC_NAME}_mat_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
     endif()
 
+    add_test(NAME dslash_${DIRAC_NAME}_splitgrid_policy${pol2}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test MatPC
+                     --dim 2 4 6 8
+                     --gtest_output=xml:dslash_${DIRAC_NAME}_matpc_test_pol${pol2}.xml)
+    if(polenv)
+      set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
+    endif()
+    set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+
     add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2}
     COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
             --dslash-type ${DIRAC_NAME}
@@ -931,6 +964,20 @@ foreach(prec IN LISTS TEST_PRECS)
       --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
       --enable-testing true
       --gtest_output=xml:invert_test_wilson_${prec}.xml)
+
+      if(DEFINED ENV{QUDA_ENABLE_TUNING})
+        if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
+          add_test(NAME invert_test_splitgrid_wilson_${prec}
+            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+            --dslash-type wilson --ngcrkrylov 8
+            --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+            --nsrc ${QUDA_TEST_NUM_PROCS}
+            --enable-testing true
+            --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml)
+
+          set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+        endif()
+      endif()
   endif()
   
   if(QUDA_DIRAC_TWISTED_MASS)
diff --git a/tests/dslash_ctest.cpp b/tests/dslash_ctest.cpp
index d75d1f3da9..631d640203 100644
--- a/tests/dslash_ctest.cpp
+++ b/tests/dslash_ctest.cpp
@@ -37,10 +37,6 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
       return true;
     }
 
-    // work out if test_split_grid is enabled
-    bool test_split_grid = (grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3] > 1);
-    if (::testing::get<2>(GetParam()) > 0 && test_split_grid) { return true; }
-
     const std::array<bool, 16> partition_enabled {true, true, true,  false,  true,  false, false, false,
                                                   true, false, false, false, true, false, true, true};
     if (!ctest_all_partitions && !partition_enabled[::testing::get<2>(GetParam())]) return true;
@@ -68,8 +64,6 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
   }
 
 public:
-  DslashTest() : dslash_test_wrapper(dtest_type) { }
-
   virtual void SetUp()
   {
     int prec = ::testing::get<0>(GetParam());
@@ -94,12 +88,20 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
     commDimPartitionedReset();
   }
 
-  static void SetUpTestCase() { initQuda(device_ordinal); }
+  static void SetUpTestCase()
+  {
+    initQuda(device_ordinal);
+    DslashTestWrapper::dtest_type = dtest_type;
+  }
 
   // Per-test-case tear-down.
   // Called after the last test in this test case.
   // Can be omitted if not needed.
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase()
+  {
+    DslashTestWrapper::destroy();
+    endQuda();
+  }
 };
 
 TEST_P(DslashTest, verify)
diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp
index 2eec7d4bba..11f8265e7f 100644
--- a/tests/dslash_test.cpp
+++ b/tests/dslash_test.cpp
@@ -33,8 +33,6 @@ class DslashTest : public ::testing::Test
   }
 
 public:
-  DslashTest() : dslash_test_wrapper(dtest_type) { }
-
   virtual void SetUp()
   {
     dslash_test_wrapper.init_test(argc_copy, argv_copy);
@@ -43,12 +41,20 @@ class DslashTest : public ::testing::Test
 
   virtual void TearDown() { dslash_test_wrapper.end(); }
 
-  static void SetUpTestCase() { initQuda(device_ordinal); }
+  static void SetUpTestCase()
+  {
+    initQuda(device_ordinal);
+    DslashTestWrapper::dtest_type = dtest_type;
+  }
 
   // Per-test-case tear-down.
   // Called after the last test in this test case.
   // Can be omitted if not needed.
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase()
+  {
+    DslashTestWrapper::destroy();
+    endQuda();
+  }
 };
 
 TEST_F(DslashTest, benchmark) { dslash_test_wrapper.run_test(niter, /**show_metrics =*/true); }
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index a06dff6147..3d13d11478 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -50,14 +50,14 @@ struct DslashTime {
 struct DslashTestWrapper {
 
   // CPU color spinor fields
-  ColorSpinorField spinor;
-  ColorSpinorField spinorOut;
-  ColorSpinorField spinorRef;
-  ColorSpinorField spinorTmp;
+  static inline ColorSpinorField spinor;
+  static inline ColorSpinorField spinorOut;
+  static inline ColorSpinorField spinorRef;
+  static inline ColorSpinorField spinorTmp;
   // For split grid
-  std::vector<ColorSpinorField> vp_spinor;
-  std::vector<ColorSpinorField> vp_spinorOut;
-  std::vector<ColorSpinorField> vp_spinorRef;
+  static inline std::vector<ColorSpinorField> vp_spinor;
+  static inline std::vector<ColorSpinorField> vp_spinorOut;
+  static inline std::vector<ColorSpinorField> vp_spinorRef;
 
   // CUDA color spinor fields
   ColorSpinorField cudaSpinor;
@@ -69,9 +69,9 @@ struct DslashTestWrapper {
   quda::DiracDomainWall4DPC *dirac_4dpc = nullptr;
 
   // Raw pointers
-  void *hostGauge[4] = {nullptr};
-  void *hostClover = nullptr;
-  void *hostCloverInv = nullptr;
+  static inline void *hostGauge[4] = {nullptr};
+  static inline void *hostClover = nullptr;
+  static inline void *hostCloverInv = nullptr;
 
   // Parameters
   QudaGaugeParam gauge_param;
@@ -79,14 +79,12 @@ struct DslashTestWrapper {
 
   // Test options
   QudaParity parity = QUDA_EVEN_PARITY;
-  dslash_test_type dtest_type = dslash_test_type::Dslash;
-  bool test_split_grid = false;
+  static inline dslash_test_type dtest_type = dslash_test_type::Dslash;
+  static inline bool test_split_grid = false;
   int num_src = 1;
 
   const bool transfer = false;
 
-  DslashTestWrapper(dslash_test_type dtest) : dtest_type(dtest) { }
-
   void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon)
   {
     cuda_prec = getPrecision(precision);
@@ -108,7 +106,12 @@ struct DslashTestWrapper {
 
     inv_param.cuda_prec = cuda_prec;
 
-    init(argc, argv);
+    static bool first_time = true;
+    if (first_time) {
+      init_host(argc, argv);
+      first_time = false;
+    }
+    init();
   }
 
   void init_test(int argc, char **argv)
@@ -118,15 +121,16 @@ struct DslashTestWrapper {
     setWilsonGaugeParam(gauge_param);
     setInvertParam(inv_param);
 
-    init(argc, argv);
+    static bool first_time = true;
+    if (first_time) {
+      init_host(argc, argv);
+      first_time = false;
+    }
+    init();
   }
 
-  void init(int argc, char **argv)
+  void init_host(int argc, char **argv)
   {
-    num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
-    test_split_grid = num_src > 1;
-    if (test_split_grid) { dtest_type = dslash_test_type::Dslash; }
-
     if (dslash_type == QUDA_ASQTAD_DSLASH || dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) {
       errorQuda("Asqtad not supported.  Please try staggered_dslash_test instead");
     } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH
@@ -138,6 +142,13 @@ struct DslashTestWrapper {
       Ls = 1;
     }
 
+    if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match");
+
+    for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
+    num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
+    test_split_grid = num_src > 1;
+    if (test_split_grid) { dtest_type = dslash_test_type::Dslash; }
+
     inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO;
     inv_param.solve_type = (dtest_type == dslash_test_type::Mat || dtest_type == dslash_test_type::MatDagMat) ?
       QUDA_DIRECT_SOLVE :
@@ -178,8 +189,6 @@ struct DslashTestWrapper {
       }
     }
 
-    if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match");
-
     // construct input fields
     for (int dir = 0; dir < 4; dir++) hostGauge[dir] = safe_malloc((size_t)V * gauge_site_size * gauge_param.cpu_prec);
 
@@ -187,8 +196,18 @@ struct DslashTestWrapper {
         || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
       hostClover = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec);
       hostCloverInv = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec);
+
+      if (compute_clover)
+        printfQuda("Computing clover field on GPU\n");
+      else {
+        printfQuda("Sending clover field to GPU\n");
+        constructHostCloverField(hostClover, hostCloverInv, inv_param);
+      }
     }
 
+    printfQuda("Randomizing fields... ");
+    constructHostGaugeField(hostGauge, gauge_param, argc, argv);
+
     ColorSpinorParam csParam;
     csParam.nColor = 3;
     csParam.nSpin = 4;
@@ -200,11 +219,7 @@ struct DslashTestWrapper {
       csParam.x[4] = Ls;
     }
 
-    if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
-      csParam.pc_type = QUDA_5D_PC;
-    } else {
-      csParam.pc_type = QUDA_4D_PC;
-    }
+    csParam.pc_type = dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC;
 
     // ndeg_tm
     if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
@@ -236,11 +251,6 @@ struct DslashTestWrapper {
 
     spinor.Source(QUDA_RANDOM_SOURCE);
 
-    inv_param.split_grid[0] = grid_partition[0];
-    inv_param.split_grid[1] = grid_partition[1];
-    inv_param.split_grid[2] = grid_partition[2];
-    inv_param.split_grid[3] = grid_partition[3];
-
     if (test_split_grid) {
       inv_param.num_src = num_src;
       inv_param.num_src_per_sub_partition = 1;
@@ -251,26 +261,18 @@ struct DslashTestWrapper {
       std::fill(vp_spinor.begin(), vp_spinor.end(), spinor);
     }
 
-    csParam.x[0] = gauge_param.X[0];
-
     // set verbosity prior to loadGaugeQuda
     setVerbosity(verbosity);
     inv_param.verbosity = verbosity;
+  }
 
-    printfQuda("Randomizing fields... ");
-    constructHostGaugeField(hostGauge, gauge_param, argc, argv);
-
+  void init()
+  {
     printfQuda("Sending gauge field to GPU\n");
     loadGaugeQuda(hostGauge, &gauge_param);
 
     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH
         || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
-      if (compute_clover)
-        printfQuda("Computing clover field on GPU\n");
-      else {
-        printfQuda("Sending clover field to GPU\n");
-        constructHostCloverField(hostClover, hostCloverInv, inv_param);
-      }
       inv_param.compute_clover = compute_clover;
       inv_param.return_clover = compute_clover;
       inv_param.compute_clover_inverse = true;
@@ -280,28 +282,16 @@ struct DslashTestWrapper {
     }
 
     if (!transfer) {
+      ColorSpinorParam csParam(spinor);
       csParam.location = QUDA_CUDA_FIELD_LOCATION;
       csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
       csParam.setPrecision(inv_param.cuda_prec, inv_param.cuda_prec, true);
 
-      if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-        csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-      } else {
-        csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-        csParam.x[0] /= 2;
-      }
-
       printfQuda("Creating cudaSpinor with nParity = %d\n", csParam.siteSubset);
       cudaSpinor = ColorSpinorField(csParam);
       printfQuda("Creating cudaSpinorOut with nParity = %d\n", csParam.siteSubset);
       cudaSpinorOut = ColorSpinorField(csParam);
 
-      if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-        csParam.x[0] /= 2;
-      }
-
-      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-
       printfQuda("Sending spinor field to GPU\n");
       cudaSpinor = spinor;
 
@@ -330,12 +320,28 @@ struct DslashTestWrapper {
         dirac = nullptr;
       }
     }
+  }
+
+  static void destroy()
+  {
+    for (int dir = 0; dir < 4; dir++)
+      if (hostGauge[dir]) host_free(hostGauge[dir]);
 
-    for (int dir = 0; dir < 4; dir++) host_free(hostGauge[dir]);
     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH
         || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
-      host_free(hostClover);
-      host_free(hostCloverInv);
+      if (hostClover) host_free(hostClover);
+      if (hostCloverInv) host_free(hostCloverInv);
+    }
+
+    spinor = {};
+    spinorOut = {};
+    spinorRef = {};
+    spinorTmp = {};
+
+    if (test_split_grid) {
+      vp_spinor.clear();
+      vp_spinorOut.clear();
+      vp_spinorRef.clear();
     }
   }
 
diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp
index f15bcdbda5..370470fc50 100644
--- a/tests/gauge_path_test.cpp
+++ b/tests/gauge_path_test.cpp
@@ -204,10 +204,14 @@ void gauge_force_test(bool compute_force = true)
   void *refmom = Mom_ref_milc.data();
   int *check_out = compute_force ? &force_check : &path_check;
   if (verify_results) {
+    quda::host_timer_t verify_timer;
+    verify_timer.start();
     gauge_force_reference(refmom, eb3, U_qdp, input_path_buf, length, loop_coeff, num_paths, compute_force);
     *check_out
       = compare_floats(Mom_milc.data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
     if (compute_force) strong_check_mom(Mom_milc.data(), refmom, 4 * V, gauge_param.cpu_prec);
+    verify_timer.stop();
+    printfQuda("Verification time = %.2f ms\n", verify_timer.last());
   }
 
   if (compute_force) {
@@ -317,6 +321,9 @@ void gauge_loop_test()
   std::vector<quda::Complex> traces_ref(num_paths);
 
   if (verify_results) {
+    quda::host_timer_t verify_timer;
+    verify_timer.start();
+
     gauge_loop_trace_reference(U_qdp, traces_ref, scale_factor, trace_path_p, trace_loop_length_p, trace_loop_coeff_p,
                                num_paths);
 
@@ -348,6 +355,9 @@ void gauge_loop_test()
             "Plaquette loop space %e time %e total %e ; plaqQuda space %e time %e total %e ; deviation %e\n",
             plaq_loop[0], plaq_loop[1], plaq_loop[2], obsParam.plaquette[0], obsParam.plaquette[1],
             obsParam.plaquette[2], plaq_deviation);
+
+    verify_timer.stop();
+    printfQuda("Verification time = %.2f ms\n", verify_timer.last());
   }
 
   double perf = 1.0 * niter * flops * V / (host_timer.last() * 1e+9);
diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp
index cb6572b25b..98c2ae91d3 100644
--- a/tests/hisq_stencil_test.cpp
+++ b/tests/hisq_stencil_test.cpp
@@ -104,8 +104,10 @@ static void hisq_test()
   double u4 = u2 * u2;
   double u6 = u4 * u2;
 
+  std::array<std::array<double, 6>, 3> act_paths;
+
   // First path: create V, W links
-  double act_path_coeff_1[6] = {
+  act_paths[0] = {
     (1.0 / 8.0),                             /* one link */
     u2 * (0.0),                              /* Naik */
     u2 * (-1.0 / 8.0) * 0.5,                 /* simple staple */
@@ -115,7 +117,7 @@ static void hisq_test()
   };
 
   // Second path: create X, long links
-  double act_path_coeff_2[6] = {
+  act_paths[1] = {
     ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */
                                                       /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
     (-1.0 / 24.0),                                    /* Naik */
@@ -126,7 +128,7 @@ static void hisq_test()
   };
 
   // Paths for epsilon corrections. Not used if n_naiks = 1.
-  double act_path_coeff_3[6] = {
+  act_paths[2] = {
     (1.0 / 8.0),   /* one link b/c of Naik */
     (-1.0 / 24.0), /* Naik */
     0.0,           /* simple staple */
@@ -185,7 +187,7 @@ static void hisq_test()
   // Tuning run...
   {
     printfQuda("Tuning...\n");
-    computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_path_coeff_2, &qudaGaugeParam);
+    computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &qudaGaugeParam);
   }
 
   struct timeval t0, t1;
@@ -196,11 +198,11 @@ static void hisq_test()
     // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
 
     // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
-    computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_path_coeff_1, &qudaGaugeParam);
+    computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &qudaGaugeParam);
 
     if (n_naiks > 1) {
       // Create Naiks, 3rd path table set
-      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_3, &qudaGaugeParam);
+      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &qudaGaugeParam);
 
       // Rescale+copy Naiks into Naik field
       cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
@@ -211,7 +213,7 @@ static void hisq_test()
     }
 
     // Create X and long links, 2nd path table set
-    computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_2, &qudaGaugeParam);
+    computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &qudaGaugeParam);
 
     if (n_naiks > 1) {
       // Add into Naik field
@@ -244,9 +246,6 @@ static void hisq_test()
   }
 
   if (verify_results) {
-
-    double *act_paths[3] = {act_path_coeff_1, act_path_coeff_2, act_path_coeff_3};
-
     computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, sitelink, &qudaGaugeParam,
                         act_paths, eps_naik);
   }
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index c43a7ca07e..0f27fa9572 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -69,8 +69,16 @@ struct fcomplex {
 struct dcomplex {
   double real;
   double imag;
+
+  void operator+=(const dcomplex &other)
+  {
+    real += other.real;
+    imag += other.imag;
+  }
 };
 
+#pragma omp declare reduction(dcomplex_sum:dcomplex : omp_out += omp_in)
+
 struct fsu3_matrix {
   using real_t = float;
   using complex_t = fcomplex;
@@ -322,9 +330,7 @@ int gf_neighborIndexFullLattice(size_t i, int dx[], const lattice_t &lat)
 template <typename su3_matrix>
 static su3_matrix compute_gauge_path(su3_matrix **sitelink, int i, int *path, int len, int dx[4], const lattice_t &lat)
 {
-  su3_matrix prev_matrix, curr_matrix;
-
-  memset(&curr_matrix, 0, sizeof(curr_matrix));
+  su3_matrix prev_matrix, curr_matrix = {};
 
   curr_matrix.e[0][0].real = 1;
   curr_matrix.e[1][1].real = 1;
@@ -366,16 +372,14 @@ template <typename su3_matrix, typename Float>
 static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, int *path, int len, Float loop_coeff,
                                  int dir, const lattice_t &lat)
 {
-  su3_matrix curr_matrix, tmat;
-  int dx[4];
-
+#pragma omp parallel for
   for (size_t i = 0; i < lat.volume; i++) {
-    memset(dx, 0, sizeof(dx));
-
+    int dx[4] = {};
     dx[dir] = 1;
 
-    curr_matrix = compute_gauge_path(sitelink, i, path, len, dx, lat);
+    su3_matrix curr_matrix = compute_gauge_path(sitelink, i, path, len, dx, lat);
 
+    su3_matrix tmat;
     su3_adjoint(&curr_matrix, &tmat);
     scalar_mult_add_su3_matrix(staple + i, &tmat, loop_coeff, staple + i);
   } // i
@@ -384,16 +388,14 @@ static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, int
 template <typename su3_matrix>
 static dcomplex compute_loop_trace(su3_matrix **sitelink, int *path, int len, double loop_coeff, const lattice_t &lat)
 {
-  su3_matrix tmat;
-  dcomplex accum;
-  memset(&accum, 0, sizeof(accum));
-  int dx[4];
+  dcomplex accum = {};
 
+#pragma omp parallel for reduction(dcomplex_sum : accum)
   for (size_t i = 0; i < lat.volume; i++) {
-    memset(dx, 0, sizeof(dx));
-    tmat = compute_gauge_path(sitelink, i, path, len, dx, lat);
+    int dx[4] = {};
+    su3_matrix tmat = compute_gauge_path(sitelink, i, path, len, dx, lat);
     auto tr = trace_su3(&tmat);
-    CSUM(accum, tr);
+    accum += dcomplex {tr.real, tr.imag};
   }
 
   CSCALE(accum, loop_coeff);
@@ -405,6 +407,7 @@ template <typename su3_matrix, typename anti_hermitmat, typename Float>
 static void update_mom(anti_hermitmat *momentum, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3,
                        const lattice_t &lat)
 {
+#pragma omp parallel for
   for (size_t i = 0; i < lat.volume; i++) {
     su3_matrix tmat1;
     su3_matrix tmat2;
@@ -426,6 +429,7 @@ template <typename su3_matrix, typename Float>
 static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3,
                          const lattice_t &lat)
 {
+#pragma omp parallel for
   for (size_t i = 0; i < lat.volume; i++) {
     su3_matrix tmat;
 
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index 5179f38a66..5e85375bb3 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -99,8 +99,9 @@ template <typename su3_vector, typename su3_matrix> void su3_projector(su3_vecto
 template <typename su3_matrix, typename su3_vector>
 void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, size_t nhops)
 {
-  int dx[4];
+#pragma omp parallel for
   for (int i = 0; i < V; ++i) {
+    int dx[4];
     for (int dir = 0; dir < 4; ++dir) {
       dx[3] = dx[2] = dx[1] = dx[0] = 0;
       dx[dir] = nhops;
@@ -894,10 +895,12 @@ void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Rea
   // To keep the code as close to the GPU code as possible, we'll
   // loop over the even sites first and then the odd sites
   LoadStore<Real> ls(volume);
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
     computeMiddleLinkSite<Real, 0>(site, dim, oprod, Qprev, link, sig, mu, coeff, ls, Pmu, P3, Qmu, newOprod);
   }
   // Loop over odd lattice sites
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
     computeMiddleLinkSite<Real, 1>(site, dim, oprod, Qprev, link, sig, mu, coeff, ls, Pmu, P3, Qmu, newOprod);
   }
@@ -988,10 +991,12 @@ void computeSideLinkField(const int dim[4], const Real *const P3,
 #endif
   LoadStore<Real> ls(volume);
 
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
     computeSideLinkSite<Real, 0>(site, dim, P3, Qprod, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod);
   }
 
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
     computeSideLinkSite<Real, 1>(site, dim, P3, Qprod, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod);
   }
@@ -1098,6 +1103,7 @@ void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *
 #endif
 
   LoadStore<Real> ls(volume);
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
 
     computeAllLinkSite<Real, 0>(site, dim, oprod, Qprev, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod);
@@ -1297,10 +1303,12 @@ void computeLongLinkField(const int dim[4], const Real *const oprod, const Real
   const int half_volume = volume / 2;
 
   LoadStore<Real> ls(volume);
+#pragma omp parallel for
   for (int site = 0; site < half_volume; ++site) {
     computeLongLinkSite<Real, 0>(site, dim, oprod, link, sig, coeff, ls, output);
   }
   // Loop over odd lattice sites
+#pragma omp parallel for
   for (int site = 0; site < half_volume; ++site) {
     computeLongLinkSite<Real, 1>(site, dim, oprod, link, sig, coeff, ls, output);
   }
@@ -1363,7 +1371,9 @@ void completeForceField(const int dim[4], const Real *const oprod, const Real *c
   const int half_volume = volume / 2;
   LoadStore<Real> ls(volume);
 
+#pragma omp parallel for
   for (int site = 0; site < half_volume; ++site) { completeForceSite<Real, 0>(site, dim, oprod, link, sig, ls, mom); }
+#pragma omp parallel for
   for (int site = 0; site < half_volume; ++site) { completeForceSite<Real, 1>(site, dim, oprod, link, sig, ls, mom); }
 }
 
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 5716ce5de8..14852a9f22 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -42,6 +42,7 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
                               sFloat **, sFloat **, int oddBit, int daggerBit, QudaDslashType dslash_type)
 #endif
 {
+#pragma omp parallel for
   for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0;
 
   gFloat *fatlinkEven[4], *fatlinkOdd[4];
@@ -66,6 +67,7 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
 #endif
   }
 
+#pragma omp parallel for
   for (int sid = 0; sid < Vh; sid++) {
     int offset = stag_spinor_site_size * sid;
 
diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp
index 20af8bd390..eda5c778a7 100644
--- a/tests/invert_test.cpp
+++ b/tests/invert_test.cpp
@@ -20,6 +20,7 @@ QudaMultigridParam mg_param;
 QudaInvertParam mg_inv_param;
 QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL];
 QudaEigParam eig_param;
+bool use_split_grid = false;
 
 // if --enable-testing true is passed, we run the tests defined in here
 #include <invert_test_gtest.hpp>
@@ -206,7 +207,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
   // params corresponds to split grid
   for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
   int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
-  bool use_split_grid = num_sub_partition > 1;
+  use_split_grid = num_sub_partition > 1;
 
   // Now QUDA is initialised and the fields are loaded, we may setup the preconditioner
   void *mg_preconditioner = nullptr;
diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp
index 3d16341bcb..74866dbcd7 100644
--- a/tests/invert_test_gtest.hpp
+++ b/tests/invert_test_gtest.hpp
@@ -117,6 +117,8 @@ bool skip_test(test_t param)
     return true;
 #endif
   }
+  // split-grid doesn't support split-grid at present
+  if (use_split_grid && multishift > 1) return true;
 
   return false;
 }
diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index 8505b04fe6..78a735ad13 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -62,8 +62,6 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
   }
 
 public:
-  virtual ~StaggeredDslashTest() { }
-
   virtual void SetUp()
   {
     int prec = ::testing::get<0>(GetParam());
@@ -92,7 +90,11 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
   // Per-test-case tear-down.
   // Called after the last test in this test case.
   // Can be omitted if not needed.
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase()
+  {
+    StaggeredDslashTestWrapper::destroy();
+    endQuda();
+  }
 };
 
 TEST_P(StaggeredDslashTest, verify)
@@ -102,7 +104,10 @@ TEST_P(StaggeredDslashTest, verify)
 
   double deviation = dslash_test_wrapper.verify();
   double tol = getTolerance(dslash_test_wrapper.inv_param.cuda_prec);
-
+  if ((dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_8
+       || dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9)
+      && dslash_test_wrapper.inv_param.cuda_prec >= QUDA_HALF_PRECISION)
+    tol *= 10; // if recon 8, we tolerate a greater deviation
   ASSERT_LE(deviation, tol) << "Reference CPU and QUDA implementations do not agree";
 }
 
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index f61e62d88c..0a3a063e45 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -22,8 +22,6 @@ class StaggeredDslashTest : public ::testing::Test
   }
 
 public:
-  StaggeredDslashTest() = default;
-
   virtual void SetUp()
   {
     dslash_test_wrapper.init_test(argc_copy, argv_copy);
@@ -37,7 +35,11 @@ class StaggeredDslashTest : public ::testing::Test
   // Per-test-case tear-down.
   // Called after the last test in this test case.
   // Can be omitted if not needed.
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase()
+  {
+    StaggeredDslashTestWrapper::destroy();
+    endQuda();
+  }
 };
 
 TEST_F(StaggeredDslashTest, benchmark) { dslash_test_wrapper.run_test(niter, /**show_metrics =*/true); }
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 6c5d40dd30..a4aaac347b 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -45,7 +45,10 @@ struct DslashTime {
 
 struct StaggeredDslashTestWrapper {
 
-  void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  // In the HISQ case, we include building fat/long links in this unit test
+  static inline void *qdp_fatlink_cpu[4] = {};
+  static inline void *qdp_longlink_cpu[4] = {};
 
   QudaGaugeParam gauge_param;
   QudaInvertParam inv_param;
@@ -56,19 +59,16 @@ struct StaggeredDslashTestWrapper {
   GaugeField *cpuFat = nullptr;
   GaugeField *cpuLong = nullptr;
 
-  ColorSpinorField spinor;
-  ColorSpinorField spinorOut;
-  ColorSpinorField spinorRef;
-  ColorSpinorField tmpCpu;
+  static inline ColorSpinorField spinor;
+  static inline ColorSpinorField spinorOut;
+  static inline ColorSpinorField spinorRef;
+  static inline ColorSpinorField tmpCpu;
   ColorSpinorField cudaSpinor;
   ColorSpinorField cudaSpinorOut;
 
-  std::vector<ColorSpinorField> vp_spinor;
-  std::vector<ColorSpinorField> vp_spinor_out;
+  static inline std::vector<ColorSpinorField> vp_spinor;
+  static inline std::vector<ColorSpinorField> vp_spinor_out;
 
-  // In the HISQ case, we include building fat/long links in this unit test
-  void *qdp_fatlink_cpu[4] = {};
-  void *qdp_longlink_cpu[4] = {};
   void *ghost_fatlink_cpu[4] = {};
   void *ghost_longlink_cpu[4] = {};
 
@@ -81,7 +81,7 @@ struct StaggeredDslashTestWrapper {
   char **argv_copy;
 
   // Split grid options
-  bool test_split_grid = false;
+  static inline bool test_split_grid = false;
   int num_src = 1;
 
   void staggeredDslashRef()
@@ -133,7 +133,12 @@ struct StaggeredDslashTestWrapper {
 
     link_recon = link_recon_;
 
-    init(argc, argv);
+    static bool first_time = true;
+    if (first_time) {
+      init_host(argc, argv);
+      first_time = false;
+    }
+    init();
   }
 
   void init_test(int argc, char **argv)
@@ -144,52 +149,122 @@ struct StaggeredDslashTestWrapper {
     setStaggeredGaugeParam(gauge_param);
     setStaggeredInvertParam(inv_param);
 
-    init(argc, argv);
+    static bool first_time = true;
+    if (first_time) {
+      init_host(argc, argv);
+      first_time = false;
+    }
+    init();
   }
 
-  void init(int argc, char **argv)
+  void init_host(int argc, char **argv)
   {
-    inv_param.split_grid[0] = grid_partition[0];
-    inv_param.split_grid[1] = grid_partition[1];
-    inv_param.split_grid[2] = grid_partition[2];
-    inv_param.split_grid[3] = grid_partition[3];
+    setDims(gauge_param.X);
+    dw_setDims(gauge_param.X, 1);
+    if (Nsrc != 1) {
+      warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc);
+      Nsrc = 1;
+    }
 
+    for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
     num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
     test_split_grid = num_src > 1;
     if (test_split_grid) { dtest_type = dslash_test_type::Dslash; }
 
-    inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO;
+    for (int dir = 0; dir < 4; dir++) {
+      qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+    }
 
-    setDims(gauge_param.X);
-    dw_setDims(gauge_param.X, 1);
-    if (Nsrc != 1) {
-      warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc);
-      Nsrc = 1;
+    bool compute_on_gpu = false; // reference fat/long fields should be computed on cpu
+    constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv,
+                                     compute_on_gpu);
+
+    ColorSpinorParam csParam;
+    csParam.nColor = 3;
+    csParam.nSpin = 1;
+    csParam.nDim = 4;
+    for (int d = 0; d < 4; d++) { csParam.x[d] = gauge_param.X[d]; }
+    csParam.x[4] = 1;
+
+    csParam.setPrecision(inv_param.cpu_prec);
+    // inv_param.solution_type = QUDA_MAT_SOLUTION;
+    csParam.pad = 0;
+    if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) {
+      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
+      csParam.x[0] /= 2;
+      inv_param.solution_type = QUDA_MATPC_SOLUTION;
+    } else {
+      csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
+      inv_param.solution_type = QUDA_MAT_SOLUTION;
     }
 
-    // Allocate a lot of memory because I'm very confused
-    void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-    void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
+    csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+    csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
+    csParam.create = QUDA_ZERO_FIELD_CREATE;
+    csParam.pc_type = QUDA_4D_PC;
+    csParam.location = QUDA_CPU_FIELD_LOCATION;
 
-    milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-    milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    spinor = ColorSpinorField(csParam);
+    spinorOut = ColorSpinorField(csParam);
+    spinorRef = ColorSpinorField(csParam);
+    tmpCpu = ColorSpinorField(csParam);
+
+    spinor.Source(QUDA_RANDOM_SOURCE);
+
+    if (test_split_grid) {
+      inv_param.num_src = num_src;
+      inv_param.num_src_per_sub_partition = 1;
+      resize(vp_spinor, num_src, csParam);
+      resize(vp_spinor_out, num_src, csParam);
+      std::fill(vp_spinor.begin(), vp_spinor.end(), spinor);
+    }
+
+    inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO;
 
+    // set verbosity prior to loadGaugeQuda
+    setVerbosity(verbosity);
+  }
+
+  void init()
+  {
+    // Prepare the fields to be used for the GPU computation
     void *qdp_fatlink_gpu[4];
     void *qdp_longlink_gpu[4];
-
     for (int dir = 0; dir < 4; dir++) {
-      qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-
       qdp_fatlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
       qdp_longlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-
-      qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+    }
+    // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
+    // "compute" the fat/long links or not.
+    if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) {
+      for (int dir = 0; dir < 4; dir++) {
+        memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+        memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
+      }
+    } else {
+      // QUDA_ASQTAD_DSLASH
+      if (compute_fatlong) {
+        computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, host_gauge_data_type_size,
+                          n_naiks, eps_naik);
+      } else {
+        // Not computing FatLong
+        for (int dir = 0; dir < 4; dir++) {
+          memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+          memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size);
+        }
+      }
     }
 
-    bool gauge_loaded = false;
-    constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu,
-                                           qdp_fatlink_gpu, gauge_param, argc, argv, gauge_loaded);
+    // Create ghost zones for CPU fields,
+    // prepare and load the GPU fields
+    void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+
+    milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
 
     // Alright, we've created all the void** links.
     // Create the void* pointers
@@ -197,8 +272,6 @@ struct StaggeredDslashTestWrapper {
     reorderQDPtoMILC(milc_fatlink_cpu, qdp_fatlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
     reorderQDPtoMILC(milc_longlink_gpu, qdp_longlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
     reorderQDPtoMILC(milc_longlink_cpu, qdp_longlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    // Create ghost zones for CPU fields,
-    // prepare and load the GPU fields
 
 #ifdef MULTI_GPU
     gauge_param.type = (dslash_type == QUDA_ASQTAD_DSLASH) ? QUDA_ASQTAD_FAT_LINKS : QUDA_SU3_LINKS;
@@ -227,9 +300,6 @@ struct StaggeredDslashTestWrapper {
       gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
     }
 
-    // set verbosity prior to loadGaugeQuda
-    setVerbosity(verbosity);
-
     printfQuda("Sending fat links to GPU\n");
     loadGaugeQuda(milc_fatlink_gpu, &gauge_param);
 
@@ -249,47 +319,7 @@ struct StaggeredDslashTestWrapper {
       loadGaugeQuda(milc_longlink_gpu, &gauge_param);
     }
 
-    ColorSpinorParam csParam;
-    csParam.nColor = 3;
-    csParam.nSpin = 1;
-    csParam.nDim = 4;
-    for (int d = 0; d < 4; d++) { csParam.x[d] = gauge_param.X[d]; }
-    csParam.x[4] = 1;
-
-    csParam.setPrecision(inv_param.cpu_prec);
-    // inv_param.solution_type = QUDA_MAT_SOLUTION;
-    csParam.pad = 0;
-    if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) {
-      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-      csParam.x[0] /= 2;
-      inv_param.solution_type = QUDA_MATPC_SOLUTION;
-    } else {
-      csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-      inv_param.solution_type = QUDA_MAT_SOLUTION;
-    }
-
-    csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
-    csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-    csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
-    csParam.create = QUDA_ZERO_FIELD_CREATE;
-    csParam.pc_type = QUDA_4D_PC;
-    csParam.location = QUDA_CPU_FIELD_LOCATION;
-
-    spinor = ColorSpinorField(csParam);
-    spinorOut = ColorSpinorField(csParam);
-    spinorRef = ColorSpinorField(csParam);
-    tmpCpu = ColorSpinorField(csParam);
-
-    spinor.Source(QUDA_RANDOM_SOURCE);
-
-    if (test_split_grid) {
-      inv_param.num_src = num_src;
-      inv_param.num_src_per_sub_partition = 1;
-      resize(vp_spinor, num_src, csParam);
-      resize(vp_spinor_out, num_src, csParam);
-      std::fill(vp_spinor.begin(), vp_spinor.end(), spinor);
-    }
-
+    ColorSpinorParam csParam(spinor);
     csParam.fieldOrder = colorspinor::getNative(inv_param.cuda_prec, 1);
     csParam.pad = 0;
     csParam.setPrecision(inv_param.cuda_prec);
@@ -305,28 +335,23 @@ struct StaggeredDslashTestWrapper {
     setDiracParam(diracParam, &inv_param, pc);
     dirac = Dirac::create(diracParam);
 
-    for (int dir = 0; dir < 4; dir++) {
-      host_free(qdp_fatlink_gpu[dir]);
-      host_free(qdp_longlink_gpu[dir]);
-      host_free(qdp_inlink[dir]);
-    }
     host_free(milc_fatlink_cpu);
     host_free(milc_longlink_cpu);
-  }
 
-  void end()
-  {
     for (int dir = 0; dir < 4; dir++) {
-      if (qdp_fatlink_cpu[dir] != nullptr) {
-        host_free(qdp_fatlink_cpu[dir]);
-        qdp_fatlink_cpu[dir] = nullptr;
+      if (qdp_fatlink_gpu[dir] != nullptr) {
+        host_free(qdp_fatlink_gpu[dir]);
+        qdp_fatlink_gpu[dir] = nullptr;
       }
-      if (qdp_longlink_cpu[dir] != nullptr) {
-        host_free(qdp_longlink_cpu[dir]);
-        qdp_longlink_cpu[dir] = nullptr;
+      if (qdp_longlink_gpu[dir] != nullptr) {
+        host_free(qdp_longlink_gpu[dir]);
+        qdp_longlink_gpu[dir] = nullptr;
       }
     }
+  }
 
+  void end()
+  {
     if (dirac != nullptr) {
       delete dirac;
       dirac = nullptr;
@@ -350,6 +375,25 @@ struct StaggeredDslashTestWrapper {
     commDimPartitionedReset();
   }
 
+  static void destroy()
+  {
+    for (int dir = 0; dir < 4; dir++) {
+      if (qdp_inlink[dir]) host_free(qdp_inlink[dir]);
+      if (qdp_fatlink_cpu[dir]) host_free(qdp_fatlink_cpu[dir]);
+      if (qdp_longlink_cpu[dir]) host_free(qdp_longlink_cpu[dir]);
+    }
+
+    spinor = {};
+    spinorOut = {};
+    spinorRef = {};
+    tmpCpu = {};
+
+    if (test_split_grid) {
+      vp_spinor.clear();
+      vp_spinor_out.clear();
+    }
+  }
+
   DslashTime dslashCUDA(int niter)
   {
     DslashTime dslash_time;
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 70877d36d2..a495a11251 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -132,7 +132,7 @@ int main(int argc, char **argv)
   milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
   milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
 
-  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv);
+  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, true);
 
   // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
   double plaq[3];
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 5d4a96540c..5fcd338b4b 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -181,7 +181,8 @@ void test(int argc, char **argv)
   void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)};
   void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
   void *qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)};
-  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv);
+  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, true);
+
   // Reorder gauge fields to MILC order
   cpuFatMILC = cpuFatQDP;
   cpuLongMILC = cpuLongQDP;
diff --git a/tests/utils/face_gauge.cpp b/tests/utils/face_gauge.cpp
index 4b4af5ca62..bebaae5622 100644
--- a/tests/utils/face_gauge.cpp
+++ b/tests/utils/face_gauge.cpp
@@ -906,15 +906,10 @@ void do_exchange_cpu_staple(Float *staple, Float **ghost_staple, Float **staple_
     Float *ghost_staple_back = ghost_staple[dir];
     Float *ghost_staple_fwd = ghost_staple[dir] + 2 * Vsh[dir] * gauge_site_size;
 
-    MsgHandle *mh_recv_back;
-    MsgHandle *mh_recv_fwd;
-    MsgHandle *mh_send_fwd;
-    MsgHandle *mh_send_back;
-
-    mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2 * len[dir]);
-    mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2 * len[dir]);
-    mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2 * len[dir]);
-    mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2 * len[dir]);
+    MsgHandle *mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2 * len[dir]);
+    MsgHandle *mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2 * len[dir]);
+    MsgHandle *mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2 * len[dir]);
+    MsgHandle *mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2 * len[dir]);
 
     comm_start(mh_recv_back);
     comm_start(mh_recv_fwd);
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 65ea318840..70aea9cdc2 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -1054,6 +1054,7 @@ template <typename Float> void constructUnitGaugeField(Float **res, QudaGaugePar
   }
 
   for (int dir = 0; dir < 4; dir++) {
+#pragma omp parallel for
     for (int i = 0; i < Vh; i++) {
       for (int m = 0; m < 3; m++) {
         for (int n = 0; n < 3; n++) {
@@ -1269,14 +1270,21 @@ template <typename Float> static void checkGauge(Float **oldG, Float **newG, dou
 
   for (int d = 0; d < 4; d++) {
     for (int eo = 0; eo < 2; eo++) {
+#pragma omp parallel for
       for (int i = 0; i < Vh; i++) {
         int ga_idx = (eo * Vh + i);
         for (int j = 0; j < 18; j++) {
           double diff = fabs(newG[d][ga_idx * 18 + j] - oldG[d][ga_idx * 18 + j]); /// fabs(oldG[d][ga_idx*18+j]);
 
           for (int f = 0; f < fail_check; f++)
-            if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[d][f]++;
-          if (diff > epsilon || std::isnan(diff)) iter[d][j]++;
+            if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) {
+#pragma omp atomic
+              fail[d][f]++;
+            }
+          if (diff > epsilon || std::isnan(diff)) {
+#pragma omp atomic
+            iter[d][j]++;
+          }
         }
       }
     }
@@ -1311,6 +1319,7 @@ void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase)
   }
 
   if (phase == SITELINK_PHASE_MILC) {
+#pragma omp parallel for
     for (int i = 0; i < V; i++) {
       for (int dir = XUP; dir <= TUP; dir++) {
         int idx = i;
@@ -1459,14 +1468,21 @@ template <typename Float> int compareLink(Float **linkA, Float **linkB, int len)
   for (int i = 0; i < 18; i++) iter[i] = 0;
 
   for (int dir = 0; dir < 4; dir++) {
+#pragma omp parallel for
     for (int i = 0; i < len; i++) {
       for (int j = 0; j < 18; j++) {
         int is = i * 18 + j;
         double diff = fabs(linkA[dir][is] - linkB[dir][is]);
         for (int f = 0; f < fail_check; f++)
-          if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++;
+          if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) {
+#pragma omp atomic
+            fail[f]++;
+          }
         // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
-        if (diff > 1e-3 || std::isnan(diff)) iter[j]++;
+        if (diff > 1e-3 || std::isnan(diff)) {
+#pragma omp atomic
+          iter[j]++;
+        }
       }
     }
   }
@@ -1624,14 +1640,21 @@ template <typename Float> int compare_mom(Float *momA, Float *momB, int len)
   int iter[mom_site_size];
   for (auto i = 0lu; i < mom_site_size; i++) iter[i] = 0;
 
+#pragma omp parallel for
   for (int i = 0; i < len; i++) {
     for (auto j = 0lu; j < mom_site_size - 1; j++) {
       int is = i * mom_site_size + j;
       double diff = fabs(momA[is] - momB[is]);
       for (int f = 0; f < fail_check; f++)
-        if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++;
+        if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) {
+#pragma omp atomic
+          fail[f]++;
+        }
       // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
-      if (diff > 1e-3 || std::isnan(diff)) iter[j]++;
+      if (diff > 1e-3 || std::isnan(diff)) {
+#pragma omp atomic
+        iter[j]++;
+      }
     }
   }
 
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 75e0d2ad1f..b788cc3081 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -49,15 +49,15 @@ void setQudaStaggeredInvTestParams();
 //------------------------------------------------------
 void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu,
                                             void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param,
-                                            int argc, char **argv, bool &gauge_loaded);
+                                            int argc, char **argv);
 void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
-                                      QudaGaugeParam &gauge_param, int argc, char **argv);
+                                      QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu);
 void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *,
                                 QudaDslashType dslash_type);
 void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param);
 void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, void *act_path_coeff);
 void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink,
-                         void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik);
+                         void *qudaGaugeParamPtr, std::array<std::array<double, 6>, 3> &act_path_coeffs, double eps_naik);
 void computeTwoLinkCPU(void **twolink, void **sitelink, QudaGaugeParam *gauge_param);
 void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk,
                                    quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param,
diff --git a/tests/utils/llfat_utils.cpp b/tests/utils/llfat_utils.cpp
index 43d2acaffa..548c6976b1 100644
--- a/tests/utils/llfat_utils.cpp
+++ b/tests/utils/llfat_utils.cpp
@@ -29,10 +29,6 @@ template <typename su3_matrix, typename Real>
 void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matrix *mulink, su3_matrix **sitelink,
                                     void **fatlink, Real coef, int use_staple)
 {
-  su3_matrix tmat1, tmat2;
-  int i;
-  su3_matrix *fat1;
-
   /* Upper staple */
   /* Computes the staple :
    *                mu (B)
@@ -46,16 +42,15 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr
    * It also adds the computed staple to the fatlink[mu] with weight coef.
    */
 
-  int dx[4];
-
   /* upper staple */
 
-  for (i = 0; i < V; i++) {
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
 
-    fat1 = ((su3_matrix *)fatlink[mu]) + i;
+    auto fat1 = ((su3_matrix *)fatlink[mu]) + i;
     su3_matrix *A = sitelink[nu] + i;
 
-    memset(dx, 0, sizeof(dx));
+    int dx[4] = {};
     dx[nu] = 1;
     int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
     su3_matrix *B;
@@ -70,6 +65,7 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr
     nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
     su3_matrix *C = sitelink[nu] + nbr_idx;
 
+    su3_matrix tmat1, tmat2;
     llfat_mult_su3_nn(A, B, &tmat1);
 
     if (staple != NULL) { /* Save the staple */
@@ -89,10 +85,11 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr
    *
    *********************************************/
 
-  for (i = 0; i < V; i++) {
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
 
-    fat1 = ((su3_matrix *)fatlink[mu]) + i;
-    memset(dx, 0, sizeof(dx));
+    auto fat1 = ((su3_matrix *)fatlink[mu]) + i;
+    int dx[4] = {};
     dx[nu] = -1;
     int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
     if (nbr_idx >= V || nbr_idx < 0) {
@@ -113,6 +110,7 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr
     nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2], dx[1], dx[0]);
     su3_matrix *C = sitelink[nu] + nbr_idx;
 
+    su3_matrix tmat1, tmat2;
     llfat_mult_su3_an(A, B, &tmat1);
     llfat_mult_su3_nn(&tmat1, C, &tmat2);
 
@@ -148,6 +146,7 @@ void llfat_cpu(void **fatlink, su3_matrix **sitelink, Float *act_path_coeff)
   for (int dir = XUP; dir <= TUP; dir++) {
 
     // Intialize fat links with c_1*U_\mu(x)
+#pragma omp parallel for
     for (int i = 0; i < V; i++) {
       su3_matrix *fat1 = ((su3_matrix *)fatlink[dir]) + i;
       llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1);
@@ -210,10 +209,6 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
                                        su3_matrix **ghost_mulink, su3_matrix **sitelink, su3_matrix **ghost_sitelink,
                                        su3_matrix **ghost_sitelink_diag, void **fatlink, Real coef, int use_staple)
 {
-  su3_matrix tmat1, tmat2;
-  int i;
-  su3_matrix *fat1;
-
   int X1 = Z[0];
   int X2 = Z[1];
   int X3 = Z[2];
@@ -237,11 +232,10 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
    * It also adds the computed staple to the fatlink[mu] with weight coef.
    */
 
-  int dx[4];
-
   // upper staple
 
-  for (i = 0; i < V; i++) {
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
 
     int half_index = i;
     int oddBit = 0;
@@ -264,10 +258,10 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
     int space_con[4] = {(x4 * X3X2 + x3 * X2 + x2) / 2, (x4 * X3X1 + x3 * X1 + x1) / 2, (x4 * X2X1 + x2 * X1 + x1) / 2,
                         (x3 * X2X1 + x2 * X1 + x1) / 2};
 
-    fat1 = ((su3_matrix *)fatlink[mu]) + i;
+    auto fat1 = ((su3_matrix *)fatlink[mu]) + i;
     su3_matrix *A = sitelink[nu] + i;
 
-    memset(dx, 0, sizeof(dx));
+    int dx[4] = {};
     dx[nu] = 1;
     int nbr_idx;
 
@@ -299,6 +293,7 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
       C = sitelink[nu] + nbr_idx;
     }
 
+    su3_matrix tmat1, tmat2;
     llfat_mult_su3_nn(A, B, &tmat1);
 
     if (staple != NULL) { /* Save the staple */
@@ -318,7 +313,8 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
    *
    *********************************************/
 
-  for (i = 0; i < V; i++) {
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
 
     int half_index = i;
     int oddBit = 0;
@@ -342,11 +338,11 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
 
     // int x4 = x4_from_full_index(i);
 
-    fat1 = ((su3_matrix *)fatlink[mu]) + i;
+    auto fat1 = ((su3_matrix *)fatlink[mu]) + i;
 
     // we could be in the ghost link area if nu is T and we are at low T boundary
     su3_matrix *A;
-    memset(dx, 0, sizeof(dx));
+    int dx[4] = {};
     dx[nu] = -1;
 
     int nbr_idx;
@@ -412,6 +408,7 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
     } else {
       C = sitelink[nu] + nbr_idx;
     }
+    su3_matrix tmat1, tmat2;
     llfat_mult_su3_an(A, B, &tmat1);
     llfat_mult_su3_nn(&tmat1, C, &tmat2);
 
@@ -430,12 +427,7 @@ template <typename su3_matrix, typename Float>
 void llfat_cpu_mg(void **fatlink, su3_matrix **sitelink, su3_matrix **ghost_sitelink, su3_matrix **ghost_sitelink_diag,
                   Float *act_path_coeff)
 {
-  QudaPrecision prec;
-  if (sizeof(Float) == 4) {
-    prec = QUDA_SINGLE_PRECISION;
-  } else {
-    prec = QUDA_DOUBLE_PRECISION;
-  }
+  QudaPrecision prec = sizeof(Float) == 4 ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION;
 
   su3_matrix *staple = (su3_matrix *)safe_malloc(V * sizeof(su3_matrix));
 
@@ -455,6 +447,7 @@ void llfat_cpu_mg(void **fatlink, su3_matrix **sitelink, su3_matrix **ghost_site
   for (int dir = XUP; dir <= TUP; dir++) {
 
     // Intialize fat links with c_1*U_\mu(x)
+#pragma omp parallel for
     for (int i = 0; i < V; i++) {
       su3_matrix *fat1 = ((su3_matrix *)fatlink[dir]) + i;
       llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1);
diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp
index 2759e3489b..b020939c5f 100644
--- a/tests/utils/staggered_gauge_utils.cpp
+++ b/tests/utils/staggered_gauge_utils.cpp
@@ -23,8 +23,9 @@ static double max_allowed_error = 1e-11;
 
 // Wrap everything for the GPU construction of fat/long links here
 void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fatlink_eps, void **qdp_longlink_eps,
-                         void **qdp_inlink, QudaGaugeParam &gauge_param_in, double **act_path_coeffs, double eps_naik,
-                         size_t gSize, int n_naiks)
+                         void **qdp_inlink, QudaGaugeParam &gauge_param_in,
+                         std::array<std::array<double, 6>, 3> &act_path_coeffs, double eps_naik, size_t gSize,
+                         int n_naiks)
 {
   // since a lot of intermediaries can be general matrices, override the recon in `gauge_param_in`
   auto gauge_param = gauge_param_in;
@@ -52,11 +53,11 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
   }
 
   // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
-  computeKSLinkQuda(milc_vlink, nullptr, milc_wlink, milc_inlink, act_path_coeffs[0], &gauge_param);
+  computeKSLinkQuda(milc_vlink, nullptr, milc_wlink, milc_inlink, act_path_coeffs[0].data(), &gauge_param);
 
   if (n_naiks > 1) {
     // Create Naiks, 3rd path table set
-    computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[2], &gauge_param);
+    computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[2].data(), &gauge_param);
 
     // Rescale+copy Naiks into Naik field
     cpu_axy(gauge_param.cpu_prec, eps_naik, milc_fatlink, milc_fatlink_eps, V * 4 * gauge_site_size);
@@ -67,7 +68,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
   }
 
   // Create X and long links, 2nd path table set
-  computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[1], &gauge_param);
+  computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[1].data(), &gauge_param);
 
   if (n_naiks > 1) {
     // Add into Naik field
@@ -98,7 +99,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
   }
 }
 
-void setActionPaths(double **act_paths)
+template <class T> void setActionPaths(T &act_paths)
 {
   ///////////////////////////
   // Set path coefficients //
@@ -160,8 +161,7 @@ void setActionPaths(double **act_paths)
 void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
                        size_t gSize, int n_naiks, double eps_naik)
 {
-  double **act_paths = new double *[3];
-  for (int i = 0; i < 3; i++) act_paths[i] = new double[6];
+  std::array<std::array<double, 6>, 3> act_paths;
   setActionPaths(act_paths);
 
   ///////////////////////////////////////////////////////////////////////
@@ -196,17 +196,12 @@ void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlin
       host_free(qdp_longlink_naik_temp[dir]);
     }
   }
-
-  for (int i = 0; i < 3; i++) delete[] act_paths[i];
-  delete[] act_paths;
 }
 
-void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu,
-                             void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize,
-                             int n_naiks, double eps_naik)
+void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
+                       size_t gSize, int n_naiks, double eps_naik)
 {
-  double **act_paths = new double *[3];
-  for (int i = 0; i < 3; i++) act_paths[i] = new double[6];
+  std::array<std::array<double, 6>, 3> act_paths;
   setActionPaths(act_paths);
 
   ///////////////////////////////////////////////////////////////////////
@@ -229,41 +224,26 @@ void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, vo
   //////////////////////////
 
   // defined in "llfat_reference.cpp"
-  computeHISQLinksCPU(qdp_fatlink_cpu, qdp_longlink_cpu, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr,
+  computeHISQLinksCPU(qdp_fatlink, qdp_longlink, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr,
                       (n_naiks == 2) ? qdp_longlink_naik_temp : nullptr, qdp_inlink, &gauge_param, act_paths, eps_naik);
 
   if (n_naiks == 2) {
     // Override the naik fields into the fat/long link fields
     for (int dir = 0; dir < 4; dir++) {
-      memcpy(qdp_fatlink_cpu[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize);
-      memcpy(qdp_longlink_cpu[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize);
-      memset(qdp_fatlink_naik_temp[dir], 0, V * gauge_site_size * gSize);
-      memset(qdp_longlink_naik_temp[dir], 0, V * gauge_site_size * gSize);
-    }
-  }
-
-  //////////////////////////
-  // Create the GPU links //
-  //////////////////////////
-
-  // Skip eps field for now
-  // Note: GPU link creation only works for single and double precision
-  computeHISQLinksGPU(qdp_fatlink_gpu, qdp_longlink_gpu, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr,
-                      (n_naiks == 2) ? qdp_longlink_naik_temp : nullptr, qdp_inlink, gauge_param, act_paths, eps_naik,
-                      gSize, n_naiks);
-
-  if (n_naiks == 2) {
-    // Override the naik fields into the fat/long link fields
-    for (int dir = 0; dir < 4; dir++) {
-      memcpy(qdp_fatlink_gpu[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize);
-      memcpy(qdp_longlink_gpu[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize);
+      memcpy(qdp_fatlink[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize);
+      memcpy(qdp_longlink[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize);
       host_free(qdp_fatlink_naik_temp[dir]);
       host_free(qdp_longlink_naik_temp[dir]);
     }
   }
+}
 
-  for (int i = 0; i < 3; i++) delete[] act_paths[i];
-  delete[] act_paths;
+void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu,
+                             void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize,
+                             int n_naiks, double eps_naik)
+{
+  computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik);
+  computeFatLongCPU(qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik);
 }
 
 // Routine that takes in a QDP-ordered field and outputs the plaquette.
diff --git a/tests/utils/staggered_gauge_utils.h b/tests/utils/staggered_gauge_utils.h
index f2cc4e4749..d969d437e1 100644
--- a/tests/utils/staggered_gauge_utils.h
+++ b/tests/utils/staggered_gauge_utils.h
@@ -18,6 +18,9 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
 void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
                        size_t gSize, int n_naiks, double eps_naik);
 
+void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
+                       size_t gSize, int n_naiks, double eps_naik);
+
 void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu,
                              void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize,
                              int n_naiks, double eps_naik);
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index df1d787ff0..95efb0d85a 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -26,25 +26,24 @@ template <typename T> using complex = std::complex<T>;
 
 // Staggered gauge field utils
 //------------------------------------------------------
-void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu,
-                                            void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param,
-                                            int argc, char **argv, bool &gauge_loaded)
+void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
+                                      QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu)
 {
+  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
+
   // load a field WITHOUT PHASES
   if (latfile.size() > 0) {
-    if (!gauge_loaded) {
-      read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
-      if (dslash_type != QUDA_LAPLACE_DSLASH) {
-        applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
-      }
-      gauge_loaded = true;
-    } // else it's already been loaded
+    // load in the command line supplied gauge field using QIO and LIME
+    read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
+    if (dslash_type != QUDA_LAPLACE_DSLASH) {
+      applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
+    }
   } else {
     int construct_type = (unit_gauge) ? 0 : 1;
     if (dslash_type == QUDA_LAPLACE_DSLASH) {
       constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param);
     } else {
-      constructFatLongGaugeField(qdp_inlink, qdp_longlink_cpu, construct_type, gauge_param.cpu_prec, &gauge_param,
+      constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param,
                                  compute_fatlong ? QUDA_STAGGERED_DSLASH : dslash_type);
     }
   }
@@ -53,62 +52,49 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli
   // "compute" the fat/long links or not.
   if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) {
     for (int dir = 0; dir < 4; dir++) {
-      memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-      memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-      memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
-      memset(qdp_longlink_cpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
+      memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+      memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
     }
   } else {
     // QUDA_ASQTAD_DSLASH
     if (compute_fatlong) {
-      computeFatLongGPUandCPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink,
-                              gauge_param, host_gauge_data_type_size, n_naiks, eps_naik);
+      if (compute_on_gpu)
+        computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks,
+                          eps_naik);
+      else
+        computeFatLongCPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks,
+                          eps_naik);
     } else {
-      // Not computing FatLong
       for (int dir = 0; dir < 4; dir++) {
-        memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-        memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-        memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size);
+        memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
       }
     }
   }
 }
 
-void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
-                                      QudaGaugeParam &gauge_param, int argc, char **argv)
+void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu,
+                                            void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param,
+                                            int argc, char **argv)
 {
-  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
-
-  if (latfile.size() > 0) {
-    // load in the command line supplied gauge field using QIO and LIME
-    read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
-    if (dslash_type != QUDA_LAPLACE_DSLASH) {
-      applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
-    }
-  } else {
-    int construct_type = (unit_gauge) ? 0 : 1;
-    if (dslash_type == QUDA_LAPLACE_DSLASH) {
-      constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param);
-    } else {
-      constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param,
-                                 compute_fatlong ? QUDA_STAGGERED_DSLASH : dslash_type);
-    }
-  }
+  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, false);
 
   // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
   // "compute" the fat/long links or not.
   if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) {
     for (int dir = 0; dir < 4; dir++) {
-      memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-      memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
+      memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+      memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
     }
   } else {
     // QUDA_ASQTAD_DSLASH
     if (compute_fatlong) {
-      computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik);
+      computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks,
+                        eps_naik);
     } else {
+      // Not computing FatLong
       for (int dir = 0; dir < 4; dir++) {
-        memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+        memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+        memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size);
       }
     }
   }
@@ -188,6 +174,7 @@ void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaP
   // FIXME: may break host comparison
   if (dslash_type == QUDA_STAGGERED_DSLASH) {
     for (int dir = 0; dir < 4; ++dir) {
+#pragma omp parallel for
       for (int i = 0; i < V; ++i) {
         for (auto j = 0lu; j < gauge_site_size; j += 2) {
           if (precision == QUDA_DOUBLE_PRECISION) {
@@ -252,11 +239,12 @@ void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugePara
 template <typename su3_matrix, typename Float>
 void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
 {
-  su3_matrix temp;
   for (int dir = XUP; dir <= TUP; ++dir) {
-    int dx[4] = {0, 0, 0, 0};
+#pragma omp parallel for
     for (int i = 0; i < V; ++i) {
+      int dx[4] = {0, 0, 0, 0};
       // Initialize the longlinks
+      su3_matrix temp;
       su3_matrix *llink = ((su3_matrix *)longlink[dir]) + i;
       llfat_scalar_mult_su3_matrix(sitelink[dir] + i, act_path_coeff[1], llink);
       dx[dir] = 1;
@@ -267,7 +255,6 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_
       llfat_mult_su3_nn(&temp, sitelink[dir] + nbr_idx, llink);
     }
   }
-  return;
 }
 #else
 
@@ -278,7 +265,7 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat
   for (int dir = 0; dir < 4; ++dir) E[dir] = Z[dir] + 4;
   const int extended_volume = E[3] * E[2] * E[1] * E[0];
 
-  su3_matrix temp;
+#pragma omp parallel for
   for (int t = 0; t < Z[3]; ++t) {
     for (int z = 0; z < Z[2]; ++z) {
       for (int y = 0; y < Z[1]; ++y) {
@@ -294,6 +281,7 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat
             llfat_scalar_mult_su3_matrix(sitelinkEx[dir] + large_index, act_path_coeff[1], llink);
             dx[dir] = 1;
             int nbr_index = neighborIndexFullLattice(E, large_index, dx);
+            su3_matrix temp;
             llfat_mult_su3_nn(llink, sitelinkEx[dir] + nbr_index, &temp);
             dx[dir] = 2;
             nbr_index = neighborIndexFullLattice(E, large_index, dx);
@@ -303,7 +291,6 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat
       }   // y
     }     // z
   }       // t
-  return;
 }
 #endif
 
@@ -399,6 +386,7 @@ void staggeredTwoLinkGaussianSmear(sFloat *res, gFloat **twolink, gFloat **ghost
   }
 
   {
+#pragma omp parallel for
     for (int i = 0; i < Vh; i++) {
       // Get local time-slice index:
       const int local_t = i / Vsh_t;
@@ -484,7 +472,7 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, const quda
 // If "eps_naik" is 0, there's no naik correction,
 // and this routine skips building the paths in "act_path_coeffs[2]"
 void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink,
-                         void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik)
+                         void *qudaGaugeParamPtr, std::array<std::array<double, 6>, 3> &act_path_coeffs, double eps_naik)
 {
   // Prepare various things
   QudaGaugeParam &qudaGaugeParam = *((QudaGaugeParam *)qudaGaugeParamPtr);
@@ -517,6 +505,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo
   int X3 = Z[2];
   int X4 = Z[3];
 
+#pragma omp parallel for
   for (int i = 0; i < V_ex; i++) {
     int sid = i;
     int oddBit = 0;
@@ -656,6 +645,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo
   // Prepare for extended W fields //
   ///////////////////////////////////
 
+#pragma omp parallel for
   for (int i = 0; i < V_ex; i++) {
     int sid = i;
     int oddBit = 0;
@@ -872,6 +862,7 @@ void reorderQDPtoMILC(void *milc_out, void **qdp_in, int V, int siteSize, QudaPr
 
 template <typename Out, typename In> void reorderMILCtoQDP(Out **qdp_out, In *milc_in, int V, int siteSize)
 {
+#pragma omp parallel for
   for (int i = 0; i < V; i++) {
     for (int dir = 0; dir < 4; dir++) {
       for (int j = 0; j < siteSize; j++) {
@@ -926,6 +917,7 @@ void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, Q
   for (int d = 0; d < 3; d++) {
 
     // even
+#pragma omp parallel for
     for (int i = 0; i < Vh; i++) {
 
       int index = fullLatticeIndex(i, 0);
@@ -974,6 +966,7 @@ void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, Q
 
   // Apply boundary conditions to temporal links
   if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t()) {
+#pragma omp parallel for
     for (int j = 0; j < Vh; j++) {
       int sign = 1;
       if (dslash_type == QUDA_ASQTAD_DSLASH) {