diff --git a/include/blas_helper.cuh b/include/blas_helper.cuh
index e2617e879a..1c55c5c2e3 100644
--- a/include/blas_helper.cuh
+++ b/include/blas_helper.cuh
@@ -111,9 +111,7 @@ namespace quda
       {}
 
       data_t(const ColorSpinorField &x) :
-        spinor(static_cast<store_t *>(const_cast<ColorSpinorField &>(x).V())),
-        stride(x.VolumeCB()),
-        cb_offset(x.Bytes() / (2 * sizeof(store_t) * N))
+        spinor(x.data<store_t *>()), stride(x.VolumeCB()), cb_offset(x.Bytes() / (2 * sizeof(store_t) * N))
       {}
     };
 
@@ -141,8 +139,8 @@ namespace quda
       {}
 
       data_t(const ColorSpinorField &x) :
-        spinor(static_cast<store_t *>(const_cast<ColorSpinorField &>(x).V())),
-        norm(static_cast<norm_t *>(const_cast<ColorSpinorField &>(x).Norm())),
+        spinor(x.data<store_t *>()),
+        norm(static_cast<norm_t *>(x.Norm())),
         stride(x.VolumeCB()),
         cb_offset(x.Bytes() / (2 * sizeof(store_t) * N)),
         cb_norm_offset(x.Bytes() / (2 * sizeof(norm_t)))
diff --git a/include/blas_quda.h b/include/blas_quda.h
index 3fc051d3ff..07b09f1209 100644
--- a/include/blas_quda.h
+++ b/include/blas_quda.h
@@ -23,9 +23,6 @@ namespace quda {
 
     void setParam(int kernel, int prec, int threads, int blocks);
 
-    extern unsigned long long flops;
-    extern unsigned long long bytes;
-
     inline void zero(cvector_ref<ColorSpinorField> &x)
     {
       for (auto i = 0u; i < x.size(); i++) x[i].zero();
@@ -33,7 +30,7 @@ namespace quda {
 
     inline void copy(ColorSpinorField &dst, const ColorSpinorField &src)
     {
-      if (dst.V() == src.V()) {
+      if (dst.data() == src.data()) {
         // check the fields are equivalent else error
         if (ColorSpinorField::are_compatible(dst, src))
           return;
diff --git a/include/clover_field.h b/include/clover_field.h
index fcbf7ffd7a..41920d0bc1 100644
--- a/include/clover_field.h
+++ b/include/clover_field.h
@@ -178,9 +178,10 @@ namespace quda {
     int nColor = 0;
     int nSpin = 0;
 
-    void *clover = nullptr;
-    void *cloverInv = nullptr;
+    quda_ptr clover = {};
+    quda_ptr cloverInv = {};
 
+    bool inverse = false;
     double diagonal = 0.0;
     array<double, 2> max = {};
 
@@ -213,12 +214,18 @@ namespace quda {
 
   public:
     CloverField(const CloverFieldParam &param);
-    virtual ~CloverField();
 
     static CloverField *Create(const CloverFieldParam &param);
 
-    void* V(bool inverse=false) { return inverse ? cloverInv : clover; }
-    const void* V(bool inverse=false) const { return inverse ? cloverInv : clover; }
+    template <typename T = void *> auto data(bool inverse = false) const
+    {
+      return inverse ? reinterpret_cast<T>(cloverInv.data()) : reinterpret_cast<T>(clover.data());
+    }
+
+    /**
+       @return whether the inverse is explicitly been allocated
+     */
+    bool Inverse() const { return inverse; }
 
     /**
        @return diagonal scaling factor applied to the identity
@@ -406,10 +413,6 @@ namespace quda {
     */
     void copy_from_buffer(void *buffer);
 
-    friend class DiracClover;
-    friend class DiracCloverPC;
-    friend class DiracTwistedClover;
-    friend class DiracTwistedCloverPC;
   };
 
   /**
diff --git a/include/clover_field_order.h b/include/clover_field_order.h
index 1464a02629..65d5ef6cff 100644
--- a/include/clover_field_order.h
+++ b/include/clover_field_order.h
@@ -312,7 +312,7 @@ namespace quda {
       static constexpr int N = nColor * nSpin / 2;
       reconstruct_t<Float, N * N, clover::reconstruct()> recon;
       FloatNAccessor(const CloverField &A, bool inverse = false) :
-        a(static_cast<Float *>(const_cast<void *>(A.V(inverse)))),
+        a(A.Bytes() ? A.data<Float *>(inverse) : nullptr),
         stride(A.VolumeCB()),
         offset_cb(A.Bytes() / (2 * sizeof(Float))),
         compressed_block_size(A.compressed_block_size()),
@@ -403,7 +403,7 @@ namespace quda {
       const int N = nSpin * nColor / 2;
       const complex<Float> zero;
       Accessor(const CloverField &A, bool inverse = false) :
-        a(static_cast<Float *>(const_cast<void *>(A.V(inverse)))),
+        a(A.Bytes() ? A.data<Float *>(inverse) : nullptr),
         offset_cb(A.Bytes() / (2 * sizeof(Float))),
         zero(complex<Float>(0.0, 0.0))
       {
@@ -639,7 +639,7 @@ namespace quda {
           if (clover.max_element(is_inverse) == 0.0 && isFixed<Float>::value)
             errorQuda("%p max_element(%d) appears unset", &clover, is_inverse);
           if (clover.Diagonal() == 0.0 && clover.Reconstruct()) errorQuda("%p diagonal appears unset", &clover);
-          this->clover = clover_ ? clover_ : (Float *)(clover.V(is_inverse));
+          this->clover = clover_ ? clover_ : clover.data<Float *>(is_inverse);
         }
 
         QudaTwistFlavorType TwistFlavor() const { return twist_flavor; }
@@ -844,7 +844,7 @@ namespace quda {
           if (clover.Order() != QUDA_PACKED_CLOVER_ORDER) {
             errorQuda("Invalid clover order %d for this accessor", clover.Order());
           }
-          this->clover = clover_ ? clover_ : (Float *)(clover.V(inverse));
+          this->clover = clover_ ? clover_ : clover.data<Float *>(inverse);
         }
 
         QudaTwistFlavorType TwistFlavor() const { return twist_flavor; }
@@ -892,8 +892,8 @@ namespace quda {
           if (clover.Order() != QUDA_QDPJIT_CLOVER_ORDER) {
             errorQuda("Invalid clover order %d for this accessor", clover.Order());
           }
-          offdiag = clover_ ? ((Float **)clover_)[0] : ((Float **)clover.V(inverse))[0];
-          diag = clover_ ? ((Float **)clover_)[1] : ((Float **)clover.V(inverse))[1];
+          offdiag = clover_ ? ((Float **)clover_)[0] : clover.data<Float **>(inverse)[0];
+          diag = clover_ ? ((Float **)clover_)[1] : clover.data<Float **>(inverse)[1];
         }
 
         QudaTwistFlavorType TwistFlavor() const { return twist_flavor; }
@@ -970,7 +970,7 @@ namespace quda {
           if (clover.Order() != QUDA_BQCD_CLOVER_ORDER) {
             errorQuda("Invalid clover order %d for this accessor", clover.Order());
           }
-          this->clover[0] = clover_ ? clover_ : (Float *)(clover.V(inverse));
+          this->clover[0] = clover_ ? clover_ : clover.data<Float *>(inverse);
           this->clover[1] = (Float *)((char *)this->clover[0] + clover.Bytes() / 2);
         }
 
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 76fa31b943..4801bdbf48 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -121,18 +121,13 @@ namespace quda
     }
   };
 
-  class ColorSpinorParam : public LatticeFieldParam
-  {
-
-  public:
+  struct ColorSpinorParam : public LatticeFieldParam {
     int nColor = 0; // Number of colors of the field
     int nSpin = 0;  // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor
     int nVec = 1;   // number of packed vectors (for multigrid transfer operator)
 
     QudaTwistFlavorType twistFlavor = QUDA_TWIST_INVALID; // used by twisted mass
-
     QudaSiteOrder siteOrder = QUDA_INVALID_SITE_ORDER; // defined for full fields
-
     QudaFieldOrder fieldOrder = QUDA_INVALID_FIELD_ORDER; // Float, Float2, Float4 etc.
     QudaGammaBasis gammaBasis = QUDA_INVALID_GAMMA_BASIS;
     QudaFieldCreate create = QUDA_INVALID_FIELD_CREATE;
@@ -179,7 +174,6 @@ namespace quda
     ColorSpinorParam() = default;
 
     // used to create cpu params
-
     ColorSpinorParam(void *V, QudaInvertParam &inv_param, const lat_dim_t &X, const bool pc_solution,
                      QudaFieldLocation location = QUDA_CPU_FIELD_LOCATION) :
       LatticeFieldParam(4, X, 0, location, inv_param.cpu_prec),
@@ -188,20 +182,12 @@ namespace quda
              || inv_param.dslash_type == QUDA_LAPLACE_DSLASH) ?
               1 :
               4),
-      nVec(1),
       twistFlavor(inv_param.twist_flavor),
-      siteOrder(QUDA_INVALID_SITE_ORDER),
-      fieldOrder(QUDA_INVALID_FIELD_ORDER),
       gammaBasis(inv_param.gamma_basis),
       create(QUDA_REFERENCE_FIELD_CREATE),
       pc_type(inv_param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC),
-      v(V),
-      is_composite(false),
-      composite_dim(0),
-      is_component(false),
-      component_id(0)
+      v(V)
     {
-
       if (nDim > QUDA_MAX_DIM) errorQuda("Number of dimensions too great");
       for (int d = 0; d < nDim; d++) x[d] = X[d];
 
@@ -343,8 +329,7 @@ namespace quda
 
     size_t length = 0; // length including pads, but not norm zone
 
-    void *v = nullptr;      // the field elements
-    void *v_h = nullptr;    // the field elements
+    quda_ptr v = {};        // the field elements
     size_t norm_offset = 0; /** offset to the norm (if applicable) */
 
     // multi-GPU parameters
@@ -441,6 +426,12 @@ namespace quda
      */
     ColorSpinorField &operator=(ColorSpinorField &&field);
 
+    /**
+       @brief Returns if the object is empty (not initialized)
+       @return true if the object has not been allocated, otherwise false
+    */
+    bool empty() const { return !init; }
+
     /**
        @brief Copy the source field contents into this
        @param[in] src Source from which we are copying
@@ -477,37 +468,19 @@ namespace quda
     /**
        @brief Return pointer to the field allocation
     */
-    void *V()
-    {
-      if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return v;
-    }
-
-    /**
-       @brief Return pointer to the field allocation
-    */
-    const void *V() const
-    {
-      if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return v;
-    }
-
-    /**
-       @brief Return pointer to the norm base pointer in the field allocation
-    */
-    void *Norm()
+    template <typename T = void *> auto data() const
     {
       if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return static_cast<char *>(v) + norm_offset;
+      return reinterpret_cast<T>(v.data());
     }
 
     /**
        @brief Return pointer to the norm base pointer in the field allocation
     */
-    const void *Norm() const
+    void *Norm() const
     {
       if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return static_cast<char *>(v) + norm_offset;
+      return static_cast<char *>(v.data()) + norm_offset;
     }
 
     size_t NormOffset() const { return norm_offset; }
@@ -938,7 +911,7 @@ namespace quda
     static void test_compatible_weak(const ColorSpinorField &a, const ColorSpinorField &b);
 
     friend std::ostream &operator<<(std::ostream &out, const ColorSpinorField &);
-    friend class ColorSpinorParam;
+    friend struct ColorSpinorParam;
   };
 
   /**
@@ -1022,28 +995,30 @@ namespace quda
 
   /**
      @brief Generate a random noise spinor.  This variant allows the user to manage the RNG state.
-     @param src The colorspinorfield
-     @param randstates Random state
-     @param type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM)
+     @param[out] src The colorspinorfield
+     @param[in,out] randstates Random state
+     @param[in] type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM)
   */
   void spinorNoise(ColorSpinorField &src, RNG &randstates, QudaNoiseType type);
 
   /**
      @brief Generate a random noise spinor.  This variant just
      requires a seed and will create and destroy the random number state.
-     @param src The colorspinorfield
-     @param seed Seed
-     @param type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM)
+     @param[out] src The colorspinorfield
+     @param[in] seed Seed
+     @param[in] type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM)
   */
   void spinorNoise(ColorSpinorField &src, unsigned long long seed, QudaNoiseType type);
 
   /**
      @brief Generate a set of diluted color spinors from a single source.
-     @param v Diluted vector set
-     @param src The input source
-     @param type The type of dilution to apply (QUDA_DILUTION_SPIN_COLOR, etc.)
+     @param[out] v Diluted vector set
+     @param[in] src The input source
+     @param[in] type The type of dilution to apply (QUDA_DILUTION_SPIN_COLOR, etc.)
+     @param[in] local_block The local block size to use when using QUDA_DILUTION_BLOCK dilution
   */
-  void spinorDilute(std::vector<ColorSpinorField> &v, const ColorSpinorField &src, QudaDilutionType type);
+  void spinorDilute(std::vector<ColorSpinorField> &v, const ColorSpinorField &src, QudaDilutionType type,
+                    const lat_dim_t &local_block = {});
 
   /**
      @brief Helper function for determining if the preconditioning
diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 67e2192122..022d2a2b69 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -877,14 +877,13 @@ namespace quda
       FieldOrderCB(const ColorSpinorField &field, int nFace = 1, void *const v_ = 0, void *const *ghost_ = 0) :
         GhostOrder(field, nFace, ghost_), volumeCB(field.VolumeCB()), accessor(field)
       {
-        v.v = v_ ? static_cast<complex<storeFloat> *>(const_cast<void *>(v_)) :
-                   static_cast<complex<storeFloat> *>(const_cast<void *>(field.V()));
+        v.v = v_ ? static_cast<complex<storeFloat> *>(const_cast<void *>(v_)) : field.data<complex<storeFloat> *>();
         resetScale(field.Scale());
 
         if constexpr (fixed && block_float) {
           if constexpr (nColor == 3 && nSpin == 1 && nVec == 1 && order == 2)
             // special case where the norm is packed into the per site struct
-            v.norm = reinterpret_cast<norm_t *>(const_cast<void *>(field.V()));
+            v.norm = field.data<norm_t *>();
           else
             v.norm = static_cast<norm_t *>(const_cast<void *>(field.Norm()));
           v.norm_offset = field.Bytes() / (2 * sizeof(norm_t));
@@ -1075,33 +1074,37 @@ namespace quda
       using GhostVector = typename VectorType<Float, N_ghost>::type;
       using AllocInt = typename AllocType<huge_alloc>::type;
       using norm_type = float;
-      Float *field;
-      norm_type *norm;
-      const AllocInt offset; // offset can be 32-bit or 64-bit
-      const AllocInt norm_offset;
-      int volumeCB;
-      int faceVolumeCB[4];
-      mutable Float *ghost[8];
-      mutable norm_type *ghost_norm[8];
-      int nParity;
-      void *backup_h; //! host memory for backing up the field when tuning
-      size_t bytes;
+      Float *field = nullptr;
+      norm_type *norm = nullptr;
+      AllocInt offset = 0; // offset can be 32-bit or 64-bit
+      AllocInt norm_offset = 0;
+      int volumeCB = 0;
+      array<int, 4> faceVolumeCB = {};
+      mutable array<Float *, 8> ghost = {};
+      mutable array<norm_type *, 8> ghost_norm = {};
+      int nParity = 0;
+      void *backup_h = nullptr; //! host memory for backing up the field when tuning
+      size_t bytes = 0;
+
+      FloatNOrder() = default;
+      FloatNOrder(const FloatNOrder &) = default;
 
       FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) :
-        field(buffer ? buffer : (Float *)a.V()),
+        field(buffer ? buffer : a.data<Float *>()),
         norm(buffer ? reinterpret_cast<norm_type *>(reinterpret_cast<char *>(buffer) + a.NormOffset()) :
                       const_cast<norm_type *>(reinterpret_cast<const norm_type *>(a.Norm()))),
         offset(a.Bytes() / (2 * sizeof(Float) * N)),
         norm_offset(a.Bytes() / (2 * sizeof(norm_type))),
         volumeCB(a.VolumeCB()),
         nParity(a.SiteSubset()),
-        backup_h(nullptr),
         bytes(a.Bytes())
       {
         for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; }
         resetGhost(ghost_ ? (void **)ghost_ : a.Ghost());
       }
 
+      FloatNOrder &operator=(const FloatNOrder &) = default;
+
       void resetGhost(void *const *ghost_) const
       {
         for (int dim = 0; dim < 4; dim++) {
@@ -1306,27 +1309,31 @@ namespace quda
       using GhostVector = int4; // 128-bit packed type
       using AllocInt = typename AllocType<huge_alloc>::type;
       using norm_type = float;
-      Float *field;
-      const AllocInt offset; // offset can be 32-bit or 64-bit
-      int volumeCB;
-      int faceVolumeCB[4];
-      mutable Float *ghost[8];
-      int nParity;
-      void *backup_h; //! host memory for backing up the field when tuning
-      size_t bytes;
+      Float *field = nullptr;
+      const AllocInt offset = 0; // offset can be 32-bit or 64-bit
+      int volumeCB = 0;
+      array<int, 4> faceVolumeCB = {};
+      mutable array<Float *, 8> ghost = {};
+      int nParity = 0;
+      void *backup_h = nullptr; //! host memory for backing up the field when tuning
+      size_t bytes = 0;
+
+      FloatNOrder() = default;
+      FloatNOrder(const FloatNOrder &) = default;
 
       FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) :
-        field(buffer ? buffer : (Float *)a.V()),
+        field(buffer ? buffer : a.data<Float *>()),
         offset(a.Bytes() / (2 * sizeof(Vector))),
         volumeCB(a.VolumeCB()),
         nParity(a.SiteSubset()),
-        backup_h(nullptr),
         bytes(a.Bytes())
       {
         for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; }
         resetGhost(ghost_ ? (void **)ghost_ : a.Ghost());
       }
 
+      FloatNOrder &operator=(const FloatNOrder &) = default;
+
       void resetGhost(void *const *ghost_) const
       {
         for (int dim = 0; dim < 4; dim++) {
@@ -1505,7 +1512,7 @@ namespace quda
       int faceVolumeCB[4];
       int nParity;
       SpaceColorSpinorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) :
-        field(field_ ? field_ : (Float *)a.V()),
+        field(field_ ? field_ : a.data<Float *>()),
         offset(a.Bytes() / (2 * sizeof(Float))),
         volumeCB(a.VolumeCB()),
         nParity(a.SiteSubset())
@@ -1589,7 +1596,7 @@ namespace quda
       int faceVolumeCB[4];
       int nParity;
       SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) :
-        field(field_ ? field_ : (Float *)a.V()),
+        field(field_ ? field_ : a.data<Float *>()),
         offset(a.Bytes() / (2 * sizeof(Float))),
         volumeCB(a.VolumeCB()),
         nParity(a.SiteSubset())
@@ -1668,7 +1675,7 @@ namespace quda
       int exDim[4]; // full field dimensions
       PaddedSpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0,
                                   Float **ghost_ = 0) :
-        field(field_ ? field_ : (Float *)a.V()),
+        field(field_ ? field_ : a.data<Float *>()),
         volumeCB(a.VolumeCB()),
         exVolumeCB(1),
         nParity(a.SiteSubset()),
@@ -1763,7 +1770,7 @@ namespace quda
       int volumeCB;
       int nParity;
       QDPJITDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) :
-        field(field_ ? field_ : (Float *)a.V()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset())
+        field(field_ ? field_ : a.data<Float *>()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset())
       {
       }
 
diff --git a/include/dirac_quda.h b/include/dirac_quda.h
index 37573f68ee..1aa339139f 100644
--- a/include/dirac_quda.h
+++ b/include/dirac_quda.h
@@ -52,9 +52,9 @@ namespace quda {
 
     QudaMatPCType matpcType;
     QudaDagType dagger;
-    cudaGaugeField *gauge;
-    cudaGaugeField *fatGauge;  // used by staggered only
-    cudaGaugeField *longGauge; // used by staggered only
+    GaugeField *gauge;
+    GaugeField *fatGauge;  // used by staggered only
+    GaugeField *longGauge; // used by staggered only
     int laplace3D;
     CloverField *clover;
     GaugeField *xInvKD; // used for the Kahler-Dirac operator only
@@ -168,13 +168,12 @@ namespace quda {
     friend class DiracG5M;
 
   protected:
-    cudaGaugeField *gauge;
+    GaugeField *gauge;
     double kappa;
     double mass;
     int laplace3D;
     QudaMatPCType matpcType;
     mutable QudaDagType dagger; // mutable to simplify implementation of Mdag
-    mutable unsigned long long flops;
     QudaDiracType type;
     mutable QudaPrecision halo_precision; // only does something for DiracCoarse at present
 
@@ -404,16 +403,6 @@ namespace quda {
     */
     virtual bool AllowTruncation() const { return false; }
 
-    /**
-       @brief  returns and then zeroes flopcount
-    */
-    unsigned long long Flops() const
-    {
-      unsigned long long rtn = flops;
-      flops = 0;
-      return rtn;
-    }
-
     /**
        @brief returns preconditioning type
     */
@@ -450,7 +439,7 @@ namespace quda {
 
         @return Error for non-staggered operators
     */
-    virtual cudaGaugeField *getStaggeredShortLinkField() const
+    virtual GaugeField *getStaggeredShortLinkField() const
     {
       errorQuda("Invalid dirac type %d", getDiracType());
       return nullptr;
@@ -461,7 +450,7 @@ namespace quda {
 
         @return Error for non-improved staggered operators
     */
-    virtual cudaGaugeField *getStaggeredLongLinkField() const
+    virtual GaugeField *getStaggeredLongLinkField() const
     {
       errorQuda("Invalid dirac type %d", getDiracType());
       return nullptr;
@@ -476,10 +465,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *)
-    {
-      gauge = gauge_in;
-    }
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *) { gauge = gauge_in; }
 
     /**
      * @brief Create the coarse operator (virtual parent)
@@ -623,7 +609,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *clover_in)
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *clover_in)
     {
       DiracWilson::updateFields(gauge_in, nullptr, nullptr, nullptr);
       clover = clover_in;
@@ -979,7 +965,7 @@ namespace quda {
   class DiracMobiusPC : public DiracMobius {
 
   protected:
-    mutable cudaGaugeField *extended_gauge;
+    mutable GaugeField *extended_gauge;
 
   private:
   public:
@@ -1227,7 +1213,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *clover_in)
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *clover_in)
     {
       DiracWilson::updateFields(gauge_in, nullptr, nullptr, nullptr);
       clover = clover_in;
@@ -1365,7 +1351,7 @@ namespace quda {
 
        @return Gauge field
    */
-    virtual cudaGaugeField *getStaggeredShortLinkField() const { return gauge; }
+    virtual GaugeField *getStaggeredShortLinkField() const { return gauge; }
 
     /**
      * @brief Create the coarse staggered operator.
@@ -1500,7 +1486,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in,
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
                               CloverField *clover_in);
 
     /**
@@ -1541,8 +1527,8 @@ namespace quda {
   class DiracImprovedStaggered : public Dirac {
 
   protected:
-    cudaGaugeField *fatGauge;
-    cudaGaugeField *longGauge;
+    GaugeField *fatGauge;
+    GaugeField *longGauge;
 
   public:
     DiracImprovedStaggered(const DiracParam &param);
@@ -1569,14 +1555,14 @@ namespace quda {
 
         @return fat link field
     */
-    virtual cudaGaugeField *getStaggeredShortLinkField() const { return fatGauge; }
+    virtual GaugeField *getStaggeredShortLinkField() const { return fatGauge; }
 
     /**
         @brief return the long link field for staggered operators for MG setup
 
         @return long link field
     */
-    virtual cudaGaugeField *getStaggeredLongLinkField() const { return longGauge; }
+    virtual GaugeField *getStaggeredLongLinkField() const { return longGauge; }
 
     /**
      *  @brief Update the internal gauge, fat gauge, long gauge, clover field pointer as appropriate.
@@ -1587,7 +1573,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, CloverField *)
+    virtual void updateFields(GaugeField *, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, CloverField *)
     {
       Dirac::updateFields(fat_gauge_in, nullptr, nullptr, nullptr);
       fatGauge = fat_gauge_in;
@@ -1736,7 +1722,7 @@ namespace quda {
      *  @param long_gauge_in Updated long links
      *  @param clover_in Updated clover field
      */
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in,
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
                               CloverField *clover_in);
 
     /**
@@ -1791,19 +1777,19 @@ namespace quda {
     const bool dslash_use_mma;   /** Whether to use tensor cores or not */
     const bool need_aos_gauge_copy; // Whether or not we need an AoS copy of the gauge fields
 
-    mutable std::shared_ptr<cpuGaugeField> Y_h;    /** CPU copy of the coarse link field */
-    mutable std::shared_ptr<cpuGaugeField> X_h;    /** CPU copy of the coarse clover term */
-    mutable std::shared_ptr<cpuGaugeField> Xinv_h; /** CPU copy of the inverse coarse clover term */
-    mutable std::shared_ptr<cpuGaugeField> Yhat_h; /** CPU copy of the preconditioned coarse link field */
+    mutable std::shared_ptr<GaugeField> Y_h;    /** CPU copy of the coarse link field */
+    mutable std::shared_ptr<GaugeField> X_h;    /** CPU copy of the coarse clover term */
+    mutable std::shared_ptr<GaugeField> Xinv_h; /** CPU copy of the inverse coarse clover term */
+    mutable std::shared_ptr<GaugeField> Yhat_h; /** CPU copy of the preconditioned coarse link field */
 
-    mutable std::shared_ptr<cudaGaugeField> Y_d;        /** GPU copy of the coarse link field */
-    mutable std::shared_ptr<cudaGaugeField> X_d;        /** GPU copy of the coarse clover term */
-    mutable std::shared_ptr<cudaGaugeField> Y_aos_d;    /** AoS GPU copy of the coarse link field */
-    mutable std::shared_ptr<cudaGaugeField> X_aos_d;    /** AoS GPU copy of the coarse clover term */
-    mutable std::shared_ptr<cudaGaugeField> Xinv_d;     /** GPU copy of inverse coarse clover term */
-    mutable std::shared_ptr<cudaGaugeField> Yhat_d;     /** GPU copy of the preconditioned coarse link field */
-    mutable std::shared_ptr<cudaGaugeField> Xinv_aos_d; /** AoS GPU copy of inverse coarse clover term */
-    mutable std::shared_ptr<cudaGaugeField> Yhat_aos_d; /** AoS GPU copy of the preconditioned coarse link field */
+    mutable std::shared_ptr<GaugeField> Y_d;        /** GPU copy of the coarse link field */
+    mutable std::shared_ptr<GaugeField> X_d;        /** GPU copy of the coarse clover term */
+    mutable std::shared_ptr<GaugeField> Y_aos_d;    /** AoS GPU copy of the coarse link field */
+    mutable std::shared_ptr<GaugeField> X_aos_d;    /** AoS GPU copy of the coarse clover term */
+    mutable std::shared_ptr<GaugeField> Xinv_d;     /** GPU copy of inverse coarse clover term */
+    mutable std::shared_ptr<GaugeField> Yhat_d;     /** GPU copy of the preconditioned coarse link field */
+    mutable std::shared_ptr<GaugeField> Xinv_aos_d; /** AoS GPU copy of inverse coarse clover term */
+    mutable std::shared_ptr<GaugeField> Yhat_aos_d; /** AoS GPU copy of the preconditioned coarse link field */
 
     /**
        @brief Initialize the coarse gauge fields.  Location is
@@ -1862,10 +1848,10 @@ namespace quda {
        @param[in] Xinv_d GPU coarse inverse clover field
        @param[in] Yhat_d GPU coarse preconditioned link field
      */
-    DiracCoarse(const DiracParam &param, std::shared_ptr<cpuGaugeField> Y_h, std::shared_ptr<cpuGaugeField> X_h,
-                std::shared_ptr<cpuGaugeField> Xinv_h, std::shared_ptr<cpuGaugeField> Yhat_h,
-                std::shared_ptr<cudaGaugeField> Y_d = nullptr, std::shared_ptr<cudaGaugeField> X_d = nullptr,
-                std::shared_ptr<cudaGaugeField> Xinv_d = nullptr, std::shared_ptr<cudaGaugeField> Yhat_d = nullptr);
+    DiracCoarse(const DiracParam &param, std::shared_ptr<GaugeField> Y_h, std::shared_ptr<GaugeField> X_h,
+                std::shared_ptr<GaugeField> Xinv_h, std::shared_ptr<GaugeField> Yhat_h,
+                std::shared_ptr<GaugeField> Y_d = nullptr, std::shared_ptr<GaugeField> X_d = nullptr,
+                std::shared_ptr<GaugeField> Xinv_d = nullptr, std::shared_ptr<GaugeField> Yhat_d = nullptr);
 
     /**
        @param[in] dirac Another operator instance to clone from (shallow copy)
@@ -1955,7 +1941,7 @@ namespace quda {
 
     virtual QudaDiracType getDiracType() const { return QUDA_COARSE_DIRAC; }
 
-    virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *)
+    virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *)
     {
       Dirac::updateFields(gauge_in, nullptr, nullptr, nullptr);
       warningQuda("Coarse gauge links cannot be trivially updated for DiracCoarse(PC). Perform an MG update instead.");
@@ -2027,10 +2013,10 @@ namespace quda {
        @param[in] Xinv_d GPU coarse inverse clover field
        @param[in] Yhat_d GPU coarse preconditioned link field
      */
-    DiracCoarsePC(const DiracParam &param, std::shared_ptr<cpuGaugeField> Y_h, std::shared_ptr<cpuGaugeField> X_h,
-                  std::shared_ptr<cpuGaugeField> Xinv_h, std::shared_ptr<cpuGaugeField> Yhat_h,
-                  std::shared_ptr<cudaGaugeField> Y_d = nullptr, std::shared_ptr<cudaGaugeField> X_d = nullptr,
-                  std::shared_ptr<cudaGaugeField> Xinv_d = nullptr, std::shared_ptr<cudaGaugeField> Yhat_d = nullptr);
+    DiracCoarsePC(const DiracParam &param, std::shared_ptr<GaugeField> Y_h, std::shared_ptr<GaugeField> X_h,
+                  std::shared_ptr<GaugeField> Xinv_h, std::shared_ptr<GaugeField> Yhat_h,
+                  std::shared_ptr<GaugeField> Y_d = nullptr, std::shared_ptr<GaugeField> X_d = nullptr,
+                  std::shared_ptr<GaugeField> Xinv_d = nullptr, std::shared_ptr<GaugeField> Yhat_d = nullptr);
 
     /**
        @param[in] dirac Another operator instance to clone from (shallow copy)
@@ -2243,8 +2229,6 @@ namespace quda {
      */
     virtual void operator()(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const = 0;
 
-    unsigned long long flops() const { return dirac->Flops(); }
-
     QudaMatPCType getMatPCType() const { return dirac->getMatPCType(); }
 
     virtual int getStencilSteps() const = 0;
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 836b474cf0..e67582b682 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -305,8 +305,8 @@ namespace quda
 #endif
 
     // constructor needed for staggered to set xpay from derived class
-    DslashArg(const ColorSpinorField &in, const GaugeField &U, int parity, bool dagger, bool xpay, int nFace,
-              int spin_project, const int *comm_override,
+    DslashArg(const ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const ColorSpinorField &x,
+              int parity, bool dagger, bool xpay, int nFace, int spin_project, const int *comm_override,
 #ifdef NVSHMEM_COMMS
               int shmem_ = 0) :
 #else
@@ -348,8 +348,14 @@ namespace quda
       retcount_intra(dslash::get_shmem_retcount_intra()),
       retcount_inter(dslash::get_shmem_retcount_inter())
 #endif
-
     {
+      if (in.data() == out.data()) errorQuda("Aliasing pointers");
+      checkOrder(out, in, x);        // check all orders match
+      checkPrecision(out, in, x, U); // check all precisions match
+      checkLocation(out, in, x, U);  // check all locations match
+      if (!in.isNative() || !U.isNative())
+        errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
+
       for (int d = 0; d < 4; d++) {
         commDim[d] = (comm_override[d] == 0) ? 0 : comm_dim_partitioned(d);
       }
diff --git a/include/enum_quda.h b/include/enum_quda.h
index 0aa7966d55..c4cbb59901 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -10,8 +10,11 @@ typedef enum qudaError_t { QUDA_SUCCESS = 0, QUDA_ERROR = 1, QUDA_ERROR_UNINITIA
 
 typedef enum QudaMemoryType_s {
   QUDA_MEMORY_DEVICE,
-  QUDA_MEMORY_PINNED,
+  QUDA_MEMORY_DEVICE_PINNED,
+  QUDA_MEMORY_HOST,
+  QUDA_MEMORY_HOST_PINNED,
   QUDA_MEMORY_MAPPED,
+  QUDA_MEMORY_MANAGED,
   QUDA_MEMORY_INVALID = QUDA_INVALID_ENUM
 } QudaMemoryType;
 
@@ -394,6 +397,7 @@ typedef enum QudaDilutionType_s {
   QUDA_DILUTION_COLOR,
   QUDA_DILUTION_SPIN_COLOR,
   QUDA_DILUTION_SPIN_COLOR_EVEN_ODD,
+  QUDA_DILUTION_BLOCK,
   QUDA_DILUTION_INVALID = QUDA_INVALID_ENUM
 } QudaDilutionType;
 
@@ -532,7 +536,7 @@ typedef enum QudaGhostExchange_s {
 typedef enum QudaStaggeredPhase_s {
   QUDA_STAGGERED_PHASE_NO = 0,
   QUDA_STAGGERED_PHASE_MILC = 1,
-  QUDA_STAGGERED_PHASE_CPS = 2,
+  QUDA_STAGGERED_PHASE_CHROMA = 2,
   QUDA_STAGGERED_PHASE_TIFR = 3,
   QUDA_STAGGERED_PHASE_INVALID = QUDA_INVALID_ENUM
 } QudaStaggeredPhase;
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index c24dd3b869..6a8708948a 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -9,7 +9,7 @@
 #   gfortran).
 #*/
 
-#define QUDA_INVALID_ENUM (-Z'7fffffff' - 1)
+#define QUDA_INVALID_ENUM -int(Z'7FFFFFFF') - 1
 
 #define QudaLinkType integer(4)
 
@@ -17,11 +17,6 @@
 #define QUDA_ERROR 1
 #define QUDA_ERROR_UNINITIALIZED 2
 
-#define QUDA_MEMORY_DEVICE 0
-#define QUDA_MEMORY_PINNED 1
-#define QUDA_MEMORY_MAPPED 2
-#define QUDA_MEMORY_INVALID QUDA_INVALID_ENUM
-
 #define QUDA_SU3_LINKS      0
 #define QUDA_GENERAL_LINKS  1
 #define QUDA_THREE_LINKS    2
@@ -366,6 +361,7 @@
 #define QUDA_DILUTION_COLOR 1
 #define QUDA_DILUTION_SPIN_COLOR 2
 #define QUDA_DILUTION_SPIN_COLOR_EVEN_ODD 3
+#define QUDA_DILUTION_BLOCK 4
 #define QUDA_DILUTION_INVALID QUDA_INVALID_ENUM
 
 #define QudaProjectionType integer(4)
@@ -474,10 +470,10 @@
 #define QUDA_GHOST_EXCHANGE_INVALID QUDA_INVALID_ENUM
 
 #define QudaStaggeredPhase integer(4)
-#define QUDA_STAGGERED_PHASE_NO   0
-#define QUDA_STAGGERED_PHASE_MILC 1
-#define QUDA_STAGGERED_PHASE_CPS  2
-#define QUDA_STAGGERED_PHASE_TIFR 3
+#define QUDA_STAGGERED_PHASE_NO     0
+#define QUDA_STAGGERED_PHASE_MILC   1
+#define QUDA_STAGGERED_PHASE_CHROMA 2
+#define QUDA_STAGGERED_PHASE_TIFR   3
 #define QUDA_STAGGERED_PHASE_INVALID QUDA_INVALID_ENUM
 
 #define QudaContractType integer(4)
diff --git a/include/gauge_field.h b/include/gauge_field.h
index e4ec3ae09d..948960b36f 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -37,113 +37,76 @@ namespace quda {
   } // namespace gauge
 
   struct GaugeFieldParam : public LatticeFieldParam {
+    int nColor = 3;
+    int nFace = 0;
 
-    int nColor;
-    int nFace;
+    QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER;
+    QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_NO;
+    QudaLinkType link_type = QUDA_WILSON_LINKS;
+    QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY;
+    QudaReconstructType reconstruct = QUDA_RECONSTRUCT_NO;
 
-    QudaReconstructType reconstruct;
-    QudaGaugeFieldOrder order;
-    QudaGaugeFixed fixed;
-    QudaLinkType link_type;
-    QudaTboundary t_boundary;
+    double anisotropy = 1.0;
+    double tadpole = 1.0;
+    GaugeField *field = nullptr; // pointer to a pre-allocated field
+    void *gauge = nullptr;       // used when we use a reference to an external field
 
-    double anisotropy;
-    double tadpole;
-    void *gauge; // used when we use a reference to an external field
+    QudaFieldCreate create = QUDA_REFERENCE_FIELD_CREATE; // used to determine the type of field created
 
-    QudaFieldCreate create; // used to determine the type of field created
-
-    QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor
+    QudaFieldGeometry geometry = QUDA_VECTOR_GEOMETRY; // whether the field is a scalar, vector or tensor
 
     // whether we need to compute the fat link maxima
     // FIXME temporary flag until we have a kernel that can do this, then we just do this in copy()
     // always set to false, requires external override
-    bool compute_fat_link_max;
+    bool compute_fat_link_max = false;
 
     /** The staggered phase convention to use */
-    QudaStaggeredPhase staggeredPhaseType;
+    QudaStaggeredPhase staggeredPhaseType = QUDA_STAGGERED_PHASE_NO;
 
     /** Whether the staggered phase factor has been applied */
-    bool staggeredPhaseApplied;
+    bool staggeredPhaseApplied = false;
 
     /** Imaginary chemical potential */
-    double i_mu;
+    double i_mu = 0.0;
 
     /** Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */
-    size_t site_offset;
+    size_t site_offset = 0;
 
     /** Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) */
-    size_t site_size;
+    size_t site_size = 0;
 
     // Default constructor
-    GaugeFieldParam(void *const h_gauge = NULL) :
-      LatticeFieldParam(),
-      nColor(3),
-      nFace(0),
-      reconstruct(QUDA_RECONSTRUCT_NO),
-      order(QUDA_INVALID_GAUGE_ORDER),
-      fixed(QUDA_GAUGE_FIXED_NO),
-      link_type(QUDA_WILSON_LINKS),
-      t_boundary(QUDA_INVALID_T_BOUNDARY),
-      anisotropy(1.0),
-      tadpole(1.0),
-      gauge(h_gauge),
-      create(QUDA_REFERENCE_FIELD_CREATE),
-      geometry(QUDA_VECTOR_GEOMETRY),
-      compute_fat_link_max(false),
-      staggeredPhaseType(QUDA_STAGGERED_PHASE_NO),
-      staggeredPhaseApplied(false),
-      i_mu(0.0),
-      site_offset(0),
-      site_size(0)
-    {
-    }
+    GaugeFieldParam(void *const h_gauge = nullptr) : gauge(h_gauge) { }
 
     GaugeFieldParam(const GaugeField &u);
 
     GaugeFieldParam(const lat_dim_t &x, QudaPrecision precision, QudaReconstructType reconstruct, int pad,
                     QudaFieldGeometry geometry, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_PAD) :
       LatticeFieldParam(4, x, pad, QUDA_INVALID_FIELD_LOCATION, precision, ghostExchange),
-      nColor(3),
-      nFace(0),
       reconstruct(reconstruct),
-      order(QUDA_INVALID_GAUGE_ORDER),
-      fixed(QUDA_GAUGE_FIXED_NO),
-      link_type(QUDA_WILSON_LINKS),
-      t_boundary(QUDA_INVALID_T_BOUNDARY),
-      anisotropy(1.0),
-      tadpole(1.0),
-      gauge(0),
       create(QUDA_NULL_FIELD_CREATE),
-      geometry(geometry),
-      compute_fat_link_max(false),
-      staggeredPhaseType(QUDA_STAGGERED_PHASE_NO),
-      staggeredPhaseApplied(false),
-      i_mu(0.0),
-      site_offset(0),
-      site_size(0)
+      geometry(geometry)
     {
     }
 
     GaugeFieldParam(const QudaGaugeParam &param, void *h_gauge = nullptr, QudaLinkType link_type_ = QUDA_INVALID_LINKS) :
       LatticeFieldParam(param),
-      nColor(3),
-      nFace(0),
-      reconstruct(QUDA_RECONSTRUCT_NO),
       order(param.gauge_order),
       fixed(param.gauge_fix),
       link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type),
-      t_boundary(param.t_boundary),
+      t_boundary(link_type == QUDA_ASQTAD_MOM_LINKS ? QUDA_PERIODIC_T : param.t_boundary),
+      // if we have momentum field and not using TIFR field, then we always have recon-10
+      reconstruct(link_type == QUDA_ASQTAD_MOM_LINKS && order != QUDA_TIFR_GAUGE_ORDER
+                      && order != QUDA_TIFR_PADDED_GAUGE_ORDER ?
+                    QUDA_RECONSTRUCT_10 :
+                    QUDA_RECONSTRUCT_NO),
       anisotropy(param.anisotropy),
       tadpole(param.tadpole_coeff),
       gauge(h_gauge),
-      create(QUDA_REFERENCE_FIELD_CREATE),
-      geometry(QUDA_VECTOR_GEOMETRY),
-      compute_fat_link_max(false),
       staggeredPhaseType(param.staggered_phase_type),
       staggeredPhaseApplied(param.staggered_phase_applied),
       i_mu(param.i_mu),
-      site_offset(param.gauge_offset),
+      site_offset(link_type == QUDA_ASQTAD_MOM_LINKS ? param.mom_offset : param.gauge_offset),
       site_size(param.site_size)
     {
       switch (link_type) {
@@ -183,90 +146,217 @@ namespace quda {
   };
 
   std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param);
+  std::ostream &operator<<(std::ostream &output, const GaugeField &param);
 
   class GaugeField : public LatticeField {
 
+    friend std::ostream &operator<<(std::ostream &output, const GaugeField &param);
+
+  private:
+    /**
+       @brief Create the field as specified by the param
+       @param[in] Parameter struct
+    */
+    void create(const GaugeFieldParam &param);
+
+    /**
+       @brief Move the contents of a field to this
+       @param[in,out] other Field we are moving from
+    */
+    void move(GaugeField &&other);
+
+    /**
+       @brief Fills the param with this field's meta data (used for
+       creating a cloned field)
+       @param[in] param The parameter we are filling
+    */
+    void fill(GaugeFieldParam &) const;
+
   protected:
-      size_t bytes;        // bytes allocated per full field
-      size_t phase_offset; // offset in bytes to gauge phases - useful to keep track of texture alignment
-      size_t phase_bytes;  // bytes needed to store the phases
-      size_t length;
-      size_t real_length;
-      int nColor;
-      int nFace;
-      QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor
-
-      QudaReconstructType reconstruct;
-      int nInternal; // number of degrees of freedom per link matrix
-      QudaGaugeFieldOrder order;
-      QudaGaugeFixed fixed;
-      QudaLinkType link_type;
-      QudaTboundary t_boundary;
-
-      double anisotropy;
-      double tadpole;
-      double fat_link_max;
-
-      QudaFieldCreate create; // used to determine the type of field created
-
-      mutable void *ghost[2 * QUDA_MAX_DIM]; // stores the ghost zone of the gauge field (non-native fields only)
-
-      mutable int ghostFace[QUDA_MAX_DIM]; // the size of each face
-
-      /**
-         The staggered phase convention to use
-      */
-      QudaStaggeredPhase staggeredPhaseType;
-
-      /**
-         Whether the staggered phase factor has been applied
-      */
-      bool staggeredPhaseApplied;
-
-      /**
-         @brief Exchange the buffers across all dimensions in a given direction
-         @param[out] recv Receive buffer
-         @param[in] send Send buffer
-         @param[in] dir Direction in which we are sending (forwards OR backwards only)
-      */
-      void exchange(void **recv, void **send, QudaDirection dir) const;
-
-      /**
-         Imaginary chemical potential
-      */
-      double i_mu;
-
-      /**
-         Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER)
-      */
-      size_t site_offset;
-
-      /**
-         Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER)
-      */
-      size_t site_size;
-
-      /**
-         Compute the required extended ghost zone sizes and offsets
-         @param[in] R Radius of the ghost zone
-         @param[in] no_comms_fill If true we create a full halo
-         regardless of partitioning
-         @param[in] bidir Is this a bi-directional exchange - if not
-         then we alias the fowards and backwards offsetss
-      */
-      void createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const;
-
-      /**
-         @brief Set the vol_string and aux_string for use in tuning
-      */
-      void setTuningString();
+    bool init = false;
+    quda_ptr gauge = {};                 /** The gauge field allocation */
+    array<quda_ptr, 8> gauge_array = {}; /** Array of pointers to each subset (e.g., QDP or QDPJITorder) */
+    size_t bytes = 0;                    // bytes allocated per full field
+    size_t phase_offset = 0;             // offset in bytes to gauge phases - useful to keep track of texture alignment
+    size_t phase_bytes = 0;              // bytes needed to store the phases
+    size_t length = 0;
+    size_t real_length = 0;
+    int nColor = 0;
+    int nFace = 0;
+    QudaFieldGeometry geometry = QUDA_INVALID_GEOMETRY; // whether the field is a scale, vector or tensor
+    int site_dim = 0; // the dimensionality of each site (number of matrices per lattice site)
+
+    QudaReconstructType reconstruct = QUDA_RECONSTRUCT_INVALID;
+    int nInternal = 0; // number of degrees of freedom per link matrix
+    QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER;
+    QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_INVALID;
+    QudaLinkType link_type = QUDA_INVALID_LINKS;
+    QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY;
+
+    double anisotropy = 0.0;
+    double tadpole = 0.0;
+    double fat_link_max = 0.0;
+
+    mutable array<quda_ptr, 2 *QUDA_MAX_DIM> ghost
+      = {}; // stores the ghost zone of the gauge field (non-native fields only)
+
+    mutable array<int, QUDA_MAX_DIM> ghostFace = {}; // the size of each face
+
+    /**
+       The staggered phase convention to use
+    */
+    QudaStaggeredPhase staggeredPhaseType = QUDA_STAGGERED_PHASE_INVALID;
+
+    /**
+       Whether the staggered phase factor has been applied
+    */
+    bool staggeredPhaseApplied = false;
+
+    /**
+       Imaginary chemical potential
+    */
+    double i_mu = 0.0;
+
+    /**
+       Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER)
+    */
+    size_t site_offset = 0;
+
+    /**
+       Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER)
+    */
+    size_t site_size = 0;
+
+    /**
+       @brief Exchange the buffers across all dimensions in a given direction
+       @param[out] recv Receive buffer
+       @param[in] send Send buffer
+       @param[in] dir Direction in which we are sending (forwards OR backwards only)
+    */
+    void exchange(void **recv, void **send, QudaDirection dir) const;
+
+    /**
+       Compute the required extended ghost zone sizes and offsets
+       @param[in] R Radius of the ghost zone
+       @param[in] no_comms_fill If true we create a full halo
+       regardless of partitioning
+       @param[in] bidir Is this a bi-directional exchange - if not
+       then we alias the fowards and backwards offsetss
+    */
+    void createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const;
+
+    /**
+       @brief Set the vol_string and aux_string for use in tuning
+    */
+    void setTuningString();
+
+    /**
+       @brief Initialize the padded region to 0
+     */
+    void zeroPad();
 
   public:
+    /**
+       @brief Default constructor
+    */
+    GaugeField() = default;
+
+    /**
+       @brief Copy constructor for creating a GaugeField from another GaugeField
+       @param field Instance of GaugeField from which we are cloning
+    */
+    GaugeField(const GaugeField &field) noexcept;
+
+    /**
+       @brief Move constructor for creating a GaugeField from another GaugeField
+       @param field Instance of GaugeField from which we are moving
+    */
+    GaugeField(GaugeField &&field) noexcept;
+
+    /**
+       @brief Constructor for creating a GaugeField from a GaugeFieldParam
+       @param param Contains the metadata for creating the field
+    */
     GaugeField(const GaugeFieldParam &param);
-    virtual ~GaugeField();
 
-    virtual void exchangeGhost(QudaLinkDirection = QUDA_LINK_BACKWARDS) = 0;
-    virtual void injectGhost(QudaLinkDirection = QUDA_LINK_BACKWARDS) = 0;
+    /**
+       @brief Copy assignment operator
+       @param[in] field Instance from which we are copying
+       @return Reference to this field
+     */
+    GaugeField &operator=(const GaugeField &field);
+
+    /**
+       @brief Move assignment operator
+       @param[in] field Instance from which we are moving
+       @return Reference to this field
+     */
+    GaugeField &operator=(GaugeField &&field);
+
+    /**
+       @brief Returns if the object is empty (not initialized)
+       @return true if the object has not been allocated, otherwise false
+    */
+    bool empty() const { return !init; }
+
+    /**
+       @brief Create the communication handlers and buffers
+       @param[in] R The thickness of the extended region in each dimension
+       @param[in] no_comms_fill Do local exchange to fill out the extended
+       region in non-partitioned dimensions
+       @param[in] bidir Whether to allocate communication buffers to
+       allow for simultaneous bi-directional exchange.  If false, then
+       the forwards and backwards buffers will alias (saving memory).
+    */
+    void createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir = true);
+
+    /**
+       @brief Allocate the ghost buffers
+       @param[in] R The thickness of the extended region in each dimension
+       @param[in] no_comms_fill Do local exchange to fill out the extended
+       @param[in] bidir Is this a bi-directional exchange - if not
+       then we alias the fowards and backwards offsetss
+       region in non-partitioned dimensions
+    */
+    void allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const;
+
+    /**
+       @brief Start the receive communicators
+       @param[in] dim The communication dimension
+       @param[in] dir The communication direction (0=backwards, 1=forwards)
+    */
+    void recvStart(int dim, int dir);
+
+    /**
+       @brief Start the sending communicators
+       @param[in] dim The communication dimension
+       @param[in] dir The communication direction (0=backwards, 1=forwards)
+       @param[in] stream_p Pointer to CUDA stream to post the
+       communication in (if 0, then use null stream)
+    */
+    void sendStart(int dim, int dir, const qudaStream_t &stream_p);
+
+    /**
+       @brief Wait for communication to complete
+       @param[in] dim The communication dimension
+       @param[in] dir The communication direction (0=backwards, 1=forwards)
+    */
+    void commsComplete(int dim, int dir);
+
+    /**
+       @brief Exchange the ghost and store store in the padded region
+       @param[in] link_direction Which links are we exchanging: this
+       flag only applies to bi-directional coarse-link fields
+     */
+    void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
+
+    /**
+       @brief The opposite of exchangeGhost: take the ghost zone on x,
+       send to node x-1, and inject back into the field
+       @param[in] link_direction Which links are we injecting: this
+       flag only applies to bi-directional coarse-link fields
+     */
+    void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
 
     size_t Length() const { return length; }
     int Ncolor() const { return nColor; }
@@ -315,7 +405,7 @@ namespace quda {
        @param no_comms_fill Do local exchange to fill out the extended
        region in non-partitioned dimensions
     */
-    virtual void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false) = 0;
+    void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false);
 
     /**
        @brief This routine will populate the border / halo region
@@ -326,7 +416,7 @@ namespace quda {
        @param no_comms_fill Do local exchange to fill out the extended
        region in non-partitioned dimensions
     */
-    virtual void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false) = 0;
+    void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false);
 
     void checkField(const LatticeField &) const;
 
@@ -342,22 +432,82 @@ namespace quda {
 
     size_t TotalBytes() const { return bytes; }
 
-    virtual void* Gauge_p() { errorQuda("Not implemented"); return (void*)0;}
-    virtual void* Even_p() { errorQuda("Not implemented"); return (void*)0;}
-    virtual void* Odd_p() { errorQuda("Not implemented"); return (void*)0;}
+    /**
+       @brief Helper function that returns true if the gauge order is an array of pointers
+       @param[in] order The gauge order requested
+       @return If the order is an array of pointers
+     */
+    constexpr bool is_pointer_array(QudaGaugeFieldOrder order) const
+    {
+      switch (order) {
+      case QUDA_QDP_GAUGE_ORDER:
+      case QUDA_QDPJIT_GAUGE_ORDER: return true;
+      default: return false;
+      }
+    }
 
-    virtual const void* Gauge_p() const { errorQuda("Not implemented"); return (void*)0;}
-    virtual const void* Even_p() const { errorQuda("Not implemented"); return (void*)0;}
-    virtual const void* Odd_p() const { errorQuda("Not implemented"); return (void*)0;}
+    /**
+       @brief Return base pointer to the gauge field allocation.
+       @tparam T Optional type to cast the pointer to (default is void*).
+       @return Base pointer to the gauge field allocation
+     */
+    template <typename T = void *>
+    std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, T> data() const
+    {
+      if (is_pointer_array(order)) errorQuda("Non dim-array ordered field requested but order is %d", order);
+      return reinterpret_cast<T>(gauge.data());
+    }
 
-    virtual int full_dim(int d) const { return x[d]; }
+    /**
+       @brief Return base pointer to the gauge field allocation
+       specified by the array index.  This is for geometry-array
+       ordered fields, e.g., QDP or QDPJIT.
 
-    const void** Ghost() const {
-      if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields");
-      return (const void**)ghost;
+       @tparam T Optional type to cast the pointer to (default is void*)
+       @param[in] d Dimension index when the allocation is an array type
+       @return Base pointer to the gauge field allocation
+     */
+    template <typename T = void *> auto data(unsigned int d) const
+    {
+      static_assert(std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>,
+                    "data() requires a pointer cast type");
+      if (d >= (unsigned)geometry) errorQuda("Invalid array index %d for geometry %d field", d, geometry);
+      if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
+      return reinterpret_cast<T>(gauge_array[d].data());
     }
 
-    void** Ghost() {
+    void *raw_pointer() const
+    {
+      if (is_pointer_array(order)) {
+        static void *data_array[8];
+        for (int i = 0; i < site_dim; i++) data_array[i] = gauge_array[i].data();
+        return data_array;
+      } else {
+        return gauge.data();
+      }
+    }
+
+    /**
+       @brief Return array of pointers to the per dimension gauge field allocation(s).
+       @tparam T Optional type to cast the pointer to (default is
+       void*).  this is for geometry-array ordered fields, e.g., QDP
+       or QDPJIT.
+       @return Array of pointers to the gauge field allocations
+     */
+    template <typename T = void *>
+    std::enable_if_t<std::is_pointer_v<T> && !std::is_pointer_v<typename std::remove_pointer<T>::type>, array<T, QUDA_MAX_DIM>>
+    data_array() const
+    {
+      if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order);
+      array<T, QUDA_MAX_DIM> u = {};
+      for (auto d = 0; d < geometry; d++) u[d] = static_cast<T>(gauge_array[d]);
+      return u;
+    }
+
+    virtual int full_dim(int d) const { return x[d]; }
+
+    auto &Ghost() const
+    {
       if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields");
       return ghost;
     }
@@ -375,15 +525,15 @@ namespace quda {
     size_t SiteSize() const { return site_size; }
 
     /**
-       Set all field elements to zero (virtual)
+       Set all field elements to zero
     */
-    virtual void zero() = 0;
+    void zero();
 
     /**
      * Generic gauge field copy
      * @param[in] src Source from which we are copying
      */
-    virtual void copy(const GaugeField &src) = 0;
+    void copy(const GaugeField &src);
 
     /**
        @brief Compute the L1 norm of the field
@@ -431,175 +581,15 @@ namespace quda {
     */
     static GaugeField* Create(const GaugeFieldParam &param);
 
-  };
-
-  class cudaGaugeField : public GaugeField {
-
-  private:
-    void *gauge;
-    void *gauge_h; // mapped-memory pointer when allocating on the host
-    void *even;
-    void *odd;
-
-    /**
-       @brief Initialize the padded region to 0
-     */
-    void zeroPad();
-
-  public:
-    cudaGaugeField(const GaugeFieldParam &);
-    virtual ~cudaGaugeField();
-
     /**
-       @brief Exchange the ghost and store store in the padded region
-       @param[in] link_direction Which links are we exchanging: this
-       flag only applies to bi-directional coarse-link fields
-     */
-    void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
-
-    /**
-       @brief The opposite of exchangeGhost: take the ghost zone on x,
-       send to node x-1, and inject back into the field
-       @param[in] link_direction Which links are we injecting: this
-       flag only applies to bi-directional coarse-link fields
-     */
-    void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
-
-    /**
-       @brief Create the communication handlers and buffers
-       @param[in] R The thickness of the extended region in each dimension
-       @param[in] no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimensions
-       @param[in] bidir Whether to allocate communication buffers to
-       allow for simultaneous bi-directional exchange.  If false, then
-       the forwards and backwards buffers will alias (saving memory).
-    */
-    void createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir = true);
-
-    /**
-       @brief Allocate the ghost buffers
-       @param[in] R The thickness of the extended region in each dimension
-       @param[in] no_comms_fill Do local exchange to fill out the extended
-       @param[in] bidir Is this a bi-directional exchange - if not
-       then we alias the fowards and backwards offsetss
-       region in non-partitioned dimensions
-    */
-    void allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const;
-
-    /**
-       @brief Start the receive communicators
-       @param[in] dim The communication dimension
-       @param[in] dir The communication direction (0=backwards, 1=forwards)
-    */
-    void recvStart(int dim, int dir);
-
-    /**
-       @brief Start the sending communicators
-       @param[in] dim The communication dimension
-       @param[in] dir The communication direction (0=backwards, 1=forwards)
-       @param[in] stream_p Pointer to CUDA stream to post the
-       communication in (if 0, then use null stream)
-    */
-    void sendStart(int dim, int dir, const qudaStream_t &stream_p);
-
-    /**
-       @brief Wait for communication to complete
-       @param[in] dim The communication dimension
-       @param[in] dir The communication direction (0=backwards, 1=forwards)
-    */
-    void commsComplete(int dim, int dir);
-
-    /**
-       @brief This does routine will populate the border / halo region of a
-       gauge field that has been created using copyExtendedGauge.
-       @param R The thickness of the extended region in each dimension
-       @param no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimensions
-    */
-    void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false);
-
-    /**
-       @brief This does routine will populate the border / halo region
-       of a gauge field that has been created using copyExtendedGauge.
-       Overloaded variant that will start and stop a comms profile.
-       @param R The thickness of the extended region in each dimension
-       @param profile TimeProfile intance which will record the time taken
-       @param no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimensions
-    */
-    void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false);
-
-    /**
-     * Generic gauge field copy
-     * @param[in] src Source from which we are copying
-     */
-    void copy(const GaugeField &src);
-
-    /**
-       @brief Download into this field from a CPU field
-       @param[in] cpu The CPU field source
-    */
-    void loadCPUField(const cpuGaugeField &cpu);
-
-    /**
-       @brief Download into this field from a CPU field.  Overloaded
-       variant that includes profiling
-       @param[in] cpu The CPU field source
-       @param[in] profile Time profile to record the transfer
-    */
-    void loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile);
-
-    /**
-       @brief Upload from this field into a CPU field
-       @param[out] cpu The CPU field source
-    */
-    void saveCPUField(cpuGaugeField &cpu) const;
-
-    /**
-       @brief Upload from this field into a CPU field.  Overloaded
-       variant that includes profiling.
-       @param[out] cpu The CPU field source
-       @param[in] profile Time profile to record the transfer
-    */
-    void saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const;
-
-    // (ab)use with care
-    void* Gauge_p() { return gauge; }
-    void* Even_p() { return even; }
-    void* Odd_p() { return odd; }
-
-    const void* Gauge_p() const { return gauge; }
-    const void* Even_p() const { return even; }
-    const void *Odd_p() const { return odd; }
-
-    /**
-      @brief Copy all contents of the field to a host buffer.
-      @param[in] the host buffer to copy to.
+       @brief Create a field that aliases this field's storage.  The
+       alias field can use a different precision than this field,
+       though it cannot be greater.  This functionality is useful for
+       the case where we have multiple temporaries in different
+       precisions, but do not need them simultaneously.  Use this functionality with caution.
+       @param[in] param Parameters for the alias field
     */
-    virtual void copy_to_buffer(void *buffer) const;
-
-    /**
-      @brief Copy all contents of the field from a host buffer to this field.
-      @param[in] the host buffer to copy from.
-    */
-    virtual void copy_from_buffer(void *buffer);
-
-    void setGauge(void* _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE
-
-    /**
-       Set all field elements to zero
-    */
-    void zero();
-
-    /**
-       @brief Backs up the cudaGaugeField to CPU memory
-    */
-    void backup() const;
-
-    /**
-       @brief Restores the cudaGaugeField to CUDA memory
-    */
-    void restore() const;
+    GaugeField create_alias(const GaugeFieldParam &param = GaugeFieldParam());
 
     /**
       @brief If managed memory and prefetch is enabled, prefetch
@@ -608,101 +598,47 @@ namespace quda {
       @param[in] stream Which stream to run the prefetch in (default 0)
     */
     void prefetch(QudaFieldLocation mem_space, qudaStream_t stream = device::get_default_stream()) const;
-  };
-
-  class cpuGaugeField : public GaugeField {
-
-    friend void cudaGaugeField::copy(const GaugeField &cpu);
-    friend void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu);
-    friend void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const;
-
-  private:
-    void **gauge; // the actual gauge field
-
-  public:
-    /**
-       @brief Constructor for cpuGaugeField from a GaugeFieldParam
-       @param[in,out] param Parameter struct - note that in the case
-       that we are wrapping host-side extended fields, this param is
-       modified for subsequent creation of fields that are not
-       extended.
-    */
-    cpuGaugeField(const GaugeFieldParam &param);
-    virtual ~cpuGaugeField();
-
-    /**
-       @brief Exchange the ghost and store store in the padded region
-       @param[in] link_direction Which links are we extracting: this
-       flag only applies to bi-directional coarse-link fields
-     */
-    void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
-
-    /**
-       @brief The opposite of exchangeGhost: take the ghost zone on x,
-       send to node x-1, and inject back into the field
-       @param[in] link_direction Which links are we injecting: this
-       flag only applies to bi-directional coarse-link fields
-     */
-    void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS);
 
     /**
-       @brief This does routine will populate the border / halo region of a
-       gauge field that has been created using copyExtendedGauge.
-
-       @param R The thickness of the extended region in each dimension
-       @param no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimenions
+       @brief Backs up the GaugeField
     */
-    void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false);
+    void backup() const;
 
     /**
-       @brief This does routine will populate the border / halo region
-       of a gauge field that has been created using copyExtendedGauge.
-       Overloaded variant that will start and stop a comms profile.
-       @param R The thickness of the extended region in each dimension
-       @param profile TimeProfile intance which will record the time taken
-       @param no_comms_fill Do local exchange to fill out the extended
-       region in non-partitioned dimensions
+       @brief Restores the GaugeField
     */
-    void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false);
-
-    /**
-     * Generic gauge field copy
-     * @param[in] src Source from which we are copying
-     */
-    void copy(const GaugeField &src);
-
-    void* Gauge_p() { return gauge; }
-    const void* Gauge_p() const { return gauge; }
+    void restore() const;
 
     /**
       @brief Copy all contents of the field to a host buffer.
       @param[in] the host buffer to copy to.
     */
-    virtual void copy_to_buffer(void *buffer) const;
+    void copy_to_buffer(void *buffer) const;
 
     /**
       @brief Copy all contents of the field from a host buffer to this field.
       @param[in] the host buffer to copy from.
     */
-    virtual void copy_from_buffer(void *buffer);
-
-    void setGauge(void** _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE
+    void copy_from_buffer(void *buffer);
 
     /**
-       Set all field elements to zero
-    */
-    void zero();
+       @brief Check if two instances are compatible
+       @param[in] a Input field
+       @param[in] b Input field
+       @return Return true if two fields are compatible
+     */
+    static bool are_compatible(const GaugeField &a, const GaugeField &b);
 
     /**
-       @brief Backs up the cpuGaugeField
-    */
-    void backup() const;
+       @brief Check if two instances are weakly compatible (precision
+       and order can differ)
+       @param[in] a Input field
+       @param[in] b Input field
+       @return Return true if two fields are compatible
+     */
+    static bool are_compatible_weak(const GaugeField &a, const GaugeField &b);
 
-    /**
-       @brief Restores the cpuGaugeField
-    */
-    void restore() const;
+    friend struct GaugeFieldParam;
   };
 
   /**
@@ -775,8 +711,8 @@ namespace quda {
      @param recon The reconsturction type
      @return the pointer to the extended gauge field
   */
-  cudaGaugeField *createExtendedGauge(cudaGaugeField &in, const lat_dim_t &R, TimeProfile &profile,
-                                      bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID);
+  GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile,
+                                  bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID);
 
   /**
      This function is used for creating an exteneded (cpu) gauge field from the input,
@@ -785,7 +721,7 @@ namespace quda {
      @param R By how many do we want to extend the gauge field in each direction
      @return the pointer to the extended gauge field
   */
-  cpuGaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R);
+  GaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R);
 
   /**
      This function is used for  extracting the gauge ghost zone from a
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 899f187ad7..a2c0300a1d 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -375,10 +375,9 @@ namespace quda {
         scale(static_cast<Float>(1.0)),
         scale_inv(static_cast<Float>(1.0))
       {
-	for (int d=0; d<U.Geometry(); d++)
-	  u[d] = gauge_ ? static_cast<complex<storeFloat>**>(gauge_)[d] :
-	    static_cast<complex<storeFloat>**>(const_cast<void*>(U.Gauge_p()))[d];
-	resetScale(U.Scale());
+        for (int d = 0; d < U.Geometry(); d++)
+          u[d] = gauge_ ? static_cast<complex<storeFloat> **>(gauge_)[d] : U.data<complex<storeFloat> *>(d);
+        resetScale(U.Scale());
       }
 
       void resetScale(Float max)
@@ -437,27 +436,30 @@ namespace quda {
     template <typename Float, int nColor, bool native_ghost, typename storeFloat>
     struct GhostAccessor<Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat> {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
-      complex<storeFloat> *ghost[8];
-      unsigned int ghostOffset[8];
-      Float scale;
-      Float scale_inv;
+      complex<storeFloat> *ghost[8] = {};
+      unsigned int ghostOffset[8] = {};
+      Float scale = static_cast<Float>(1.0);
+      Float scale_inv = static_cast<Float>(1.0);
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
 
-      GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) :
-        scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
+      GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr)
       {
         for (int d=0; d<4; d++) {
-	  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
-	  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-
-	  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
-	    ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
-	  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-	}
+          ghost[d] = ghost_ ? static_cast<complex<storeFloat> *>(ghost_[d]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
+                              static_cast<complex<storeFloat> *>(const_cast<void *>(U.Ghost()[d].data())) :
+                              nullptr;
+          ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
+
+          ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
+            ghost_                                              ? static_cast<complex<storeFloat> *>(ghost_[d + 4]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
+                     static_cast<complex<storeFloat> *>(const_cast<void *>(U.Ghost()[d + 4].data())) :
+                     nullptr;
+          ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
+        }
 
-	resetScale(U.Scale());
+        resetScale(U.Scale());
       }
 
       void resetScale(Float max)
@@ -486,8 +488,7 @@ namespace quda {
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
 
       Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) :
-        u(gauge_ ? static_cast<complex<storeFloat> *>(gauge_) :
-                   static_cast<complex<storeFloat> *>(const_cast<void *>(U.Gauge_p()))),
+        u(gauge_ ? static_cast<complex<storeFloat> *>(gauge_) : U.data<complex<storeFloat> *>()),
         volumeCB(U.VolumeCB()),
         geometry(U.Geometry()),
         scale(static_cast<Float>(1.0)),
@@ -559,27 +560,30 @@ namespace quda {
     template <typename Float, int nColor, bool native_ghost, typename storeFloat>
     struct GhostAccessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat> {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
-      complex<storeFloat> *ghost[8];
-      unsigned int ghostOffset[8];
-      Float scale;
-      Float scale_inv;
+      complex<storeFloat> *ghost[8] = {};
+      unsigned int ghostOffset[8] = {};
+      Float scale = static_cast<Float>(1.0);
+      Float scale_inv = static_cast<Float>(1.0);
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
 
-      GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) :
-        scale(static_cast<Float>(1.0)), scale_inv(static_cast<Float>(1.0))
+      GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr)
       {
         for (int d=0; d<4; d++) {
-	  ghost[d] = ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d]));
-	  ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-
-	  ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
-	    ghost_ ? static_cast<complex<storeFloat>*>(ghost_[d+4]) :
-	    static_cast<complex<storeFloat>*>(const_cast<void*>(U.Ghost()[d+4]));
-	  ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor();
-	}
+          ghost[d] = ghost_ ? static_cast<complex<storeFloat> *>(ghost_[d]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
+                              static_cast<complex<storeFloat> *>(const_cast<void *>(U.Ghost()[d].data())) :
+                              nullptr;
+          ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
+
+          ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr :
+            ghost_                                              ? static_cast<complex<storeFloat> *>(ghost_[d + 4]) :
+            U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ?
+                     static_cast<complex<storeFloat> *>(const_cast<void *>(U.Ghost()[d + 4].data())) :
+                     nullptr;
+          ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor();
+        }
 
-	resetScale(U.Scale());
+        resetScale(U.Scale());
       }
 
       void resetScale(Float max)
@@ -624,8 +628,7 @@ namespace quda {
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
 
       Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) :
-        u(gauge_ ? static_cast<complex<storeFloat> *>(gauge_) :
-                   static_cast<complex<storeFloat> *>(const_cast<void *>(U.Gauge_p()))),
+        u(gauge_ ? static_cast<complex<storeFloat> *>(gauge_) : U.data<complex<storeFloat> *>()),
         offset_cb((U.Bytes() >> 1) / sizeof(complex<storeFloat>)),
         volumeCB(U.VolumeCB()),
         stride(U.Stride()),
@@ -691,26 +694,26 @@ namespace quda {
     template <typename Float, int nColor, bool native_ghost, typename storeFloat>
     struct GhostAccessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat> {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
-      complex<storeFloat> *ghost[8];
-      const int volumeCB;
-      unsigned int ghostVolumeCB[8];
-      Float scale;
-      Float scale_inv;
+      complex<storeFloat> *ghost[8] = {};
+      const unsigned int volumeCB;
+      unsigned int ghostVolumeCB[8] = {};
+      Float scale = static_cast<Float>(1.0);
+      Float scale_inv = static_cast<Float>(1.0);
       static constexpr bool fixed = fixed_point<Float,storeFloat>();
       Accessor<Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat> accessor;
 
       GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_ = 0) :
         volumeCB(U.VolumeCB()),
-        scale(static_cast<Float>(1.0)),
-        scale_inv(static_cast<Float>(1.0)),
         accessor(U, gauge_, ghost_)
       {
         if constexpr (!native_ghost) assert(ghost_ != nullptr);
         for (int d = 0; d < 4; d++) {
           ghost[d] = !native_ghost ? static_cast<complex<storeFloat>*>(ghost_[d]) : nullptr;
-	  ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d);
-	  ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast<complex<storeFloat>*>(ghost_[d+4]) : nullptr;
-	  ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d);
+          ghostVolumeCB[d] = U.Nface() * U.SurfaceCB(d);
+          ghost[d + 4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY ?
+            static_cast<complex<storeFloat> *>(ghost_[d + 4]) :
+            nullptr;
+          ghostVolumeCB[d + 4] = U.Nface() * U.SurfaceCB(d);
         }
         resetScale(U.Scale());
       }
@@ -755,7 +758,7 @@ namespace quda {
       using wrapper = fieldorder_wrapper<Float, storeFloat>;
 
       /** An internal reference to the actual field we are accessing */
-      const int volumeCB;
+      const unsigned int volumeCB;
       const int nDim;
       const int_fastdiv geometry;
       const QudaFieldLocation location;
@@ -874,13 +877,13 @@ namespace quda {
 	__device__ __host__ inline int Ncolor() const { return nColor; }
 
 	/** Returns the field volume */
-	__device__ __host__ inline int Volume() const { return 2*volumeCB; }
+        __device__ __host__ inline auto Volume() const { return 2 * volumeCB; }
 
-	/** Returns the field volume */
-	__device__ __host__ inline int VolumeCB() const { return volumeCB; }
+        /** Returns the field volume */
+        __device__ __host__ inline auto VolumeCB() const { return volumeCB; }
 
-	/** Returns the field geometric dimension */
-	__device__ __host__ inline int Ndim() const { return nDim; }
+        /** Returns the field geometric dimension */
+        __device__ __host__ inline int Ndim() const { return nDim; }
 
 	/** Returns the field geometry */
 	__device__ __host__ inline int Geometry() const { return geometry; }
@@ -1501,7 +1504,7 @@ namespace quda {
       {
         switch (phase) {
         case QUDA_STAGGERED_PHASE_MILC:
-        case QUDA_STAGGERED_PHASE_CPS:
+        case QUDA_STAGGERED_PHASE_CHROMA:
         case QUDA_STAGGERED_PHASE_TIFR: return true;
         default: return false;
         }
@@ -1530,7 +1533,7 @@ namespace quda {
         int coords[QUDA_MAX_DIM];
         int_fastdiv X[QUDA_MAX_DIM];
         int R[QUDA_MAX_DIM];
-        const int volumeCB;
+        const unsigned int volumeCB;
         int faceVolumeCB[4];
         const int stride;
         const int geometry;
@@ -1539,7 +1542,7 @@ namespace quda {
 
         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
           reconstruct(u),
-          gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+          gauge(gauge_ ? gauge_ : u.data<Float *>()),
           offset(u.Bytes() / (2 * sizeof(Float) * N)),
           ghostExchange(u.GhostExchange()),
           volumeCB(u.VolumeCB()),
@@ -1767,7 +1770,7 @@ namespace quda {
 
       /**
          @brief The LegacyOrder defines the ghost zone storage and ordering for
-         all cpuGaugeFields, which use the same ghost zone storage.
+         all non-native fields, which use the same ghost zone storage.
       */
       template <typename Float, int length_> struct LegacyOrder {
         static constexpr int length = length_;
@@ -1775,9 +1778,9 @@ namespace quda {
         using store_t = Float;
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
-        Float *ghost[QUDA_MAX_DIM];
-        int faceVolumeCB[QUDA_MAX_DIM];
-        const int volumeCB;
+        Float *ghost[QUDA_MAX_DIM] = {};
+        int faceVolumeCB[QUDA_MAX_DIM] = {};
+        const unsigned int volumeCB;
         const int stride;
         const int geometry;
         const int hasPhase;
@@ -1792,7 +1795,9 @@ namespace quda {
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
 
           for (int i = 0; i < 4; i++) {
-            ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]);
+            ghost[i] = (ghost_)                            ? ghost_[i] :
+              u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? (Float *)(u.Ghost()[i].data()) :
+                                                             nullptr;
             faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth
           }
         }
@@ -1849,10 +1854,12 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge[QUDA_MAX_DIM];
-      const int volumeCB;
-    QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
-      : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
-	{ for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
+      const unsigned int volumeCB;
+      QDPOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
+        LegacyOrder<Float, length>(u, ghost_), volumeCB(u.VolumeCB())
+      {
+        for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
+      }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
         {
@@ -1893,10 +1900,12 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge[QUDA_MAX_DIM];
-      const int volumeCB;
-    QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
-      : LegacyOrder<Float,length>(u, ghost_), volumeCB(u.VolumeCB())
-	{ for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; }
+      const unsigned int volumeCB;
+      QDPJITOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
+        LegacyOrder<Float, length>(u, ghost_), volumeCB(u.VolumeCB())
+      {
+        for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
+      }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
         {
@@ -1941,16 +1950,21 @@ namespace quda {
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
-    const int volumeCB;
+    const unsigned int volumeCB;
     const int geometry;
-  MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) :
-    LegacyOrder<Float,length>(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()),
-      volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; }
+    MILCOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
+      LegacyOrder<Float, length>(u, ghost_),
+      gauge(gauge_ ? gauge_ : u.data<Float *>()),
+      volumeCB(u.VolumeCB()),
+      geometry(u.Geometry())
+    {
+      ;
+    }
 
-  __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
-  {
-    auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
-    block_load<complex, length / 2>(v, reinterpret_cast<complex *>(in));
+    __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
+    {
+      auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length];
+      block_load<complex, length / 2>(v, reinterpret_cast<complex *>(in));
     }
 
     __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const
@@ -1997,13 +2011,13 @@ namespace quda {
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
-    const int volumeCB;
+    const unsigned int volumeCB;
     const int geometry;
     const size_t offset;
     const size_t size;
     MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
       LegacyOrder<Float, length>(u, ghost_),
-      gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+      gauge(gauge_ ? gauge_ : u.data<Float *>()),
       volumeCB(u.VolumeCB()),
       geometry(u.Geometry()),
       offset(u.SiteOffset()),
@@ -2056,14 +2070,14 @@ namespace quda {
     using real = typename mapper<Float>::type;
     using complex = complex<real>;
     Float *gauge;
-    const int volumeCB;
+    const unsigned int volumeCB;
     const real anisotropy;
     const real anisotropy_inv;
     static constexpr int Nc = 3;
     const int geometry;
     CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
       LegacyOrder<Float, length>(u, ghost_),
-      gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+      gauge(gauge_ ? gauge_ : u.data<Float *>()),
       volumeCB(u.VolumeCB()),
       anisotropy(u.Anisotropy()),
       anisotropy_inv(1.0 / anisotropy),
@@ -2125,13 +2139,11 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
-      const int volumeCB;
-      int exVolumeCB; // extended checkerboard volume
+      const unsigned int volumeCB;
+      unsigned int exVolumeCB; // extended checkerboard volume
       static constexpr int Nc = 3;
       BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
-        LegacyOrder<Float, length>(u, ghost_),
-        gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
-        volumeCB(u.VolumeCB())
+        LegacyOrder<Float, length>(u, ghost_), gauge(gauge_ ? gauge_ : u.data<Float *>()), volumeCB(u.VolumeCB())
       {
         if constexpr (length != 18) errorQuda("Gauge length %d not supported", length);
         // compute volumeCB + halo region
@@ -2189,13 +2201,13 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
-      const int volumeCB;
+      const unsigned int volumeCB;
       static constexpr int Nc = 3;
       const real scale;
       const real scale_inv;
       TIFROrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
         LegacyOrder<Float, length>(u, ghost_),
-        gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+        gauge(gauge_ ? gauge_ : u.data<Float *>()),
         volumeCB(u.VolumeCB()),
         scale(u.Scale()),
         scale_inv(1.0 / scale)
@@ -2253,7 +2265,7 @@ namespace quda {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       Float *gauge;
-      const int volumeCB;
+      const unsigned int volumeCB;
       int exVolumeCB;
       static constexpr int Nc = 3;
       const real scale;
@@ -2262,7 +2274,7 @@ namespace quda {
       const int exDim[4];
       TIFRPaddedOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
         LegacyOrder<Float, length>(u, ghost_),
-        gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()),
+        gauge(gauge_ ? gauge_ : u.data<Float *>()),
         volumeCB(u.VolumeCB()),
         exVolumeCB(1),
         scale(u.Scale()),
diff --git a/include/gauge_tools.h b/include/gauge_tools.h
index 503c20bc9f..9b7d68db37 100644
--- a/include/gauge_tools.h
+++ b/include/gauge_tools.h
@@ -9,9 +9,8 @@ namespace quda
    * @param[in] Gauge field upon which we are measuring.
    * @param[in,out] param Parameter struct that defines which
    * observables we are making and the resulting observables.
-   * @param[in] profile TimeProfile instance used for profiling.
    */
-  void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param, TimeProfile &profile);
+  void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param);
 
   /**
    * @brief Project the input gauge field onto the SU(3) group.  This
diff --git a/include/invert_quda.h b/include/invert_quda.h
index 0a2ca1b335..11ac64708e 100644
--- a/include/invert_quda.h
+++ b/include/invert_quda.h
@@ -225,12 +225,6 @@ namespace quda {
     /** The type of accelerator type to use for preconditioner */
     QudaAcceleratorType accelerator_type_precondition;
 
-    /**< The time taken by the solver */
-    double secs;
-
-    /**< The Gflops rate of the solver */
-    double gflops;
-
     // Incremental EigCG solver parameters
     /**< The precision of the Ritz vectors */
     QudaPrecision precision_ritz;//also search space precision
@@ -333,8 +327,6 @@ namespace quda {
       ca_lambda_max_precondition(param.ca_lambda_max_precondition),
       schwarz_type(param.schwarz_type),
       accelerator_type_precondition(param.accelerator_type_precondition),
-      secs(param.secs),
-      gflops(param.gflops),
       precision_ritz(param.cuda_prec_ritz),
       n_ev(param.n_ev),
       m(param.max_search_dim),
@@ -422,8 +414,6 @@ namespace quda {
       ca_lambda_max_precondition(param.ca_lambda_max_precondition),
       schwarz_type(param.schwarz_type),
       accelerator_type_precondition(param.accelerator_type_precondition),
-      secs(param.secs),
-      gflops(param.gflops),
       precision_ritz(param.precision_ritz),
       n_ev(param.n_ev),
       m(param.m),
@@ -466,9 +456,6 @@ namespace quda {
       param.true_res = true_res;
       param.true_res_hq = true_res_hq;
       param.iter += iter;
-      comm_allreduce_sum(gflops);
-      param.gflops += gflops;
-      param.secs += secs;
       if (offset >= 0) {
 	param.true_res_offset[offset] = true_res_offset[offset];
         param.iter_res_offset[offset] = iter_res_offset[offset];
@@ -786,12 +773,6 @@ namespace quda {
     static void computeCAKrylovSpace(const DiracMatrix &diracm, std::vector<ColorSpinorField> &Ap,
                                      std::vector<ColorSpinorField> &p, int n_krylov, QudaCABasis basis, double m_map,
                                      double b_map, Args &&...args);
-
-    /**
-     * @brief Return flops
-     * @return flops expended by this operator
-     */
-    virtual double flops() const { return 0; }
   };
 
   /**
@@ -1641,8 +1622,6 @@ namespace quda {
     bool apply_mat; //! Whether to compute q = Ap or assume it is provided
     bool hermitian; //! Whether A is hermitian or not
 
-    TimeProfile &profile;
-
     /**
        @brief Solve the equation A p_k psi_k = q_k psi_k = b by minimizing the
        residual and using Eigen's SVD algorithm for numerical stability
@@ -1661,7 +1640,7 @@ namespace quda {
        @param apply_mat Whether to apply the operator in place or assume q already contains this
        @profile Timing profile to use
     */
-    MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian, TimeProfile &profile = dummy);
+    MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian);
 
     /**
        @param x The optimum for the solution vector.
diff --git a/include/kernels/covDev.cuh b/include/kernels/covDev.cuh
index b86e989bf7..28c52e9b38 100644
--- a/include/kernels/covDev.cuh
+++ b/include/kernels/covDev.cuh
@@ -37,19 +37,13 @@ namespace quda
 
     CovDevArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int mu, int parity, bool dagger,
               const int *comm_override) :
-      DslashArg<Float, nDim>(in, U, parity, dagger, false, 1, spin_project, comm_override),
+      DslashArg<Float, nDim>(out, in, U, in, parity, dagger, false, 1, spin_project, comm_override),
       out(out),
       in(in),
       in_pack(in),
       U(U),
       mu(mu)
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in);        // check all orders match
-      checkPrecision(out, in, U); // check all precisions match
-      checkLocation(out, in, U);  // check all locations match
-      if (!out.isNative() || !in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
     }
   };
 
diff --git a/include/kernels/dslash_gamma_helper.cuh b/include/kernels/dslash_gamma_helper.cuh
index 3b5e27492a..5261ea5b32 100644
--- a/include/kernels/dslash_gamma_helper.cuh
+++ b/include/kernels/dslash_gamma_helper.cuh
@@ -78,11 +78,11 @@ namespace quda {
     {
       ColorSpinor<typename Arg::real, Arg::nColor, 4> in = arg.in(x_cb, parity);
       switch(arg.d) {
-      case 0: arg.out(x_cb, parity) = in.gamma(0);
-      case 1: arg.out(x_cb, parity) = in.gamma(1);
-      case 2: arg.out(x_cb, parity) = in.gamma(2);
-      case 3: arg.out(x_cb, parity) = in.gamma(3);
-      case 4: arg.out(x_cb, parity) = in.gamma(4);
+      case 0: arg.out(x_cb, parity) = in.gamma(0); break;
+      case 1: arg.out(x_cb, parity) = in.gamma(1); break;
+      case 2: arg.out(x_cb, parity) = in.gamma(2); break;
+      case 3: arg.out(x_cb, parity) = in.gamma(3); break;
+      case 4: arg.out(x_cb, parity) = in.gamma(4); break;
       }
     }
   };
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 8f772165bf..deb38455f8 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -51,7 +51,7 @@ namespace quda
 
     StaggeredArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const GaugeField &L, double a,
                  const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(in, U, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project,
+      DslashArg<Float, nDim>(out, in, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project,
                              comm_override),
       out(out),
       in(in, improved_ ? 3 : 1),
@@ -65,12 +65,6 @@ namespace quda
       is_last_time_slice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
       dagger_scale(dagger ? static_cast<real>(-1.0) : static_cast<real>(1.0))
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in, x);        // check all orders match
-      checkPrecision(out, in, x, U); // check all precisions match
-      checkLocation(out, in, x, U);  // check all locations match
-      if (!in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
     }
   };
 
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index cd7575974a..f87e8f9865 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -38,7 +38,7 @@ namespace quda
 
     WilsonArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a,
               const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(in, U, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override),
+      DslashArg<Float, nDim>(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override),
       out(out),
       in(in),
       in_pack(in),
@@ -46,12 +46,6 @@ namespace quda
       U(U),
       a(a)
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in, x);        // check all orders match
-      checkPrecision(out, in, x, U); // check all precisions match
-      checkLocation(out, in, x, U);  // check all locations match
-      if (!in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
     }
   };
 
diff --git a/include/kernels/gauge_phase.cuh b/include/kernels/gauge_phase.cuh
index ef57369cb8..73def9ab49 100644
--- a/include/kernels/gauge_phase.cuh
+++ b/include/kernels/gauge_phase.cuh
@@ -63,9 +63,10 @@ namespace quda {
       } else if (dim == 3) { // also apply boundary condition
 	phase = (t == arg.X[3]-1) ? arg.tBoundary : 1.0;
       }
-    } else if (Arg::phase == QUDA_STAGGERED_PHASE_CPS) {
+    } else if (Arg::phase == QUDA_STAGGERED_PHASE_CHROMA) {
+      // Chroma follows CPS convention, but uses -Dslash instead of Dslash compared to QUDA
       if (dim==0) {
-	phase = 1.0;
+	phase = -1.0;
       } else if (dim == 1) {
 	phase = (1.0 - 2.0 * ((1 + x) % 2) );
       } else if (dim == 2) {
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index ac09ddc5ed..a029242210 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -40,8 +40,7 @@ namespace quda
 
     LaplaceArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, double b,
                const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) :
-
-      DslashArg<Float, nDim>(in, U, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override),
+      DslashArg<Float, nDim>(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override),
       out(out),
       in(in),
       in_pack(in),
@@ -51,12 +50,6 @@ namespace quda
       b(b),
       dir(dir)
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in, x);        // check all orders match
-      checkPrecision(out, in, x, U); // check all precisions match
-      checkLocation(out, in, x, U);  // check all locations match
-      if (!in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder());
       if (dir < 3 || dir > 4) errorQuda("Unsupported laplace direction %d (must be 3 or 4)", dir);
     }
   };
diff --git a/include/kernels/spinor_dilute.cuh b/include/kernels/spinor_dilute.cuh
index 538610ff44..956b559092 100644
--- a/include/kernels/spinor_dilute.cuh
+++ b/include/kernels/spinor_dilute.cuh
@@ -18,7 +18,7 @@ namespace quda {
     case QUDA_DILUTION_COLOR: return nColor;
     case QUDA_DILUTION_SPIN_COLOR: return nSpin * nColor;
     case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: return nSpin * nColor * 2;
-    default: return 1;
+    default: return 128;
     }
   }
 
@@ -28,10 +28,15 @@ namespace quda {
     static constexpr int nSpin = nSpin_;
     static constexpr int nColor = nColor_;
     static constexpr QudaDilutionType type = type_;
-    static constexpr int dilution_size = get_size<nSpin, nColor>(type);
+    static constexpr int max_dilution_size = get_size<nSpin, nColor>(type);
     using V = typename colorspinor_mapper<store_t, nSpin, nColor>::type;
-    V v[dilution_size];
+    int dilution_size;
+    V v[max_dilution_size];
     V src;
+    int nParity;
+    lat_dim_t dims = {};
+    lat_dim_t dilution_block_dims = {};
+    lat_dim_t dilution_block_grid = {};
 
     /**
        @brief Constructor for the dilution arg
@@ -39,14 +44,36 @@ namespace quda {
        @param src The source vector we are diluting
      */
     template <std::size_t... S>
-    SpinorDiluteArg(std::vector<ColorSpinorField> &v, const ColorSpinorField &src, std::index_sequence<S...>) :
+    SpinorDiluteArg(std::vector<ColorSpinorField> &v, const ColorSpinorField &src, const lat_dim_t &dilution_block_dims,
+                    std::index_sequence<S...>) :
       kernel_param(dim3(src.VolumeCB(), src.SiteSubset(), 1)),
-      v{v[S]...},
-      src(src)
+      dilution_size(v.size()),
+      src(src),
+      nParity(src.SiteSubset()),
+      dims(static_cast<const LatticeField &>(src).X()),
+      dilution_block_dims(dilution_block_dims)
     {
+      for (auto i = 0u; i < v.size(); i++) this->v[i] = V(v[i]);
+      if (nParity == 1) { // dimensions need to be full-field
+        this->dims[0] *= 2;
+        this->dilution_block_dims[0] *= 2;
+      }
+      for (auto i = 0; i < src.Ndim() && type == QUDA_DILUTION_BLOCK; i++)
+        dilution_block_grid[i] = (dims[i] * comms_dim[i]) / this->dilution_block_dims[i];
     }
   };
 
+  template <typename coord_t, typename Arg>
+  __device__ __host__ void getCoordsGlobal(coord_t &coords, int x_cb, int parity, const Arg &arg)
+  {
+    getCoords(coords, x_cb, arg.dims, parity);
+
+    // first 4 dimensions are potentially distributed so include global offsets
+    for (int i = 0; i < 4; i++) {
+      coords[i] += arg.comms_coord[i] * arg.dims[i]; // global coordinate
+    }
+  }
+
   /**
      Functor for diluting the src vector
    */
@@ -69,8 +96,8 @@ namespace quda {
       case QUDA_DILUTION_COLOR: return c == i;
       case QUDA_DILUTION_SPIN_COLOR: return (s * Arg::nColor + c) == i;
       case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: return ((parity * Arg::nSpin + s) * Arg::nColor + c) == i;
+      default: return 0;
       }
-      return 0;
     }
 
     __device__ __host__ void operator()(int x_cb, int parity)
@@ -78,16 +105,30 @@ namespace quda {
       using vector = ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin>;
       vector src = arg.src(x_cb, parity);
 
-      for (int i = 0; i < Arg::dilution_size; i++) {
-        vector v;
+      if (Arg::type == QUDA_DILUTION_BLOCK) {
+        lat_dim_t coords;
+        getCoordsGlobal(coords, x_cb, parity, arg);
+
+        lat_dim_t block_coords;
+        for (int i = 0; i < coords.size(); i++) block_coords[i] = coords[i] / arg.dilution_block_dims[i];
+        int block_idx = ((block_coords[3] * arg.dilution_block_grid[2] + block_coords[2]) * arg.dilution_block_grid[1]
+                         + block_coords[1])
+            * arg.dilution_block_grid[0]
+          + block_coords[0];
+
+        for (int i = 0; i < arg.dilution_size; i++) { arg.v[i](x_cb, parity) = i == block_idx ? src : vector(); }
+      } else {
+        for (int i = 0; i < Arg::max_dilution_size; i++) { // for these types max = actual size
+          vector v;
 
-        for (int s = 0; s < Arg::nSpin; s++) {
-          for (int c = 0; c < Arg::nColor; c++) {
-            v(s, c) = write_source(i, s, c, parity) ? src(s, c) : complex<typename Arg::real>(0.0, 0.0);
+          for (int s = 0; s < Arg::nSpin; s++) {
+            for (int c = 0; c < Arg::nColor; c++) {
+              v(s, c) = write_source(i, s, c, parity) ? src(s, c) : complex<typename Arg::real>(0.0, 0.0);
+            }
           }
-        }
 
-        arg.v[i](x_cb, parity) = v;
+          arg.v[i](x_cb, parity) = v;
+        }
       }
     }
 
diff --git a/include/kernels/staggered_kd_apply_xinv_kernel.cuh b/include/kernels/staggered_kd_apply_xinv_kernel.cuh
index bbe8b70166..f5b137486f 100644
--- a/include/kernels/staggered_kd_apply_xinv_kernel.cuh
+++ b/include/kernels/staggered_kd_apply_xinv_kernel.cuh
@@ -39,7 +39,7 @@ namespace quda {
       X0h(out.X()[0]/2),
       volumeCB(in.VolumeCB())
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
+      if (in.data() == out.data()) errorQuda("Aliasing pointers");
       checkOrder(out, in); // check all orders match
       checkPrecision(out, in, xInv); // check all precisions match
       checkLocation(out, in, xInv);
diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh
index 2fdb42f17a..9f4db096e8 100644
--- a/include/kernels/staggered_quark_smearing.cuh
+++ b/include/kernels/staggered_quark_smearing.cuh
@@ -45,8 +45,7 @@ namespace quda
 
     StaggeredQSmearArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int t0,
                        bool is_t0_kernel, int parity, int dir, bool dagger, const int *comm_override) :
-
-      DslashArg<Float, nDim>(in, U, parity, dagger, false, 3, false, comm_override),
+      DslashArg<Float, nDim>(out, in, U, in, parity, dagger, false, 3, false, comm_override),
       out(out, 3),
       in(in, 3),
       in_pack(in, 3),
@@ -56,12 +55,6 @@ namespace quda
       is_t0_kernel(is_t0_kernel),
       t0_offset(is_t0_kernel ? in.VolumeCB() / in.X(3) : 0)
     {
-      if (in.V() == out.V()) errorQuda("Aliasing pointers");
-      checkOrder(out, in);        // check all orders match
-      checkPrecision(out, in, U); // check all precisions match
-      checkLocation(out, in, U);  // check all locations match
-      if (!in.isNative() || !U.isNative())
-        errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination", in.FieldOrder(), U.FieldOrder());
       if (dir < 3 || dir > 4) errorQuda("Unsupported laplace direction %d (must be 3 or 4)", dir);
 
       for (int i = 0; i < 4; i++) {
diff --git a/include/lattice_field.h b/include/lattice_field.h
index 04190db9f3..b92297eabc 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -34,9 +34,6 @@ namespace quda {
   class cudaEigVecSet;
 
   class GaugeField;
-  class cpuGaugeField;
-  class cudaGaugeField;
-
   class CloverField;
 
   enum class QudaOffsetCopyMode { COLLECT, DISPERSE };
@@ -71,11 +68,14 @@ namespace quda {
     /** Array storing the length of dimension */
     lat_dim_t x = {};
 
+    /** Padding to be added to the checker-boarded volume (only for native field ordering) */
     int pad = 0;
 
+    /** Whether the field is full or single parity */
     QudaSiteSubset siteSubset = QUDA_INVALID_SITE_SUBSET;
 
-    QudaMemoryType mem_type = QUDA_MEMORY_DEVICE;
+    /** The type of memory allocation to use for the field */
+    QudaMemoryType mem_type = QUDA_MEMORY_INVALID;
 
     /** The type of ghost exchange to be done with this field */
     QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
@@ -108,7 +108,7 @@ namespace quda {
       nDim(nDim),
       pad(pad),
       siteSubset(QUDA_FULL_SITE_SUBSET),
-      mem_type(QUDA_MEMORY_DEVICE),
+      mem_type(location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_HOST),
       ghostExchange(ghostExchange),
       scale(1.0)
     {
@@ -126,14 +126,14 @@ namespace quda {
        @param[in] param Contains the metadata for filling out the LatticeFieldParam
     */
     LatticeFieldParam(const QudaGaugeParam &param) :
-      location(QUDA_CPU_FIELD_LOCATION),
+      location(param.location),
       precision(param.cpu_prec),
       ghost_precision(param.cpu_prec),
       init(true),
       nDim(4),
       pad(0),
       siteSubset(QUDA_FULL_SITE_SUBSET),
-      mem_type(QUDA_MEMORY_DEVICE),
+      mem_type(QUDA_MEMORY_HOST),
       ghostExchange(QUDA_GHOST_EXCHANGE_NO),
       scale(param.scale)
     {
@@ -144,15 +144,18 @@ namespace quda {
     }
 
     /**
-       @brief Contructor for creating LatticeFieldParam from a LatticeField
+       @brief Constructor for creating LatticeFieldParam from a LatticeField
     */
     LatticeFieldParam(const LatticeField &field);
   };
 
   std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param);
+  std::ostream &operator<<(std::ostream &output, const LatticeField &field);
 
   class LatticeField : public Object {
 
+    friend std::ostream &operator<<(std::ostream &output, const LatticeField &param);
+
     /**
        @brief Create the field as specified by the param
        @param[in] Parameter struct
@@ -178,9 +181,13 @@ namespace quda {
     /** Checkerboarded local volume */
     size_t localVolumeCB = 0;
 
+    /** Stride used for native field ordering (stride = volumeCB + pad) */
     size_t stride = 0;
+
+    /** Padding to be added to the checker-boarded volume (only for native field ordering) */
     int pad = 0;
 
+    /** Total size of the allocation */
     size_t total_bytes = 0;
 
     /** Number of field dimensions */
@@ -463,9 +470,7 @@ namespace quda {
       }
     }
 
-    mutable char *backup_h = nullptr;
-    mutable char *backup_norm_h = nullptr;
-    mutable bool backed_up = false;
+    mutable std::vector<quda_ptr> backup_h = {};
 
   public:
     /**
diff --git a/include/llfat_quda.h b/include/llfat_quda.h
index 696c67d3f8..0bf9f5b249 100644
--- a/include/llfat_quda.h
+++ b/include/llfat_quda.h
@@ -11,7 +11,7 @@ namespace quda {
      @param u[in] The input gauge field
      @param coeff[in] Array of path coefficients
   */
-  void fatKSLink(GaugeField *fat, const GaugeField &u, const double *coeff);
+  void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff);
 
   /**
      @brief Compute the long links for an improved staggered (Kogut-Susskind) fermions.
@@ -19,6 +19,6 @@ namespace quda {
      @param u[in] The input gauge field
      @param coeff[in] Array of path coefficients
   */
-  void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff);
+  void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff);
 
 } // namespace quda
diff --git a/include/malloc_quda.h b/include/malloc_quda.h
index 8df59bbf56..05a36fcd77 100644
--- a/include/malloc_quda.h
+++ b/include/malloc_quda.h
@@ -114,6 +114,9 @@ namespace quda {
 #define register_pinned(ptr, bytes) quda::register_pinned_(__func__, quda::file_name(__FILE__), __LINE__, ptr, bytes)
 #define unregister_pinned(size) quda::unregister_pinned_(__func__, quda::file_name(__FILE__), __LINE__, ptr)
 
+#define quda_malloc(size) quda::quda_malloc_(__func__, quda::file_name(__FILE__), __LINE__, size)
+#define quda_free(ptr) quda::quda_free_(__func__, quda::file_name(__FILE__), __LINE__, ptr)
+
 namespace quda {
 
   namespace pool {
diff --git a/include/multi_blas_helper.cuh b/include/multi_blas_helper.cuh
index 6a470fe576..78aaa1ac4b 100644
--- a/include/multi_blas_helper.cuh
+++ b/include/multi_blas_helper.cuh
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <algorithm>
+#include <utility>
 #include <register_traits.h>
 #include <blas_helper.cuh>
 #include <reduce_helper.h>
diff --git a/include/multigrid.h b/include/multigrid.h
index 2d50bcec97..e5981baac2 100644
--- a/include/multigrid.h
+++ b/include/multigrid.h
@@ -397,9 +397,9 @@ namespace quda {
        @brief This method only resets the KD operators with the updated fine links and rebuilds
               the KD inverse
      */
-    void resetStaggeredKD(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in,
-                          cudaGaugeField *gauge_sloppy_in, cudaGaugeField *fat_gauge_sloppy_in,
-                          cudaGaugeField *long_gauge_sloppy_in, double mass);
+    void resetStaggeredKD(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
+                          GaugeField *gauge_sloppy_in, GaugeField *fat_gauge_sloppy_in,
+                          GaugeField *long_gauge_sloppy_in, double mass);
 
     /**
        @brief Dump the null-space vectors to disk.  Will recurse dumping all levels.
@@ -486,11 +486,6 @@ namespace quda {
     */
     void buildFreeVectors(std::vector<ColorSpinorField*> &B);
 
-    /**
-       @brief Return the total flops done on this and all coarser levels.
-     */
-    double flops() const;
-
     /**
       @brief Return if we're on a fine grid right now
     */
@@ -634,13 +629,13 @@ namespace quda {
      operator we are constructing the coarse grid operator from.
      For staggered, should always be QUDA_MATPC_INVALID.
    */
-  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                         const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                         const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                          QudaDiracType dirac, QudaMatPCType matpc);
 
   template <int fineColor, int coarseColor>
-  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                         const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                         const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                          QudaDiracType dirac, QudaMatPCType matpc);
 
   /**
diff --git a/include/quda.h b/include/quda.h
index 1d97e6c737..1f8e642b98 100644
--- a/include/quda.h
+++ b/include/quda.h
@@ -62,7 +62,7 @@ extern "C" {
 
     QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */
 
-    int ga_pad;       /**< The pad size that the cudaGaugeField will use (default=0) */
+    int ga_pad; /**< The pad size that native GaugeFields will use (default=0) */
 
     int site_ga_pad;  /**< Used by link fattening and the gauge and fermion forces */
 
@@ -1497,7 +1497,7 @@ extern "C" {
   void  saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param);
 
   /**
-   * Reinterpret gauge as a pointer to cudaGaugeField and call destructor.
+   * Reinterpret gauge as a pointer to a GaugeField and call destructor.
    *
    * @param gauge Gauge field to be freed
    */
@@ -1698,12 +1698,10 @@ extern "C" {
    * @param[in] reunit_interval, reunitarize gauge field when iteration count is a multiple of this
    * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value
    * @param[in] param The parameters of the external fields and the computation settings
-   * @param[out] timeinfo
    */
   int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                                 const unsigned int verbose_interval, const double relax_boost, const double tolerance,
-                                const unsigned int reunit_interval, const unsigned int stopWtheta,
-                                QudaGaugeParam *param, double *timeinfo);
+                                const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param);
 
   /**
    * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
@@ -1717,12 +1715,10 @@ extern "C" {
    * iteration reachs the maximum number of steps defined by Nsteps
    * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value
    * @param[in] param The parameters of the external fields and the computation settings
-   * @param[out] timeinfo
    */
   int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                                 const unsigned int verbose_interval, const double alpha, const unsigned int autotune,
-                                const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param,
-                                double *timeinfo);
+                                const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param);
 
   /**
    * @brief Strided Batched GEMM
@@ -1779,9 +1775,11 @@ extern "C" {
     int delete_2link;
     /** Set if the input spinor is on a time slice **/
     int t0;
+    /** Time taken for the smearing operations **/
+    double secs;
     /** Flops count for the smearing operations **/
-    int gflops;
-    
+    double gflops;
+
   } QudaQuarkSmearParam;
 
   /**
diff --git a/include/quda_api.h b/include/quda_api.h
index 45c226ba19..e1ec69bbe1 100644
--- a/include/quda_api.h
+++ b/include/quda_api.h
@@ -3,6 +3,7 @@
 #include <quda_define.h>
 #include <string>
 #include <enum_quda.h>
+#include <quda_ptr.h>
 
 /**
    @file quda_api.h
@@ -22,7 +23,7 @@ enum qudaMemcpyKind {
 namespace quda
 {
 
-  class TuneParam;
+  struct TuneParam;
 
   struct qudaStream_t {
     int idx;
@@ -42,6 +43,16 @@ namespace quda
   void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
                    const char *line);
 
+  /**
+     @brief Wrapper around cudaMemcpy or driver API equivalent
+     @param[out] dst Destination pointer
+     @param[in] src Source pointer
+     @param[in] count Size of transfer
+     @param[in] kind Type of memory copy
+  */
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
+                   const char *file, const char *line);
+
   /**
      @brief Wrapper around cudaMemcpyAsync or driver API equivalent
      @param[out] dst Destination pointer
@@ -63,6 +74,14 @@ namespace quda
   void qudaMemcpyP2PAsync_(void *dst, const void *src, size_t count, const qudaStream_t &stream, const char *func,
                            const char *file, const char *line);
 
+  /**
+     @brief Heterogenous memset function
+     @param[out] ptr Heterogeneous pointer
+     @param[in] value Value to set for each byte of specified memory
+     @param[in] count Size in bytes to set
+   */
+  void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line);
+
   /**
      @brief Wrapper around cudaMemset or driver API equivalent
      @param[out] ptr Starting address pointer
@@ -72,15 +91,14 @@ namespace quda
   void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line);
 
   /**
-     @brief Wrapper around cudaMemset2D or driver API equivalent
+     @brief Wrapper around cudaMemsetAsync or driver API equivalent
      @param[out] ptr Starting address pointer
-     @param[in] Pitch in bytes
      @param[in] value Value to set for each byte of specified memory
-     @param[in] width Width in bytes
-     @param[in] height Height in bytes
+     @param[in] count Size in bytes to set
+     @param[in] stream Stream to issue memset
    */
-  void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func,
-                     const char *file, const char *line);
+  void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line);
 
   /**
      @brief Wrapper around cudaMemsetAsync or driver API equivalent
@@ -89,20 +107,21 @@ namespace quda
      @param[in] count Size in bytes to set
      @param[in] stream Stream to issue memset
    */
-  void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
                         const char *file, const char *line);
 
   /**
-     @brief Wrapper around cudaMemsetAsync or driver API equivalent
+     @brief Asynchronous heterogenous memset2d function
      @param[out] ptr Starting address pointer
+     @param[in] Initial offset from pointer
      @param[in] Pitch in bytes
      @param[in] value Value to set for each byte of specified memory
      @param[in] width Width in bytes
      @param[in] height Height in bytes
      @param[in] stream Stream to issue memset
    */
-  void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream,
-                          const char *func, const char *file, const char *line);
+  void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
+                          const qudaStream_t &stream, const char *func, const char *file, const char *line);
 
   /**
      @brief Wrapper around cudaMemPrefetchAsync or driver API equivalent
@@ -224,14 +243,11 @@ namespace quda
 #define qudaMemset(ptr, value, count)                                                                                  \
   ::quda::qudaMemset_(ptr, value, count, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
 
-#define qudaMemset2D(ptr, pitch, value, width, height)                                                                 \
-  ::quda::qudaMemset2D_(ptr, pitch, value, width, height, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
-
 #define qudaMemsetAsync(ptr, value, count, stream)                                                                     \
   ::quda::qudaMemsetAsync_(ptr, value, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
 
-#define qudaMemset2DAsync(ptr, pitch, value, width, height, stream)                                                    \
-  ::quda::qudaMemset2DAsync_(ptr, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__),            \
+#define qudaMemset2DAsync(ptr, offset, pitch, value, width, height, stream)                                            \
+  ::quda::qudaMemset2DAsync_(ptr, offset, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__),    \
                              __STRINGIFY__(__LINE__))
 
 #define qudaMemPrefetchAsync(ptr, count, mem_space, stream)                                                            \
diff --git a/include/quda_arch.h b/include/quda_arch.h
index ed88fb0b8e..45a8ed34e4 100644
--- a/include/quda_arch.h
+++ b/include/quda_arch.h
@@ -14,5 +14,8 @@
 
 #elif defined(QUDA_TARGET_SYCL)
 #include <targets/sycl/quda_sycl.h>
+#endif
 
+#ifdef QUDA_OPENMP
+#include <omp.h>
 #endif
diff --git a/include/quda_internal.h b/include/quda_internal.h
index 756d5822e0..dd8a6c8177 100644
--- a/include/quda_internal.h
+++ b/include/quda_internal.h
@@ -49,6 +49,7 @@
 #include <object.h>
 #include <device.h>
 #include <array.h>
+#include "timer.h"
 
 namespace quda {
 
diff --git a/include/quda_milc_interface.h b/include/quda_milc_interface.h
index 23b8bccc87..c3207cfaa2 100644
--- a/include/quda_milc_interface.h
+++ b/include/quda_milc_interface.h
@@ -1026,7 +1026,7 @@ extern "C" {
 			  void* inGauge);
 
   /**
-   * Reinterpret gauge as a pointer to cudaGaugeField and call destructor.
+   * Reinterpret gauge as a pointer to a GaugeField and call destructor.
    *
    * @param gauge Gauge field to be freed
    */
diff --git a/include/quda_ptr.h b/include/quda_ptr.h
new file mode 100644
index 0000000000..aab76f6b89
--- /dev/null
+++ b/include/quda_ptr.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <ostream>
+#include "malloc_quda.h"
+
+namespace quda
+{
+
+  /**
+     Object that stores a memory allocation with different views for
+     host or device.  Depending on the nature of the underlying memory
+     type, both views may not be defined
+
+     type                       defined views
+     QUDA_MEMORY_DEVICE         device only
+     QUDA_MEMORY_DEVICE_PINNED  device only
+     QUDA_MEMORY_HOST           host only
+     QUDA_MEMORY_HOST_PINNED    both
+     QUDA_MEMORY_MAPPED         both (pinned to host)
+     QUDA_MEMORY_MANAGED        both
+   */
+  class quda_ptr
+  {
+    friend std::ostream &operator<<(std::ostream &output, const quda_ptr &ptr);
+    QudaMemoryType type = QUDA_MEMORY_INVALID; /** Memory type of the allocation */
+    size_t size = 0;                           /** Size of the allocation */
+    bool pool = false;                         /** Is the allocation is pooled */
+    void *device = nullptr;                    /** Device-view of the allocation */
+    void *host = nullptr;                      /** Host-view of the allocation */
+    bool reference = false;                    /** Is this a reference to another allocation */
+
+    /**
+       @brief Internal deallocation routine
+     */
+    void destroy();
+
+  public:
+    quda_ptr() = default;
+    quda_ptr(quda_ptr &&) = default;
+    quda_ptr &operator=(quda_ptr &&);
+    quda_ptr(const quda_ptr &) = delete;
+    quda_ptr &operator=(const quda_ptr &) = delete;
+
+    /**
+       @brief Constructor for quda_ptr
+       @param[in] type The memory type of the allocation
+       @param[in] size The size of the allocation
+       @param[in] pool Whether the allocation should be in the memory pool (default is true)
+    */
+    quda_ptr(QudaMemoryType type, size_t size, bool pool = true);
+
+    /**
+       @brief Constructor for quda_ptr where we are wrapping a non-owned pointer
+       @param[in] ptr Raw base pointer
+       @param[in] type The memory type of the allocation
+    */
+    quda_ptr(void *ptr, QudaMemoryType type);
+
+    /**
+       @brief Destructor for the quda_ptr
+    */
+    virtual ~quda_ptr();
+
+    /**
+       @brief Specialized exchange function to use in place of
+       std::exchange when exchanging quda_ptr objects: moves obj to
+       *this, and moves new_value to obj
+       @param[in,out] obj
+       @param[in] new_value New value for obj to take
+     */
+    void exchange(quda_ptr &obj, quda_ptr &&new_value);
+
+    /**
+       @return Returns true if allocation is visible to the device
+    */
+    bool is_device() const;
+
+    /**
+       @return Returns true if allocation is visible to the host
+    */
+    bool is_host() const;
+
+    /**
+       Return view of the pointer.  For mapped memory we return the device view.
+     */
+    void *data() const;
+
+    /**
+       Return the device view of the pointer
+     */
+    void *data_device() const;
+
+    /**
+       Return the host view of the pointer
+     */
+    void *data_host() const;
+
+    /**
+       Return if the instance is a reference rather than an allocation
+     */
+    bool is_reference() const;
+  };
+
+  std::ostream &operator<<(std::ostream &output, const quda_ptr &ptr);
+
+} // namespace quda
diff --git a/include/reference_wrapper_helper.h b/include/reference_wrapper_helper.h
index 2b85c497fd..3f73709ca6 100644
--- a/include/reference_wrapper_helper.h
+++ b/include/reference_wrapper_helper.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#include <type_traits>
 #include <functional>
+#include <iterator>
 #include <initializer_list>
 #include <enum_quda.h>
 #include <util_quda.h>
diff --git a/include/staggered_kd_build_xinv.h b/include/staggered_kd_build_xinv.h
index fdf57eccf8..2bd1b4f600 100644
--- a/include/staggered_kd_build_xinv.h
+++ b/include/staggered_kd_build_xinv.h
@@ -14,7 +14,7 @@ namespace quda
      @param mass [in] Mass of the original staggered operator w/out factor of 2 convention
      @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv
   */
-  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass,
+  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass,
                                         const bool dagger_approximation);
 
   /**
@@ -34,7 +34,7 @@ namespace quda
      @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv
      @return constructed Xinv
   */
-  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass,
+  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass,
                                                                           const bool dagger_approximation);
 
 } // namespace quda
diff --git a/include/targets/cuda/externals/jitify.hpp b/include/targets/cuda/externals/jitify.hpp
index 46a51a97cd..110be5d22e 100644
--- a/include/targets/cuda/externals/jitify.hpp
+++ b/include/targets/cuda/externals/jitify.hpp
@@ -365,7 +365,7 @@ inline std::string path_base(std::string p) {
   // "foo/bar"  -> "foo"
   // "foo/bar/" -> "foo/bar"
 #if defined _WIN32 || defined _WIN64
-  char sep = '\\';
+  const char* sep = "\\/";
 #else
   char sep = '/';
 #endif
@@ -496,10 +496,13 @@ inline std::string comment_out_code_line(int line_num, std::string source) {
 inline void print_with_line_numbers(std::string const& source) {
   int linenum = 1;
   std::stringstream source_ss(source);
+  std::stringstream output_ss;
+  output_ss.imbue(std::locale::classic());
   for (std::string line; std::getline(source_ss, line); ++linenum) {
-    std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line
+    output_ss << std::setfill(' ') << std::setw(3) << linenum << " " << line
               << std::endl;
   }
+  std::cout << output_ss.str();
 }
 
 inline void print_compile_log(std::string program_name,
@@ -554,7 +557,7 @@ inline bool load_source(
     std::string filename, std::map<std::string, std::string>& sources,
     std::string current_dir = "",
     std::vector<std::string> include_paths = std::vector<std::string>(),
-    file_callback_type file_callback = 0,
+    file_callback_type file_callback = 0, std::string* program_name = nullptr,
     std::map<std::string, std::string>* fullpaths = nullptr,
     bool search_current_dir = true) {
   std::istream* source_stream = 0;
@@ -568,6 +571,9 @@ inline bool load_source(
     string_stream << source;
     source_stream = &string_stream;
   }
+  if (program_name) {
+    *program_name = filename;
+  }
   if (sources.count(filename)) {
     // Already got this one
     return true;
@@ -672,6 +678,8 @@ inline bool load_source(
       // TODO: Handle block comments (currently they cause a compilation error).
       size_t comment_start = line_after_pragma.find("//");
       std::string pragma_args = line_after_pragma.substr(0, comment_start);
+      // handle quote character used in #pragma expression
+      pragma_args = replace_token(pragma_args, "\"", "\\\"");
       std::string comment = comment_start != std::string::npos
                                 ? line_after_pragma.substr(comment_start)
                                 : "";
@@ -682,7 +690,7 @@ inline bool load_source(
     source += line + "\n";
   }
   // HACK TESTING (WAR for cub)
-  // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source;
+  source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source;
   ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" +
   /// source;
 
@@ -690,6 +698,7 @@ inline bool load_source(
   //   of the same header from different paths.
   if (pragma_once) {
     std::stringstream ss;
+    ss.imbue(std::locale::classic());
     ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0')
        << hash;
     std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n";
@@ -1385,7 +1394,16 @@ static const char* jitsafe_header_preinclude_h = R"(
 // WAR to allow exceptions to be parsed
 #define try
 #define catch(...)
-)";
+)"
+#if defined(_WIN32) || defined(_WIN64)
+// WAR for NVRTC <= 11.0 not defining _WIN64.
+R"(
+#ifndef _WIN64
+#define _WIN64 1
+#endif
+)"
+#endif
+;
 
 static const char* jitsafe_header_float_h = R"(
 #pragma once
@@ -1403,12 +1421,12 @@ static const char* jitsafe_header_float_h = R"(
 #define DBL_MAX_EXP     1024
 #define FLT_MAX_10_EXP  38
 #define DBL_MAX_10_EXP  308
-#define FLT_MAX         3.4028234e38f 
-#define DBL_MAX         1.7976931348623157e308 
-#define FLT_EPSILON     1.19209289e-7f 
-#define DBL_EPSILON     2.220440492503130e-16 
-#define FLT_MIN         1.1754943e-38f; 
-#define DBL_MIN         2.2250738585072013e-308 
+#define FLT_MAX         3.4028234e38f
+#define DBL_MAX         1.7976931348623157e308
+#define FLT_EPSILON     1.19209289e-7f
+#define DBL_EPSILON     2.220440492503130e-16
+#define FLT_MIN         1.1754943e-38f
+#define DBL_MIN         2.2250738585072013e-308
 #define FLT_ROUNDS      1
 #if defined __cplusplus && __cplusplus >= 201103L
 #define FLT_EVAL_METHOD 0
@@ -1596,14 +1614,28 @@ struct IntegerLimits {
 #endif  // __cplusplus >= 201103L
 	enum {
        is_specialized = true,
-       digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits,
-       digits10   = (digits * 30103) / 100000,
-       is_signed  = ((T)(-1)<0),
-       is_integer = true,
-       is_exact   = true,
-       radix      = 2,
-       is_bounded = true,
-       is_modulo  = false
+       digits            = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits,
+       digits10          = (digits * 30103) / 100000,
+       is_signed         = ((T)(-1)<0),
+       is_integer        = true,
+       is_exact          = true,
+       has_infinity      = false,
+       has_quiet_NaN     = false,
+       has_signaling_NaN = false,
+       has_denorm        = 0,
+       has_denorm_loss   = false,
+       round_style       = 0,
+       is_iec559         = false,
+       is_bounded        = true,
+       is_modulo         = !(is_signed || Max == 1 /*is bool*/),
+       max_digits10      = 0,
+       radix             = 2,
+       min_exponent      = 0,
+       min_exponent10    = 0,
+       max_exponent      = 0,
+       max_exponent10    = 0,
+       tinyness_before   = false,
+       traps             = false
 	};
 };
 } // namespace __jitify_detail
@@ -1910,6 +1942,46 @@ static const char* jitsafe_header_type_traits = R"(
     template<size_t len, size_t alignment> struct aligned_storage { struct type { alignas(alignment) char data[len]; }; };
     template <class T> struct alignment_of : std::integral_constant<size_t,alignof(T)> {};
 
+    template <typename T> struct make_unsigned;
+    template <> struct make_unsigned<signed char>        { typedef unsigned char type; };
+    template <> struct make_unsigned<signed short>       { typedef unsigned short type; };
+    template <> struct make_unsigned<signed int>         { typedef unsigned int type; };
+    template <> struct make_unsigned<signed long>        { typedef unsigned long type; };
+    template <> struct make_unsigned<signed long long>   { typedef unsigned long long type; };
+    template <> struct make_unsigned<unsigned char>      { typedef unsigned char type; };
+    template <> struct make_unsigned<unsigned short>     { typedef unsigned short type; };
+    template <> struct make_unsigned<unsigned int>       { typedef unsigned int type; };
+    template <> struct make_unsigned<unsigned long>      { typedef unsigned long type; };
+    template <> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+    template <> struct make_unsigned<char>               { typedef unsigned char type; };
+    #if defined _WIN32 || defined _WIN64
+    template <> struct make_unsigned<wchar_t>            { typedef unsigned short type; };
+    #else
+    template <> struct make_unsigned<wchar_t>            { typedef unsigned int type; };
+    #endif
+
+    template <typename T> struct make_signed;
+    template <> struct make_signed<signed char>        { typedef signed char type; };
+    template <> struct make_signed<signed short>       { typedef signed short type; };
+    template <> struct make_signed<signed int>         { typedef signed int type; };
+    template <> struct make_signed<signed long>        { typedef signed long type; };
+    template <> struct make_signed<signed long long>   { typedef signed long long type; };
+    template <> struct make_signed<unsigned char>      { typedef signed char type; };
+    template <> struct make_signed<unsigned short>     { typedef signed short type; };
+    template <> struct make_signed<unsigned int>       { typedef signed int type; };
+    template <> struct make_signed<unsigned long>      { typedef signed long type; };
+    template <> struct make_signed<unsigned long long> { typedef signed long long type; };
+    template <> struct make_signed<char>               { typedef signed char type; };
+    #if defined _WIN32 || defined _WIN64
+    template <> struct make_signed<wchar_t>            { typedef signed short type; };
+    #else
+    template <> struct make_signed<wchar_t>            { typedef signed int type; };
+    #endif
+
+    #if __cplusplus >= 201703L
+    template< typename... Ts > struct make_void { typedef void type; };
+    template< typename... Ts > using void_t = typename make_void<Ts...>::type;
+    #endif  // __cplusplus >= 201703L
     }  // namespace std
     #endif // c++11
 )";
@@ -1949,8 +2021,8 @@ static const char* jitsafe_header_stdint_h =
     "#define INT8_MIN    SCHAR_MIN\n"
     "#define INT16_MIN   SHRT_MIN\n"
     "#if defined _WIN32 || defined _WIN64\n"
-    "#define WCHAR_MIN   SHRT_MIN\n"
-    "#define WCHAR_MAX   SHRT_MAX\n"
+    "#define WCHAR_MIN   0\n"
+    "#define WCHAR_MAX   USHRT_MAX\n"
     "typedef unsigned long long uintptr_t; //optional\n"
     "#else\n"
     "#define WCHAR_MIN   INT_MIN\n"
@@ -2083,24 +2155,33 @@ static const char* jitsafe_header_sstream =
     "#include <ostream>\n"
     "#include <istream>\n";
 
-static const char* jitsafe_header_utility =
-    "#pragma once\n"
-    "namespace std {\n"
-    "template<class T1, class T2>\n"
-    "struct pair {\n"
-    "	T1 first;\n"
-    "	T2 second;\n"
-    "	inline pair() {}\n"
-    "	inline pair(T1 const& first_, T2 const& second_)\n"
-    "		: first(first_), second(second_) {}\n"
-    "	// TODO: Standard includes many more constructors...\n"
-    "	// TODO: Comparison operators\n"
-    "};\n"
-    "template<class T1, class T2>\n"
-    "pair<T1,T2> make_pair(T1 const& first, T2 const& second) {\n"
-    "	return pair<T1,T2>(first, second);\n"
-    "}\n"
-    "}  // namespace std\n";
+static const char* jitsafe_header_utility = R"(
+    #pragma once
+    namespace std {
+    template<class T1, class T2>
+    struct pair {
+        T1 first;
+        T2 second;
+        inline pair() {}
+        inline pair(T1 const& first_, T2 const& second_): first(first_), second(second_) {}
+        // TODO: Standard includes many more constructors...
+        // TODO: Comparison operators
+    };
+    template<class T1, class T2>
+    pair<T1,T2> make_pair(T1 const& first, T2 const& second) {
+        return pair<T1,T2>(first, second);
+    }
+
+    template<typename T>
+    constexpr bool always_false = false;
+
+    template<typename T>
+    typename std::add_rvalue_reference<T>::type declval() noexcept
+    {
+    static_assert(always_false<T>, "declval not allowed in an evaluated context");
+    }
+    }  // namespace std
+)";
 
 // TODO: incomplete
 static const char* jitsafe_header_vector =
@@ -2340,14 +2421,81 @@ static const char* jitsafe_header_tuple = R"(
     #if __cplusplus >= 201103L
     namespace std {
     template<class... Types > class tuple;
+
+    template< size_t I, class T >
+    struct tuple_element;
+    // recursive case
+    template< size_t I, class Head, class... Tail >
+    struct tuple_element<I, tuple<Head, Tail...>>
+        : tuple_element<I-1, tuple<Tail...>> { };
+    // base case
+    template< class Head, class... Tail >
+    struct tuple_element<0, tuple<Head, Tail...>> {
+      using type = Head;
+    };
     } // namespace std
     #endif
  )";
 
+static const char* jitsafe_header_functional = R"(
+    #pragma once
+    #if __cplusplus >= 201103L
+    namespace std {
+    template<class T>
+    class reference_wrapper
+    {
+    public:
+    // types
+    using type = T;
+    reference_wrapper(const reference_wrapper&) noexcept = default;
+    // assignment
+    reference_wrapper& operator=(const reference_wrapper& x) noexcept = default;
+    // access
+    constexpr operator T& () const noexcept { return *_ptr; }
+    constexpr T& get() const noexcept { return *_ptr; }
+    private:
+    T* _ptr;
+    };
+    } // namespace std
+    #endif
+)";
+
+static const char* jitsafe_header_map = R"(
+    #pragma once
+    namespace std {
+    template<class Key, class T, class Compare = void, class Allocator = void> class map {};
+    } // namespace std
+)";
+
+static const char* jitsafe_header_stack = R"(
+    #pragma once
+    namespace std {
+    template<class T, class = void> class stack {};
+    } // namespace std
+)";
+
+static const char* jitsafe_header_initializer_list = R"(
+    #pragma once
+)";
+
 static const char* jitsafe_header_assert = R"(
     #pragma once
  )";
 
+static const char* jitsafe_header_sys_time = R"(
+    #pragma once
+    struct timeval {
+    unsigned long long tv_sec;
+    unsigned long long tv_usec;
+    };
+    struct timeval it_interval;
+    struct timeval it_value;
+    int getitimer(int, struct itimerval *);
+    int gettimeofday(struct timeval *, void *);
+    int setitimer(int, const struct itimerval *, struct itimerval *);
+    int utimes(const char *, const struct timeval [2]);
+ )";
+
 // WAR: These need to be pre-included as a workaround for NVRTC implicitly using
 // /usr/include as an include path. The other built-in headers will be included
 // lazily as needed.
@@ -2406,8 +2554,13 @@ static const std::map<std::string, std::string>& get_jitsafe_headers_map() {
       {"time.h", jitsafe_header_time_h},
       {"ctime", jitsafe_header_time_h},
       {"tuple", jitsafe_header_tuple},
+      {"functional", jitsafe_header_functional},
+      {"map", jitsafe_header_map},
+      {"stack", jitsafe_header_stack},
+      {"initializer_list", jitsafe_header_initializer_list},
       {"assert.h", jitsafe_header_assert},
-      {"cassert", jitsafe_header_assert}};
+      {"cassert", jitsafe_header_assert},
+      {"sys/time.h", jitsafe_header_sys_time}};
   return jitsafe_headers_map;
 }
 
@@ -2673,6 +2826,17 @@ inline nvrtcResult compile_kernel(std::string program_name,
       &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers,
       header_sources_c.data(), header_names_c.data()));
 
+  // Ensure nvrtc_program gets destroyed.
+  struct ScopedNvrtcProgramDestroyer {
+    nvrtcProgram& nvrtc_program_;
+    ScopedNvrtcProgramDestroyer(nvrtcProgram& nvrtc_program)
+        : nvrtc_program_(nvrtc_program) {}
+    ~ScopedNvrtcProgramDestroyer() { nvrtcDestroyProgram(&nvrtc_program_); }
+    ScopedNvrtcProgramDestroyer(const ScopedNvrtcProgramDestroyer&) = delete;
+    ScopedNvrtcProgramDestroyer& operator=(const ScopedNvrtcProgramDestroyer&) =
+        delete;
+  } nvrtc_program_scope_guard{nvrtc_program};
+
 #if CUDA_VERSION >= 8000
   if (!instantiation.empty()) {
     CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str()));
@@ -2720,7 +2884,6 @@ inline nvrtcResult compile_kernel(std::string program_name,
 #endif
   }
 
-  CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program));
 #undef CHECK_NVRTC
   return NVRTC_SUCCESS;
 }
@@ -2746,10 +2909,9 @@ inline void load_program(std::string const& cuda_source,
 
   // Load program source
   if (!detail::load_source(cuda_source, *program_sources, "", *include_paths,
-                           file_callback)) {
+                           file_callback, program_name)) {
     throw std::runtime_error("Source not found: " + cuda_source);
   }
-  *program_name = program_sources->begin()->first;
 
   // Maps header include names to their full file paths.
   std::map<std::string, std::string> header_fullpaths;
@@ -2757,7 +2919,7 @@ inline void load_program(std::string const& cuda_source,
   // Load header sources
   for (std::string const& header : headers) {
     if (!detail::load_source(header, *program_sources, "", *include_paths,
-                             file_callback, &header_fullpaths)) {
+                             file_callback, nullptr, &header_fullpaths)) {
       // **TODO: Deal with source not found
       throw std::runtime_error("Source not found: " + header);
     }
@@ -2816,8 +2978,8 @@ inline void load_program(std::string const& cuda_source,
     std::string include_parent_fullpath = header_fullpaths[include_parent];
     std::string include_path = detail::path_base(include_parent_fullpath);
     if (detail::load_source(include_name, *program_sources, include_path,
-                            *include_paths, file_callback, &header_fullpaths,
-                            is_included_with_quotes)) {
+                            *include_paths, file_callback, nullptr,
+                            &header_fullpaths, is_included_with_quotes)) {
 #if JITIFY_PRINT_HEADER_PATHS
       std::cout << "Found #include " << include_name << " from "
                 << include_parent << ":" << line_num << " ["
@@ -3067,6 +3229,7 @@ class KernelLauncher {
   std::unique_ptr<KernelLauncher_impl const> _impl;
 
  public:
+  KernelLauncher() = default;
   inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid,
                         dim3 block, unsigned int smem = 0,
                         cudaStream_t stream = 0);
@@ -3135,6 +3298,7 @@ class KernelInstantiation {
   std::unique_ptr<KernelInstantiation_impl const> _impl;
 
  public:
+  KernelInstantiation() = default;
   inline KernelInstantiation(Kernel const& kernel,
                              std::vector<std::string> const& template_args);
 
@@ -3282,6 +3446,7 @@ class Kernel {
   std::unique_ptr<Kernel_impl const> _impl;
 
  public:
+  Kernel() = default;
   Kernel(Program const& program, std::string name,
          jitify::detail::vector<std::string> options = 0);
 
@@ -3346,6 +3511,7 @@ class Program {
   std::unique_ptr<Program_impl const> _impl;
 
  public:
+  Program() = default;
   Program(JitCache& cache, std::string source,
           jitify::detail::vector<std::string> headers = 0,
           jitify::detail::vector<std::string> options = 0,
diff --git a/include/targets/hip/FFT_Plans.h b/include/targets/hip/FFT_Plans.h
index d43415cd99..d4de34d368 100644
--- a/include/targets/hip/FFT_Plans.h
+++ b/include/targets/hip/FFT_Plans.h
@@ -2,7 +2,7 @@
 
 #include <quda_hip_api.h>
 #include <quda_internal.h>
-#include <hipfft.h>
+#include <hipfft/hipfft.h>
 
 #define FFT_FORWARD HIPFFT_FORWARD
 #define FFT_INVERSE HIPFFT_BACKWARD
diff --git a/include/timer.h b/include/timer.h
index 4c1557b7ce..0b69d5d466 100644
--- a/include/timer.h
+++ b/include/timer.h
@@ -2,10 +2,6 @@
 
 #include <sys/time.h>
 
-#ifdef INTERFACE_NVTX
-#include "nvtx3/nvToolsExt.h"
-#endif
-
 #include <quda_internal.h>
 #include <util_quda.h>
 #include <device.h>
@@ -65,15 +61,16 @@ namespace quda {
       }
     }
 
+    int ref_count = 0;
+
     /**
        @brief Start the timer
      */
-    void start(const char *func = nullptr, const char *file = nullptr, int line = 0)
+    void start(const char * = nullptr, const char * = nullptr, int = 0)
     {
-      if (running) {
-        printfQuda("ERROR: Cannot start an already running timer (%s:%d in %s())", file ? file : "", line,
-                   func ? func : "");
-        errorQuda("Aborting");
+      if (running) { // if the timer has already started, we increment the ref counter and return
+        ref_count++;
+        return;
       }
       if (!device) {
         gettimeofday(&host_start, NULL);
@@ -110,6 +107,10 @@ namespace quda {
      */
     void stop(const char *func = nullptr, const char *file = nullptr, int line = 0)
     {
+      if (ref_count > 0) {
+        ref_count--;
+        return;
+      }
       peek(func, file, line);
       time += last_interval;
       count++;
@@ -186,70 +187,26 @@ namespace quda {
     QUDA_PROFILE_COUNT  /**< The total number of timers we have.  Must be last enum type. */
   };
 
-#ifdef INTERFACE_NVTX
-
-#define PUSH_RANGE(name,cid) { \
-    int color_id = cid; \
-    color_id = color_id%nvtx_num_colors;\
-    nvtxEventAttributes_t eventAttrib = {}; \
-    eventAttrib.version = NVTX_VERSION; \
-    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
-    eventAttrib.colorType = NVTX_COLOR_ARGB; \
-    eventAttrib.color = nvtx_colors[color_id]; \
-    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
-    eventAttrib.message.ascii = name; \
-    eventAttrib.category = cid;\
-    nvtxRangePushEx(&eventAttrib); \
-}
-#define POP_RANGE nvtxRangePop();
-#else
-#define PUSH_RANGE(name,cid)
-#define POP_RANGE
-#endif
-
   class TimeProfile {
     std::string fname;  /**< Which function are we profiling */
 #ifdef INTERFACE_NVTX
     static const uint32_t nvtx_colors[];// = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff };
     static const int nvtx_num_colors;// = sizeof(nvtx_colors)/sizeof(uint32_t);
 #endif
-    host_timer_t profile[QUDA_PROFILE_COUNT];
+    array<host_timer_t, QUDA_PROFILE_COUNT> profile;
     static std::string pname[];
 
     bool switchOff;
     bool use_global;
 
-    // global timer
-    static host_timer_t global_profile[QUDA_PROFILE_COUNT];
-    static bool global_switchOff[QUDA_PROFILE_COUNT];
-    static int global_total_level[QUDA_PROFILE_COUNT]; // zero initialize
-
-    static void StopGlobal(const char *func, const char *file, int line, QudaProfileType idx) {
-
-      global_total_level[idx]--;
-      if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line);
-
-      // switch off total timer if we need to
-      if (global_switchOff[idx]) {
-        global_total_level[idx]--;
-        if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line);
-        global_switchOff[idx] = false;
-      }
-    }
-
-    static void StartGlobal(const char *func, const char *file, int line, QudaProfileType idx) {
-      // if total timer isn't running, then start it running
-      if (!global_profile[idx].running) {
-        global_profile[idx].start(func, file, line);
-        global_total_level[idx]++;
-        global_switchOff[idx] = true;
-      }
-
-      if (global_total_level[idx] == 0) global_profile[idx].start(func, file, line);
-      global_total_level[idx]++;
-    }
+    static void StopGlobal(const char *func, const char *file, int line, QudaProfileType idx);
+    static void StartGlobal(const char *func, const char *file, int line, QudaProfileType idx);
 
   public:
+    TimeProfile() = default;
+    TimeProfile(const TimeProfile &) = default;
+    TimeProfile &operator=(const TimeProfile &) = default;
+
     TimeProfile(std::string fname) : fname(fname), switchOff(false), use_global(true) { ; }
 
     TimeProfile(std::string fname, bool use_global) : fname(fname), switchOff(false), use_global(use_global) { ; }
@@ -257,30 +214,8 @@ namespace quda {
     /**< Print out the profile information */
     void Print();
 
-    void Start_(const char *func, const char *file, int line, QudaProfileType idx)
-    {
-      // if total timer isn't running, then start it running
-      if (!profile[QUDA_PROFILE_TOTAL].running && idx != QUDA_PROFILE_TOTAL) {
-        profile[QUDA_PROFILE_TOTAL].start(func, file, line);
-        switchOff = true;
-      }
-
-      profile[idx].start(func, file, line);
-      PUSH_RANGE(fname.c_str(),idx)
-	if (use_global) StartGlobal(func,file,line,idx);
-    }
-
-    void Stop_(const char *func, const char *file, int line, QudaProfileType idx) {
-      profile[idx].stop(func, file, line);
-      POP_RANGE
-
-      // switch off total timer if we need to
-      if (switchOff && idx != QUDA_PROFILE_TOTAL) {
-        profile[QUDA_PROFILE_TOTAL].stop(func, file, line);
-        switchOff = false;
-      }
-      if (use_global) StopGlobal(func,file,line,idx);
-    }
+    void Start_(const char *func, const char *file, int line, QudaProfileType idx);
+    void Stop_(const char *func, const char *file, int line, QudaProfileType idx);
 
     void Reset_(const char *func, const char *file, int line) {
       for (int idx = 0; idx < QUDA_PROFILE_COUNT; idx++) profile[idx].reset(func, file, line);
@@ -294,12 +229,29 @@ namespace quda {
 
   };
 
-  static TimeProfile dummy("dummy");
+  /**
+     @brief Container that we use for pushing a profile onto the
+     profile stack.  While this object is in scope it will exist on
+     the profile stack, and be popped when its destructor is called.
+   */
+  struct pushProfile {
+    static inline double secs_dummy = 0;
+    static inline double gflops_dummy = 0;
+    TimeProfile &profile;
+    double &secs;
+    double &gflops;
+    uint64_t flops;
+    pushProfile(TimeProfile &profile, double &secs = secs_dummy, double &gflops = gflops_dummy);
+    virtual ~pushProfile();
+  };
 
-} // namespace quda
+  /**
+     @brief Return a reference to the present profile at the top of
+     the stack
+   */
+  TimeProfile &getProfile();
 
-#undef PUSH_RANGE
-#undef POP_RANGE
+} // namespace quda
 
 #define TPSTART(idx) Start_(__func__, __FILE__, __LINE__, idx)
 #define TPSTOP(idx) Stop_(__func__, __FILE__, __LINE__, idx)
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 1511f6f881..9da6a82411 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -17,34 +17,26 @@
 
 namespace quda {
 
-  class TuneParam {
-
-  public:
-    dim3 block;
+  struct TuneParam {
+    dim3 block = {1, 1, 1};
     dim3 grid;
-    unsigned int shared_bytes;
-    bool set_max_shared_bytes; // whether to opt in to max shared bytes per thread block
-    int4 aux; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters
+    unsigned int shared_bytes = 0;
+    bool set_max_shared_bytes = false; // whether to opt in to max shared bytes per thread block
+    int4 aux = {1, 1, 1, 1};           // free parameter used as an arbitrary autotuning dimension
 
     std::string comment;
-    float time;
-    long long n_calls;
+    float time = FLT_MAX;
+    long long n_calls = 0;
 
     TuneParam();
     TuneParam(const TuneParam &) = default;
     TuneParam(TuneParam &&) = default;
     TuneParam &operator=(const TuneParam &) = default;
     TuneParam &operator=(TuneParam &&) = default;
-
-    friend std::ostream& operator<<(std::ostream& output, const TuneParam& param) {
-      output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
-      output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
-      output << "shared_bytes=" << param.shared_bytes;
-      output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")";
-      return output;
-    }
   };
 
+  std::ostream &operator<<(std::ostream &, const TuneParam &);
+
   /**
    * @brief Returns a reference to the tunecache map
    * @return tunecache reference
@@ -53,6 +45,10 @@ namespace quda {
 
   class Tunable {
 
+    friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity);
+    static inline uint64_t _flops_global = 0;
+    static inline uint64_t _bytes_global = 0;
+
   protected:
     virtual long long flops() const { return 0; }
     virtual long long bytes() const { return 0; }
@@ -68,20 +64,7 @@ namespace quda {
     virtual bool tuneGridDim() const { return true; }
     virtual bool tuneAuxDim() const { return false; }
 
-    virtual bool tuneSharedBytes() const
-    {
-      static bool tune_shared = true;
-      static bool init = false;
-
-      if (!init) {
-        char *enable_shared_env = getenv("QUDA_ENABLE_TUNING_SHARED");
-        if (enable_shared_env) {
-          if (strcmp(enable_shared_env, "0") == 0) { tune_shared = false; }
-        }
-        init = true;
-      }
-      return tune_shared;
-    }
+    virtual bool tuneSharedBytes() const;
 
     virtual bool advanceGridDim(TuneParam &param) const
     {
@@ -239,16 +222,7 @@ namespace quda {
        @brief Whether the present instance has already been tuned or not
        @return True if tuned, false if not
     */
-    bool tuned()
-    {
-      // not tuning is equivalent to already tuned
-      if (!getTuning()) return true;
-
-      TuneKey key = tuneKey();
-      if (use_managed_memory()) strcat(key.aux, ",managed");
-      // if key is present in cache then already tuned
-      return getTuneCache().find(key) != getTuneCache().end();
-    }
+    bool tuned() const;
 
   public:
     Tunable() : launch_error(QUDA_SUCCESS) { aux[0] = '\0'; }
@@ -287,24 +261,9 @@ namespace quda {
      */
     virtual float min_tune_time() const { return 1e-3; }
 
-    virtual std::string paramString(const TuneParam &param) const
-    {
-      std::stringstream ps;
-      ps << param;
-      return ps.str();
-    }
-
-    virtual std::string perfString(float time) const
-    {
-      float gflops = flops() / (1e9 * time);
-      float gbytes = bytes() / (1e9 * time);
-      std::stringstream ss;
-      ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
-      ss << gbytes << " GB/s";
-      return ss.str();
-    }
-
-    virtual std::string miscString(const TuneParam &) const { return std::string(); }
+    virtual std::string paramString(const TuneParam &param) const;
+    virtual std::string perfString(float time) const;
+    virtual std::string miscString(const TuneParam &) const;
 
     virtual void initTuneParam(TuneParam &param) const
     {
@@ -385,6 +344,12 @@ namespace quda {
 
     qudaError_t launchError() const { return launch_error; }
     qudaError_t &launchError() { return launch_error; }
+
+    static void flops_global(uint64_t value) { _flops_global = value; }
+    static uint64_t flops_global() { return _flops_global; }
+
+    static void bytes_global(uint64_t value) { _bytes_global = value; }
+    static uint64_t bytes_global() { return _bytes_global; }
   };
 
   /**
diff --git a/include/util_quda.h b/include/util_quda.h
index 533df01970..3d68fb5a2e 100644
--- a/include/util_quda.h
+++ b/include/util_quda.h
@@ -66,7 +66,7 @@ char *getPrintBuffer();
    number of OMP threads for CPU functions recorded in the tune cache.
    @return Returns the string
 */
-char* getOmpThreadStr();
+const char *getOmpThreadStr();
 
 void errorQuda_(const char *func, const char *file, int line, ...);
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 60c245b743..6c18ccae43 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -36,7 +36,7 @@ set (QUDA_OBJS
   field_cache.cpp
   gauge_covdev.cpp dirac.cpp
   clover_field.cpp lattice_field.cpp gauge_field.cpp
-  cpu_gauge_field.cpp cuda_gauge_field.cpp extract_gauge_ghost.cu
+  extract_gauge_ghost.cu
   gauge_norm.cu gauge_update_quda.cu
   max_clover.cu dirac_clover.cpp dirac_wilson.cpp dirac_staggered.cpp
   dirac_staggered_kd.cpp dirac_clover_hasenbusch_twist.cpp
@@ -84,7 +84,7 @@ set (QUDA_OBJS
   clover_sigma_outer_product.cu momentum.cu gauge_qcharge.cu
   deflation.cpp checksum.cu transform_reduce.cu
   dslash5_mobius_eofa.cu
-  madwf_ml.cpp
+  madwf_ml.cpp quda_ptr.cpp
   instantiate.cpp version.cpp
   block_transpose.cu )
 # cmake-format: on
@@ -174,6 +174,7 @@ configure_file(extract_gauge_ghost.in.cu extract_gauge_ghost.cu @ONLY)
 configure_file(gauge_noise.in.cu gauge_noise.cu @ONLY)
 configure_file(gauge_norm.in.cu gauge_norm.cu @ONLY)
 configure_file(spinor_noise.in.cu spinor_noise.cu @ONLY)
+configure_file(spinor_dilute.in.cu spinor_dilute.cu @ONLY)
 configure_file(copy_color_spinor_mg.in.hpp copy_color_spinor_mg.hpp @ONLY)
 configure_file(color_spinor_pack.in.cu color_spinor_pack.cu @ONLY)
 configure_file(color_spinor_util.in.cu color_spinor_util.cu @ONLY)
@@ -469,6 +470,7 @@ endif()
 
 if(QUDA_OPENMP)
   target_link_libraries(quda PUBLIC OpenMP::OpenMP_CXX)
+  target_compile_definitions(quda PUBLIC QUDA_OPENMP)
 endif()
 
 # set which precisions to enable
diff --git a/lib/blas_quda.cu b/lib/blas_quda.cu
index 4c8719f309..f84f2eeb59 100644
--- a/lib/blas_quda.cu
+++ b/lib/blas_quda.cu
@@ -7,9 +7,6 @@ namespace quda {
 
   namespace blas {
 
-    unsigned long long flops;
-    unsigned long long bytes;
-
     template <template <typename real> class Functor, typename store_t, typename y_store_t,
               int nSpin, typename coeff_t>
     class Blas : public TunableGridStrideKernel2D
@@ -56,9 +53,6 @@ namespace quda {
         }
 
         apply(device::get_default_stream());
-
-        blas::bytes += bytes();
-        blas::flops += flops();
       }
 
       TuneKey tuneKey() const override { return TuneKey(vol, typeid(f).name(), aux); }
diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu
index 27b7d68f22..64651fb55f 100644
--- a/lib/block_orthogonalize.in.cu
+++ b/lib/block_orthogonalize.in.cu
@@ -278,7 +278,7 @@ namespace quda {
                 QUDA_PRECISION, V.Precision(), B[0]->Precision());
 
     if constexpr (is_enabled_multigrid()) {
-      if (B[0]->V() == nullptr) {
+      if (B[0]->data() == nullptr) {
         warningQuda("Trying to BlockOrthogonalize staggered transform, skipping...");
         return;
       }
diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu
index 5071dcdc8b..3a1ab75569 100644
--- a/lib/clover_deriv_quda.cu
+++ b/lib/clover_deriv_quda.cu
@@ -66,6 +66,7 @@ namespace quda {
 #if defined(GPU_CLOVER_DIRAC) && (QUDA_PRECISION & 8)
   void cloverDerivative(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, QudaParity parity)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY);
     assert(force.Geometry() == QUDA_VECTOR_GEOMETRY);
 
@@ -81,6 +82,7 @@ namespace quda {
     } else {
       errorQuda("Precision %d not supported", force.Precision());
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void cloverDerivative(GaugeField &, GaugeField &, GaugeField &, double, QudaParity)
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index bb952ba324..0d859c4fdc 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -15,9 +15,7 @@ namespace quda {
   CloverFieldParam::CloverFieldParam(const CloverField &a) :
     LatticeFieldParam(a),
     reconstruct(clover::reconstruct()),
-    inverse(a.V(true)),
-    clover(nullptr),
-    cloverInv(nullptr),
+    inverse(a.Inverse()),
     csw(a.Csw()),
     coeff(a.Coeff()),
     twist_flavor(a.TwistFlavor()),
@@ -36,21 +34,16 @@ namespace quda {
   CloverField::CloverField(const CloverFieldParam &param) :
     LatticeField(param),
     reconstruct(param.reconstruct),
-    bytes(0),
     nColor(3),
     nSpin(4),
-    clover(nullptr),
-    cloverInv(nullptr),
-    diagonal(0.0),
-    max {0, 0},
+    inverse(param.inverse),
     csw(param.csw),
     coeff(param.coeff),
     twist_flavor(param.twist_flavor),
     mu2(param.mu2),
     rho(param.rho),
     order(param.order),
-    create(param.create),
-    trlog {0, 0}
+    create(param.create)
   {
     if (siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", siteSubset);
     if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim);
@@ -79,53 +72,26 @@ namespace quda {
 
     if (bytes) {
       if (create != QUDA_REFERENCE_FIELD_CREATE) {
-        if (location == QUDA_CUDA_FIELD_LOCATION) {
-          clover = pool_device_malloc(bytes);
-        } else {
-          clover = safe_malloc(bytes);
-        }
-
+        clover = quda_ptr(mem_type, bytes);
       } else {
-        clover = param.clover;
+        clover = quda_ptr(param.clover, mem_type);
       }
 
       total_bytes += bytes;
 
-      if (param.inverse) {
+      if (inverse) {
         if (create != QUDA_REFERENCE_FIELD_CREATE) {
-          if (location == QUDA_CUDA_FIELD_LOCATION) {
-            cloverInv = pool_device_malloc(bytes);
-          } else {
-            cloverInv = safe_malloc(bytes);
-          }
+          cloverInv = quda_ptr(mem_type, bytes);
         } else {
-          cloverInv = param.cloverInv;
+          cloverInv = quda_ptr(param.cloverInv, mem_type);
         }
 
         total_bytes += bytes;
       }
 
       if (create == QUDA_ZERO_FIELD_CREATE) {
-        if (location == QUDA_CUDA_FIELD_LOCATION) {
-          qudaMemset(clover, '\0', bytes);
-          if (param.inverse) qudaMemset(cloverInv, '\0', bytes);
-        } else {
-          memset(clover, '\0', bytes);
-          if (param.inverse) memset(cloverInv, '\0', bytes);
-        }
-      }
-    }
-  }
-
-  CloverField::~CloverField()
-  {
-    if (create != QUDA_REFERENCE_FIELD_CREATE) {
-      if (location == QUDA_CUDA_FIELD_LOCATION) {
-        if (clover) pool_device_free(clover);
-        if (cloverInv) pool_device_free(cloverInv);
-      } else {
-        if (clover) host_free(clover);
-        if (cloverInv) host_free(cloverInv);
+        qudaMemset(clover, '\0', bytes);
+        if (inverse) qudaMemset(cloverInv, '\0', bytes);
       }
     }
   }
@@ -141,38 +107,31 @@ namespace quda {
 
   void CloverField::backup(bool which) const
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(backup_h + which * bytes, V(which), bytes, qudaMemcpyDeviceToHost);
-    } else {
-      memcpy(backup_h + which * bytes, V(which), bytes);
-    }
+    qudaMemcpy(backup_h[which], which ? cloverInv : clover, bytes, qudaMemcpyDefault);
   }
 
   void CloverField::backup() const
   {
-    if (backup_h) errorQuda("Already allocated host backup");
-    backup_h = static_cast<char *>(safe_malloc(2 * bytes));
+    if (backup_h.size()) errorQuda("Already allocated host backup");
+    backup_h.resize(2);
+    for (auto &b : backup_h) b = quda_ptr(QUDA_MEMORY_HOST, bytes);
 
-    if (V(false)) backup(false);
-    if (V(true)) backup(true);
+    backup(false);
+    if (inverse) backup(true);
   }
 
   void CloverField::restore(bool which) const
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy((void *)V(which), backup_h + which * bytes, bytes, qudaMemcpyHostToDevice);
-    } else {
-      memcpy((void *)V(which), backup_h + which * bytes, bytes);
-    }
+    qudaMemcpy(which ? cloverInv : clover, backup_h[which], bytes, qudaMemcpyDefault);
   }
 
   void CloverField::restore() const
   {
-    if (V(false)) restore(false);
-    if (V(true)) restore(true);
+    if (!backup_h.size()) errorQuda("Cannot restore since not backed up");
+    restore(false);
+    if (inverse) restore(true);
 
-    host_free(backup_h);
-    backup_h = nullptr;
+    backup_h.resize(0);
   }
 
   CloverField *CloverField::Create(const CloverFieldParam &param) { return new CloverField(param); }
@@ -184,9 +143,15 @@ namespace quda {
 
   void CloverField::copy(const CloverField &src, bool is_inverse)
   {
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_H2D);
+    }
+
     // special case where we wish to make a copy of the inverse field when dynamic_inverse is enabled
     static bool dynamic_inverse_copy = false;
-    if (is_inverse && clover::dynamic_inverse() && V(true) && !src.V(true) && !dynamic_inverse_copy) {
+    if (is_inverse && clover::dynamic_inverse() && inverse && !src.inverse && !dynamic_inverse_copy) {
       dynamic_inverse_copy = true;
       // create a copy of the clover field that we will invert in place and use as the source
       CloverFieldParam param(src);
@@ -201,11 +166,11 @@ namespace quda {
     }
 
     checkField(src);
-    if (!V(is_inverse)) errorQuda("Destination field's is_inverse=%d component does not exist", is_inverse);
-    if (!src.V(is_inverse) && !dynamic_inverse_copy)
+    if (is_inverse && !inverse) errorQuda("Destination field's is_inverse=%d component does not exist", is_inverse);
+    if (is_inverse && !src.Inverse() && !dynamic_inverse_copy)
       errorQuda("Source field's is_inverse=%d component does not exist", is_inverse);
 
-    auto src_v = dynamic_inverse_copy ? src.V(false) : src.V(is_inverse);
+    auto src_v = dynamic_inverse_copy ? src.data(false) : src.data(is_inverse);
 
     // if we copying to a reconstruction field, we must find the overall scale factor to allow us to reconstruct
     if (Reconstruct()) {
@@ -227,7 +192,7 @@ namespace quda {
         void *packClover = pool_pinned_malloc(bytes);
 
         copyGenericClover(*this, src, is_inverse, QUDA_CPU_FIELD_LOCATION, packClover, src_v);
-        qudaMemcpy(V(is_inverse), packClover, bytes, qudaMemcpyHostToDevice);
+        qudaMemcpy(data(is_inverse), packClover, bytes, qudaMemcpyHostToDevice);
 
         pool_pinned_free(packClover);
       } else if (reorder_location() == QUDA_CUDA_FIELD_LOCATION && src.Location() == QUDA_CPU_FIELD_LOCATION) {
@@ -252,13 +217,17 @@ namespace quda {
         void *packClover = pool_device_malloc(bytes);
 
         copyGenericClover(*this, src, is_inverse, QUDA_CUDA_FIELD_LOCATION, packClover, src_v);
-        qudaMemcpy(V(is_inverse), packClover, bytes, qudaMemcpyDeviceToHost);
+        qudaMemcpy(data(is_inverse), packClover, bytes, qudaMemcpyDeviceToHost);
 
         pool_device_free(packClover);
       }
     }
 
-    qudaDeviceSynchronize();
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_H2D);
+    }
   }
 
   void CloverField::copy(const CloverField &src)
@@ -270,26 +239,22 @@ namespace quda {
   void CloverField::copy_to_buffer(void *buffer) const
   {
     size_t buffer_offset = 0;
-    if (V(false)) { // direct
-      qudaMemcpy(buffer, clover, bytes, qudaMemcpyDefault);
-      buffer_offset += bytes;
-    }
+    qudaMemcpy(buffer, clover.data(), bytes, qudaMemcpyDefault);
+    buffer_offset += bytes;
 
-    if (V(true)) { // inverse
-      qudaMemcpy(static_cast<char *>(buffer) + buffer_offset, cloverInv, bytes, qudaMemcpyDefault);
+    if (inverse) { // inverse
+      qudaMemcpy(static_cast<char *>(buffer) + buffer_offset, cloverInv.data(), bytes, qudaMemcpyDefault);
     }
   }
 
   void CloverField::copy_from_buffer(void *buffer)
   {
     size_t buffer_offset = 0;
-    if (V(false)) { // direct
-      qudaMemcpy(clover, static_cast<char *>(buffer), bytes, qudaMemcpyDefault);
-      buffer_offset += bytes;
-    }
+    qudaMemcpy(clover.data(), static_cast<char *>(buffer), bytes, qudaMemcpyDefault);
+    buffer_offset += bytes;
 
-    if (V(true)) { // inverse
-      qudaMemcpy(cloverInv, static_cast<char *>(buffer) + buffer_offset, bytes, qudaMemcpyDefault);
+    if (inverse) { // inverse
+      qudaMemcpy(cloverInv.data(), static_cast<char *>(buffer) + buffer_offset, bytes, qudaMemcpyDefault);
     }
   }
 
@@ -303,12 +268,12 @@ namespace quda {
                              QudaParity parity) const
   {
     if (location == QUDA_CUDA_FIELD_LOCATION && is_prefetch_enabled()) {
-      auto clover_parity = clover;
-      auto cloverInv_parity = cloverInv;
       auto bytes_parity = parity == QUDA_INVALID_PARITY ? bytes : bytes / 2;
+      auto clover_parity = clover.data();
+      auto cloverInv_parity = inverse ? cloverInv.data() : nullptr;
       if (parity == QUDA_ODD_PARITY) {
-        clover_parity = clover ? static_cast<char *>(clover_parity) + bytes_parity : nullptr;
-        cloverInv_parity = cloverInv ? static_cast<char *>(cloverInv_parity) + bytes_parity : nullptr;
+        clover_parity = static_cast<char *>(clover_parity) + bytes_parity;
+        cloverInv_parity = inverse ? static_cast<char *>(cloverInv_parity) + bytes_parity : nullptr;
       }
 
       switch (type) {
@@ -366,25 +331,23 @@ namespace quda {
     spinor_param.fieldOrder = colorspinor::getNative(a.Precision(), a.Nspin());
     spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
     spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
-    spinor_param.v = (void*)a.V(inverse);
+    spinor_param.v = a.data(inverse);
     spinor_param.location = a.Location();
     return spinor_param;
   }
 
   // Return the L2 norm squared of the clover field
-  double norm2(const CloverField &a, bool inverse) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a, inverse));
-    double nrm2 = blas::norm2(*b);
-    delete b;
-    return nrm2;
+  double norm2(const CloverField &a, bool inverse)
+  {
+    ColorSpinorField b(colorSpinorParam(a, inverse));
+    return blas::norm2(b);
   }
 
   // Return the L1 norm of the clover field
-  double norm1(const CloverField &a, bool inverse) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a, inverse));
-    double nrm1 = blas::norm1(*b);
-    delete b;
-    return nrm1;
+  double norm1(const CloverField &a, bool inverse)
+  {
+    ColorSpinorField b(colorSpinorParam(a, inverse));
+    return blas::norm1(b);
   }
 
 } // namespace quda
diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu
index ac7f15fbfe..903ce4e76c 100644
--- a/lib/clover_invert.cu
+++ b/lib/clover_invert.cu
@@ -49,9 +49,11 @@ namespace quda {
 #ifdef GPU_CLOVER_DIRAC
   void cloverInvert(CloverField &clover, bool computeTraceLog)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (clover.Reconstruct()) errorQuda("Cannot store the inverse with a reconstruct field");
     if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
     instantiate<CloverInvert>(clover, computeTraceLog);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void cloverInvert(CloverField &, bool)
diff --git a/lib/clover_outer_product.cu b/lib/clover_outer_product.cu
index c24595b58d..8ac8744bf8 100644
--- a/lib/clover_outer_product.cu
+++ b/lib/clover_outer_product.cu
@@ -136,6 +136,7 @@ namespace quda {
   void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector<ColorSpinorField *> &x,
                           std::vector<ColorSpinorField *> &p, std::vector<double> &coeff)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkNative(*x[0], *p[0], force, U);
     checkPrecision(*x[0], *p[0], force, U);
 
@@ -162,6 +163,7 @@ namespace quda {
         instantiate<CloverForce, ReconstructNo12>(U, force, inA, inB, inC, inD, parity, coeff[i]);
       }
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else // GPU_CLOVER_DIRAC not defined
   void computeCloverForce(GaugeField &, const GaugeField &, std::vector<ColorSpinorField *> &,
diff --git a/lib/clover_quda.cu b/lib/clover_quda.cu
index 853fdbe156..c000310f6b 100644
--- a/lib/clover_quda.cu
+++ b/lib/clover_quda.cu
@@ -37,9 +37,11 @@ namespace quda {
 #ifdef GPU_CLOVER_DIRAC
   void computeClover(CloverField &clover, const GaugeField& f, double coeff)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here");
     clover.Diagonal(0.5); // 0.5 comes from scaling used on native fields
     instantiate<ComputeClover>(clover, f, coeff);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void computeClover(CloverField &, const GaugeField &, double)
diff --git a/lib/clover_sigma_outer_product.cu b/lib/clover_sigma_outer_product.cu
index 1c34a7ff33..370ada813f 100644
--- a/lib/clover_sigma_outer_product.cu
+++ b/lib/clover_sigma_outer_product.cu
@@ -61,6 +61,7 @@ namespace quda {
   void computeCloverSigmaOprod(GaugeField& oprod, std::vector<ColorSpinorField*> &x,
 			       std::vector<ColorSpinorField*> &p, std::vector<std::vector<double> > &coeff)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (x.size() > MAX_NVECTOR) {
       // divide and conquer
       std::vector<ColorSpinorField*> x0(x.begin(), x.begin()+x.size()/2);
@@ -83,6 +84,7 @@ namespace quda {
     }
 
     instantiate<CloverSigmaOprod>(oprod, x, p, coeff);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else // GPU_CLOVER_DIRAC not defined
   void computeCloverSigmaOprod(GaugeField &, std::vector<ColorSpinorField*> &,
diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
index 4c23f50781..564d1ed55c 100644
--- a/lib/coarse_op.cuh
+++ b/lib/coarse_op.cuh
@@ -887,8 +887,8 @@ namespace quda {
 	X_atomic.backup();
         break;
       case COMPUTE_CONVERT:
-	if (Y_atomic.Gauge_p() == Y.Gauge_p()) Y.backup();
-	if (X_atomic.Gauge_p() == X.Gauge_p()) X.backup();
+        if (Y_atomic.data() == Y.data()) Y.backup();
+        if (X_atomic.data() == X.data()) X.backup();
         break;
       case COMPUTE_RESCALE:
         Y.backup();
@@ -921,8 +921,8 @@ namespace quda {
 	X_atomic.restore();
         break;
       case COMPUTE_CONVERT:
-	if (Y_atomic.Gauge_p() == Y.Gauge_p()) Y.restore();
-	if (X_atomic.Gauge_p() == X.Gauge_p()) X.restore();
+        if (Y_atomic.data() == Y.data()) Y.restore();
+        if (X_atomic.data() == X.data()) X.restore();
         break;
       case COMPUTE_RESCALE:
         Y.restore();
diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu
index 320c14bf12..c586f8f1dd 100644
--- a/lib/coarse_op.in.cu
+++ b/lib/coarse_op.in.cu
@@ -97,7 +97,7 @@ namespace quda {
       gCoarseAtomic yAccessorAtomic(const_cast<GaugeField&>(Yatomic));
       gCoarseAtomic xAccessorAtomic(const_cast<GaugeField&>(Xatomic));
       cFine cAccessor(const_cast<CloverField&>(c), false);
-      cFine cInvAccessor(const_cast<CloverField&>(c), true);
+      cFine cInvAccessor(const_cast<CloverField &>(c), c.Inverse());
 
       calculateY<use_mma, QUDA_CUDA_FIELD_LOCATION, false,Float,fineSpin,fineColor,coarseSpin,coarseColor>
         (yAccessor, xAccessor, yAccessorAtomic, xAccessorAtomic, uvAccessor,
@@ -173,17 +173,17 @@ namespace quda {
       gf_param.nFace = 1;
       gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-      U = new cpuGaugeField(gf_param);
+      U = new GaugeField(gf_param);
 
       //Copy the cuda gauge field to the cpu
-      static_cast<const cudaGaugeField&>(gauge).saveCPUField(*static_cast<cpuGaugeField*>(U));
+      U->copy(gauge);
     } else if (location == QUDA_CUDA_FIELD_LOCATION && gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) {
       //Create a copy of the gauge field with no reconstruction, required for fine-grained access
       GaugeFieldParam gf_param(gauge);
       gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
       gf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
       gf_param.setPrecision(gf_param.Precision());
-      U = new cudaGaugeField(gf_param);
+      U = new GaugeField(gf_param);
 
       U->copy(gauge);
     }
@@ -197,7 +197,7 @@ namespace quda {
     for (int i = 0; i < cf_param.nDim; i++) cf_param.x[i] = clover ? clover->X()[i] : 0;
 
     // only create inverse if not doing dynamic clover and one already exists
-    cf_param.inverse = !clover::dynamic_inverse() && clover && clover->V(true);
+    cf_param.inverse = !clover::dynamic_inverse() && clover && clover->Inverse();
     cf_param.clover = nullptr;
     cf_param.cloverInv = nullptr;
     cf_param.create = QUDA_NULL_FIELD_CREATE;
diff --git a/lib/coarse_op_preconditioned.in.cu b/lib/coarse_op_preconditioned.in.cu
index a6eeb63451..4ac89b7c0b 100644
--- a/lib/coarse_op_preconditioned.in.cu
+++ b/lib/coarse_op_preconditioned.in.cu
@@ -166,7 +166,7 @@ namespace quda
           GaugeFieldParam param(X);
           param.order = gOrder_milc;
           param.setPrecision(X.Precision() < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : X.Precision());
-          output = cudaGaugeField::Create(param);
+          output = new GaugeField(param);
           if (copy_content) output->copy(X);
         }
         return output;
@@ -175,8 +175,8 @@ namespace quda
       GaugeField *X_aos = create_gauge_copy(X, true);
       Xinv_aos = create_gauge_copy(Xinv, false);
 
-      blas::flops += invert((void *)Xinv_aos->Gauge_p(), (void *)X_aos->Gauge_p(), n, X_aos->Volume(),
-                            X_aos->Precision(), X.Location());
+      Tunable::flops_global(invert(Xinv_aos->data(), X_aos->data(), n, X_aos->Volume(), X_aos->Precision(), X.Location())
+                            + Tunable::flops_global());
 
       if (&Xinv != Xinv_aos) {
         if (Xinv.Precision() < QUDA_SINGLE_PRECISION) Xinv.Scale(Xinv_aos->abs_max());
@@ -187,9 +187,8 @@ namespace quda
       if (!use_mma) { delete Xinv_aos; }
 
     } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) {
-      const cpuGaugeField *X_h = static_cast<const cpuGaugeField*>(&X);
-      cpuGaugeField *Xinv_h = static_cast<cpuGaugeField*>(&Xinv);
-      blas::flops += invert(*(void**)Xinv_h->Gauge_p(), *(void**)X_h->Gauge_p(), n, X_h->Volume(), X.Precision(), X.Location());
+      Tunable::flops_global(invert(Xinv.data<void *>(0), X.data<void *>(0), n, X.Volume(), X.Precision(), X.Location())
+                            + Tunable::flops_global());
     } else {
       errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order());
     }
@@ -213,7 +212,7 @@ namespace quda
             param.order = order;
             // if we did the exchange on AoS order, then this zero initialize wouldn't be needed
             if (!copy_content) param.create = QUDA_ZERO_FIELD_CREATE;
-            output = cudaGaugeField::Create(param);
+            output = new GaugeField(param);
             if (copy_content) output->copy(X);
           }
           return output;
diff --git a/lib/coarsecoarse_op_mma.in.cu b/lib/coarsecoarse_op_mma.in.cu
index ee18191dbb..eee43a43ac 100644
--- a/lib/coarsecoarse_op_mma.in.cu
+++ b/lib/coarsecoarse_op_mma.in.cu
@@ -40,10 +40,10 @@ namespace quda {
       } else {
         GaugeFieldParam param(X);
         param.order = order;
-        output = cudaGaugeField::Create(param);
+        output = new GaugeField(param);
         if (copy_content) output->copy(X);
       }
-      return static_cast<cudaGaugeField *>(output);
+      return output;
     };
 
     auto Y_order = create_gauge_copy(Y, gOrder, false);
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 2666913072..46c190f433 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -23,15 +23,6 @@ namespace quda
     composite_descr(param.is_composite, param.composite_dim, param.is_component, param.component_id),
     components(0)
   {
-    // this must come before create
-    if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
-      v = param.v;
-      norm_offset = param.norm_offset;
-      reference = true;
-    } else if (param.create == QUDA_GHOST_FIELD_CREATE) {
-      ghost_only = true;
-    }
-
     create(param);
 
     switch (param.create) {
@@ -84,7 +75,7 @@ namespace quda
   {
     if (&src != this) {
       // if field not already initialized then move the field
-      if (!init || are_compatible(*this, src)) {
+      if (!init || are_compatible(*this, src) || src.empty()) {
         if (init) destroy();
         LatticeField::operator=(std::move(src));
         move(std::move(src));
@@ -157,21 +148,13 @@ namespace quda
       errorQuda("Subset not implemented");
 
     if (param.create != QUDA_REFERENCE_FIELD_CREATE && param.create != QUDA_GHOST_FIELD_CREATE) {
-      if (location == QUDA_CPU_FIELD_LOCATION) {
-        v = safe_malloc(bytes);
-      } else if (location == QUDA_CUDA_FIELD_LOCATION) {
-        switch (mem_type) {
-        case QUDA_MEMORY_DEVICE: v = pool_device_malloc(bytes); break;
-        case QUDA_MEMORY_MAPPED:
-          v_h = mapped_malloc(bytes);
-          v = get_mapped_device_pointer(v_h);
-          break;
-        default: errorQuda("Unsupported memory type %d", mem_type);
-        }
-      } else {
-        errorQuda("Unexpected field location %d", location);
-      }
+      v = quda_ptr(mem_type, bytes);
       alloc = true;
+    } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
+      v = quda_ptr(param.v, mem_type);
+      reference = true;
+    } else if (param.create == QUDA_GHOST_FIELD_CREATE) {
+      ghost_only = true;
     }
 
     if (composite_descr.is_composite && param.create != QUDA_REFERENCE_FIELD_CREATE
@@ -186,7 +169,7 @@ namespace quda
       components.reserve(composite_descr.dim);
       for (int cid = 0; cid < composite_descr.dim; cid++) {
         param.component_id = cid;
-        param.v = static_cast<void *>(static_cast<char *>(v) + cid * bytes / composite_descr.dim);
+        param.v = static_cast<void *>(static_cast<char *>(v.data()) + cid * bytes / composite_descr.dim);
         components.push_back(new ColorSpinorField(param));
       }
     }
@@ -203,7 +186,7 @@ namespace quda
       param.is_component = composite_descr.is_component;
       param.component_id = composite_descr.id;
       even = new ColorSpinorField(param);
-      param.v = static_cast<char *>(v) + bytes / 2;
+      param.v = !ghost_only ? static_cast<char *>(v.data()) + bytes / 2 : nullptr;
       odd = new ColorSpinorField(param);
     }
 
@@ -225,17 +208,13 @@ namespace quda
 
   void ColorSpinorField::zeroPad()
   {
+    if (!isNative()) return;
     // zero the region added for alignment reasons
     if (bytes != bytes_raw) {
       size_t subset_bytes = bytes / siteSubset;
       size_t subset_bytes_raw = bytes_raw / siteSubset;
-      for (int subset = 0; subset < siteSubset; subset++) {
-        if (location == QUDA_CUDA_FIELD_LOCATION)
-          qudaMemsetAsync(static_cast<char *>(v) + subset_bytes_raw + subset_bytes * subset, 0,
-                          subset_bytes - subset_bytes_raw, device::get_default_stream());
-        else
-          memset(static_cast<char *>(v) + subset_bytes_raw + subset_bytes * subset, 0, subset_bytes - subset_bytes_raw);
-      }
+      qudaMemset2DAsync(v, subset_bytes_raw, subset_bytes, 0, subset_bytes - subset_bytes_raw, siteSubset,
+                        device::get_default_stream());
     }
   }
 
@@ -252,8 +231,7 @@ namespace quda
     pc_type = std::exchange(src.pc_type, QUDA_PC_INVALID);
     suggested_parity = std::exchange(src.suggested_parity, QUDA_INVALID_PARITY);
     length = std::exchange(src.length, 0);
-    v = std::exchange(src.v, nullptr);
-    v_h = std::exchange(src.v_h, nullptr);
+    v.exchange(src.v, {}); // cannot use std::exchange for quda_ptr
     norm_offset = std::exchange(src.norm_offset, 0);
     ghost = std::exchange(src.ghost, {});
     ghostFace = std::exchange(src.ghostFace, {});
@@ -274,18 +252,7 @@ namespace quda
   void ColorSpinorField::destroy()
   {
     if (alloc) {
-      if (location == QUDA_CPU_FIELD_LOCATION) {
-        host_free(v);
-      } else { // device field
-        switch (mem_type) {
-        case QUDA_MEMORY_DEVICE: pool_device_free(v); break;
-        case QUDA_MEMORY_MAPPED: host_free(v_h); break;
-        default: errorQuda("Unsupported memory type %d", mem_type);
-        }
-      }
       alloc = false;
-      v = nullptr;
-      v_h = nullptr;
 
       if (composite_descr.is_composite) {
         CompositeColorSpinorField::iterator vec;
@@ -430,18 +397,18 @@ namespace quda
     ghost_precision_allocated = ghost_precision;
   } // createGhostZone
 
-  void ColorSpinorField::zero()
-  {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemsetAsync(v, 0, bytes, device::get_default_stream());
-    } else {
-      memset(v, '\0', bytes);
-    }
-  }
+  void ColorSpinorField::zero() { qudaMemsetAsync(v, 0, bytes, device::get_default_stream()); }
 
   void ColorSpinorField::copy(const ColorSpinorField &src)
   {
     test_compatible_weak(*this, src);
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_H2D);
+    }
+
     if (Location() == src.Location()) { // H2H and D2D
 
       copyGenericColorSpinor(*this, src, Location());
@@ -452,24 +419,24 @@ namespace quda
         void *buffer = pool_pinned_malloc(bytes);
         memset(buffer, 0, bytes); // FIXME (temporary?) bug fix for padding
         copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, 0);
-        qudaMemcpy(v, buffer, bytes, qudaMemcpyDefault);
+        qudaMemcpy(v.data(), buffer, bytes, qudaMemcpyDefault);
         pool_pinned_free(buffer);
 
       } else { // reorder on device
 
         if (src.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
           // special case where we use mapped memory to read/write directly from application's array
-          void *src_d = get_mapped_device_pointer(src.V());
-          copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, v, src_d);
+          void *src_d = get_mapped_device_pointer(src.data());
+          copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, v.data(), src_d);
         } else {
           void *Src = nullptr, *buffer = nullptr;
           if (!zeroCopy) {
             buffer = pool_device_malloc(src.Bytes());
             Src = buffer;
-            qudaMemcpy(Src, src.V(), src.Bytes(), qudaMemcpyDefault);
+            qudaMemcpy(Src, src.data(), src.Bytes(), qudaMemcpyDefault);
           } else {
             buffer = pool_pinned_malloc(src.Bytes());
-            memcpy(buffer, src.V(), src.Bytes());
+            memcpy(buffer, src.data(), src.Bytes());
             Src = get_mapped_device_pointer(buffer);
           }
 
@@ -482,13 +449,12 @@ namespace quda
             pool_device_free(buffer);
         }
       }
-      qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
 
     } else if (Location() == QUDA_CPU_FIELD_LOCATION && src.Location() == QUDA_CUDA_FIELD_LOCATION) { // D2H
 
       if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // reorder on the host
         void *buffer = pool_pinned_malloc(bytes);
-        qudaMemcpy(buffer, v, bytes, qudaMemcpyDefault);
+        qudaMemcpy(buffer, v.data(), bytes, qudaMemcpyDefault);
         copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, 0, buffer);
         pool_pinned_free(buffer);
 
@@ -496,8 +462,8 @@ namespace quda
 
         if (FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
           // special case where we use zero-copy memory to read/write directly from application's array
-          void *dest_d = get_mapped_device_pointer(v);
-          copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dest_d, src.V());
+          void *dest_d = get_mapped_device_pointer(v.data());
+          copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dest_d, src.data());
         } else {
           void *dst = nullptr, *buffer = nullptr;
           if (!zeroCopy) {
@@ -511,10 +477,10 @@ namespace quda
           copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dst, 0);
 
           if (!zeroCopy) {
-            qudaMemcpy(v, dst, Bytes(), qudaMemcpyDefault);
+            qudaMemcpy(v.data(), dst, Bytes(), qudaMemcpyDefault);
           } else {
             qudaDeviceSynchronize();
-            memcpy(v, buffer, bytes);
+            memcpy(v.data(), buffer, bytes);
           }
 
           if (zeroCopy)
@@ -526,6 +492,12 @@ namespace quda
 
       qudaDeviceSynchronize(); // need to sync before data can be used on CPU
     }
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_H2D);
+    }
   }
 
   // Fills the param with the contents of this field
@@ -533,7 +505,7 @@ namespace quda
   {
     LatticeField::fill(param);
     param.field = const_cast<ColorSpinorField *>(this);
-    param.v = v;
+    param.v = !ghost_only ? v.data() : nullptr;
     param.nColor = nColor;
     param.nSpin = nSpin;
     param.nVec = nVec;
@@ -863,7 +835,7 @@ namespace quda
       errorQuda("Cannot create an alias to source with lower precision than the alias");
     ColorSpinorParam param = param_.init ? param_ : ColorSpinorParam(*this);
     param.create = QUDA_REFERENCE_FIELD_CREATE;
-    param.v = V();
+    param.v = data();
 
     return ColorSpinorField(param);
   }
@@ -874,7 +846,7 @@ namespace quda
       errorQuda("Cannot create an alias to source with lower precision than the alias");
     ColorSpinorParam param(param_);
     param.create = QUDA_REFERENCE_FIELD_CREATE;
-    param.v = V();
+    param.v = data();
 
     return new ColorSpinorField(param);
   }
@@ -915,9 +887,7 @@ namespace quda
     coarseParam.setPrecision(new_precision);
 
     // set where we allocate the field
-    coarseParam.mem_type = (new_mem_type != QUDA_MEMORY_INVALID) ?
-      new_mem_type :
-      (new_location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_PINNED);
+    coarseParam.mem_type = new_mem_type;
 
     return new ColorSpinorField(coarseParam);
   }
@@ -948,9 +918,7 @@ namespace quda
     }
 
     // set where we allocate the field
-    fineParam.mem_type = (new_mem_type != QUDA_MEMORY_INVALID) ?
-      new_mem_type :
-      (new_location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_PINNED);
+    fineParam.mem_type = new_mem_type;
 
     return new ColorSpinorField(fineParam);
   }
@@ -1508,49 +1476,29 @@ namespace quda
 
   void ColorSpinorField::backup() const
   {
-    if (backed_up) errorQuda("ColorSpinorField already backed up");
-
-    backup_h = new char[bytes];
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(backup_h, v, bytes, qudaMemcpyDefault);
-    } else {
-      memcpy(backup_h, v, bytes);
-    }
-
-    backed_up = true;
+    if (backup_h.size()) errorQuda("ColorSpinorField already backed up");
+    backup_h.resize(1);
+    backup_h[0] = quda_ptr(QUDA_MEMORY_HOST, bytes);
+    qudaMemcpy(backup_h[0], v, bytes, qudaMemcpyDefault);
   }
 
   void ColorSpinorField::restore() const
   {
-    if (!backed_up) errorQuda("Cannot restore since not backed up");
-
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(v, backup_h, bytes, qudaMemcpyDefault);
-      delete[] backup_h;
-    } else {
-      memcpy(v, backup_h, bytes);
-      delete[] backup_h;
-    }
-
-    backed_up = false;
+    if (!backup_h.size()) errorQuda("Cannot restore since not backed up");
+    qudaMemcpy(v, backup_h[0], bytes, qudaMemcpyDefault);
+    backup_h.resize(0);
   }
 
   void ColorSpinorField::copy_to_buffer(void *buffer) const
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(buffer, v, bytes, qudaMemcpyDeviceToHost);
-    } else {
-      std::memcpy(buffer, v, bytes);
-    }
+    quda_ptr buf(buffer, QUDA_MEMORY_HOST);
+    qudaMemcpy(buf, v, bytes, qudaMemcpyDefault);
   }
 
   void ColorSpinorField::copy_from_buffer(void *buffer)
   {
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      qudaMemcpy(v, buffer, bytes, qudaMemcpyHostToDevice);
-    } else {
-      std::memcpy(v, buffer, bytes);
-    }
+    quda_ptr buf(buffer, QUDA_MEMORY_HOST);
+    qudaMemcpy(v, buf, bytes, qudaMemcpyDefault);
   }
 
   void ColorSpinorField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const
@@ -1558,7 +1506,7 @@ namespace quda
     if (Location() == QUDA_CUDA_FIELD_LOCATION) {
       // conditionals based on destructor
       if (is_prefetch_enabled() && alloc && mem_type == QUDA_MEMORY_DEVICE)
-        qudaMemPrefetchAsync(v, bytes, mem_space, stream);
+        qudaMemPrefetchAsync(v.data(), bytes, mem_space, stream);
     }
   }
 
@@ -1599,7 +1547,7 @@ namespace quda
   std::ostream &operator<<(std::ostream &out, const ColorSpinorField &a)
   {
     out << "location = " << a.Location() << std::endl;
-    out << "v = " << a.v << std::endl;
+    out << "v = " << a.v.data() << std::endl;
     out << "alloc = " << a.alloc << std::endl;
     out << "reference = " << a.reference << std::endl;
     out << "init = " << a.init << std::endl;
diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu
index c0baeb0ec5..0b9355d4d1 100644
--- a/lib/color_spinor_util.in.cu
+++ b/lib/color_spinor_util.in.cu
@@ -417,7 +417,8 @@ namespace quda {
 
     param.create = create;
     if (create == QUDA_COPY_FIELD_CREATE) param.field = &const_cast<ColorSpinorField&>(src);
-    else if (create == QUDA_REFERENCE_FIELD_CREATE) param.v = const_cast<ColorSpinorField&>(src).V();
+    else if (create == QUDA_REFERENCE_FIELD_CREATE)
+      param.v = src.data();
 
     resize(v, new_size, param);
   }
diff --git a/lib/contract.cu b/lib/contract.cu
index 491652ae9c..74206419c6 100644
--- a/lib/contract.cu
+++ b/lib/contract.cu
@@ -58,12 +58,14 @@ public:
 #ifdef GPU_CONTRACT
   void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, const QudaContractType cType)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(x, y);
     if (x.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS || y.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS)
       errorQuda("Unexpected gamma basis x=%d y=%d", x.GammaBasis(), y.GammaBasis());
     if (x.Nspin() != 4 || y.Nspin() != 4) errorQuda("Unexpected number of spins x=%d y=%d", x.Nspin(), y.Nspin());
 
     instantiate<Contraction>(x, y, result, cType);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void contractQuda(const ColorSpinorField &, const ColorSpinorField &, void *, const QudaContractType)
diff --git a/lib/copy_clover_offset.cu b/lib/copy_clover_offset.cu
index 1300082c24..f29e663c14 100644
--- a/lib/copy_clover_offset.cu
+++ b/lib/copy_clover_offset.cu
@@ -70,8 +70,8 @@ namespace quda
 
     if (pc_type != QUDA_4D_PC) { errorQuda("Gauge field copy must use 4d even-odd preconditioning."); }
 
-    if (in.V(true)) { instantiate<CopyCloverOffset>(out, in, offset, true); }
-    if (in.V(false)) { instantiate<CopyCloverOffset>(out, in, offset, false); }
+    if (in.Inverse()) instantiate<CopyCloverOffset>(out, in, offset, true);
+    instantiate<CopyCloverOffset>(out, in, offset, false);
   }
 #else
   void copyFieldOffset(CloverField &, const CloverField &, CommKey, QudaPCType)
diff --git a/lib/copy_color_spinor_mg.in.hpp b/lib/copy_color_spinor_mg.in.hpp
index a6678143b4..92b96b4c1a 100644
--- a/lib/copy_color_spinor_mg.in.hpp
+++ b/lib/copy_color_spinor_mg.in.hpp
@@ -117,14 +117,14 @@ namespace quda {
       }
 
       // set for the source subset ordering
-      srcFloat *srcEven = Src ? Src : (srcFloat*)src.V();
+      srcFloat *srcEven = Src ? Src : src.data<srcFloat *>();
       srcFloat *srcOdd = (srcFloat*)((char*)srcEven + src.Bytes()/2);
       if (src.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) {
 	std::swap<srcFloat*>(srcEven, srcOdd);
       }
 
       // set for the destination subset ordering
-      dstFloat *dstEven = Dst ? Dst : (dstFloat*)dst.V();
+      dstFloat *dstEven = Dst ? Dst : dst.data<dstFloat *>();
       dstFloat *dstOdd = (dstFloat*)((char*)dstEven + dst.Bytes()/2);
       if (dst.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) {
 	std::swap<dstFloat*>(dstEven, dstOdd);
diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp
deleted file mode 100644
index f4b27109a8..0000000000
--- a/lib/cpu_gauge_field.cpp
+++ /dev/null
@@ -1,431 +0,0 @@
-#include <quda_internal.h>
-#include <timer.h>
-#include <gauge_field.h>
-#include <assert.h>
-#include <string.h>
-#include <typeinfo>
-
-namespace quda {
-
-  cpuGaugeField::cpuGaugeField(const GaugeFieldParam &param) :
-    GaugeField(param)
-  {
-    if (precision == QUDA_HALF_PRECISION) {
-      errorQuda("CPU fields do not support half precision");
-    }
-    if (precision == QUDA_QUARTER_PRECISION) {
-      errorQuda("CPU fields do not support quarter precision");
-    }
-    if (pad != 0) {
-      errorQuda("CPU fields do not support non-zero padding");
-    }
-    if (reconstruct != QUDA_RECONSTRUCT_NO && reconstruct != QUDA_RECONSTRUCT_10) {
-      errorQuda("Reconstruction type %d not supported", reconstruct);
-    }
-    if (reconstruct == QUDA_RECONSTRUCT_10 && link_type != QUDA_ASQTAD_MOM_LINKS) {
-      errorQuda("10-reconstruction only supported with momentum links");
-    }
-
-    int siteDim=0;
-    if (geometry == QUDA_SCALAR_GEOMETRY) siteDim = 1;
-    else if (geometry == QUDA_VECTOR_GEOMETRY) siteDim = nDim;
-    else if (geometry == QUDA_TENSOR_GEOMETRY) siteDim = nDim * (nDim-1) / 2;
-    else if (geometry == QUDA_COARSE_GEOMETRY) siteDim = 2*nDim;
-    else if (geometry == QUDA_KDINVERSE_GEOMETRY)
-      siteDim = 1 << nDim;
-    else errorQuda("Unknown geometry type %d", geometry);
-
-    // compute the correct bytes size for these padded field orders
-    if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      bytes = siteDim * (x[0]*x[1]*(x[2]+4)*x[3]) * nInternal * precision;
-    } else if (order == QUDA_BQCD_GAUGE_ORDER) {
-      bytes = siteDim * (x[0]+4)*(x[1]+2)*(x[2]+2)*(x[3]+2) * nInternal * precision;
-    } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) {
-      bytes = volume * site_size;
-    }
-
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      gauge = (void**) safe_malloc(siteDim * sizeof(void*));
-
-      for (int d=0; d<siteDim; d++) {
-	size_t nbytes = volume * nInternal * precision;
-	if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
-          gauge[d] = nbytes ? safe_malloc(nbytes) : nullptr;
-          if (create == QUDA_ZERO_FIELD_CREATE && nbytes) memset(gauge[d], 0, nbytes);
-        } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
-          gauge[d] = ((void **)param.gauge)[d];
-        } else {
-          errorQuda("Unsupported creation type %d", create);
-        }
-      }
-    
-    } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER  ||
-	       order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER ||
-	       order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) {
-
-      if (order == QUDA_MILC_SITE_GAUGE_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) {
-	errorQuda("MILC site gauge order only supported for reference fields");
-      }
-
-      if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
-        gauge = bytes ? (void **)safe_malloc(bytes) : nullptr;
-        if (create == QUDA_ZERO_FIELD_CREATE && bytes) memset(gauge, 0, bytes);
-      } else if (create == QUDA_REFERENCE_FIELD_CREATE) {
-	gauge = (void**) param.gauge;
-      } else {
-	errorQuda("Unsupported creation type %d", create);
-      }
-
-    } else {
-      errorQuda("Unsupported gauge order type %d", order);
-    }
-  
-    // no need to exchange data if this is a momentum field
-    if (link_type != QUDA_ASQTAD_MOM_LINKS) {
-      // Ghost zone is always 2-dimensional    
-      for (int i=0; i<nDim; i++) {
-	size_t nbytes = nFace * surface[i] * nInternal * precision;
-	ghost[i] = nbytes ? safe_malloc(nbytes) : nullptr;
-	ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? safe_malloc(nbytes) : nullptr;
-      }
-
-      if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
-	// exchange the boundaries if a non-trivial field
-	if (create != QUDA_NULL_FIELD_CREATE && create != QUDA_ZERO_FIELD_CREATE &&
-	    (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY) )
-	  exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-      }
-    }
-
-    // compute the fat link max now in case it is needed later (i.e., for half precision)
-    if (param.compute_fat_link_max) fat_link_max = this->abs_max();
-  }
-
-
-  cpuGaugeField::~cpuGaugeField()
-  {
-    int siteDim = 0;
-    if (geometry == QUDA_SCALAR_GEOMETRY) siteDim = 1;
-    else if (geometry == QUDA_VECTOR_GEOMETRY) siteDim = nDim;
-    else if (geometry == QUDA_TENSOR_GEOMETRY) siteDim = nDim * (nDim-1) / 2;
-    else if (geometry == QUDA_COARSE_GEOMETRY) siteDim = 2*nDim;
-    else if (geometry == QUDA_KDINVERSE_GEOMETRY)
-      siteDim = 1 << nDim;
-    else errorQuda("Unknown geometry type %d", geometry);
-
-    if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) {
-      if (order == QUDA_QDP_GAUGE_ORDER) {
-	for (int d=0; d<siteDim; d++) {
-	  if (gauge[d]) host_free(gauge[d]);
-	}
-	if (gauge) host_free(gauge);
-      } else {
-	if (gauge) host_free(gauge);
-      }
-    } else { // QUDA_REFERENCE_FIELD_CREATE 
-      if (order == QUDA_QDP_GAUGE_ORDER){
-	if (gauge) host_free(gauge);
-      }
-    }
-  
-    if (link_type != QUDA_ASQTAD_MOM_LINKS) {
-      for (int i=0; i<nDim; i++) {
-	if (ghost[i]) host_free(ghost[i]);
-	if (ghost[i+4] && geometry == QUDA_COARSE_GEOMETRY) host_free(ghost[i+4]);
-      }
-    }
-  }
-
-  // This does the exchange of the gauge field ghost zone and places it
-  // into the ghost array.
-  void cpuGaugeField::exchangeGhost(QudaLinkDirection link_direction) {
-    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY)
-      errorQuda("Cannot exchange for %d geometry gauge field", geometry);
-
-    if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
-      errorQuda("Cannot request exchange of forward links on non-coarse geometry");
-
-    void *send[2*QUDA_MAX_DIM];
-    for (int d=0; d<nDim; d++) {
-      send[d] = safe_malloc(nFace*surface[d]*nInternal*precision);
-      if (geometry == QUDA_COARSE_GEOMETRY) send[d+4] = safe_malloc(nFace*surface[d]*nInternal*precision);
-    }
-
-    if (link_direction == QUDA_LINK_BACKWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
-      // get the links into contiguous buffers
-      extractGaugeGhost(*this, send, true);
-
-      // communicate between nodes
-      exchange(ghost, send, QUDA_FORWARDS);
-    }
-
-    // repeat if requested and links are bi-directional
-    if (link_direction == QUDA_LINK_FORWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
-      extractGaugeGhost(*this, send, true, nDim);
-      exchange(ghost+nDim, send+nDim, QUDA_FORWARDS);
-    }
-
-    for (int d=0; d<geometry; d++) host_free(send[d]);
-  }
-
-  // This does the opposite of exchangeGhost and sends back the ghost
-  // zone to the node from which it came and injects it back into the
-  // field
-  void cpuGaugeField::injectGhost(QudaLinkDirection link_direction) {
-    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY)
-      errorQuda("Cannot exchange for %d geometry gauge field", geometry);
-
-    if (link_direction != QUDA_LINK_BACKWARDS)
-      errorQuda("link_direction = %d not supported", link_direction);
-
-    void *recv[2*QUDA_MAX_DIM];
-    for (int d=0; d<nDim; d++) recv[d] = safe_malloc(nFace*surface[d]*nInternal*precision);
-
-    // communicate between nodes
-    exchange(recv, ghost, QUDA_BACKWARDS);
-
-    // get the links into contiguous buffers
-    extractGaugeGhost(*this, recv, false);
-
-    for (int d=0; d<nDim; d++) host_free(recv[d]);
-  }
-
-  void cpuGaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill)
-  {
-
-    void *send[QUDA_MAX_DIM];
-    void *recv[QUDA_MAX_DIM];
-    size_t bytes[QUDA_MAX_DIM];
-    // store both parities and directions in each
-    for (int d=0; d<nDim; d++) {
-      if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d])) ) continue;
-      bytes[d] = surface[d] * R[d] * geometry * nInternal * precision;
-      send[d] = safe_malloc(2 * bytes[d]);
-      recv[d] = safe_malloc(2 * bytes[d]);
-    }
-
-    for (int d=0; d<nDim; d++) {
-      if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d])) ) continue;
-      //extract into a contiguous buffer
-      extractExtendedGaugeGhost(*this, d, R, send, true);
-
-      if (comm_dim_partitioned(d)) {
-	// do the exchange
-	MsgHandle *mh_recv_back;
-	MsgHandle *mh_recv_fwd;
-	MsgHandle *mh_send_fwd;
-	MsgHandle *mh_send_back;
-	
-	mh_recv_back = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
-	mh_recv_fwd  = comm_declare_receive_relative(((char*)recv[d])+bytes[d], d, +1, bytes[d]);
-	mh_send_back = comm_declare_send_relative(send[d], d, -1, bytes[d]);
-	mh_send_fwd  = comm_declare_send_relative(((char*)send[d])+bytes[d], d, +1, bytes[d]);
-	
-	comm_start(mh_recv_back);
-	comm_start(mh_recv_fwd);
-	comm_start(mh_send_fwd);
-	comm_start(mh_send_back);
-	
-	comm_wait(mh_send_fwd);
-	comm_wait(mh_send_back);
-	comm_wait(mh_recv_back);
-	comm_wait(mh_recv_fwd);
-	
-	comm_free(mh_send_fwd);
-	comm_free(mh_send_back);
-	comm_free(mh_recv_back);
-	comm_free(mh_recv_fwd);
-      } else {
-	memcpy(static_cast<char*>(recv[d])+bytes[d], send[d], bytes[d]);
-	memcpy(recv[d], static_cast<char*>(send[d])+bytes[d], bytes[d]);
-      }      
-
-      // inject back into the gauge field
-      extractExtendedGaugeGhost(*this, d, R, recv, false);
-    }
-
-    for (int d=0; d<nDim; d++) {
-      if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d])) ) continue;
-      host_free(send[d]);
-      host_free(recv[d]);
-    }
-  }
-
-  void cpuGaugeField::exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill)
-  {
-    profile.TPSTART(QUDA_PROFILE_COMMS);
-    exchangeExtendedGhost(R, no_comms_fill);
-    profile.TPSTOP(QUDA_PROFILE_COMMS);
-  }
-
-  // defined in cudaGaugeField
-  void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
-  void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
-  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
-  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry);
-
-  void cpuGaugeField::copy(const GaugeField &src) {
-    if (this == &src) return;
-
-    checkField(src);
-
-    if (link_type == QUDA_ASQTAD_FAT_LINKS) {
-      fat_link_max = src.LinkMax();
-      if (fat_link_max == 0.0 && precision < QUDA_SINGLE_PRECISION) fat_link_max = src.abs_max();
-    } else {
-      fat_link_max = 1.0;
-    }
-
-    if (typeid(src) == typeid(cudaGaugeField)) {
-
-      if (reorder_location() == QUDA_CPU_FIELD_LOCATION) {
-
-	if (!src.isNative()) errorQuda("Only native order is supported");
-	void *buffer = pool_pinned_malloc(src.Bytes());
-	// this copies over both even and odd
-        qudaMemcpy(buffer, static_cast<const cudaGaugeField &>(src).Gauge_p(), src.Bytes(), qudaMemcpyDeviceToHost);
-
-        copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, gauge, buffer);
-	pool_pinned_free(buffer);
-
-      } else { // else on the GPU
-
-	void *buffer = create_gauge_buffer(bytes, order, geometry);
-	size_t ghost_bytes[8];
-	int dstNinternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : 2*nColor*nColor;
-	for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * dstNinternal * precision;
-	void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr;
-
-	if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) {
-	  copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0, ghost_buffer, 0);
-	  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0, ghost_buffer, 0, 3); // forwards links if bi-directional
-	} else {
-	  copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0);
-	}
-
-	if (order == QUDA_QDP_GAUGE_ORDER) {
-	  for (int d=0; d<geometry; d++) {
-            qudaMemcpy(((void **)gauge)[d], ((void **)buffer)[d], bytes / geometry, qudaMemcpyDeviceToHost);
-          }
-	} else {
-          qudaMemcpy(gauge, buffer, bytes, qudaMemcpyHostToDevice);
-        }
-
-	if (order > 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
-	  for (int d=0; d<geometry; d++)
-            qudaMemcpy(Ghost()[d], ghost_buffer[d], ghost_bytes[d], qudaMemcpyDeviceToHost);
-
-        free_gauge_buffer(buffer, order, geometry);
-	if (nFace > 0) free_ghost_buffer(ghost_buffer, order, geometry);
-      }
-
-    } else if (typeid(src) == typeid(cpuGaugeField)) {
-      // copy field and ghost zone directly
-      copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, gauge,
-		       const_cast<void*>(static_cast<const cpuGaugeField&>(src).Gauge_p()));
-    } else {
-      errorQuda("Invalid gauge field type");
-    }
-
-    // if we have copied from a source without a pad then we need to exchange
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD &&
-	src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD) {
-      exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-    }
-  }
-
-  void cpuGaugeField::setGauge(void **gauge_)
-  {
-    if(create != QUDA_REFERENCE_FIELD_CREATE) {
-      errorQuda("Setting gauge pointer is only allowed when create="
-		"QUDA_REFERENCE_FIELD_CREATE type\n");
-    }
-    gauge = gauge_;
-  }
-
-  void cpuGaugeField::backup() const {
-    if (backed_up) errorQuda("Gauge field already backed up");
-
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      char **buffer = new char*[geometry];
-      for (int d=0; d<geometry; d++) {
-	buffer[d] = new char[bytes/geometry];
-	memcpy(buffer[d], gauge[d], bytes/geometry);
-      }
-      backup_h = reinterpret_cast<char*>(buffer);
-    } else {
-      backup_h = new char[bytes];
-      memcpy(backup_h, gauge, bytes);
-    }
-
-    backed_up = true;
-  }
-
-  void cpuGaugeField::restore() const
-  {
-    if (!backed_up) errorQuda("Cannot restore since not backed up");
-
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      char **buffer = reinterpret_cast<char**>(backup_h);
-      for (int d=0; d<geometry; d++) {
-	memcpy(gauge[d], buffer[d], bytes/geometry);
-	delete []buffer[d];
-      }
-      delete []buffer;
-    } else {
-      memcpy(gauge, backup_h, bytes);
-      delete []backup_h;
-    }
-
-    backed_up = false;
-  }
-
-  void cpuGaugeField::zero() {
-    if (order != QUDA_QDP_GAUGE_ORDER) {
-      memset(gauge, 0, bytes);
-    } else {
-      for (int g=0; g<geometry; g++) memset(gauge[g], 0, volume * nInternal * precision);
-    }
-  }
-
-  void cpuGaugeField::copy_to_buffer(void *buffer) const
-  {
-
-    if (Order() == QUDA_QDP_GAUGE_ORDER || Order() == QUDA_QDPJIT_GAUGE_ORDER) {
-      void *const *p = static_cast<void *const *>(Gauge_p());
-      int dbytes = Bytes() / 4;
-      static_assert(sizeof(char) == 1, "Assuming sizeof(char) == 1");
-      char *dst_buffer = reinterpret_cast<char *>(buffer);
-      for (int d = 0; d < 4; d++) { std::memcpy(&dst_buffer[d * dbytes], p[d], dbytes); }
-    } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
-               || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
-               || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      const void *p = Gauge_p();
-      int bytes = Bytes();
-      std::memcpy(buffer, p, bytes);
-    } else {
-      errorQuda("Unsupported order = %d\n", Order());
-    }
-  }
-
-  void cpuGaugeField::copy_from_buffer(void *buffer)
-  {
-
-    if (Order() == QUDA_QDP_GAUGE_ORDER || Order() == QUDA_QDPJIT_GAUGE_ORDER) {
-      void **p = static_cast<void **>(Gauge_p());
-      size_t dbytes = Bytes() / 4;
-      static_assert(sizeof(char) == 1, "Assuming sizeof(char) == 1");
-      const char *dst_buffer = reinterpret_cast<const char *>(buffer);
-      for (int d = 0; d < 4; d++) { std::memcpy(p[d], &dst_buffer[d * dbytes], dbytes); }
-    } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
-               || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
-               || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-      void *p = Gauge_p();
-      size_t bytes = Bytes();
-      std::memcpy(p, buffer, bytes);
-    } else {
-      errorQuda("Unsupported order = %d\n", Order());
-    }
-  }
-
-} // namespace quda
diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp
deleted file mode 100644
index 568209a74f..0000000000
--- a/lib/cuda_gauge_field.cpp
+++ /dev/null
@@ -1,741 +0,0 @@
-#include <cstring>
-#include <typeinfo>
-#include <gauge_field.h>
-#include <timer.h>
-#include <blas_quda.h>
-#include <device.h>
-
-namespace quda {
-
-  cudaGaugeField::cudaGaugeField(const GaugeFieldParam &param) :
-    GaugeField(param), gauge(0), even(0), odd(0)
-  {
-    if ((order == QUDA_QDP_GAUGE_ORDER || order == QUDA_QDPJIT_GAUGE_ORDER) &&
-        create != QUDA_REFERENCE_FIELD_CREATE) {
-      errorQuda("QDP ordering only supported for reference fields");
-    }
-
-    if (order == QUDA_QDP_GAUGE_ORDER ||
-	order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER ||
-	order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_CPS_WILSON_GAUGE_ORDER)
-      errorQuda("Field ordering %d presently disabled for this type", order);
-
-#ifdef MULTI_GPU
-    if (link_type != QUDA_ASQTAD_MOM_LINKS &&
-	ghostExchange == QUDA_GHOST_EXCHANGE_PAD &&
-	isNative()) {
-      bool pad_check = true;
-      for (int i=0; i<nDim; i++) {
-	// when we have coarse links we need to double the pad since we're storing forwards and backwards links
-	int minimum_pad = nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1);
-	if (pad < minimum_pad) pad_check = false;
-	if (!pad_check)
-	  errorQuda("cudaGaugeField being constructed with insufficient padding in dim %d (%d < %d)\n", i, pad, minimum_pad);
-      }
-    }
-#endif
-
-    if (create != QUDA_NULL_FIELD_CREATE &&
-        create != QUDA_ZERO_FIELD_CREATE &&
-        create != QUDA_REFERENCE_FIELD_CREATE){
-      errorQuda("ERROR: create type(%d) not supported yet\n", create);
-    }
-
-    if (create != QUDA_REFERENCE_FIELD_CREATE) {
-      switch(mem_type) {
-      case QUDA_MEMORY_DEVICE: gauge = bytes ? pool_device_malloc(bytes) : nullptr; break;
-      case QUDA_MEMORY_MAPPED:
-        gauge_h = bytes ? mapped_malloc(bytes) : nullptr;
-        gauge = bytes ? get_mapped_device_pointer(gauge_h) : nullptr; // set the matching device pointer
-        break;
-      default:
-	errorQuda("Unsupported memory type %d", mem_type);
-      }
-      if (create == QUDA_ZERO_FIELD_CREATE && bytes) qudaMemset(gauge, 0, bytes);
-    } else {
-      gauge = param.gauge;
-    }
-
-    if ( !isNative() ) {
-      for (int i=0; i<nDim; i++) {
-        size_t nbytes = nFace * surface[i] * nInternal * precision;
-        ghost[i] = nbytes ? pool_device_malloc(nbytes) : nullptr;
-	ghost[i+4] = (nbytes && geometry == QUDA_COARSE_GEOMETRY) ? pool_device_malloc(nbytes) : nullptr;
-      }
-    }
-
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
-      if (create == QUDA_REFERENCE_FIELD_CREATE) exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-    }
-
-    even = gauge;
-    odd = static_cast<char*>(gauge) + bytes/2;
-
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
-      if (isNative()) {
-        if (create != QUDA_ZERO_FIELD_CREATE) zeroPad();
-      } else {
-        for (int i = 0; i < nDim; i++) {
-          size_t nbytes = nFace * surface[i] * nInternal * precision;
-          qudaMemset(ghost[i], 0, nbytes);
-          if (nbytes && geometry == QUDA_COARSE_GEOMETRY) qudaMemset(ghost[i + 4], 0, nbytes);
-        }
-      }
-    }
-  }
-
-  void cudaGaugeField::zeroPad() {
-    size_t pad_bytes = (stride - volumeCB) * precision * order;
-    int Npad = (geometry * (reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2)) / order;
-
-    size_t pitch = stride*order*precision;
-    if (pad_bytes) {
-      qudaMemset2D(static_cast<char *>(even) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
-      qudaMemset2D(static_cast<char *>(odd) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad);
-    }
-  }
-
-  cudaGaugeField::~cudaGaugeField()
-  {
-    if (create != QUDA_REFERENCE_FIELD_CREATE) {
-      switch(mem_type) {
-      case QUDA_MEMORY_DEVICE:
-        if (gauge) pool_device_free(gauge);
-        break;
-      case QUDA_MEMORY_MAPPED:
-        if (gauge_h) host_free(gauge_h);
-        break;
-      default:
-        errorQuda("Unsupported memory type %d", mem_type);
-      }
-    }
-
-    if ( !isNative() ) {
-      for (int i=0; i<nDim; i++) {
-        if (ghost[i]) pool_device_free(ghost[i]);
-        if (ghost[i + 4] && geometry == QUDA_COARSE_GEOMETRY) pool_device_free(ghost[i + 4]);
-      }
-    }
-
-  }
-
-  // This does the exchange of the forwards boundary gauge field ghost zone and places
-  // it into the ghost array of the next node
-  void cudaGaugeField::exchangeGhost(QudaLinkDirection link_direction) {
-
-    if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
-    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
-    if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY)
-      errorQuda("Cannot request exchange of forward links on non-coarse geometry");
-    if (nFace == 0) errorQuda("nFace = 0");
-
-    const int dir = 1; // sending forwards only
-    const lat_dim_t R = {nFace, nFace, nFace, nFace};
-    const bool no_comms_fill = true; // dslash kernels presently require this
-    const bool bidir = false; // communication is only ever done in one direction at once
-    createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
-
-    // loop over backwards and forwards links
-    const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
-    for (int link_dir = 0; link_dir<2; link_dir++) {
-      if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
-
-      void *send_d[2*QUDA_MAX_DIM] = { };
-      void *recv_d[2*QUDA_MAX_DIM] = { };
-
-      size_t offset = 0;
-      for (int d=0; d<nDim; d++) {
-        recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
-        if (bidir) offset += ghost_face_bytes_aligned[d];
-        send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
-        offset += ghost_face_bytes_aligned[d];
-      }
-
-      extractGaugeGhost(*this, send_d, true, link_dir*nDim); // get the links into contiguous buffers
-      qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait
-
-      // issue receive preposts and host-to-device copies if needed
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-	recvStart(dim, dir); // prepost the receive
-	if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
-          qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
-                          ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir));
-        }
-      }
-
-      // if gdr enabled then synchronize
-      if (comm_gdr_enabled()) qudaDeviceSynchronize();
-
-      // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-        if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled())
-          qudaStreamSynchronize(device::get_stream(2 * dim + dir));
-        sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending
-      }
-
-      // complete communication and issue host-to-device copies if needed
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-	commsComplete(dim, dir);
-	if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
-          qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir], from_face_dim_dir_h[bufferIndex][dim][1 - dir],
-                          ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir));
-        }
-      }
-
-      qudaDeviceSynchronize(); // synchronize before issuing kernels / copies in default stream - could replace with event post and wait
-
-      // fill in the halos for non-partitioned dimensions
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim) && no_comms_fill) {
-          qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
-        }
-      }
-
-      if (isNative()) {
-	copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2*link_dir); // 1, 3
-      } else {
-	// copy from receive buffer into ghost array
-	for (int dim=0; dim<nDim; dim++)
-          qudaMemcpy(ghost[dim + link_dir * nDim], recv_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
-      }
-
-      bufferIndex = 1-bufferIndex;
-    } // link_dir
-
-    qudaDeviceSynchronize();
-  }
-
-  // This does the opposite of exchangeGhost and sends back the ghost
-  // zone to the node from which it came and injects it back into the
-  // field
-  void cudaGaugeField::injectGhost(QudaLinkDirection link_direction)
-  {
-    if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
-    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry);
-    if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);
-    if (nFace == 0) errorQuda("nFace = 0");
-
-    const int dir = 0; // sending backwards only
-    const lat_dim_t R = {nFace, nFace, nFace, nFace};
-    const bool no_comms_fill = false; // injection never does no_comms_fill
-    const bool bidir = false; // communication is only ever done in one direction at once
-    createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
-
-    // loop over backwards and forwards links (forwards links never sent but leave here just in case)
-    const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
-    for (int link_dir = 0; link_dir<2; link_dir++) {
-      if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
-
-      void *send_d[2*QUDA_MAX_DIM] = { };
-      void *recv_d[2*QUDA_MAX_DIM] = { };
-
-      size_t offset = 0;
-      for (int d=0; d<nDim; d++) {
-	// send backwards is first half of each ghost_send_buffer
-        send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
-        if (bidir) offset += ghost_face_bytes_aligned[d];
-        // receive from forwards is the second half of each ghost_recv_buffer
-        recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
-        offset += ghost_face_bytes_aligned[d];
-      }
-
-      if (isNative()) { // copy from padded region in gauge field into send buffer
-	copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir);
-      } else { // copy from receive buffer into ghost array
-        for (int dim = 0; dim < nDim; dim++)
-          qudaMemcpy(send_d[dim], ghost[dim + link_dir * nDim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
-      }
-      qudaDeviceSynchronize(); // need to synchronize before issueing copies in different streams - could replace with event post and wait
-
-      // issue receive preposts and host-to-device copies if needed
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-	recvStart(dim, dir); // prepost the receive
-	if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
-          qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
-                          ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir));
-        }
-      }
-
-      // if gdr enabled then synchronize
-      if (comm_gdr_enabled()) qudaDeviceSynchronize();
-
-      // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-        if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled())
-          qudaStreamSynchronize(device::get_stream(2 * dim + dir));
-        sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending
-      }
-
-      // complete communication and issue host-to-device copies if needed
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim)) continue;
-	commsComplete(dim, dir);
-	if (!comm_peer2peer_enabled(1-dir,dim) && !comm_gdr_enabled()) {
-          qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir], from_face_dim_dir_h[bufferIndex][dim][1 - dir],
-                          ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir));
-        }
-      }
-
-      qudaDeviceSynchronize(); // synchronize before issuing kernel / copies in default stream - could replace with event post and wait
-
-      // fill in the halos for non-partitioned dimensions
-      for (int dim=0; dim<nDim; dim++) {
-	if (!comm_dim_partitioned(dim) && no_comms_fill) {
-          qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
-        }
-      }
-
-      // get the links into contiguous buffers
-      extractGaugeGhost(*this, recv_d, false, link_dir*nDim);
-
-      bufferIndex = 1-bufferIndex;
-    } // link_dir
-
-    qudaDeviceSynchronize();
-  }
-
-  void cudaGaugeField::allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir) const
-  {
-    createGhostZone(R, no_comms_fill, bidir);
-    LatticeField::allocateGhostBuffer(ghost_bytes);
-  }
-
-  void cudaGaugeField::createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir)
-  {
-    allocateGhostBuffer(R, no_comms_fill, bidir); // allocate the ghost buffer if not yet allocated
-
-    // ascertain if this instance needs it comms buffers to be updated
-    bool comms_reset = ghost_field_reset || // FIXME add send buffer check
-      (my_face_h[0] != ghost_pinned_send_buffer_h[0]) || (my_face_h[1] != ghost_pinned_send_buffer_h[1]) ||
-      (from_face_h[0] != ghost_pinned_recv_buffer_h[0]) || (from_face_h[1] != ghost_pinned_recv_buffer_h[1]) ||
-      ghost_bytes != ghost_bytes_old; // ghost buffer has been resized (e.g., bidir to unidir)
-
-    if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill);
-
-    if (ghost_field_reset) destroyIPCComms();
-    createIPCComms();
-  }
-
-  void cudaGaugeField::recvStart(int dim, int dir)
-  {
-    if (!comm_dim_partitioned(dim)) return;
-
-    // receive from neighboring the processor
-    if (comm_peer2peer_enabled(1 - dir, dim)) {
-      comm_start(mh_recv_p2p[bufferIndex][dim][1 - dir]);
-    } else if (comm_gdr_enabled()) {
-      comm_start(mh_recv_rdma[bufferIndex][dim][1 - dir]);
-    } else {
-      comm_start(mh_recv[bufferIndex][dim][1 - dir]);
-    }
-  }
-
-  void cudaGaugeField::sendStart(int dim, int dir, const qudaStream_t &stream)
-  {
-    if (!comm_dim_partitioned(dim)) return;
-
-    if (!comm_peer2peer_enabled(dir,dim)) {
-      if (comm_gdr_enabled()) {
-        comm_start(mh_send_rdma[bufferIndex][dim][dir]);
-      } else {
-        comm_start(mh_send[bufferIndex][dim][dir]);
-      }
-    } else { // doing peer-to-peer
-
-      void *ghost_dst
-        = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2];
-
-      qudaMemcpyP2PAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir], ghost_face_bytes[dim], stream);
-
-      // record the event
-      qudaEventRecord(ipcCopyEvent[bufferIndex][dim][dir], stream);
-      // send to the neighboring processor
-      comm_start(mh_send_p2p[bufferIndex][dim][dir]);
-    }
-  }
-
-  void cudaGaugeField::commsComplete(int dim, int dir)
-  {
-    if (!comm_dim_partitioned(dim)) return;
-
-    if (comm_peer2peer_enabled(1 - dir, dim)) {
-      comm_wait(mh_recv_p2p[bufferIndex][dim][1 - dir]);
-      qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][dim][1 - dir]);
-    } else if (comm_gdr_enabled()) {
-      comm_wait(mh_recv_rdma[bufferIndex][dim][1 - dir]);
-    } else {
-      comm_wait(mh_recv[bufferIndex][dim][1 - dir]);
-    }
-
-    if (comm_peer2peer_enabled(dir, dim)) {
-      comm_wait(mh_send_p2p[bufferIndex][dim][dir]);
-      qudaEventSynchronize(ipcCopyEvent[bufferIndex][dim][dir]);
-    } else if (comm_gdr_enabled()) {
-      comm_wait(mh_send_rdma[bufferIndex][dim][dir]);
-    } else {
-      comm_wait(mh_send[bufferIndex][dim][dir]);
-    }
-  }
-
-  void cudaGaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill)
-  {
-    const int b = bufferIndex;
-    void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];
-
-    createComms(R, no_comms_fill);
-
-    size_t offset = 0;
-    for (int dim=0; dim<nDim; dim++) {
-      if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
-      send_d[dim] = static_cast<char*>(ghost_send_buffer_d[b]) + offset;
-      recv_d[dim] = static_cast<char*>(ghost_recv_buffer_d[b]) + offset;
-
-      // silence cuda-memcheck initcheck errors that arise since we
-      // have an oversized ghost buffer when doing the extended exchange
-      qudaMemsetAsync(send_d[dim], 0, 2 * ghost_face_bytes_aligned[dim], device::get_default_stream());
-      offset += 2 * ghost_face_bytes_aligned[dim]; // factor of two from fwd/back
-    }
-
-    for (int dim=0; dim<nDim; dim++) {
-      if ( !(comm_dim_partitioned(dim) || (no_comms_fill && R[dim])) ) continue;
-
-      //extract into a contiguous buffer
-      extractExtendedGaugeGhost(*this, dim, R, send_d, true);
-
-      if (comm_dim_partitioned(dim)) {
-        qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait
-
-        for (int dir=0; dir<2; dir++) recvStart(dim, dir);
-
-	for (int dir=0; dir<2; dir++) {
-	  // issue host-to-device copies if needed
-	  if (!comm_peer2peer_enabled(dir,dim) && !comm_gdr_enabled()) {
-            qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
-                            ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(dir));
-          }
-        }
-
-        // if either direction is not peer-to-peer then we need to synchronize
-        if (!comm_peer2peer_enabled(0, dim) || !comm_peer2peer_enabled(1, dim)) qudaDeviceSynchronize();
-
-        for (int dir = 0; dir < 2; dir++) sendStart(dim, dir, device::get_stream(dir));
-        for (int dir = 0; dir < 2; dir++) commsComplete(dim, dir);
-
-        for (int dir = 0; dir < 2; dir++) {
-          // issue host-to-device copies if needed
-          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
-            qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir],
-                            ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(dir));
-          }
-        }
-
-      } else { // if just doing a local exchange to fill halo then need to swap faces
-        qudaMemcpy(from_face_dim_dir_d[b][dim][1], my_face_dim_dir_d[b][dim][0], ghost_face_bytes[dim],
-                   qudaMemcpyDeviceToDevice);
-        qudaMemcpy(from_face_dim_dir_d[b][dim][0], my_face_dim_dir_d[b][dim][1], ghost_face_bytes[dim],
-                   qudaMemcpyDeviceToDevice);
-      }
-
-      // inject back into the gauge field
-      // need to synchronize the copy streams before rejoining the compute stream - could replace with event post and wait
-      qudaDeviceSynchronize();
-      extractExtendedGaugeGhost(*this, dim, R, recv_d, false);
-    }
-
-    bufferIndex = 1-bufferIndex;
-    qudaDeviceSynchronize();
-  }
-
-  void cudaGaugeField::exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill)
-  {
-    profile.TPSTART(QUDA_PROFILE_COMMS);
-    exchangeExtendedGhost(R, no_comms_fill);
-    profile.TPSTOP(QUDA_PROFILE_COMMS);
-  }
-
-  void cudaGaugeField::setGauge(void *gauge_)
-  {
-    if(create != QUDA_REFERENCE_FIELD_CREATE) {
-      errorQuda("Setting gauge pointer is only allowed when create="
-          "QUDA_REFERENCE_FIELD_CREATE type\n");
-    }
-    gauge = gauge_;
-  }
-
-  void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      void **buffer = new void*[geometry];
-      for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes/geometry);
-      return ((void*)buffer);
-    } else {
-      return pool_device_malloc(bytes);
-    }
-
-  }
-
-  void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-
-    if (order > 4) {
-      void **buffer = new void*[geometry];
-      for (int d=0; d<geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
-      return buffer;
-    } else {
-      return 0;
-    }
-
-  }
-
-  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-    if (order == QUDA_QDP_GAUGE_ORDER) {
-      for (int d=0; d<geometry; d++) pool_device_free(((void**)buffer)[d]);
-      delete []((void**)buffer);
-    } else {
-      pool_device_free(buffer);
-    }
-  }
-
-  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) {
-    if (order > 4) {
-      for (int d=0; d<geometry; d++) pool_device_free(buffer[d]);
-      delete []buffer;
-    }
-  }
-
-  void cudaGaugeField::copy(const GaugeField &src) {
-    if (this == &src) return;
-
-    checkField(src);
-
-    if (link_type == QUDA_ASQTAD_FAT_LINKS) {
-      fat_link_max = src.LinkMax();
-      if (fat_link_max == 0.0 && precision < QUDA_SINGLE_PRECISION) fat_link_max = src.abs_max();
-    } else {
-      fat_link_max = 1.0;
-    }
-
-    if (typeid(src) == typeid(cudaGaugeField)) {
-
-      if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-        // copy field and ghost zone into this field
-        copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
-
-        if (geometry == QUDA_COARSE_GEOMETRY)
-          copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge, 0, 0, 3);
-      } else {
-        copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast<const cudaGaugeField&>(src).gauge);
-        if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
-      }
-
-    } else if (typeid(src) == typeid(cpuGaugeField)) {
-      if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU
-	void *buffer = pool_pinned_malloc(bytes);
-
-	if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-	  // copy field and ghost zone into buffer
-	  copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
-
-          if (geometry == QUDA_COARSE_GEOMETRY)
-            copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField &>(src).gauge,
-                             0, 0, 3);
-        } else {
-	  copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast<const cpuGaugeField&>(src).gauge);
-          if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
-	}
-
-	// this copies over both even and odd
-        qudaMemcpy(gauge, buffer, bytes, qudaMemcpyDefault);
-        pool_pinned_free(buffer);
-      } else { // else on the GPU
-
-        if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
-            src.Order() == QUDA_BQCD_GAUGE_ORDER      ||
-            src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-	  // special case where we use zero-copy memory to read/write directly from application's array
-          void *src_d = get_mapped_device_pointer(src.Gauge_p());
-
-          if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
-            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, src_d);
-          } else {
-            errorQuda("Ghost copy not supported here");
-          }
-
-        } else {
-	  void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
-	  size_t ghost_bytes[8];
-	  int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor;
-	  for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * srcNinternal * src.Precision();
-	  void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;
-
-	  if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
-	    for (int d=0; d<geometry; d++) {
-              qudaMemcpy(((void **)buffer)[d], ((void **)src.Gauge_p())[d], src.Bytes() / geometry, qudaMemcpyDefault);
-            }
-          } else {
-            qudaMemcpy(buffer, src.Gauge_p(), src.Bytes(), qudaMemcpyDefault);
-          }
-
-          if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
-              && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
-            for (int d = 0; d < geometry; d++)
-              qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], qudaMemcpyDefault);
-
-          if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer);
-            if (geometry == QUDA_COARSE_GEOMETRY)
-              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer, 3);
-          } else {
-            copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer);
-            if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
-          }
-          free_gauge_buffer(buffer, src.Order(), src.Geometry());
-          if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
-        }
-      } // reorder_location
-    } else {
-      errorQuda("Invalid gauge field type");
-    }
-
-    // if we have copied from a source without a pad then we need to exchange
-    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD)
-      exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
-
-    staggeredPhaseApplied = src.StaggeredPhaseApplied();
-    staggeredPhaseType = src.StaggeredPhase();
-
-    qudaDeviceSynchronize(); // include sync here for accurate host-device profiling
-  }
-
-  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu) {
-    copy(cpu);
-    qudaDeviceSynchronize();
-  }
-
-  void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile) {
-    profile.TPSTART(QUDA_PROFILE_H2D);
-    loadCPUField(cpu);
-    profile.TPSTOP(QUDA_PROFILE_H2D);
-  }
-
-  void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const
-  {
-    static_cast<LatticeField&>(cpu).checkField(*this);
-
-    if (reorder_location() == QUDA_CUDA_FIELD_LOCATION) {
-
-      if (cpu.Order() == QUDA_MILC_SITE_GAUGE_ORDER ||
-          cpu.Order() == QUDA_BQCD_GAUGE_ORDER      ||
-          cpu.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
-	// special case where we use zero-copy memory to read/write directly from application's array
-        void *cpu_d = get_mapped_device_pointer(cpu.Gauge_p());
-        if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
-          copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, gauge);
-        } else {
-          errorQuda("Ghost copy not supported here");
-        }
-      } else {
-	void *buffer = create_gauge_buffer(cpu.Bytes(), cpu.Order(), cpu.Geometry());
-
-	// Allocate space for ghost zone if required
-	size_t ghost_bytes[8];
-	int cpuNinternal = cpu.Reconstruct() != QUDA_RECONSTRUCT_NO ? cpu.Reconstruct() : 2*nColor*nColor;
-	for (int d=0; d<geometry; d++) ghost_bytes[d] = nFace * surface[d%4] * cpuNinternal * cpu.Precision();
-	void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr;
-
-	if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-	  copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0);
-	  if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0, 3);
-	} else {
-	  copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge);
-	}
-
-	if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) {
-          for (int d = 0; d < geometry; d++)
-            qudaMemcpy(((void **)cpu.gauge)[d], ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault);
-        } else {
-          qudaMemcpy(cpu.gauge, buffer, cpu.Bytes(), qudaMemcpyDefault);
-        }
-
-        if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
-            && cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
-          for (int d = 0; d < geometry; d++)
-            qudaMemcpy(cpu.Ghost()[d], ghost_buffer[d], ghost_bytes[d], qudaMemcpyDefault);
-
-        free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry());
-        if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry);
-      }
-    } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder
-
-      void *buffer = pool_pinned_malloc(bytes);
-      qudaMemcpy(buffer, gauge, bytes, qudaMemcpyDefault);
-
-      if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
-	copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
-      } else {
-	copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer);
-      }
-      pool_pinned_free(buffer);
-
-    } else {
-      errorQuda("Invalid pack location %d", reorder_location());
-    }
-
-    cpu.staggeredPhaseApplied = staggeredPhaseApplied;
-    cpu.staggeredPhaseType = staggeredPhaseType;
-
-    qudaDeviceSynchronize();
-  }
-
-  void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const {
-    profile.TPSTART(QUDA_PROFILE_D2H);
-    saveCPUField(cpu);
-    profile.TPSTOP(QUDA_PROFILE_D2H);
-  }
-
-  void cudaGaugeField::backup() const {
-    if (backed_up) errorQuda("Gauge field already backed up");
-    backup_h = new char[bytes];
-    qudaMemcpy(backup_h, gauge, bytes, qudaMemcpyDefault);
-    backed_up = true;
-  }
-
-  void cudaGaugeField::restore() const
-  {
-    if (!backed_up) errorQuda("Cannot restore since not backed up");
-    qudaMemcpy(gauge, backup_h, bytes, qudaMemcpyDefault);
-    delete []backup_h;
-    backed_up = false;
-  }
-
-  void cudaGaugeField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const
-  {
-    if (is_prefetch_enabled() && mem_type == QUDA_MEMORY_DEVICE) {
-      if (gauge) qudaMemPrefetchAsync(gauge, bytes, mem_space, stream);
-      if (!isNative()) {
-        for (int i = 0; i < nDim; i++) {
-          size_t nbytes = nFace * surface[i] * nInternal * precision;
-          if (ghost[i] && nbytes) qudaMemPrefetchAsync(ghost[i], nbytes, mem_space, stream);
-          if (ghost[i + 4] && nbytes && geometry == QUDA_COARSE_GEOMETRY)
-            qudaMemPrefetchAsync(ghost[i + 4], nbytes, mem_space, stream);
-        }
-      }
-    }
-  }
-
-  void cudaGaugeField::zero() { qudaMemset(gauge, 0, bytes); }
-
-  void cudaGaugeField::copy_to_buffer(void *buffer) const
-  {
-    qudaMemcpy(buffer, Gauge_p(), Bytes(), qudaMemcpyDeviceToHost);
-  }
-
-  void cudaGaugeField::copy_from_buffer(void *buffer)
-  {
-    qudaMemcpy(Gauge_p(), buffer, Bytes(), qudaMemcpyHostToDevice);
-  }
-
-} // namespace quda
diff --git a/lib/dirac.cpp b/lib/dirac.cpp
index 6e0a5912d3..35411c1841 100644
--- a/lib/dirac.cpp
+++ b/lib/dirac.cpp
@@ -14,7 +14,6 @@ namespace quda {
     laplace3D(param.laplace3D),
     matpcType(param.matpcType),
     dagger(param.dagger),
-    flops(0),
     type(param.type),
     halo_precision(param.halo_precision),
     use_mobius_fused_kernel(param.use_mobius_fused_kernel),
@@ -29,7 +28,6 @@ namespace quda {
     laplace3D(dirac.laplace3D),
     matpcType(dirac.matpcType),
     dagger(dirac.dagger),
-    flops(0),
     type(dirac.type),
     halo_precision(dirac.halo_precision),
     profile("Dirac", false)
@@ -51,7 +49,6 @@ namespace quda {
       laplace3D = dirac.laplace3D;
       matpcType = dirac.matpcType;
       dagger = dirac.dagger;
-      flops = 0;
 
       for (int i=0; i<4; i++) commDim[i] = dirac.commDim[i];
 
@@ -115,7 +112,7 @@ namespace quda {
   }
 
   void Dirac::checkSpinorAlias(const ColorSpinorField &a, const ColorSpinorField &b) const {
-    if (a.V() == b.V()) errorQuda("Aliasing pointers");
+    if (a.data() == b.data()) errorQuda("Aliasing pointers");
   }
 
   // Dirac operator factory
diff --git a/lib/dirac_clover.cpp b/lib/dirac_clover.cpp
index cf57b39352..242fc23e24 100644
--- a/lib/dirac_clover.cpp
+++ b/lib/dirac_clover.cpp
@@ -38,7 +38,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilsonClover(out, in, *gauge, *clover, k, x, parity, dagger, commDim, profile);
-    flops += 1872ll*in.Volume();
   }
 
   // Public method to apply the clover term only
@@ -47,13 +46,11 @@ namespace quda {
     checkParitySpinor(in, out);
 
     ApplyClover(out, in, *clover, false, parity);
-    flops += 504ll*in.Volume();
   }
 
   void DiracClover::M(ColorSpinorField &out, const ColorSpinorField &in) const
   {
     ApplyWilsonClover(out, in, *gauge, *clover, -kappa, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-    flops += 1872ll * in.Volume();
   }
 
   void DiracClover::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -105,7 +102,7 @@ namespace quda {
     DiracClover(param)
   {
     // For the preconditioned operator, we need to check that the inverse of the clover term is present
-    if (!clover->cloverInv && !clover::dynamic_inverse()) errorQuda("Clover inverse required for DiracCloverPC");
+    if (!clover->Inverse() && !clover::dynamic_inverse()) errorQuda("Clover inverse required for DiracCloverPC");
   }
 
   DiracCloverPC::DiracCloverPC(const DiracCloverPC &dirac) : DiracClover(dirac) { }
@@ -127,7 +124,6 @@ namespace quda {
     checkParitySpinor(in, out);
 
     ApplyClover(out, in, *clover, true, parity);
-    flops += 504ll*in.Volume();
   }
 
   // apply hopping term, then clover: (A_ee^-1 D_eo) or (A_oo^-1 D_oe),
@@ -140,7 +136,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilsonCloverPreconditioned(out, in, *gauge, *clover, 0.0, in, parity, dagger, commDim, profile);
-    flops += 1824ll*in.Volume();
   }
 
   // xpay version of the above
@@ -152,7 +147,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilsonCloverPreconditioned(out, in, *gauge, *clover, k, x, parity, dagger, commDim, profile);
-    flops += 1872ll*in.Volume();
   }
 
   // Apply the even-odd preconditioned clover-improved Dirac operator
diff --git a/lib/dirac_clover_hasenbusch_twist.cpp b/lib/dirac_clover_hasenbusch_twist.cpp
index 93600c8299..8c82d7bbd8 100644
--- a/lib/dirac_clover_hasenbusch_twist.cpp
+++ b/lib/dirac_clover_hasenbusch_twist.cpp
@@ -42,9 +42,6 @@ namespace quda
         ApplyWilsonCloverHasenbuschTwist(out.Odd(), in.Even(), *gauge, *clover, -kappa, mu, in.Odd(), QUDA_ODD_PARITY,
                                          dagger, commDim, profile);
       }
-
-      // 2 c/b applies of DiracClover + (1-imu gamma_5 A)psi_{!p}
-      flops += 2 * 1872ll * in.VolumeCB() + (48ll + 504ll) * in.VolumeCB();
     } else {
       if (matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
         ApplyWilsonClover(out.Even(), in.Odd(), *gauge, *clover, -kappa, in.Even(), QUDA_EVEN_PARITY, dagger, commDim,
@@ -57,8 +54,6 @@ namespace quda
         ApplyWilsonClover(out.Odd(), in.Even(), *gauge, *clover, -kappa, in.Odd(), QUDA_ODD_PARITY, dagger, commDim,
                           profile);
       }
-      // 2 c/b applies of DiracClover + (1-imu gamma_5)psi_{!p}
-      flops += 2 * 1872ll * in.VolumeCB() + 48ll * in.VolumeCB();
     }
   }
 
@@ -115,9 +110,6 @@ namespace quda
     checkSpinorAlias(in, out);
 
     ApplyWilsonCloverHasenbuschTwistPCClovInv(out, in, *gauge, *clover, k, b, x, parity, dagger, commDim, profile);
-
-    // DiracCloverPC.DslashXPay -/+ mu ( i gamma_5 ) A
-    flops += (1872ll + 48ll + 504ll) * in.Volume();
   }
 
   // xpay version of the above
@@ -129,9 +121,6 @@ namespace quda
     checkSpinorAlias(in, out);
 
     ApplyWilsonCloverHasenbuschTwistPCNoClovInv(out, in, *gauge, *clover, k, b, x, parity, dagger, commDim, profile);
-
-    //    DiracCloverPC.DslashXPay -/+ mu ( i gamma_5 )
-    flops += (1872ll + 48) * in.Volume();
   }
 
   // Apply the even-odd preconditioned clover-improved Dirac operator
@@ -155,7 +144,6 @@ namespace quda
 
       // applies (A + imu*g5 - kappa^2 D)-
       ApplyTwistedClover(out, tmp, *gauge, *clover, kappa2, mu, in, parity[1], dagger, commDim, profile);
-      flops += 1872ll * in.Volume();
     } else if (!dagger) { // symmetric preconditioning
       // We need two cases because M = 1-ADAD and M^\dag = 1-D^\dag A D^dag A
       // where A is actually a clover inverse.
@@ -188,7 +176,7 @@ namespace quda
   {
     // double a = - 2.0 * kappa * mu * T.Vectors().TwistFlavor();
     // CoarseOp(Y, X, T, *gauge, &clover, kappa, a, -mu_factor,QUDA_CLOVERPC_DIRAC, matpcType);
-    errorQuda("Not yet implemented\n");
+    errorQuda("Not yet implemented");
   }
 
 } // namespace quda
diff --git a/lib/dirac_coarse.cpp b/lib/dirac_coarse.cpp
index b69544510a..6956df7e69 100644
--- a/lib/dirac_coarse.cpp
+++ b/lib/dirac_coarse.cpp
@@ -29,12 +29,12 @@ namespace quda {
     initializeCoarse();
   }
 
-  DiracCoarse::DiracCoarse(const DiracParam &param, std::shared_ptr<cpuGaugeField> Y_h,
-                           std::shared_ptr<cpuGaugeField> X_h, std::shared_ptr<cpuGaugeField> Xinv_h,
-                           std::shared_ptr<cpuGaugeField> Yhat_h, // cpu link fields
-                           std::shared_ptr<cudaGaugeField> Y_d, std::shared_ptr<cudaGaugeField> X_d,
-                           std::shared_ptr<cudaGaugeField> Xinv_d,
-                           std::shared_ptr<cudaGaugeField> Yhat_d) // gpu link field
+  DiracCoarse::DiracCoarse(const DiracParam &param, std::shared_ptr<GaugeField> Y_h, std::shared_ptr<GaugeField> X_h,
+                           std::shared_ptr<GaugeField> Xinv_h,
+                           std::shared_ptr<GaugeField> Yhat_h, // cpu link fields
+                           std::shared_ptr<GaugeField> Y_d, std::shared_ptr<GaugeField> X_d,
+                           std::shared_ptr<GaugeField> Xinv_d,
+                           std::shared_ptr<GaugeField> Yhat_d) // gpu link field
     :
     Dirac(param),
     mass(param.mass),
@@ -69,7 +69,7 @@ namespace quda {
     {
       GaugeFieldParam param(X);
       param.order = gOrder;
-      auto output = std::shared_ptr<cudaGaugeField>(static_cast<cudaGaugeField *>(cudaGaugeField::Create(param)));
+      auto output = std::shared_ptr<GaugeField>(GaugeField::Create(param));
       output->copy(X);
       return output;
     };
@@ -156,12 +156,12 @@ namespace quda {
     gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone
 
     if (gpu) {
-      Y_d = std::make_shared<cudaGaugeField>(gParam);
+      Y_d = std::make_shared<GaugeField>(gParam);
       GaugeFieldParam milcParam(*Y_d);
       milcParam.order = QUDA_MILC_GAUGE_ORDER;
-      if (need_aos_gauge_copy) { Y_aos_d = std::make_shared<cudaGaugeField>(milcParam); }
+      if (need_aos_gauge_copy) { Y_aos_d = std::make_shared<GaugeField>(milcParam); }
     } else
-      Y_h = std::make_shared<cpuGaugeField>(gParam);
+      Y_h = std::make_shared<GaugeField>(gParam);
 
     gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
     gParam.nFace = 0;
@@ -169,12 +169,12 @@ namespace quda {
     gParam.pad = 0;
 
     if (gpu) {
-      X_d = std::make_shared<cudaGaugeField>(gParam);
+      X_d = std::make_shared<GaugeField>(gParam);
       GaugeFieldParam milcParam(*X_d);
       milcParam.order = QUDA_MILC_GAUGE_ORDER;
-      if (need_aos_gauge_copy) { X_aos_d = std::make_shared<cudaGaugeField>(milcParam); }
+      if (need_aos_gauge_copy) { X_aos_d = std::make_shared<GaugeField>(milcParam); }
     } else
-      X_h = std::make_shared<cpuGaugeField>(gParam);
+      X_h = std::make_shared<GaugeField>(gParam);
   }
 
   void DiracCoarse::createYhat(bool gpu) const
@@ -208,12 +208,12 @@ namespace quda {
     gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone
 
     if (gpu) {
-      Yhat_d = std::make_shared<cudaGaugeField>(gParam);
+      Yhat_d = std::make_shared<GaugeField>(gParam);
       GaugeFieldParam milcParam(*Yhat_d);
       milcParam.order = QUDA_MILC_GAUGE_ORDER;
-      if (need_aos_gauge_copy) { Yhat_aos_d = std::make_shared<cudaGaugeField>(milcParam); }
+      if (need_aos_gauge_copy) { Yhat_aos_d = std::make_shared<GaugeField>(milcParam); }
     } else
-      Yhat_h = std::make_shared<cpuGaugeField>(gParam);
+      Yhat_h = std::make_shared<GaugeField>(gParam);
 
     gParam.setPrecision(gpu ? X_d->Precision() : X_h->Precision());
     gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
@@ -222,12 +222,12 @@ namespace quda {
     gParam.pad = 0;
 
     if (gpu) {
-      Xinv_d = std::make_shared<cudaGaugeField>(gParam);
+      Xinv_d = std::make_shared<GaugeField>(gParam);
       GaugeFieldParam milcParam(*Xinv_d);
       milcParam.order = QUDA_MILC_GAUGE_ORDER;
-      if (need_aos_gauge_copy) { Xinv_aos_d = std::make_shared<cudaGaugeField>(milcParam); }
+      if (need_aos_gauge_copy) { Xinv_aos_d = std::make_shared<GaugeField>(milcParam); }
     } else
-      Xinv_h = std::make_shared<cpuGaugeField>(gParam);
+      Xinv_h = std::make_shared<GaugeField>(gParam);
   }
 
   void DiracCoarse::initializeCoarse()
@@ -254,7 +254,6 @@ namespace quda {
       if (setup_use_mma && dirac->isCoarse()) {
 
         dirac->createCoarseOp(*Y_aos_d, *X_aos_d, *transfer, kappa, mass, Mu(), MuFactor(), AllowTruncation());
-
         X_d->copy(*X_aos_d);
 
         if (getVerbosity() >= QUDA_VERBOSE) printfQuda("About to build the preconditioned coarse clover\n");
@@ -375,8 +374,6 @@ namespace quda {
       ApplyCoarse(out, in, in, *Y_h, *X_h, kappa, parity, false, true, dagger, commDim, QUDA_INVALID_PRECISION,
                   dslash_use_mma);
     }
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * n * n - 2 * n) * (long long)in[0].VolumeCB() * in.size();
   }
 
   void DiracCoarse::CloverInv(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -393,8 +390,6 @@ namespace quda {
       ApplyCoarse(out, in, in, *Y_h, *Xinv_h, kappa, parity, false, true, dagger, commDim, QUDA_INVALID_PRECISION,
                   dslash_use_mma);
     }
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * n * n - 2 * n) * (long long)in[0].VolumeCB() * in.size();
   }
 
   void DiracCoarse::Dslash(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -410,9 +405,6 @@ namespace quda {
     } else if ( location == QUDA_CPU_FIELD_LOCATION ) {
       ApplyCoarse(out, in, in, *Y_h, *X_h, kappa, parity, true, false, dagger, commDim, halo_precision, dslash_use_mma);
     }
-
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * (8 * n * n) - 2 * n) * (long long)in[0].VolumeCB() * in[0].SiteSubset() * in.size();
   }
 
   void DiracCoarse::DslashXpay(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -429,8 +421,6 @@ namespace quda {
     } else if ( location == QUDA_CPU_FIELD_LOCATION ) {
       ApplyCoarse(out, in, x, *Y_h, *X_h, kappa, parity, true, true, dagger, commDim, halo_precision, dslash_use_mma);
     }
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (9 * (8 * n * n) - 2 * n) * (long long)in[0].VolumeCB() * in[0].SiteSubset() * in.size();
   }
 
   void DiracCoarse::M(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
@@ -446,8 +436,6 @@ namespace quda {
       ApplyCoarse(out, in, in, *Y_h, *X_h, kappa, QUDA_INVALID_PARITY, true, true, dagger, commDim, halo_precision,
                   dslash_use_mma);
     }
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (9 * (8 * n * n) - 2 * n) * (long long)in[0].VolumeCB() * in[0].SiteSubset() * in.size();
   }
 
   void DiracCoarse::MdagM(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
@@ -507,11 +495,10 @@ namespace quda {
     /* do nothing */
   }
 
-  DiracCoarsePC::DiracCoarsePC(const DiracParam &param, std::shared_ptr<cpuGaugeField> Y_h,
-                               std::shared_ptr<cpuGaugeField> X_h, std::shared_ptr<cpuGaugeField> Xinv_h,
-                               std::shared_ptr<cpuGaugeField> Yhat_h, std::shared_ptr<cudaGaugeField> Y_d,
-                               std::shared_ptr<cudaGaugeField> X_d, std::shared_ptr<cudaGaugeField> Xinv_d,
-                               std::shared_ptr<cudaGaugeField> Yhat_d) :
+  DiracCoarsePC::DiracCoarsePC(const DiracParam &param, std::shared_ptr<GaugeField> Y_h, std::shared_ptr<GaugeField> X_h,
+                               std::shared_ptr<GaugeField> Xinv_h, std::shared_ptr<GaugeField> Yhat_h,
+                               std::shared_ptr<GaugeField> Y_d, std::shared_ptr<GaugeField> X_d,
+                               std::shared_ptr<GaugeField> Xinv_d, std::shared_ptr<GaugeField> Yhat_d) :
     DiracCoarse(param, Y_h, X_h, Xinv_h, Yhat_h, Y_d, X_d, Xinv_d, Yhat_d)
   {
   }
@@ -537,9 +524,6 @@ namespace quda {
       ApplyCoarse(out, in, in, *Yhat_h, *X_h, kappa, parity, true, false, dagger, commDim, halo_precision,
                   dslash_use_mma);
     }
-
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * (8 * n * n) - 2 * n) * in[0].VolumeCB() * in[0].SiteSubset() * in.size();
   }
 
   void DiracCoarsePC::DslashXpay(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
@@ -548,10 +532,6 @@ namespace quda {
     // FIXME emulated for now
     Dslash(out, in, parity);
     for (auto i = 0u; i < x.size(); i++) blas::xpay(x[i], k, out[i]);
-
-    int n = in[0].Nspin() * in[0].Ncolor();
-    flops += (8 * (8 * n * n) - 2 * n) * in[0].VolumeCB()
-      * in.size(); // blas flops counted separately so only need to count dslash flops
   }
 
   void DiracCoarsePC::M(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in) const
diff --git a/lib/dirac_domain_wall.cpp b/lib/dirac_domain_wall.cpp
index cc29f2927f..85b9a4f76b 100644
--- a/lib/dirac_domain_wall.cpp
+++ b/lib/dirac_domain_wall.cpp
@@ -49,11 +49,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall5D(out, in, *gauge, 0.0, mass, in, parity, dagger, commDim, profile);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls-2)*(in.Volume()/Ls);
-    long long wall = 2*in.Volume()/Ls;
-    flops += 1320LL*(long long)in.Volume() + 96LL*bulk + 120LL*wall;
   }
 
   void DiracDomainWall::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, 
@@ -65,11 +60,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall5D(out, in, *gauge, k, mass, x, parity, dagger, commDim, profile);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls-2)*(in.Volume()/Ls);
-    long long wall = 2*in.Volume()/Ls;
-    flops += (1320LL+48LL)*(long long)in.Volume() + 96LL*bulk + 120LL*wall;
   }
 
   void DiracDomainWall::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -77,11 +67,6 @@ namespace quda {
     checkFullSpinor(out, in);
 
     ApplyDomainWall5D(out, in, *gauge, -kappa5, mass, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += (1320LL + 48LL) * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracDomainWall::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
diff --git a/lib/dirac_domain_wall_4d.cpp b/lib/dirac_domain_wall_4d.cpp
index 7c69233f70..043101e2b5 100644
--- a/lib/dirac_domain_wall_4d.cpp
+++ b/lib/dirac_domain_wall_4d.cpp
@@ -26,7 +26,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4D(out, in, *gauge, 0.0, 0.0, nullptr, nullptr, in, parity, dagger, commDim, profile);
-    flops += 1320LL*(long long)in.Volume();
   }
 
   void DiracDomainWall4D::Dslash5(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -36,11 +35,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, in, mass, 0.0, nullptr, nullptr, 0.0, dagger, Dslash5Type::DSLASH5_DWF);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls-2)*(in.Volume()/Ls);
-    long long wall = 2*in.Volume()/Ls;
-    flops += 96LL*bulk + 120LL*wall;
   }
 
   // Modification for the 4D preconditioned domain wall operator
@@ -52,8 +46,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4D(out, in, *gauge, k, 0.0, nullptr, nullptr, x, parity, dagger, commDim, profile);
-
-    flops += (1320LL+48LL)*(long long)in.Volume();
   }
 
   void DiracDomainWall4D::Dslash5Xpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -64,11 +56,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, x, mass, 0.0, nullptr, nullptr, k, dagger, Dslash5Type::DSLASH5_DWF);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls-2)*(in.Volume()/Ls);
-    long long wall = 2*in.Volume()/Ls;
-    flops += (48LL)*(long long)in.Volume() + 96LL*bulk + 120LL*wall;
   }
 
   void DiracDomainWall4D::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -76,13 +63,7 @@ namespace quda {
     checkFullSpinor(out, in);
 
     ApplyDomainWall4D(out, in, *gauge, 0.0, 0.0, nullptr, nullptr, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-    flops += 1320LL * (long long)in.Volume();
     ApplyDslash5(out, in, out, mass, 0.0, nullptr, nullptr, 1.0, dagger, Dslash5Type::DSLASH5_DWF);
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += (48LL) * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-
     blas::xpay(in, -kappa5, out);
   }
 
@@ -132,9 +113,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, in, mass, m5, nullptr, nullptr, 0.0, dagger, Dslash5Type::M5_INV_DWF);
-
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
   }
 
   void DiracDomainWall4DPC::M5invXpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -145,9 +123,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, x, mass, m5, nullptr, nullptr, b, dagger, Dslash5Type::M5_INV_DWF);
-
-    long long Ls = in.X(4);
-    flops += (144LL * Ls + 48LL) * (long long)in.Volume() + 3LL * Ls * (Ls - 1LL);
   }
 
   // Apply the 4D even-odd preconditioned domain-wall Dirac operator
diff --git a/lib/dirac_improved_staggered.cpp b/lib/dirac_improved_staggered.cpp
index b6423f9a02..e700a88cce 100644
--- a/lib/dirac_improved_staggered.cpp
+++ b/lib/dirac_improved_staggered.cpp
@@ -32,7 +32,6 @@ namespace quda {
     checkParitySpinor(in, out);
 
     ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, 0., in, parity, dagger, commDim, profile);
-    flops += 1146ll*in.Volume();
   }
 
   void DiracImprovedStaggered::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -49,10 +48,8 @@ namespace quda {
       } else {
         ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, 0., x, parity, QUDA_DAG_YES, commDim, profile);
       }
-      flops += 1146ll * in.Volume();
     } else {
       ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, k, x, parity, dagger, commDim, profile);
-      flops += 1158ll * in.Volume();
     }
   }
 
@@ -69,11 +66,9 @@ namespace quda {
         ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, 0., in, QUDA_INVALID_PARITY, QUDA_DAG_YES, commDim,
                                profile);
       }
-      flops += 1146ll * in.Volume();
     } else {
       ApplyImprovedStaggered(out, in, *fatGauge, *longGauge, 2. * mass, in, QUDA_INVALID_PARITY, dagger, commDim,
                              profile);
-      flops += 1158ll * in.Volume();
     }
   }
 
@@ -134,8 +129,6 @@ namespace quda {
     } else {
       ApplyStaggeredQSmear(out, in, *gauge, t0_local, is_time_slice, parity, laplace3D, dagger, comm_dim, profile);
     }
-
-    flops += ( laplace3D > 3 ? 570ll : 426ll ) * ( in.Volume() / ( is_time_slice ? in.X(3) : 1 ) );
   }  
 
   void DiracImprovedStaggered::createCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, double, double mass,
diff --git a/lib/dirac_improved_staggered_kd.cpp b/lib/dirac_improved_staggered_kd.cpp
index fdba112b7f..d7e058d3fd 100644
--- a/lib/dirac_improved_staggered_kd.cpp
+++ b/lib/dirac_improved_staggered_kd.cpp
@@ -59,29 +59,23 @@ namespace quda
       if (mass == 0.) {
         ApplyImprovedStaggered(tmp, in, *fatGauge, *longGauge, 0., in, QUDA_INVALID_PARITY, QUDA_DAG_YES, commDim,
                                profile);
-        flops += 1146ll * in.Volume();
       } else {
         ApplyImprovedStaggered(tmp, in, *fatGauge, *longGauge, 2. * mass, in, QUDA_INVALID_PARITY, dagger, commDim,
                                profile);
-        flops += 1158ll * in.Volume();
       }
 
       ApplyStaggeredKahlerDiracInverse(out, tmp, *Xinv, false);
-      flops += (8ll * 48 - 2ll) * 48 * in.Volume() / 16; // for 2^4 block
 
     } else { // QUDA_DAG_YES
 
       ApplyStaggeredKahlerDiracInverse(tmp, in, *Xinv, true);
-      flops += (8ll * 48 - 2ll) * 48 * in.Volume() / 16; // for 2^4 block
 
       if (mass == 0.) {
         ApplyImprovedStaggered(out, tmp, *fatGauge, *longGauge, 0., tmp, QUDA_INVALID_PARITY, QUDA_DAG_NO, commDim,
                                profile);
-        flops += 1146ll * in.Volume();
       } else {
         ApplyImprovedStaggered(out, tmp, *fatGauge, *longGauge, 2. * mass, tmp, QUDA_INVALID_PARITY, dagger, commDim,
                                profile);
-        flops += 1158ll * in.Volume();
       }
     }
   }
@@ -154,8 +148,8 @@ namespace quda
     // Should we support "preparing" and "reconstructing"?
   }
 
-  void DiracImprovedStaggeredKD::updateFields(cudaGaugeField *, cudaGaugeField *fat_gauge_in,
-                                              cudaGaugeField *long_gauge_in, CloverField *)
+  void DiracImprovedStaggeredKD::updateFields(GaugeField *, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
+                                              CloverField *)
   {
     Dirac::updateFields(fat_gauge_in, nullptr, nullptr, nullptr);
     fatGauge = fat_gauge_in;
diff --git a/lib/dirac_mobius.cpp b/lib/dirac_mobius.cpp
index d60d8060fc..aaf2aaf6fc 100644
--- a/lib/dirac_mobius.cpp
+++ b/lib/dirac_mobius.cpp
@@ -58,8 +58,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4D(out, in, *gauge, 0.0, 0.0, nullptr, nullptr, in, parity, dagger, commDim, profile);
-
-    flops += 1320LL * (long long)in.Volume();
   }
 
   void DiracMobius::Dslash4pre(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -69,11 +67,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, in, mass, m5, b_5, c_5, 0.0, dagger, Dslash5Type::DSLASH5_MOBIUS_PRE);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   // Unlike DWF-4d, the Mobius variant here applies the full M5 operator and not just D5
@@ -84,11 +77,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, in, mass, m5, b_5, c_5, 0.0, dagger, Dslash5Type::DSLASH5_MOBIUS);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 48LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   // Modification for the 4D preconditioned Mobius domain wall operator
@@ -100,8 +88,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4D(out, in, *gauge, k, m5, b_5, c_5, x, parity, dagger, commDim, profile);
-
-    flops += 1320LL * (long long)in.Volume();
   }
 
   void DiracMobius::Dslash4preXpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -112,12 +98,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, x, mass, m5, b_5, c_5, k, dagger, Dslash5Type::DSLASH5_MOBIUS_PRE);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-
-    flops += (72LL + 48LL) * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   // The xpay operator bakes in a factor of kappa_b^2
@@ -129,11 +109,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDslash5(out, in, x, mass, m5, b_5, c_5, k, dagger, Dslash5Type::DSLASH5_MOBIUS);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 96LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobius::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -156,13 +131,6 @@ namespace quda {
       ApplyDslash5(out, in, in, mass, m5, b_5, c_5, 0.0, dagger, Dslash5Type::DSLASH5_MOBIUS);
     }
     blas::axpy(-mobius_kappa_b, tmp, out);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall; // pre
-    flops += 1320LL * (long long)in.Volume();                            // dslash4
-    flops += 48LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall; // dslash5
   }
 
   void DiracMobius::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -223,9 +191,6 @@ namespace quda {
 
     ApplyDslash5(out, in, in, mass, m5, b_5, c_5, 0.0, dagger,
                  zMobius ? Dslash5Type::M5_INV_ZMOBIUS : Dslash5Type::M5_INV_MOBIUS);
-
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
   }
 
   // The xpay operator bakes in a factor of kappa_b^2
@@ -238,9 +203,6 @@ namespace quda {
 
     ApplyDslash5(out, in, x, mass, m5, b_5, c_5, k, dagger,
                  zMobius ? Dslash5Type::M5_INV_ZMOBIUS : Dslash5Type::M5_INV_MOBIUS);
-
-    long long Ls = in.X(4);
-    flops += (144LL * Ls + 48LL) * (long long)in.Volume() + 3LL * Ls * (Ls - 1LL);
   }
 
   void DiracMobiusPC::Dslash4M5invM5pre(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
@@ -250,16 +212,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5invM5pre(out, in, *gauge, 0.0, m5, b_5, c_5, in, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5inv
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // M5pre
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobiusPC::Dslash4M5preM5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity) const
@@ -269,16 +221,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5preM5inv(out, in, *gauge, 0.0, m5, b_5, c_5, in, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5inv
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // M5pre
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobiusPC::Dslash4M5invXpay(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -289,14 +231,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5inv(out, in, *gauge, a, m5, b_5, c_5, x, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5inv
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   void DiracMobiusPC::Dslash4M5preXpay(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -307,16 +241,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5pre(out, in, *gauge, a, m5, b_5, c_5, x, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5pre
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   void DiracMobiusPC::Dslash4XpayM5mob(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -327,16 +251,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5mob(out, in, *gauge, a, m5, b_5, c_5, x, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5mob
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 48LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   void DiracMobiusPC::Dslash4M5preXpayM5mob(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -347,18 +261,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5preM5mob(out, in, *gauge, a, m5, b_5, c_5, x, out, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5pre
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-    // M5mob
-    flops += 48LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   void DiracMobiusPC::Dslash4M5invXpayM5inv(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -369,16 +271,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyDomainWall4DM5invM5inv(out, in, *gauge, a, m5, b_5, c_5, x, y, parity, dagger, commDim, mass, profile);
-
-    // D4
-    flops += 1320LL * (long long)in.Volume();
-    // M5inv
-    long long Ls = in.X(4);
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // M5inv
-    flops += 144LL * (long long)in.Volume() * Ls + 3LL * Ls * (Ls - 1LL);
-    // xpay
-    flops += 48LL * (long long)in.Volume();
   }
 
   // Apply the even-odd preconditioned mobius DWF operator
@@ -554,7 +446,7 @@ namespace quda {
 
   void DiracMobiusPC::MdagMLocal(ColorSpinorField &out, const ColorSpinorField &in) const
   {
-    if (zMobius) { errorQuda("DiracMobiusPC::MdagMLocal doesn't currently support zMobius.\n"); }
+    if (zMobius) errorQuda("DiracMobiusPC::MdagMLocal doesn't currently support zMobius");
 
     lat_dim_t shift0 = {0, 0, 0, 0};
     lat_dim_t shift1;
@@ -565,7 +457,7 @@ namespace quda {
       shift2[d] = comm_dim_partitioned(d) ? 2 : 0;
     }
 
-    if (extended_gauge == nullptr) { extended_gauge = createExtendedGauge(*gauge, shift2, profile, true); }
+    if (extended_gauge == nullptr) extended_gauge = createExtendedGauge(*gauge, shift2, profile, true);
 
     checkDWF(in, out);
     checkSpinorAlias(in, out);
@@ -573,67 +465,36 @@ namespace quda {
     ColorSpinorParam csParam(out);
     csParam.create = QUDA_NULL_FIELD_CREATE;
 
-    ColorSpinorField *unextended_tmp1 = ColorSpinorField::Create(csParam);
-    ColorSpinorField *unextended_tmp2 = ColorSpinorField::Create(csParam);
+    ColorSpinorField unextended_tmp1(csParam);
+    ColorSpinorField unextended_tmp2(csParam);
 
     csParam.x[0] += shift2[0]; // x direction is checkerboarded
     for (int d = 1; d < 4; ++d) { csParam.x[d] += shift2[d] * 2; }
-    ColorSpinorField *extended_tmp1 = ColorSpinorField::Create(csParam);
-    ColorSpinorField *extended_tmp2 = ColorSpinorField::Create(csParam);
+    ColorSpinorField extended_tmp1(csParam);
+    ColorSpinorField extended_tmp2(csParam);
 
     int odd_bit = (getMatPCType() == QUDA_MATPC_ODD_ODD) ? 1 : 0;
     QudaParity parity[2] = {static_cast<QudaParity>((1 + odd_bit) % 2), static_cast<QudaParity>((0 + odd_bit) % 2)};
     if (out.Precision() == QUDA_HALF_PRECISION || out.Precision() == QUDA_QUARTER_PRECISION) {
-      mobius_tensor_core::apply_fused_dslash(*unextended_tmp2, in, *extended_gauge, *unextended_tmp2, in, mass, m5, b_5,
+      mobius_tensor_core::apply_fused_dslash(unextended_tmp2, in, *extended_gauge, unextended_tmp2, in, mass, m5, b_5,
                                              c_5, dagger, parity[1], shift0.data, shift0.data,
                                              MdwfFusedDslashType::D5PRE);
 
-      mobius_tensor_core::apply_fused_dslash(*extended_tmp2, *unextended_tmp2, *extended_gauge, *extended_tmp2,
-                                             *unextended_tmp2, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
+      mobius_tensor_core::apply_fused_dslash(extended_tmp2, unextended_tmp2, *extended_gauge, extended_tmp2,
+                                             unextended_tmp2, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
                                              shift2.data, MdwfFusedDslashType::D4_D5INV_D5PRE);
 
-      mobius_tensor_core::apply_fused_dslash(*extended_tmp1, *extended_tmp2, *extended_gauge, *unextended_tmp1, in,
-                                             mass, m5, b_5, c_5, dagger, parity[1], shift0.data, shift1.data,
+      mobius_tensor_core::apply_fused_dslash(extended_tmp1, extended_tmp2, *extended_gauge, unextended_tmp1, in, mass,
+                                             m5, b_5, c_5, dagger, parity[1], shift0.data, shift1.data,
                                              MdwfFusedDslashType::D4_D5INV_D5INVDAG);
 
-      mobius_tensor_core::apply_fused_dslash(*extended_tmp2, *extended_tmp1, *extended_gauge, *extended_tmp2,
-                                             *extended_tmp1, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
+      mobius_tensor_core::apply_fused_dslash(extended_tmp2, extended_tmp1, *extended_gauge, extended_tmp2,
+                                             extended_tmp1, mass, m5, b_5, c_5, dagger, parity[0], shift1.data,
                                              shift1.data, MdwfFusedDslashType::D4DAG_D5PREDAG_D5INVDAG);
 
-      mobius_tensor_core::apply_fused_dslash(out, *extended_tmp2, *extended_gauge, out, *unextended_tmp1, mass, m5, b_5,
+      mobius_tensor_core::apply_fused_dslash(out, extended_tmp2, *extended_gauge, out, unextended_tmp1, mass, m5, b_5,
                                              c_5, dagger, parity[1], shift2.data, shift2.data,
                                              MdwfFusedDslashType::D4DAG_D5PREDAG);
-
-      const long long Ls = in.X(4);
-      const long long mat = 2ll * 4ll * Ls - 1ll; // (multiplicaiton-add) * (spin) * Ls - 1
-      const long long hop = 7ll * 8ll;            // 8 for eight directions
-
-      long long vol;
-      long long halo_vol;
-
-      vol = (2 * in.X(0)) * in.X(1) * in.X(2) * in.X(3) * Ls / 2ll;
-      flops += vol * 24ll * mat;
-
-      vol = (2 * in.X(0) + 2 * 1) * (in.X(1) + 2 * 1) * (in.X(2) + 2 * 1) * (in.X(3) + 2 * 1) * Ls / 2ll;
-      halo_vol = (2 * in.X(0)) * in.X(1) * in.X(2) * in.X(3) * Ls / 2ll;
-      flops += halo_vol * 24ll * hop + vol * 24ll * mat;
-
-      vol = (2 * in.X(0) + 2 * 2) * (in.X(1) + 2 * 2) * (in.X(2) + 2 * 2) * (in.X(3) + 2 * 2) * Ls / 2ll;
-      halo_vol = (2 * in.X(0) + 2 * 1) * (in.X(1) + 2 * 1) * (in.X(2) + 2 * 1) * (in.X(3) + 2 * 1) * Ls / 2ll;
-      flops += halo_vol * 24ll * hop + vol * 24ll * mat * 2ll;
-
-      vol = (2 * in.X(0) + 2 * 1) * (in.X(1) + 2 * 1) * (in.X(2) + 2 * 1) * (in.X(3) + 2 * 1) * Ls / 2ll;
-      flops += vol * 24ll * (hop + mat);
-
-      vol = (2 * in.X(0)) * in.X(1) * in.X(2) * in.X(3) * Ls / 2ll;
-      flops += vol * 24ll * (hop + mat);
-
-      delete extended_tmp2;
-      delete extended_tmp1;
-
-      delete unextended_tmp1;
-      delete unextended_tmp2;
-
     } else {
       errorQuda("DiracMobiusPC::MdagMLocal(...) only supports half and quarter precision");
     }
@@ -710,20 +571,13 @@ namespace quda {
 
   void DiracMobiusEofa::m5_eofa(ColorSpinorField &out, const ColorSpinorField &in) const
   {
-    if (in.Ndim() != 5 || out.Ndim() != 5) errorQuda("Wrong number of dimensions\n");
+    if (in.Ndim() != 5 || out.Ndim() != 5) errorQuda("Wrong number of dimensions");
 
     checkDWF(in, out);
     checkSpinorAlias(in, out);
 
     mobius_eofa::apply_dslash5(out, in, in, mass, m5, b_5, c_5, 0., eofa_pm, m5inv_fac, mobius_kappa, eofa_u, eofa_x,
                                eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5_EOFA);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-
-    // 96 = 48 + 48, the second 48 from EOFA
-    flops += 96LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobiusEofa::m5_eofa_xpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -738,13 +592,6 @@ namespace quda {
     // The kernel will actually do (m5 * in - kappa_b^2 * x)
     mobius_eofa::apply_dslash5(out, in, x, mass, m5, b_5, c_5, a, eofa_pm, m5inv_fac, mobius_kappa, eofa_u, eofa_x,
                                eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5_EOFA);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-
-    // 144 = 96 + 48, the 48 from EOFA
-    flops += 144LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall;
   }
 
   void DiracMobiusEofa::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -768,15 +615,6 @@ namespace quda {
                                  eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5_EOFA);
     }
     blas::axpy(-mobius_kappa_b, tmp, out);
-
-    long long Ls = in.X(4);
-    long long bulk = (Ls - 2) * (in.Volume() / Ls);
-    long long wall = 2 * in.Volume() / Ls;
-    flops += 72LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall; // pre
-    flops += 1320LL * (long long)in.Volume();                            // dslash4
-
-    // 96 = 48 + 48, the second 48 from EOFA
-    flops += 96LL * (long long)in.Volume() + 96LL * bulk + 120LL * wall; // dslash5
   }
 
   void DiracMobiusEofa::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -816,9 +654,6 @@ namespace quda {
 
     mobius_eofa::apply_dslash5(out, in, in, mass, m5, b_5, c_5, 0., eofa_pm, m5inv_fac, mobius_kappa, eofa_u, eofa_x,
                                eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5INV_EOFA);
-
-    long long Ls = in.X(4);
-    flops += (192LL * Ls + 96LL) * (long long)in.Volume() + 3LL * Ls * (Ls - 1LL);
   }
 
   void DiracMobiusEofaPC::m5inv_eofa_xpay(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x,
@@ -834,9 +669,6 @@ namespace quda {
     // The kernel will actually do (x - kappa_b^2 * m5inv * in)
     mobius_eofa::apply_dslash5(out, in, x, mass, m5, b_5, c_5, a, eofa_pm, m5inv_fac, mobius_kappa, eofa_u, eofa_x,
                                eofa_y, sherman_morrison_fac, dagger, Dslash5Type::M5INV_EOFA);
-
-    long long Ls = in.X(4);
-    flops += (192LL * Ls + 48LL + 96LL) * (long long)in.Volume() + 3LL * Ls * (Ls - 1LL);
   }
 
   // Apply the even-odd preconditioned mobius DWF EOFA operator
diff --git a/lib/dirac_staggered.cpp b/lib/dirac_staggered.cpp
index eb04c249b1..fcb7641a3f 100644
--- a/lib/dirac_staggered.cpp
+++ b/lib/dirac_staggered.cpp
@@ -25,7 +25,6 @@ namespace quda {
     checkParitySpinor(in, out);
 
     ApplyStaggered(out, in, *gauge, 0., in, parity, dagger, commDim, profile);
-    flops += 570ll*in.Volume();
   }
 
   void DiracStaggered::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, 
@@ -43,10 +42,8 @@ namespace quda {
       } else {
         ApplyStaggered(out, in, *gauge, 0., x, parity, QUDA_DAG_YES, commDim, profile);
       }
-      flops += 570ll * in.Volume();
     } else {
       ApplyStaggered(out, in, *gauge, k, x, parity, dagger, commDim, profile);
-      flops += 582ll * in.Volume();
     }
   }
 
@@ -66,10 +63,8 @@ namespace quda {
       } else {
         ApplyStaggered(out, in, *gauge, 0., in, QUDA_INVALID_PARITY, QUDA_DAG_YES, commDim, profile);
       }
-      flops += 570ll * in.Volume();
     } else {
       ApplyStaggered(out, in, *gauge, 2. * mass, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-      flops += 582ll * in.Volume();
     }
   }
 
@@ -142,7 +137,6 @@ namespace quda {
     } else {
       ApplyStaggeredQSmear(out, in, *gauge, t0_local, is_time_slice, parity, laplace3D, dagger, comm_dim, profile);
     }
-    flops += ( laplace3D > 3 ? 570ll : 426ll ) * ( in.Volume() / ( is_time_slice ? in.X(3) : 1 ) );
   }  
   
 
diff --git a/lib/dirac_staggered_kd.cpp b/lib/dirac_staggered_kd.cpp
index 9271c8afc3..ffb1bda9e3 100644
--- a/lib/dirac_staggered_kd.cpp
+++ b/lib/dirac_staggered_kd.cpp
@@ -58,25 +58,20 @@ namespace quda
 
       if (mass == 0.) {
         ApplyStaggered(tmp, in, *gauge, 0., in, QUDA_INVALID_PARITY, QUDA_DAG_YES, commDim, profile);
-        flops += 570ll * in.Volume();
       } else {
         ApplyStaggered(tmp, in, *gauge, 2. * mass, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-        flops += 582ll * in.Volume();
       }
+
       ApplyStaggeredKahlerDiracInverse(out, tmp, *Xinv, false);
-      flops += (8ll * 48 - 2ll) * 48 * in.Volume() / 16; // for 2^4 block
 
     } else { // QUDA_DAG_YES
 
       ApplyStaggeredKahlerDiracInverse(tmp, in, *Xinv, true);
-      flops += (8ll * 48 - 2ll) * 48 * in.Volume() / 16; // for 2^4 block
 
       if (mass == 0.) {
         ApplyStaggered(out, tmp, *gauge, 0., tmp, QUDA_INVALID_PARITY, QUDA_DAG_NO, commDim, profile);
-        flops += 570ll * in.Volume();
       } else {
         ApplyStaggered(out, tmp, *gauge, 2. * mass, tmp, QUDA_INVALID_PARITY, dagger, commDim, profile);
-        flops += 582ll * in.Volume();
       }
     }
   }
@@ -150,7 +145,7 @@ namespace quda
     // Should we support "preparing" and "reconstructing"?
   }
 
-  void DiracStaggeredKD::updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *)
+  void DiracStaggeredKD::updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *)
   {
     Dirac::updateFields(gauge_in, nullptr, nullptr, nullptr);
   }
diff --git a/lib/dirac_twisted_clover.cpp b/lib/dirac_twisted_clover.cpp
index 25d91776a7..e6806e5d50 100644
--- a/lib/dirac_twisted_clover.cpp
+++ b/lib/dirac_twisted_clover.cpp
@@ -50,11 +50,6 @@ namespace quda {
   {
     checkParitySpinor(out, in);
     ApplyTwistClover(out, in, *clover, kappa, mu, epsilon, parity, dagger, twistType);
-
-    if (twistType == QUDA_TWIST_GAMMA5_INVERSE)
-      flops += (504ll + 504ll + 48ll) * in.Volume();
-    else
-      flops += (504ll + 48ll) * in.Volume();
   }
 
 
@@ -79,15 +74,10 @@ namespace quda {
       // tm_rho is a Hasenbusch mass preconditioning parameter applied just like a twisted mass
       // but *not* the inverse of M_ee or M_oo
       ApplyTwistedClover(out, in, *gauge, *clover, k, 2 * (mu + tm_rho) * kappa, x, parity, dagger, commDim, profile);
-      // wilson + chiral twist + clover
-      flops += (1320ll + 48ll + 504ll) * in.Volume();
-
     } else {
       // k * D * in + (A + i*2*mu*kappa*gamma_5 * tau_3 - 2 * kappa * epsilon * tau_1 ) * x
       ApplyNdegTwistedClover(out, in, *gauge, *clover, k, 2 * mu * kappa, -2 * kappa * epsilon, x, parity, dagger,
                              commDim, profile);
-      // wilson + chiral twist + flavour twist + clover
-      flops += (1320ll + 48ll + 48ll + 504ll) * in.Volume();
     }
   }
 
@@ -106,14 +96,10 @@ namespace quda {
       // (-kappa * D + A + i*2*mu*kappa*gamma_5 ) * in
       ApplyTwistedClover(out, in, *gauge, *clover, -kappa, 2.0 * kappa * mu, in, QUDA_INVALID_PARITY, dagger, commDim,
                          profile);
-      // wilson + chiral twist + clover
-      flops += (1320ll + 48ll + 504ll) * in.Volume();
     } else {
       // (-kappa * D + A + i*2*mu*kappa*gamma_5*tau_3 - 2*epsilon*kappa*tau_1) * in
       ApplyNdegTwistedClover(out, in, *gauge, *clover, -kappa, 2 * kappa * mu, -2 * kappa * epsilon, in,
                              QUDA_INVALID_PARITY, dagger, commDim, profile);
-      // wilson + chiral twist + flavor twist + clover
-      flops += (1320ll + 48ll + 48ll + 504ll) * in.Volume();
     }
   }
 
@@ -231,11 +217,9 @@ namespace quda {
       if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
         ApplyTwistedCloverPreconditioned(out, in, *gauge, *clover, 1.0, -2.0 * kappa * mu, false, in, parity, dagger,
                                          commDim, profile);
-        flops += (1320ll + 48ll + 504ll) * in.Volume();
       } else {
         ApplyNdegTwistedCloverPreconditioned(out, in, *gauge, *clover, 1.0, -2.0 * kappa * mu, 2.0 * kappa * epsilon,
                                              false, in, parity, dagger, commDim, profile);
-        flops += (1320ll + 48ll + 48ll + 504ll) * in.Volume();
       }
     }
   }
@@ -260,11 +244,9 @@ namespace quda {
       if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
         ApplyTwistedCloverPreconditioned(out, in, *gauge, *clover, k, -2.0 * kappa * mu, true, x, parity, dagger,
                                          commDim, profile);
-        flops += (1320ll + 48ll + 504ll) * in.Volume();
       } else {
         ApplyNdegTwistedCloverPreconditioned(out, in, *gauge, *clover, k, -2.0 * kappa * mu, 2.0 * kappa * epsilon,
                                              true, x, parity, dagger, commDim, profile);
-        flops += (1320ll + 48ll + 48ll + 504ll) * in.Volume();
       }
     }
   }
diff --git a/lib/dirac_twisted_mass.cpp b/lib/dirac_twisted_mass.cpp
index 0c2911c851..964c1191cc 100644
--- a/lib/dirac_twisted_mass.cpp
+++ b/lib/dirac_twisted_mass.cpp
@@ -36,7 +36,6 @@ namespace quda {
   {
     checkParitySpinor(out, in);
     ApplyTwistGamma(out, in, 4, kappa, mu, epsilon, dagger, twistType);
-    flops += 24ll*in.Volume();
   }
 
   // Public method to apply the twist
@@ -51,12 +50,10 @@ namespace quda {
     if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
       // this would really just be a Wilson dslash (not actually instantiated at present)
       ApplyTwistedMass(out, in, *gauge, 0.0, 2 * mu * kappa, in, parity, dagger, commDim, profile);
-      flops += 1392ll * in.Volume();
     } else {
       // this would really just be a 2-way vectorized Wilson dslash (not actually instantiated at present)
       ApplyNdegTwistedMass(
           out, in, *gauge, 0.0, 2 * mu * kappa, -2 * kappa * epsilon, in, parity, dagger, commDim, profile);
-      flops += (1440ll) * in.Volume();
     }
   }
 
@@ -67,11 +64,9 @@ namespace quda {
     if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
       // k * D * in + (1 + i*2*mu*kappa*gamma_5) *x
       ApplyTwistedMass(out, in, *gauge, k, 2 * mu * kappa, x, parity, dagger, commDim, profile);
-      flops += 1416ll * in.Volume();
     } else {
       // k * D * in + (1 + i*2*mu*kappa*gamma_5*tau_3 - 2*epsilon*kappa*tau_1) * x
       ApplyNdegTwistedMass(out, in, *gauge, k, 2 * mu * kappa, -2 * kappa * epsilon, x, parity, dagger, commDim, profile);
-      flops += (1464ll) * in.Volume();
     }
   }
 
@@ -88,11 +83,9 @@ namespace quda {
 
     if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
       ApplyTwistedMass(out, in, *gauge, -kappa, 2 * mu * kappa, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-      flops += 1416ll * in.Volume();
     } else {
       ApplyNdegTwistedMass(out, in, *gauge, -kappa, 2 * mu * kappa, -2 * kappa * epsilon, in, QUDA_INVALID_PARITY,
           dagger, commDim, profile);
-      flops += (1464ll) * in.Volume();
     }
   }
 
@@ -174,7 +167,6 @@ namespace quda {
       bool asymmetric
           = (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) && dagger;
       ApplyTwistedMassPreconditioned(out, in, *gauge, b, a, false, in, parity, dagger, asymmetric, commDim, profile);
-      flops += 1392ll * in.Volume(); // flops numbers are approximate since they will vary depending on the dagger or not
     } else {//TWIST doublet :
       double a = 2.0 * kappa * mu;
       double b = 2.0 * kappa * epsilon;
@@ -184,7 +176,6 @@ namespace quda {
           = (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) && dagger;
       ApplyNdegTwistedMassPreconditioned(out, in, *gauge, c, -2.0 * mu * kappa, 2.0 * kappa * epsilon, false, in,
           parity, dagger, asymmetric, commDim, profile);
-      flops += (1440ll) * in.Volume(); // flops are approx. since they will vary depending on the dagger or not
     }
   }
 
@@ -206,7 +197,6 @@ namespace quda {
       bool asymmetric
           = (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) && dagger;
       ApplyTwistedMassPreconditioned(out, in, *gauge, b, a, true, x, parity, dagger, asymmetric, commDim, profile);
-      flops += 1416ll * in.Volume(); // flops numbers are approximate since they will vary depending on the dagger or not
     } else {//TWIST_DOUBLET:
       double a = 2.0 * kappa * mu;
       double b = 2.0 * kappa * epsilon;
@@ -216,8 +206,6 @@ namespace quda {
           = (matpcType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || matpcType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) && dagger;
       ApplyNdegTwistedMassPreconditioned(out, in, *gauge, k * c, -2 * mu * kappa, 2 * kappa * epsilon, true, x, parity,
           dagger, asymmetric, commDim, profile);
-      flops += (1464ll)
-          * in.Volume(); // flops numbers are approximate since they will vary depending on the dagger or not
     }
   }
 
diff --git a/lib/dirac_wilson.cpp b/lib/dirac_wilson.cpp
index 336d7d38be..a73fbe7080 100644
--- a/lib/dirac_wilson.cpp
+++ b/lib/dirac_wilson.cpp
@@ -28,7 +28,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilson(out, in, *gauge, 0.0, in, parity, dagger, commDim, profile);
-    flops += 1320ll*in.Volume();
   }
 
   void DiracWilson::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, const QudaParity parity,
@@ -38,7 +37,6 @@ namespace quda {
     checkSpinorAlias(in, out);
 
     ApplyWilson(out, in, *gauge, k, x, parity, dagger, commDim, profile);
-    flops += 1368ll*in.Volume();
   }
 
   void DiracWilson::M(ColorSpinorField &out, const ColorSpinorField &in) const
@@ -46,7 +44,6 @@ namespace quda {
     checkFullSpinor(out, in);
 
     ApplyWilson(out, in, *gauge, -kappa, in, QUDA_INVALID_PARITY, dagger, commDim, profile);
-    flops += 1368ll * in.Volume();
   }
 
   void DiracWilson::MdagM(ColorSpinorField &out, const ColorSpinorField &in) const
diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu
index 7389394ba1..fa2ba4d365 100644
--- a/lib/dslash_clover_helper.cu
+++ b/lib/dslash_clover_helper.cu
@@ -35,8 +35,14 @@ namespace quda {
       launch<CloverApply>(tp, stream, CloverArg<Float, nColor>(out, in, clover, parity));
     }
 
-    void preTune() { if (out.V() == in.V()) out.backup(); }  // Backup if in and out fields alias
-    void postTune() { if (out.V() == in.V()) out.restore(); } // Restore if the in and out fields alias
+    void preTune()
+    {
+      if (out.data() == in.data()) out.backup();
+    } // Backup if in and out fields alias
+    void postTune()
+    {
+      if (out.data() == in.data()) out.restore();
+    } // Restore if the in and out fields alias
     long long flops() const { return in.Volume()*504ll; }
     long long bytes() const { return out.Bytes() + in.Bytes() + clover.Bytes() / (3 - in.SiteSubset()); }
   };
@@ -115,8 +121,14 @@ namespace quda {
       }
     }
 
-    void preTune() { if (out.V() == in.V()) out.backup(); } // Restore if the in and out fields alias
-    void postTune() { if (out.V() == in.V()) out.restore(); } // Restore if the in and out fields alias
+    void preTune()
+    {
+      if (out.data() == in.data()) out.backup();
+    } // Restore if the in and out fields alias
+    void postTune()
+    {
+      if (out.data() == in.data()) out.restore();
+    } // Restore if the in and out fields alias
     long long flops() const { return (inverse ? 1056ll : 552ll) * in.Volume(); }
     long long bytes() const {
       long long rtn = out.Bytes() + in.Bytes() + clover.Bytes() / (3 - in.SiteSubset());
diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp
index ebb8c03a05..fd755394b4 100644
--- a/lib/dslash_coarse.hpp
+++ b/lib/dslash_coarse.hpp
@@ -418,7 +418,7 @@ namespace quda {
      */
     inline void operator()(DslashCoarsePolicy policy)
     {
-      if (inA[0].V() == out[0].V()) errorQuda("Aliasing pointers");
+      if (inA[0].data() == out[0].data()) errorQuda("Aliasing pointers");
 
       // check all precisions match
       QudaPrecision precision = checkPrecision(out[0], inA[0], inB[0]);
diff --git a/lib/dslash_coarse.in.cpp b/lib/dslash_coarse.in.cpp
index 1077f904a8..d2a51102e8 100644
--- a/lib/dslash_coarse.in.cpp
+++ b/lib/dslash_coarse.in.cpp
@@ -74,7 +74,7 @@ namespace quda
       GaugeFieldParam param(X);
       param.order = order;
       param.location = QUDA_CUDA_FIELD_LOCATION;
-      output = static_cast<GaugeField *>(cudaGaugeField::Create(param));
+      output = static_cast<GaugeField *>(GaugeField::Create(param));
       if (copy_content) { output->copy(X); }
     }
     return output;
diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu
index 4b7ef2458c..66c523df20 100644
--- a/lib/dslash_gamma_helper.cu
+++ b/lib/dslash_gamma_helper.cu
@@ -74,8 +74,14 @@ namespace quda {
       launch<TwistGamma>(tp, stream, GammaArg<Float, nColor>(out, in, d, kappa, mu, epsilon, dagger, type));
     }
 
-    void preTune() { if (out.V() == in.V()) out.backup(); }
-    void postTune() { if (out.V() == in.V()) out.restore(); }
+    void preTune()
+    {
+      if (out.data() == in.data()) out.backup();
+    }
+    void postTune()
+    {
+      if (out.data() == in.data()) out.restore();
+    }
     long long flops() const { return 0; }
     long long bytes() const { return out.Bytes() + in.Bytes(); }
   };
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
index d169f4f0e1..ca8ce572d9 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
@@ -140,16 +140,6 @@ namespace quda
                                                    const ColorSpinorField &x, int parity, bool dagger,
                                                    const int *comm_override, TimeProfile &profile)
   {
-    if (in.V() == out.V()) errorQuda("Aliasing pointers");
-    if (in.FieldOrder() != out.FieldOrder())
-      errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder());
-
-    // check all precisions match
-    checkPrecision(out, in, U, A);
-
-    // check all locations match
-    checkLocation(out, in, U, A);
-
     instantiate<WilsonCloverHasenbuschTwistPCNoClovInvApply>(out, in, U, A, a, b, x, parity, dagger, comm_override,
                                                              profile);
   }
diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp
index 00c888dd88..710d6ac13a 100644
--- a/lib/eigensolve_quda.cpp
+++ b/lib/eigensolve_quda.cpp
@@ -259,8 +259,6 @@ namespace quda
       io.save(kSpace, save_prec, n_eig);
     }
 
-    mat.flops();
-
     logQuda(QUDA_SUMMARIZE, "********************************\n");
     logQuda(QUDA_SUMMARIZE, "***** END QUDA EIGENSOLVER *****\n");
     logQuda(QUDA_SUMMARIZE, "********************************\n");
diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu
index 5ace8e5a29..248b7d1d6c 100644
--- a/lib/gauge_ape.cu
+++ b/lib/gauge_ape.cu
@@ -57,7 +57,9 @@ namespace quda {
 
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeAPE>(out, in, alpha);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
 
diff --git a/lib/gauge_covdev.cpp b/lib/gauge_covdev.cpp
index e3a9e751e4..0d458c14f3 100644
--- a/lib/gauge_covdev.cpp
+++ b/lib/gauge_covdev.cpp
@@ -26,7 +26,6 @@ namespace quda {
     // only switch on comms needed for mu derivative (FIXME - only communicate in the given direction)
     comm_dim[mu % 4] = comm_dim_partitioned(mu % 4);
     ApplyCovDev(out, in, *gauge, mu, parity, dagger, comm_dim, profile);
-    flops += 1320ll*in.Volume(); // FIXME
   }
 
   void GaugeCovDev::MCD(ColorSpinorField &out, const ColorSpinorField &in, const int mu) const
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 1181ecb733..9b2584ba26 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -5,64 +5,101 @@
 
 namespace quda {
 
-  GaugeFieldParam::GaugeFieldParam(const GaugeField &u) :
-    LatticeFieldParam(u),
-    nColor(u.Ncolor()),
-    nFace(u.Nface()),
-    reconstruct(u.Reconstruct()),
-    order(u.Order()),
-    fixed(u.GaugeFixed()),
-    link_type(u.LinkType()),
-    t_boundary(u.TBoundary()),
-    anisotropy(u.Anisotropy()),
-    tadpole(u.Tadpole()),
-    gauge(NULL),
-    create(QUDA_NULL_FIELD_CREATE),
-    geometry(u.Geometry()),
-    compute_fat_link_max(false),
-    staggeredPhaseType(u.StaggeredPhase()),
-    staggeredPhaseApplied(u.StaggeredPhaseApplied()),
-    i_mu(u.iMu()),
-    site_offset(u.SiteOffset()),
-    site_size(u.SiteSize())
-  { }
-
-  GaugeField::GaugeField(const GaugeFieldParam &param) :
-    LatticeField(param),
-    bytes(0),
-    phase_offset(0),
-    phase_bytes(0),
-    nColor(param.nColor),
-    nFace(param.nFace),
-    geometry(param.geometry),
-    reconstruct(param.reconstruct),
-    nInternal(reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2),
-    order(param.order),
-    fixed(param.fixed),
-    link_type(param.link_type),
-    t_boundary(param.t_boundary),
-    anisotropy(param.anisotropy),
-    tadpole(param.tadpole),
-    fat_link_max(link_type == QUDA_ASQTAD_FAT_LINKS ? 0.0 : 1.0),
-    create(param.create),
-    staggeredPhaseType(param.staggeredPhaseType),
-    staggeredPhaseApplied(param.staggeredPhaseApplied),
-    i_mu(param.i_mu),
-    site_offset(param.site_offset),
-    site_size(param.site_size)
-  {
-    if (siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", siteSubset);
-    if (order == QUDA_NATIVE_GAUGE_ORDER) errorQuda("Invalid gauge order %d", order);
-    if (ghost_precision != precision) ghost_precision = precision; // gauge fields require matching precision
-
-    if (link_type != QUDA_COARSE_LINKS && nColor != 3)
-      errorQuda("nColor must be 3, not %d for this link type", nColor);
-    if (nDim != 4)
-      errorQuda("Number of dimensions must be 4 not %d", nDim);
-    if (link_type != QUDA_WILSON_LINKS && anisotropy != 1.0)
+  GaugeFieldParam::GaugeFieldParam(const GaugeField &u) : LatticeFieldParam(u) { u.fill(*this); }
+
+  GaugeField::GaugeField(const GaugeFieldParam &param) : LatticeField(param)
+  {
+    create(param);
+
+    switch (param.create) {
+    case QUDA_NULL_FIELD_CREATE:
+    case QUDA_REFERENCE_FIELD_CREATE: break; // do nothing
+    case QUDA_ZERO_FIELD_CREATE: zero(); break;
+    case QUDA_COPY_FIELD_CREATE: copy(*param.field); break;
+    default: errorQuda("ERROR: create type(%d) not supported yet", param.create);
+    }
+  }
+
+  GaugeField::GaugeField(const GaugeField &u) noexcept : LatticeField(u)
+  {
+    GaugeFieldParam param;
+    u.fill(param);
+    param.create = QUDA_COPY_FIELD_CREATE;
+    create(param);
+    copy(u);
+  }
+
+  GaugeField::GaugeField(GaugeField &&u) noexcept : LatticeField(std::move(u)) { move(std::move(u)); }
+
+  GaugeField &GaugeField::operator=(const GaugeField &src)
+  {
+    if (src.empty()) errorQuda("Copying from empty field");
+    if (&src != this) {
+      if (!init) { // keep current attributes unless unset
+        LatticeField::operator=(src);
+        GaugeFieldParam param;
+        src.fill(param);
+        param.create = QUDA_COPY_FIELD_CREATE;
+        create(param);
+      }
+
+      copy(src);
+    }
+    return *this;
+  }
+
+  GaugeField &GaugeField::operator=(GaugeField &&src)
+  {
+    if (&src != this) {
+      // if field not already initialized then move the field
+      if (!init || are_compatible(*this, src) || src.empty()) {
+        LatticeField::operator=(std::move(src));
+        move(std::move(src));
+      } else {
+        // we error if the field is not compatible with this
+        errorQuda("Moving to already created field");
+      }
+    }
+    return *this;
+  }
+
+  void GaugeField::create(const GaugeFieldParam &param)
+  {
+    if (param.siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", param.siteSubset);
+    if (param.order == QUDA_NATIVE_GAUGE_ORDER) errorQuda("Invalid gauge order %d", param.order);
+    if (param.GhostPrecision() != param.Precision())
+      errorQuda("Ghost precision %d doesn't match field precision %d", param.GhostPrecision(), param.Precision());
+    if (param.link_type != QUDA_COARSE_LINKS && param.nColor != 3)
+      errorQuda("nColor must be 3, not %d for this link type", param.nColor);
+    if (param.nDim != 4) errorQuda("Number of dimensions must be 4 not %d", param.nDim);
+    if (param.link_type != QUDA_WILSON_LINKS && param.anisotropy != 1.0)
       errorQuda("Anisotropy only supported for Wilson links");
-    if (link_type != QUDA_WILSON_LINKS && fixed == QUDA_GAUGE_FIXED_YES)
+    if (param.link_type != QUDA_WILSON_LINKS && param.fixed == QUDA_GAUGE_FIXED_YES)
       errorQuda("Temporal gauge fixing only supported for Wilson links");
+    if ((param.reconstruct == QUDA_RECONSTRUCT_12 || param.reconstruct == QUDA_RECONSTRUCT_8)
+        && param.link_type != QUDA_SU3_LINKS)
+      errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type");
+    if (param.reconstruct == QUDA_RECONSTRUCT_10 && param.link_type != QUDA_ASQTAD_MOM_LINKS)
+      errorQuda("10-reconstruction only supported with momentum links");
+
+    nColor = param.nColor;
+    nFace = param.nFace;
+    geometry = param.geometry;
+    reconstruct = param.reconstruct;
+    nInternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2;
+    order = param.order;
+    fixed = param.fixed;
+    link_type = param.link_type;
+    t_boundary = param.t_boundary;
+    anisotropy = param.anisotropy;
+    tadpole = param.tadpole;
+    fat_link_max = link_type == QUDA_ASQTAD_FAT_LINKS ? 0.0 : 1.0;
+    staggeredPhaseType = param.staggeredPhaseType;
+    staggeredPhaseApplied = param.staggeredPhaseApplied;
+    i_mu = param.i_mu;
+    site_offset = param.site_offset;
+    site_size = param.site_size;
+
     if (geometry == QUDA_SCALAR_GEOMETRY) {
       real_length = volume*nInternal;
       length = 2*stride*nInternal; // two comes from being full lattice
@@ -80,37 +117,185 @@ namespace quda {
       length = 2 * (1 << nDim) * nDim * stride * nInternal; // two comes from being full lattice
     }
 
-    if ((reconstruct == QUDA_RECONSTRUCT_12 || reconstruct == QUDA_RECONSTRUCT_8) && link_type != QUDA_SU3_LINKS) {
-      errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type");
+    switch (geometry) {
+    case QUDA_SCALAR_GEOMETRY: site_dim = 1; break;
+    case QUDA_VECTOR_GEOMETRY: site_dim = nDim; break;
+    case QUDA_TENSOR_GEOMETRY: site_dim = nDim * (nDim - 1) / 2; break;
+    case QUDA_COARSE_GEOMETRY: site_dim = 2 * nDim; break;
+    case QUDA_KDINVERSE_GEOMETRY: site_dim = 1 << nDim; break;
+    default: errorQuda("Unknown geometry type %d", geometry);
     }
 
-    if (reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13) {
-      // Need to adjust the phase alignment as well.
-      int half_phase_bytes
-        = (length / (2 * reconstruct)) * precision; // number of bytes needed to store phases for a single parity
-      int half_gauge_bytes = (length / 2) * precision
-        - half_phase_bytes; // number of bytes needed to store the gauge field for a single parity excluding the phases
-      // Adjust the alignments for the gauge and phase separately
-      half_phase_bytes = ((half_phase_bytes + (512-1))/512)*512;
-      half_gauge_bytes = ((half_gauge_bytes + (512-1))/512)*512;
-    
-      phase_offset = half_gauge_bytes;
-      phase_bytes = half_phase_bytes*2;
-      bytes = (half_gauge_bytes + half_phase_bytes)*2;      
+    if (isNative()) {
+      if (reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13) {
+        // Need to adjust the phase alignment as well.
+        size_t half_phase_bytes
+          = (length / (2 * reconstruct)) * precision; // bytes needed to store phases for a single parity
+        size_t half_gauge_bytes = (length / 2) * precision
+          - half_phase_bytes; // bytes needed to store the gauge field for a single parity excluding the phases
+        // Adjust the alignments for the gauge and phase separately
+        half_phase_bytes = ALIGNMENT_ADJUST(half_phase_bytes);
+        half_gauge_bytes = ALIGNMENT_ADJUST(half_gauge_bytes);
+        phase_offset = half_gauge_bytes;
+        phase_bytes = half_phase_bytes * 2;
+        bytes = (half_gauge_bytes + half_phase_bytes) * 2;
+      } else {
+        bytes = length * precision;
+        bytes = 2 * ALIGNMENT_ADJUST(bytes / 2);
+      }
     } else {
-      bytes = length * precision;
-      if (isNative()) bytes = 2*ALIGNMENT_ADJUST(bytes/2);
+      // compute the correct bytes size for these padded field orders
+      if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+        bytes = site_dim * (x[0] * x[1] * (x[2] + 4) * x[3]) * nInternal * precision;
+      } else if (order == QUDA_BQCD_GAUGE_ORDER) {
+        bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision;
+      } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) {
+        bytes = volume * site_size;
+      } else {
+        bytes = length * precision;
+      }
     }
+
     total_bytes = bytes;
 
+    if (isNative() && ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
+      bool pad_check = true;
+      for (int i = 0; i < nDim; i++) {
+        // when we have coarse links we need to double the pad since we're storing forwards and backwards links
+        int minimum_pad = comm_dim_partitioned(i) ? nFace * surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1) : 0;
+        if (pad < minimum_pad) pad_check = false;
+        if (!pad_check)
+          errorQuda("GaugeField being constructed with insufficient padding in dim %d (%d < %d)", i, pad, minimum_pad);
+      }
+    }
+
+    if (isNative()) {
+      if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
+        gauge = quda_ptr(mem_type, bytes);
+      } else {
+        gauge = quda_ptr(param.gauge, mem_type);
+      }
+    } else if (is_pointer_array(order)) {
+
+      size_t nbytes = volume * nInternal * precision;
+      for (int d = 0; d < site_dim; d++) {
+        if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
+          gauge_array[d] = quda_ptr(mem_type, nbytes);
+        } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
+          if (param.gauge) gauge_array[d] = quda_ptr(static_cast<void **>(param.gauge)[d], mem_type);
+        } else {
+          errorQuda("Unsupported creation type %d", param.create);
+        }
+      }
+
+    } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER
+               || order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER
+               || order == QUDA_MILC_SITE_GAUGE_ORDER) {
+      // does not support device
+
+      if (order == QUDA_MILC_SITE_GAUGE_ORDER && param.create != QUDA_REFERENCE_FIELD_CREATE) {
+        errorQuda("MILC site gauge order only supported for reference fields");
+      }
+
+      if (param.create != QUDA_REFERENCE_FIELD_CREATE) {
+        gauge = quda_ptr(mem_type, bytes);
+      } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) {
+        gauge = quda_ptr(param.gauge, mem_type);
+      } else {
+        errorQuda("Unsupported creation type %d", param.create);
+      }
+
+    } else {
+      errorQuda("Unsupported gauge order type %d", order);
+    }
+
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) {
+      if (!isNative()) {
+        for (int i = 0; i < nDim; i++) {
+          size_t nbytes = nFace * surface[i] * nInternal * precision;
+          ghost[i] = quda_ptr(mem_type, nbytes);
+          if (geometry == QUDA_COARSE_GEOMETRY) ghost[i + 4] = quda_ptr(mem_type, nbytes);
+
+          qudaMemset(ghost[i], 0, nbytes);
+          if (geometry == QUDA_COARSE_GEOMETRY) qudaMemset(ghost[i + 4], 0, nbytes);
+        }
+      } else {
+        if (param.create != QUDA_ZERO_FIELD_CREATE) zeroPad();
+      }
+    }
+
+    init = true;
     setTuningString();
+
+    // exchange the boundaries if a non-trivial field
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD)
+      if (param.create == QUDA_REFERENCE_FIELD_CREATE
+          && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) {
+        exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
+      }
+
+    // compute the fat link max now in case it is needed later (i.e., for half precision)
+    if (param.compute_fat_link_max) fat_link_max = this->abs_max();
   }
 
-  GaugeField::~GaugeField() {
+  void GaugeField::move(GaugeField &&src)
+  {
+    init = std::exchange(src.init, {});
+    if (src.gauge.is_reference()) errorQuda("Cannot move a reference allocation");
+    gauge.exchange(src.gauge, {});
+    for (auto i = 0; i < gauge_array.size(); i++) gauge_array[i].exchange(src.gauge_array[i], {});
+    bytes = std::exchange(src.bytes, 0);
+    phase_offset = std::exchange(src.phase_offset, 0);
+    phase_bytes = std::exchange(src.phase_bytes, 0);
+    length = std::exchange(src.length, 0);
+    real_length = std::exchange(src.real_length, 0);
+    nColor = std::exchange(src.nColor, 0);
+    nFace = std::exchange(src.nFace, 0);
+    geometry = std::exchange(src.geometry, QUDA_INVALID_GEOMETRY);
+    site_dim = std::exchange(src.site_dim, 0);
+    reconstruct = std::exchange(src.reconstruct, QUDA_RECONSTRUCT_INVALID);
+    nInternal = std::exchange(src.nInternal, 0);
+    order = std::exchange(src.order, QUDA_INVALID_GAUGE_ORDER);
+    fixed = std::exchange(src.fixed, QUDA_GAUGE_FIXED_INVALID);
+    link_type = std::exchange(src.link_type, QUDA_INVALID_LINKS);
+    t_boundary = std::exchange(src.t_boundary, QUDA_INVALID_T_BOUNDARY);
+    anisotropy = std::exchange(src.anisotropy, 0.0);
+    tadpole = std::exchange(src.tadpole, 0.0);
+    fat_link_max = std::exchange(src.fat_link_max, 0.0);
+    for (auto i = 0; i < ghost.size(); i++) ghost[i].exchange(src.ghost[i], {});
+    ghostFace = std::exchange(src.ghostFace, {});
+    staggeredPhaseType = std::exchange(src.staggeredPhaseType, QUDA_STAGGERED_PHASE_INVALID);
+    staggeredPhaseApplied = std::exchange(src.staggeredPhaseApplied, false);
+    i_mu = std::exchange(src.i_mu, 0.0);
+    site_offset = std::exchange(src.site_offset, 0);
+    site_size = std::exchange(src.site_size, 0);
+  }
 
+  void GaugeField::fill(GaugeFieldParam &param) const
+  {
+    LatticeField::fill(param);
+    param.gauge = nullptr;
+    param.nColor = nColor;
+    param.nFace = nFace;
+    param.reconstruct = reconstruct;
+    param.order = order;
+    param.fixed = fixed;
+    param.link_type = link_type;
+    param.t_boundary = t_boundary;
+    param.anisotropy = anisotropy;
+    param.tadpole = tadpole;
+    param.create = QUDA_NULL_FIELD_CREATE;
+    param.geometry = geometry;
+    param.compute_fat_link_max = false;
+    param.staggeredPhaseType = staggeredPhaseType;
+    param.staggeredPhaseApplied = staggeredPhaseApplied;
+    param.i_mu = i_mu;
+    param.site_offset = site_offset;
+    param.site_size = site_size;
   }
 
-  void GaugeField::setTuningString() {
+  void GaugeField::setTuningString()
+  {
     LatticeField::setTuningString();
     std::stringstream aux_ss;
     aux_ss << "vol=" << volume << "stride=" << stride << "precision=" << precision << "geometry=" << geometry
@@ -120,9 +305,24 @@ namespace quda {
     if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());
   }
 
+  void GaugeField::zeroPad()
+  {
+    if (!isNative()) return;
+    size_t pad_bytes = (stride - volumeCB) * precision * order;
+    int Npad = (geometry * (reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2)) / order;
+
+    size_t pitch = stride * order * precision;
+    if (pad_bytes) {
+      for (int parity = 0; parity < 2; parity++) {
+        qudaMemset2DAsync(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad,
+                          device::get_default_stream());
+      }
+    }
+  }
+
   void GaugeField::createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir) const
   {
-    if (typeid(*this) == typeid(cpuGaugeField)) return;
+    if (location == QUDA_CPU_FIELD_LOCATION) return;
 
     // if this is not a bidirectional exchange then we are doing a
     // scalar exchange, e.g., only the link matrix in the direcion we
@@ -154,30 +354,467 @@ namespace quda {
 
     if (phase != QUDA_STAGGERED_PHASE_INVALID) staggeredPhaseType = phase;
     applyGaugePhase(*this);
-    if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) {
-      if (typeid(*this)==typeid(cudaGaugeField)) {
-	static_cast<cudaGaugeField&>(*this).exchangeGhost();
-      } else {
-	static_cast<cpuGaugeField&>(*this).exchangeGhost();
-      }
-    }
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) exchangeGhost();
     staggeredPhaseApplied = true;
   }
 
   void GaugeField::removeStaggeredPhase() {
     if (!staggeredPhaseApplied) errorQuda("No staggered phases to remove");
     applyGaugePhase(*this);
-    if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) {
-      if (typeid(*this)==typeid(cudaGaugeField)) {
-	static_cast<cudaGaugeField&>(*this).exchangeGhost();
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) exchangeGhost();
+    staggeredPhaseApplied = false;
+  }
+
+  void GaugeField::createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir)
+  {
+    allocateGhostBuffer(R, no_comms_fill, bidir); // allocate the ghost buffer if not yet allocated
+
+    // ascertain if this instance needs it comms buffers to be updated
+    bool comms_reset = ghost_field_reset || // FIXME add send buffer check
+      (my_face_h[0] != ghost_pinned_send_buffer_h[0]) || (my_face_h[1] != ghost_pinned_send_buffer_h[1])
+      || (from_face_h[0] != ghost_pinned_recv_buffer_h[0]) || (from_face_h[1] != ghost_pinned_recv_buffer_h[1])
+      || ghost_bytes != ghost_bytes_old; // ghost buffer has been resized (e.g., bidir to unidir)
+
+    if (!initComms || comms_reset) LatticeField::createComms(no_comms_fill);
+
+    if (ghost_field_reset) destroyIPCComms();
+    createIPCComms();
+  }
+
+  void GaugeField::allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir) const
+  {
+    createGhostZone(R, no_comms_fill, bidir);
+    LatticeField::allocateGhostBuffer(ghost_bytes);
+  }
+
+  void GaugeField::recvStart(int dim, int dir)
+  {
+    if (!comm_dim_partitioned(dim)) return;
+
+    // receive from neighboring the processor
+    if (comm_peer2peer_enabled(1 - dir, dim)) {
+      comm_start(mh_recv_p2p[bufferIndex][dim][1 - dir]);
+    } else if (comm_gdr_enabled()) {
+      comm_start(mh_recv_rdma[bufferIndex][dim][1 - dir]);
+    } else {
+      comm_start(mh_recv[bufferIndex][dim][1 - dir]);
+    }
+  }
+
+  void GaugeField::sendStart(int dim, int dir, const qudaStream_t &stream)
+  {
+    if (!comm_dim_partitioned(dim)) return;
+
+    if (!comm_peer2peer_enabled(dir, dim)) {
+      if (comm_gdr_enabled()) {
+        comm_start(mh_send_rdma[bufferIndex][dim][dir]);
       } else {
-	static_cast<cpuGaugeField&>(*this).exchangeGhost();
+        comm_start(mh_send[bufferIndex][dim][dir]);
       }
+    } else { // doing peer-to-peer
+
+      void *ghost_dst
+        = static_cast<char *>(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2];
+
+      qudaMemcpyP2PAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir], ghost_face_bytes[dim], stream);
+
+      // record the event
+      qudaEventRecord(ipcCopyEvent[bufferIndex][dim][dir], stream);
+      // send to the neighboring processor
+      comm_start(mh_send_p2p[bufferIndex][dim][dir]);
+    }
+  }
+
+  void GaugeField::commsComplete(int dim, int dir)
+  {
+    if (!comm_dim_partitioned(dim)) return;
+
+    if (comm_peer2peer_enabled(1 - dir, dim)) {
+      comm_wait(mh_recv_p2p[bufferIndex][dim][1 - dir]);
+      qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][dim][1 - dir]);
+    } else if (comm_gdr_enabled()) {
+      comm_wait(mh_recv_rdma[bufferIndex][dim][1 - dir]);
+    } else {
+      comm_wait(mh_recv[bufferIndex][dim][1 - dir]);
+    }
+
+    if (comm_peer2peer_enabled(dir, dim)) {
+      comm_wait(mh_send_p2p[bufferIndex][dim][dir]);
+      qudaEventSynchronize(ipcCopyEvent[bufferIndex][dim][dir]);
+    } else if (comm_gdr_enabled()) {
+      comm_wait(mh_send_rdma[bufferIndex][dim][dir]);
+    } else {
+      comm_wait(mh_send[bufferIndex][dim][dir]);
+    }
+  }
+
+  // This does the exchange of the forwards boundary gauge field ghost zone and places
+  // it into the ghost array of the next node
+  void GaugeField::exchangeGhost(QudaLinkDirection link_direction)
+  {
+    if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD)
+      errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
+    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY)
+      errorQuda("Invalid geometry=%d", geometry);
+    if ((link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS)
+        && geometry != QUDA_COARSE_GEOMETRY)
+      errorQuda("Cannot request exchange of forward links on non-coarse geometry");
+    if (nFace == 0) errorQuda("nFace = 0");
+
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      const int dir = 1; // sending forwards only
+      const lat_dim_t R = {nFace, nFace, nFace, nFace};
+      const bool no_comms_fill = true; // dslash kernels presently require this
+      const bool bidir = false;        // communication is only ever done in one direction at once
+      createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
+
+      // loop over backwards and forwards links
+      const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
+      for (int link_dir = 0; link_dir < 2; link_dir++) {
+        if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
+
+        void *send_d[2 * QUDA_MAX_DIM] = {};
+        void *recv_d[2 * QUDA_MAX_DIM] = {};
+
+        size_t offset = 0;
+        for (int d = 0; d < nDim; d++) {
+          recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
+          if (bidir) offset += ghost_face_bytes_aligned[d];
+          send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
+          offset += ghost_face_bytes_aligned[d];
+        }
+
+        extractGaugeGhost(*this, send_d, true, link_dir * nDim); // get the links into contiguous buffers
+        qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait
+
+        // issue receive preposts and host-to-device copies if needed
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          recvStart(dim, dir); // prepost the receive
+          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
+            qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
+                            ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir));
+          }
+        }
+
+        // if gdr enabled then synchronize
+        if (comm_gdr_enabled()) qudaDeviceSynchronize();
+
+        // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled())
+            qudaStreamSynchronize(device::get_stream(2 * dim + dir));
+          sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending
+        }
+
+        // complete communication and issue host-to-device copies if needed
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          commsComplete(dim, dir);
+          if (!comm_peer2peer_enabled(1 - dir, dim) && !comm_gdr_enabled()) {
+            qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir],
+                            from_face_dim_dir_h[bufferIndex][dim][1 - dir], ghost_face_bytes[dim],
+                            qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir));
+          }
+        }
+
+        qudaDeviceSynchronize(); // synchronize before issuing kernels / copies in default stream - could replace with event post and wait
+
+        // fill in the halos for non-partitioned dimensions
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim) && no_comms_fill) {
+            qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+          }
+        }
+
+        if (isNative()) {
+          copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2 * link_dir); // 1, 3
+        } else {
+          // copy from receive buffer into ghost array
+          for (int dim = 0; dim < nDim; dim++)
+            qudaMemcpy(ghost[dim + link_dir * nDim].data(), recv_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+        }
+
+        bufferIndex = 1 - bufferIndex;
+      } // link_dir
+
+      qudaDeviceSynchronize();
+    } else { // cpu field
+      void *send[2 * QUDA_MAX_DIM];
+      for (int d = 0; d < nDim; d++) {
+        send[d] = safe_malloc(nFace * surface[d] * nInternal * precision);
+        if (geometry == QUDA_COARSE_GEOMETRY) send[d + 4] = safe_malloc(nFace * surface[d] * nInternal * precision);
+      }
+
+      void *ghost_[2 * QUDA_MAX_DIM];
+      for (auto i = 0; i < geometry; i++) ghost_[i] = ghost[i].data();
+
+      // get the links into contiguous buffers
+      if (link_direction == QUDA_LINK_BACKWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
+        extractGaugeGhost(*this, send, true);
+
+        // communicate between nodes
+        exchange(ghost_, send, QUDA_FORWARDS);
+      }
+
+      // repeat if requested and links are bi-directional
+      if (link_direction == QUDA_LINK_FORWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) {
+        extractGaugeGhost(*this, send, true, nDim);
+        exchange(ghost_ + nDim, send + nDim, QUDA_FORWARDS);
+      }
+
+      for (int d = 0; d < geometry; d++) host_free(send[d]);
+    }
+  }
+
+  // This does the opposite of exchangeGhost and sends back the ghost
+  // zone to the node from which it came and injects it back into the
+  // field
+  void GaugeField::injectGhost(QudaLinkDirection link_direction)
+  {
+    if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD)
+      errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange);
+    if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY)
+      errorQuda("Invalid geometry=%d", geometry);
+    if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction);
+    if (nFace == 0) errorQuda("nFace = 0");
+
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      const int dir = 0; // sending backwards only
+      const lat_dim_t R = {nFace, nFace, nFace, nFace};
+      const bool no_comms_fill = false; // injection never does no_comms_fill
+      const bool bidir = false;         // communication is only ever done in one direction at once
+      createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge
+
+      // loop over backwards and forwards links (forwards links never sent but leave here just in case)
+      const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS};
+      for (int link_dir = 0; link_dir < 2; link_dir++) {
+        if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue;
+
+        void *send_d[2 * QUDA_MAX_DIM] = {};
+        void *recv_d[2 * QUDA_MAX_DIM] = {};
+
+        size_t offset = 0;
+        for (int d = 0; d < nDim; d++) {
+          // send backwards is first half of each ghost_send_buffer
+          send_d[d] = static_cast<char *>(ghost_send_buffer_d[bufferIndex]) + offset;
+          if (bidir) offset += ghost_face_bytes_aligned[d];
+          // receive from forwards is the second half of each ghost_recv_buffer
+          recv_d[d] = static_cast<char *>(ghost_recv_buffer_d[bufferIndex]) + offset;
+          offset += ghost_face_bytes_aligned[d];
+        }
+
+        if (isNative()) { // copy from padded region in gauge field into send buffer
+          copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2 * link_dir);
+        } else { // copy from receive buffer into ghost array
+          for (int dim = 0; dim < nDim; dim++)
+            qudaMemcpy(send_d[dim], ghost[dim + link_dir * nDim].data(), ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+        }
+        qudaDeviceSynchronize(); // need to synchronize before issueing copies in different streams - could replace with event post and wait
+
+        // issue receive preposts and host-to-device copies if needed
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          recvStart(dim, dir); // prepost the receive
+          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
+            qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
+                            ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir));
+          }
+        }
+
+        // if gdr enabled then synchronize
+        if (comm_gdr_enabled()) qudaDeviceSynchronize();
+
+        // if the sending direction is not peer-to-peer then we need to synchronize before we start sending
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled())
+            qudaStreamSynchronize(device::get_stream(2 * dim + dir));
+          sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending
+        }
+
+        // complete communication and issue host-to-device copies if needed
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim)) continue;
+          commsComplete(dim, dir);
+          if (!comm_peer2peer_enabled(1 - dir, dim) && !comm_gdr_enabled()) {
+            qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir],
+                            from_face_dim_dir_h[bufferIndex][dim][1 - dir], ghost_face_bytes[dim],
+                            qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir));
+          }
+        }
+
+        qudaDeviceSynchronize(); // synchronize before issuing kernel / copies in default stream - could replace with event post and wait
+
+        // fill in the halos for non-partitioned dimensions
+        for (int dim = 0; dim < nDim; dim++) {
+          if (!comm_dim_partitioned(dim) && no_comms_fill) {
+            qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice);
+          }
+        }
+
+        // get the links into contiguous buffers
+        extractGaugeGhost(*this, recv_d, false, link_dir * nDim);
+
+        bufferIndex = 1 - bufferIndex;
+      } // link_dir
+
+      qudaDeviceSynchronize();
+    } else {
+      void *recv[QUDA_MAX_DIM];
+      for (int d = 0; d < nDim; d++) recv[d] = safe_malloc(nFace * surface[d] * nInternal * precision);
+
+      void *ghost_[] = {ghost[0].data(), ghost[1].data(), ghost[2].data(), ghost[3].data(),
+                        ghost[4].data(), ghost[5].data(), ghost[6].data(), ghost[7].data()};
+
+      // communicate between nodes
+      exchange(recv, ghost_, QUDA_BACKWARDS);
+
+      // get the links into contiguous buffers
+      extractGaugeGhost(*this, recv, false);
+
+      for (int d = 0; d < QUDA_MAX_DIM; d++) host_free(recv[d]);
     }
-    staggeredPhaseApplied = false;
   }
 
-  void GaugeField::exchange(void **ghost_link, void **link_sendbuf, QudaDirection dir) const {
+  void GaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill)
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      const int b = bufferIndex;
+      void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM];
+
+      createComms(R, no_comms_fill);
+
+      size_t offset = 0;
+      for (int dim = 0; dim < nDim; dim++) {
+        if (!(comm_dim_partitioned(dim) || (no_comms_fill && R[dim]))) continue;
+        send_d[dim] = static_cast<char *>(ghost_send_buffer_d[b]) + offset;
+        recv_d[dim] = static_cast<char *>(ghost_recv_buffer_d[b]) + offset;
+
+        // silence cuda-memcheck initcheck errors that arise since we
+        // have an oversized ghost buffer when doing the extended exchange
+        qudaMemsetAsync(send_d[dim], 0, 2 * ghost_face_bytes_aligned[dim], device::get_default_stream());
+        offset += 2 * ghost_face_bytes_aligned[dim]; // factor of two from fwd/back
+      }
+
+      for (int dim = 0; dim < nDim; dim++) {
+        if (!(comm_dim_partitioned(dim) || (no_comms_fill && R[dim]))) continue;
+
+        // extract into a contiguous buffer
+        extractExtendedGaugeGhost(*this, dim, R, send_d, true);
+
+        if (comm_dim_partitioned(dim)) {
+          qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait
+
+          for (int dir = 0; dir < 2; dir++) recvStart(dim, dir);
+
+          for (int dir = 0; dir < 2; dir++) {
+            // issue host-to-device copies if needed
+            if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
+              qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir],
+                              ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(dir));
+            }
+          }
+
+          // if either direction is not peer-to-peer then we need to synchronize
+          if (!comm_peer2peer_enabled(0, dim) || !comm_peer2peer_enabled(1, dim)) qudaDeviceSynchronize();
+
+          for (int dir = 0; dir < 2; dir++) sendStart(dim, dir, device::get_stream(dir));
+          for (int dir = 0; dir < 2; dir++) commsComplete(dim, dir);
+
+          for (int dir = 0; dir < 2; dir++) {
+            // issue host-to-device copies if needed
+            if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) {
+              qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir],
+                              ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(dir));
+            }
+          }
+
+        } else { // if just doing a local exchange to fill halo then need to swap faces
+          qudaMemcpy(from_face_dim_dir_d[b][dim][1], my_face_dim_dir_d[b][dim][0], ghost_face_bytes[dim],
+                     qudaMemcpyDeviceToDevice);
+          qudaMemcpy(from_face_dim_dir_d[b][dim][0], my_face_dim_dir_d[b][dim][1], ghost_face_bytes[dim],
+                     qudaMemcpyDeviceToDevice);
+        }
+
+        // inject back into the gauge field
+        // need to synchronize the copy streams before rejoining the compute stream - could replace with event post and wait
+        qudaDeviceSynchronize();
+        extractExtendedGaugeGhost(*this, dim, R, recv_d, false);
+      }
+
+      bufferIndex = 1 - bufferIndex;
+      qudaDeviceSynchronize();
+    } else {
+      void *send[QUDA_MAX_DIM];
+      void *recv[QUDA_MAX_DIM];
+      size_t bytes[QUDA_MAX_DIM];
+      // store both parities and directions in each
+      for (int d = 0; d < nDim; d++) {
+        if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue;
+        bytes[d] = surface[d] * R[d] * geometry * nInternal * precision;
+        send[d] = safe_malloc(2 * bytes[d]);
+        recv[d] = safe_malloc(2 * bytes[d]);
+      }
+
+      for (int d = 0; d < nDim; d++) {
+        if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue;
+        // extract into a contiguous buffer
+        extractExtendedGaugeGhost(*this, d, R, send, true);
+
+        if (comm_dim_partitioned(d)) {
+          // do the exchange
+          MsgHandle *mh_recv_back;
+          MsgHandle *mh_recv_fwd;
+          MsgHandle *mh_send_fwd;
+          MsgHandle *mh_send_back;
+
+          mh_recv_back = comm_declare_receive_relative(recv[d], d, -1, bytes[d]);
+          mh_recv_fwd = comm_declare_receive_relative(((char *)recv[d]) + bytes[d], d, +1, bytes[d]);
+          mh_send_back = comm_declare_send_relative(send[d], d, -1, bytes[d]);
+          mh_send_fwd = comm_declare_send_relative(((char *)send[d]) + bytes[d], d, +1, bytes[d]);
+
+          comm_start(mh_recv_back);
+          comm_start(mh_recv_fwd);
+          comm_start(mh_send_fwd);
+          comm_start(mh_send_back);
+
+          comm_wait(mh_send_fwd);
+          comm_wait(mh_send_back);
+          comm_wait(mh_recv_back);
+          comm_wait(mh_recv_fwd);
+
+          comm_free(mh_send_fwd);
+          comm_free(mh_send_back);
+          comm_free(mh_recv_back);
+          comm_free(mh_recv_fwd);
+        } else {
+          memcpy(static_cast<char *>(recv[d]) + bytes[d], send[d], bytes[d]);
+          memcpy(recv[d], static_cast<char *>(send[d]) + bytes[d], bytes[d]);
+        }
+
+        // inject back into the gauge field
+        extractExtendedGaugeGhost(*this, d, R, recv, false);
+      }
+
+      for (int d = 0; d < nDim; d++) {
+        if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue;
+        host_free(send[d]);
+        host_free(recv[d]);
+      }
+    }
+  }
+
+  void GaugeField::exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill)
+  {
+    profile.TPSTART(QUDA_PROFILE_COMMS);
+    exchangeExtendedGhost(R, no_comms_fill);
+    profile.TPSTOP(QUDA_PROFILE_COMMS);
+  }
+
+  void GaugeField::exchange(void **ghost_link, void **link_sendbuf, QudaDirection dir) const
+  {
     MsgHandle *mh_send[4];
     MsgHandle *mh_recv[4];
     size_t bytes[4];
@@ -202,16 +839,8 @@ namespace quda {
 	  if (no_comms_fill) memcpy(ghost_link[i], link_sendbuf[i], bytes[i]);
 	}
       }
-    } else { // FIXME for CUDA field copy back to the CPU
-      for (int i=0; i<nDimComms; i++) {
-	if (comm_dim_partitioned(i)) {
-	  send[i] = pool_pinned_malloc(bytes[i]);
-	  receive[i] = pool_pinned_malloc(bytes[i]);
-          qudaMemcpy(send[i], link_sendbuf[i], bytes[i], qudaMemcpyDeviceToHost);
-        } else {
-          if (no_comms_fill) qudaMemcpy(ghost_link[i], link_sendbuf[i], bytes[i], qudaMemcpyDeviceToDevice);
-        }
-      }
+    } else {
+      errorQuda("Not supported");
     }
 
     for (int i=0; i<nDimComms; i++) {
@@ -240,21 +869,23 @@ namespace quda {
       comm_wait(mh_recv[i]);
     }
 
-    if (Location() == QUDA_CUDA_FIELD_LOCATION) {
-      for (int i=0; i<nDimComms; i++) {
-	if (!comm_dim_partitioned(i)) continue;
-        qudaMemcpy(ghost_link[i], receive[i], bytes[i], qudaMemcpyHostToDevice);
-        pool_pinned_free(send[i]);
-	pool_pinned_free(receive[i]);
-      }
-    }
-
     for (int i=0; i<nDimComms; i++) {
       if (!comm_dim_partitioned(i)) continue;
       comm_free(mh_send[i]);
       comm_free(mh_recv[i]);
     }
+  }
 
+  bool GaugeField::are_compatible_weak(const GaugeField &a, const GaugeField &b)
+  {
+    return (a.LinkType() == b.LinkType() && a.Ncolor() == b.Ncolor() && a.Nface() == b.Nface()
+            && a.GaugeFixed() == b.GaugeFixed() && a.TBoundary() == b.TBoundary() && a.Anisotropy() == b.Anisotropy()
+            && a.Tadpole() == b.Tadpole());
+  }
+
+  bool GaugeField::are_compatible(const GaugeField &a, const GaugeField &b)
+  {
+    return (a.Precision() == b.Precision() && a.Order() == b.Order() && are_compatible_weak(a, b));
   }
 
   void GaugeField::checkField(const LatticeField &l) const {
@@ -274,13 +905,232 @@ namespace quda {
     }
   }
 
-  std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param) {
+  void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
+  {
+    if (order == QUDA_QDP_GAUGE_ORDER) {
+      void **buffer = new void *[geometry];
+      for (int d = 0; d < geometry; d++) buffer[d] = pool_device_malloc(bytes / geometry);
+      return ((void *)buffer);
+    } else {
+      return pool_device_malloc(bytes);
+    }
+  }
+
+  void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
+  {
+    if (order > 4) {
+      void **buffer = new void *[geometry];
+      for (int d = 0; d < geometry; d++) buffer[d] = pool_device_malloc(bytes[d]);
+      return buffer;
+    } else {
+      return 0;
+    }
+  }
+
+  void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
+  {
+    if (order == QUDA_QDP_GAUGE_ORDER) {
+      for (int d = 0; d < geometry; d++) pool_device_free(((void **)buffer)[d]);
+      delete[]((void **)buffer);
+    } else {
+      pool_device_free(buffer);
+    }
+  }
+
+  void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry)
+  {
+    if (order > 4) {
+      for (int d = 0; d < geometry; d++) pool_device_free(buffer[d]);
+      delete[] buffer;
+    }
+  }
+
+  void GaugeField::copy(const GaugeField &src)
+  {
+    if (this == &src) return;
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTART(QUDA_PROFILE_H2D);
+    }
+
+    checkField(src);
+
+    if (link_type == QUDA_ASQTAD_FAT_LINKS) {
+      fat_link_max = src.LinkMax();
+      if (fat_link_max == 0.0 && precision < QUDA_SINGLE_PRECISION) fat_link_max = src.abs_max();
+    } else {
+      fat_link_max = 1.0;
+    }
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION) {
+
+      if (location == QUDA_CUDA_FIELD_LOCATION) {
+        if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
+          // copy field and ghost zone into this field
+          copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION);
+
+          if (geometry == QUDA_COARSE_GEOMETRY)
+            copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr, nullptr, nullptr, 3);
+        } else {
+          copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr);
+          if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
+        }
+      } else { // CPU location
+        if (reorder_location() == QUDA_CPU_FIELD_LOCATION) {
+
+          if (!src.isNative()) errorQuda("Only native order is supported");
+          void *buffer = pool_pinned_malloc(src.Bytes());
+          qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDeviceToHost);
+
+          if (GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
+            copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
+          } else {
+            copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, nullptr, buffer);
+          }
+          pool_pinned_free(buffer);
+
+        } else { // else reorder on the GPU
+
+          if (order == QUDA_MILC_SITE_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER
+              || order == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+            // special case where we use zero-copy memory to read/write directly from application's array
+            void *data_d = get_mapped_device_pointer(data());
+            if (GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, data_d, nullptr);
+            } else {
+              errorQuda("Ghost copy not supported here");
+            }
+            qudaDeviceSynchronize(); // synchronize to ensure visibility on the host
+          } else {
+            void *buffer = create_gauge_buffer(bytes, order, geometry);
+            size_t ghost_bytes[8];
+            int dstNinternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : 2 * nColor * nColor;
+            for (int d = 0; d < geometry; d++) ghost_bytes[d] = nFace * surface[d % 4] * dstNinternal * precision;
+            void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr;
+
+            if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) {
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr);
+              if (geometry == QUDA_COARSE_GEOMETRY)
+                copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr,
+                                 3); // forwards links if bi-directional
+            } else {
+              copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0);
+            }
+
+            if (order == QUDA_QDP_GAUGE_ORDER) {
+              for (int d = 0; d < geometry; d++) {
+                qudaMemcpy(gauge_array[d].data(), ((void **)buffer)[d], bytes / geometry, qudaMemcpyDeviceToHost);
+              }
+            } else {
+              qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyDeviceToHost);
+            }
+
+            if (order > 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
+                && nFace)
+              for (int d = 0; d < geometry; d++)
+                qudaMemcpy(Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDeviceToHost);
+
+            free_gauge_buffer(buffer, order, geometry);
+            if (nFace > 0) free_ghost_buffer(ghost_buffer, order, geometry);
+          } // order
+        }
+      }
+
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION) {
+
+      if (location == QUDA_CPU_FIELD_LOCATION) {
+        // copy field and ghost zone directly
+        copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION);
+      } else {
+        if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU
+          void *buffer = pool_pinned_malloc(bytes);
+
+          if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
+            // copy field and ghost zone into buffer
+            copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr);
+
+            if (geometry == QUDA_COARSE_GEOMETRY)
+              copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr, 0, 0, 3);
+          } else {
+            copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr);
+            if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
+          }
+
+          qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyDefault);
+          pool_pinned_free(buffer);
+        } else { // else on the GPU
+
+          if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER || src.Order() == QUDA_BQCD_GAUGE_ORDER
+              || src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+            // special case where we use zero-copy memory to read/write directly from application's array
+            void *src_d = get_mapped_device_pointer(src.data());
+
+            if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) {
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, data(), src_d);
+            } else {
+              errorQuda("Ghost copy not supported here");
+            }
+
+          } else {
+            void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry());
+            size_t ghost_bytes[8];
+            int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2 * nColor * nColor;
+            for (int d = 0; d < geometry; d++) ghost_bytes[d] = nFace * surface[d % 4] * srcNinternal * src.Precision();
+            void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr;
+
+            if (src.Order() == QUDA_QDP_GAUGE_ORDER) {
+              for (int d = 0; d < geometry; d++) {
+                qudaMemcpy(((void **)buffer)[d], src.data(d), src.Bytes() / geometry, qudaMemcpyDefault);
+              }
+            } else {
+              qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDefault);
+            }
+
+            if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD
+                && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace)
+              for (int d = 0; d < geometry; d++)
+                qudaMemcpy(ghost_buffer[d], src.Ghost()[d].data(), ghost_bytes[d], qudaMemcpyDefault);
+
+            if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) {
+              copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer);
+              if (geometry == QUDA_COARSE_GEOMETRY)
+                copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer, 3);
+            } else {
+              copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer);
+              if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported");
+            }
+            free_gauge_buffer(buffer, src.Order(), src.Geometry());
+            if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry);
+          }
+        } // reorder_location
+      }   // this location
+    } else {
+      errorQuda("Invalid gauge field type");
+    }
+
+    // if we have copied from a source without a pad then we need to exchange
+    if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD)
+      exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL);
+
+    staggeredPhaseApplied = src.StaggeredPhaseApplied();
+    staggeredPhaseType = src.StaggeredPhase();
+
+    if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_D2H);
+    } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) {
+      getProfile().TPSTOP(QUDA_PROFILE_H2D);
+    }
+  }
+
+  std::ostream &operator<<(std::ostream &output, const GaugeFieldParam &param)
+  {
     output << static_cast<const LatticeFieldParam &>(param);
     output << "nColor = " << param.nColor << std::endl;
     output << "nFace = " << param.nFace << std::endl;
     output << "reconstruct = " << param.reconstruct << std::endl;
-    int nInternal = (param.reconstruct != QUDA_RECONSTRUCT_NO ? 
-		     param.reconstruct : param.nColor * param.nColor * 2);
+    int nInternal = (param.reconstruct != QUDA_RECONSTRUCT_NO ? param.reconstruct : param.nColor * param.nColor * 2);
     output << "nInternal = " << nInternal << std::endl;
     output << "order = " << param.order << std::endl;
     output << "fixed = " << param.fixed << std::endl;
@@ -296,6 +1146,49 @@ namespace quda {
     return output;  // for multiple << operators.
   }
 
+  std::ostream &operator<<(std::ostream &output, const GaugeField &field)
+  {
+    output << static_cast<const LatticeField &>(field);
+    output << "init = " << field.init << std::endl;
+    output << "gauge = " << field.gauge << std::endl;
+    output << "gauge_array = " << field.gauge_array << std::endl;
+    output << "bytes = " << field.bytes << std::endl;
+    output << "phase_offset = " << field.phase_offset << std::endl;
+    output << "phase_bytes = " << field.phase_bytes << std::endl;
+    output << "length = " << field.length << std::endl;
+    output << "real_length = " << field.real_length << std::endl;
+    output << "nColor = " << field.nColor << std::endl;
+    output << "nFace = " << field.nFace << std::endl;
+    output << "geometry = " << field.geometry << std::endl;
+    output << "site_dim = " << field.geometry << std::endl;
+    output << "reconstruct = " << field.reconstruct << std::endl;
+    output << "nInternal = " << field.nInternal << std::endl;
+    output << "order = " << field.order << std::endl;
+    output << "fixed = " << field.fixed << std::endl;
+    output << "link_type = " << field.link_type << std::endl;
+    output << "t_boundary = " << field.t_boundary << std::endl;
+    output << "anisotropy = " << field.anisotropy << std::endl;
+    output << "tadpole = " << field.tadpole << std::endl;
+    output << "fat_link_max = " << field.fat_link_max << std::endl;
+    output << "ghost = " << field.ghost << std::endl;
+    output << "ghostFace = " << field.ghostFace << std::endl;
+    output << "staggeredPhaseType = " << field.staggeredPhaseType << std::endl;
+    output << "staggeredPhaseApplied = " << field.staggeredPhaseApplied << std::endl;
+    output << "i_mu = " << field.i_mu << std::endl;
+    output << "site_offset = " << field.site_offset << std::endl;
+    output << "size_size = " << field.site_size << std::endl;
+    return output; // for multiple << operators.
+  }
+
+  void GaugeField::zero()
+  {
+    if (order != QUDA_QDP_GAUGE_ORDER) {
+      qudaMemset(gauge, 0, bytes);
+    } else {
+      for (int g = 0; g < geometry; g++) qudaMemset(gauge_array[g], 0, volume * nInternal * precision);
+    }
+  }
+
   ColorSpinorParam colorSpinorParam(const GaugeField &a) {
    if (a.FieldOrder() == QUDA_QDP_GAUGE_ORDER || a.FieldOrder() == QUDA_QDPJIT_GAUGE_ORDER)
      errorQuda("Not implemented for this order %d", a.FieldOrder());
@@ -318,59 +1211,54 @@ namespace quda {
     spinor_param.setPrecision(a.Precision(), a.Precision(), true);
     spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
     spinor_param.create = QUDA_REFERENCE_FIELD_CREATE;
-    spinor_param.v = (void*)a.Gauge_p();
+    spinor_param.v = a.data();
     spinor_param.location = a.Location();
     return spinor_param;
   }
 
   // Return the L2 norm squared of the gauge field
-  double norm2(const GaugeField &a) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a));
-    double nrm2 = blas::norm2(*b);
-    delete b;
-    return nrm2;
+  double norm2(const GaugeField &a)
+  {
+    ColorSpinorField b(colorSpinorParam(a));
+    return blas::norm2(b);
   }
 
   // Return the L1 norm of the gauge field
-  double norm1(const GaugeField &a) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a));
-    double nrm1 = blas::norm1(*b);
-    delete b;
-    return nrm1;
+  double norm1(const GaugeField &a)
+  {
+    ColorSpinorField b(colorSpinorParam(a));
+    return blas::norm1(b);
   }
 
   // Scale the gauge field by the constant a
-  void ax(const double &a, GaugeField &u) {
-    ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(u));
-    blas::ax(a, *b);
-    delete b;
+  void ax(const double &a, GaugeField &u)
+  {
+    ColorSpinorField b(colorSpinorParam(u));
+    blas::ax(a, b);
   }
 
   uint64_t GaugeField::checksum(bool mini) const {
     return Checksum(*this, mini);
   }
 
-  GaugeField* GaugeField::Create(const GaugeFieldParam &param) {
+  GaugeField *GaugeField::Create(const GaugeFieldParam &param) { return new GaugeField(param); }
 
-    GaugeField *field = nullptr;
-    if (param.location == QUDA_CPU_FIELD_LOCATION) {
-      field = new cpuGaugeField(param);
-    } else if (param.location== QUDA_CUDA_FIELD_LOCATION) {
-      field = new cudaGaugeField(param);
-    } else {
-      errorQuda("Invalid field location %d", param.location);
-    }
-
-    return field;
+  GaugeField GaugeField::create_alias(const GaugeFieldParam &param_)
+  {
+    if (param_.init && param_.Precision() > precision)
+      errorQuda("Cannot create an alias to source with lower precision than the alias");
+    GaugeFieldParam param = param_.init ? param_ : GaugeFieldParam(*this);
+    param.create = QUDA_REFERENCE_FIELD_CREATE;
+    param.gauge = gauge.data();
+    return GaugeField(param);
   }
 
   // helper for creating extended gauge fields
-  cudaGaugeField *createExtendedGauge(cudaGaugeField &in, const lat_dim_t &R, TimeProfile &profile,
-                                      bool redundant_comms, QudaReconstructType recon)
+  GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile, bool redundant_comms,
+                                  QudaReconstructType recon)
   {
-    profile.TPSTART(QUDA_PROFILE_INIT);
     GaugeFieldParam gParamEx(in);
-    gParamEx.location = QUDA_CUDA_FIELD_LOCATION;
+    // gParamEx.location = QUDA_CUDA_FIELD_LOCATION;
     gParamEx.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
     gParamEx.pad = 0;
     gParamEx.nFace = 1;
@@ -383,12 +1271,10 @@ namespace quda {
     if (recon != QUDA_RECONSTRUCT_INVALID) gParamEx.reconstruct = recon;
     gParamEx.setPrecision(gParamEx.Precision(), true);
 
-    auto *out = new cudaGaugeField(gParamEx);
+    auto *out = new GaugeField(gParamEx);
 
     // copy input field into the extended device gauge field
-    copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION);
-
-    profile.TPSTOP(QUDA_PROFILE_INIT);
+    copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION); // wrong location if both fields cpu
 
     // now fill up the halos
     out->exchangeExtendedGhost(R, profile, redundant_comms);
@@ -397,10 +1283,10 @@ namespace quda {
   }
 
   // helper for creating extended (cpu) gauge fields
-  cpuGaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R)
+  GaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R)
   {
     GaugeFieldParam gauge_field_param(gauge_param, gauge);
-    cpuGaugeField cpu(gauge_field_param);
+    GaugeField cpu(gauge_field_param);
 
     gauge_field_param.location = QUDA_CPU_FIELD_LOCATION;
     gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
@@ -409,7 +1295,7 @@ namespace quda {
       gauge_field_param.x[d] += 2 * R[d];
       gauge_field_param.r[d] = R[d];
     }
-    cpuGaugeField *padded_cpu = new cpuGaugeField(gauge_field_param);
+    GaugeField *padded_cpu = new GaugeField(gauge_field_param);
 
     copyExtendedGauge(*padded_cpu, cpu, QUDA_CPU_FIELD_LOCATION);
     padded_cpu->exchangeExtendedGhost(R, true); // Do comm to fill halo = true
@@ -417,4 +1303,91 @@ namespace quda {
     return padded_cpu;
   }
 
+  void GaugeField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION && is_prefetch_enabled() && mem_type == QUDA_MEMORY_DEVICE) {
+      if (gauge.data()) qudaMemPrefetchAsync(gauge.data(), bytes, mem_space, stream);
+      if (!isNative()) {
+        for (int i = 0; i < nDim; i++) {
+          size_t nbytes = nFace * surface[i] * nInternal * precision;
+          if (ghost[i].data() && nbytes) qudaMemPrefetchAsync(ghost[i].data(), nbytes, mem_space, stream);
+          if (ghost[i + 4].data() && nbytes && geometry == QUDA_COARSE_GEOMETRY)
+            qudaMemPrefetchAsync(ghost[i + 4].data(), nbytes, mem_space, stream);
+        }
+      }
+    }
+  }
+
+  void GaugeField::backup() const
+  {
+    if (backup_h.size()) errorQuda("Gauge field already backed up");
+
+    if (order == QUDA_QDP_GAUGE_ORDER) {
+      backup_h.resize(geometry);
+      for (int d = 0; d < geometry; d++) {
+        backup_h[d] = quda_ptr(QUDA_MEMORY_HOST, bytes / geometry);
+        qudaMemcpy(backup_h[d], gauge_array[d], bytes / geometry, qudaMemcpyDefault);
+      }
+    } else {
+      backup_h.resize(1);
+      backup_h[0] = quda_ptr(QUDA_MEMORY_HOST, bytes);
+      qudaMemcpy(backup_h[0], gauge, bytes, qudaMemcpyDefault);
+    }
+  }
+
+  void GaugeField::restore() const
+  {
+    if (!backup_h.size()) errorQuda("Cannot restore since not backed up");
+
+    if (order == QUDA_QDP_GAUGE_ORDER) {
+      for (int d = 0; d < geometry; d++) {
+        qudaMemcpy(gauge_array[d], backup_h[d], bytes / geometry, qudaMemcpyDefault);
+      }
+    } else {
+      qudaMemcpy(gauge, backup_h[0], bytes, qudaMemcpyDefault);
+    }
+
+    backup_h.resize(0);
+  }
+
+  void GaugeField::copy_to_buffer(void *buffer) const
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost);
+    } else {
+      if (is_pointer_array(order)) {
+        char *dst_buffer = reinterpret_cast<char *>(buffer);
+        for (int d = 0; d < site_dim; d++) {
+          std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim);
+        }
+      } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
+                 || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
+                 || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+        std::memcpy(buffer, data(), Bytes());
+      } else {
+        errorQuda("Unsupported order = %d", Order());
+      }
+    }
+  }
+
+  void GaugeField::copy_from_buffer(void *buffer)
+  {
+    if (location == QUDA_CUDA_FIELD_LOCATION) {
+      qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice);
+    } else {
+      if (is_pointer_array(order)) {
+        const char *dst_buffer = reinterpret_cast<const char *>(buffer);
+        for (int d = 0; d < site_dim; d++) {
+          std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim);
+        }
+      } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER
+                 || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER
+                 || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) {
+        std::memcpy(data(), buffer, Bytes());
+      } else {
+        errorQuda("Unsupported order = %d", Order());
+      }
+    }
+  }
+
 } // namespace quda
diff --git a/lib/gauge_field_strength_tensor.cu b/lib/gauge_field_strength_tensor.cu
index d0ec026881..dc6b763b54 100644
--- a/lib/gauge_field_strength_tensor.cu
+++ b/lib/gauge_field_strength_tensor.cu
@@ -34,8 +34,10 @@ namespace quda
 
   void computeFmunu(GaugeField &f, const GaugeField &u)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(f, u);
     instantiate2<Fmunu,ReconstructWilson>(u, f); // u must be first here for correct template instantiation
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_fix_fft.cu b/lib/gauge_fix_fft.cu
index b85f5b4457..b72a3aef78 100644
--- a/lib/gauge_fix_fft.cu
+++ b/lib/gauge_fix_fft.cu
@@ -217,7 +217,7 @@ namespace quda {
     GaugeFixQuality<decltype(argQ)> gfixquality(argQ, data);
     gfixquality.apply(device::get_default_stream());
     double action0 = argQ.getAction();
-    if(getVerbosity() >= QUDA_SUMMARIZE) printf("Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
+    logQuda(QUDA_SUMMARIZE, "Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
 
     double diff = 0.0;
     int iter = 0;
@@ -289,7 +289,7 @@ namespace quda {
       if ( autotune && ((action - action0) < -1e-14) ) {
         if ( arg.alpha > 0.01 ) {
           arg.alpha = 0.95 * arg.alpha;
-          if(getVerbosity() >= QUDA_SUMMARIZE) printf(">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", arg.alpha);
+          logQuda(QUDA_SUMMARIZE, ">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", arg.alpha);
         }
       }
       //------------------------------------------------------------------------
@@ -356,7 +356,7 @@ namespace quda {
     
     gflops = (gflops * 1e-9) / (secs);
     gbytes = gbytes / (secs * 1e9);
-    if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
+    logQuda(QUDA_SUMMARIZE, "Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
 
     host_free(num_failures_h);
   }
@@ -366,10 +366,10 @@ namespace quda {
                    double alpha, int autotune, double tolerance, int stopWtheta)
     {
       if (gauge_dir != 3) {
-	if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Starting Landau gauge fixing with FFTs...\n");
+        logQuda(QUDA_SUMMARIZE, "Starting Landau gauge fixing with FFTs...\n");
         gaugeFixingFFT<Float, recon, 4>(data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
       } else {
-	if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Starting Coulomb gauge fixing with FFTs...\n");
+        logQuda(QUDA_SUMMARIZE, "Starting Coulomb gauge fixing with FFTs...\n");
         gaugeFixingFFT<Float, recon, 3>(data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
       }
     }
@@ -389,8 +389,10 @@ namespace quda {
   void gaugeFixingFFT(GaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha,
                       const int autotune, const double tolerance, const int stopWtheta)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (comm_partitioned()) errorQuda("Gauge Fixing with FFTs in multi-GPU support NOT implemented yet!");
     instantiate<GaugeFixingFFT>(data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 }
diff --git a/lib/gauge_fix_ovr.cu b/lib/gauge_fix_ovr.cu
index 814b65427d..064ed5b158 100644
--- a/lib/gauge_fix_ovr.cu
+++ b/lib/gauge_fix_ovr.cu
@@ -502,7 +502,9 @@ namespace quda {
   void gaugeFixingOVR(GaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost,
                       const double tolerance, const int reunit_interval, const int stopWtheta)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeFixingOVR>(data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 }   //namespace quda
diff --git a/lib/gauge_force.cu b/lib/gauge_force.cu
index 2558dadcac..5e43fa64e6 100644
--- a/lib/gauge_force.cu
+++ b/lib/gauge_force.cu
@@ -48,6 +48,7 @@ namespace quda {
   void gaugeForce(GaugeField& mom, const GaugeField& u, double epsilon, std::vector<int**>& input_path,
                   std::vector<int>& length, std::vector<double>& path_coeff, int num_paths, int path_max_length)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(mom, u);
     checkLocation(mom, u);
     if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
@@ -57,11 +58,13 @@ namespace quda {
     // gauge field must be passed as first argument so we peel off its reconstruct type
     instantiate<GaugeForce_>(u, mom, epsilon, p);
     p.free();
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
   
   void gaugePath(GaugeField& out, const GaugeField& u, double coeff, std::vector<int**>& input_path,
 		 std::vector<int>& length, std::vector<double>& path_coeff, int num_paths, int path_max_length)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(out, u);
     checkLocation(out, u);
     if (out.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Reconstruction type %d not supported", out.Reconstruct());
@@ -71,6 +74,7 @@ namespace quda {
     // gauge field must be passed as first argument so we peel off its reconstruct type
     instantiate<GaugePath>(u, out, coeff, p);
     p.free();
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_laplace.cpp b/lib/gauge_laplace.cpp
index 18ab9e3802..1f7d8608e8 100644
--- a/lib/gauge_laplace.cpp
+++ b/lib/gauge_laplace.cpp
@@ -29,7 +29,6 @@ namespace quda {
       if (laplace3D == i) comm_dim[i] = 0;
     }
     ApplyLaplace(out, in, *gauge, laplace3D, 1.0, 1.0, in, parity, dagger, comm_dim, profile);
-    flops += 1320ll*in.Volume(); // FIXME
   }
 
   void GaugeLaplace::DslashXpay(ColorSpinorField &out, const ColorSpinorField &in, 
@@ -45,7 +44,6 @@ namespace quda {
       if (laplace3D == i) comm_dim[i] = 0;
     }
     ApplyLaplace(out, in, *gauge, laplace3D, k, 1.0, x, parity, dagger, comm_dim, profile);
-    flops += 1368ll*in.Volume(); // FIXME
   }
 
   void GaugeLaplace::M(ColorSpinorField &out, const ColorSpinorField &in) const
diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu
index faaaa97d99..0b1af50ba4 100644
--- a/lib/gauge_loop_trace.cu
+++ b/lib/gauge_loop_trace.cu
@@ -55,6 +55,7 @@ namespace quda {
   void gaugeLoopTrace(const GaugeField& u, std::vector<Complex>& loop_traces, double factor, std::vector<int**>& input_path,
 		 std::vector<int>& length, std::vector<double>& path_coeff, int num_paths, int path_max_length)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     paths<1> p(input_path, length, path_coeff, num_paths, path_max_length);
 
     std::vector<array<double, 2>> tr_array(loop_traces.size());
@@ -65,6 +66,7 @@ namespace quda {
     for (auto i = 0u; i < tr_array.size(); i++) { loop_traces[i] = Complex(tr_array[i][0], tr_array[i][1]); }
 
     p.free();
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp
index 42d07e19cc..041dc6164d 100644
--- a/lib/gauge_observable.cpp
+++ b/lib/gauge_observable.cpp
@@ -5,9 +5,9 @@
 namespace quda
 {
 
-  void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param, TimeProfile &profile)
+  void gaugeObservables(GaugeField &u, QudaGaugeObservableParam &param)
   {
-    profile.TPSTART(QUDA_PROFILE_COMPUTE);
+    auto &profile = getProfile();
     if (param.su_project) {
       int *num_failures_h = static_cast<int *>(pool_pinned_malloc(sizeof(int)));
       int *num_failures_d = static_cast<int *>(get_mapped_device_pointer(num_failures_h));
@@ -24,7 +24,6 @@ namespace quda
       param.plaquette[1] = plaq.y;
       param.plaquette[2] = plaq.z;
     }
-    profile.TPSTOP(QUDA_PROFILE_COMPUTE);
 
     if (param.compute_polyakov_loop) { gaugePolyakovLoop(param.ploop, u, 3, profile); }
 
@@ -45,10 +44,8 @@ namespace quda
       std::vector<Complex> loop_traces(param.num_paths);
 
       // actually do the computation
-      profile.TPSTART(QUDA_PROFILE_COMPUTE);
       gaugeLoopTrace(u, loop_traces, param.factor, input_path_v, path_length_v, loop_coeff_v, param.num_paths,
                      param.max_length);
-      profile.TPSTOP(QUDA_PROFILE_COMPUTE);
 
       for (int i = 0; i < param.num_paths; i++) { memcpy(param.traces + i, &loop_traces[i], sizeof(Complex)); }
     }
@@ -57,7 +54,6 @@ namespace quda
     if (!param.compute_qcharge && !param.compute_qcharge_density) return;
 
     // create the Fmunu field
-    profile.TPSTART(QUDA_PROFILE_INIT);
     // u is an extended field we need to shrink for the Fmunu field
     lat_dim_t x;
     for (int i = 0; i < 4; i++) x[i] = u.X()[i] - 2 * u.R()[i];
@@ -66,16 +62,11 @@ namespace quda
     tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
     tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
     tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-    cudaGaugeField gaugeFmunu(tensorParam);
-    profile.TPSTOP(QUDA_PROFILE_INIT);
+    GaugeField gaugeFmunu(tensorParam);
 
-    profile.TPSTART(QUDA_PROFILE_COMPUTE);
     computeFmunu(gaugeFmunu, u);
-    profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-    profile.TPSTOP(QUDA_PROFILE_TOTAL);
 
     if (param.compute_qcharge || param.compute_qcharge_density) {
-      profile.TPSTART(QUDA_PROFILE_TOTAL);
       profile.TPSTART(QUDA_PROFILE_INIT);
       if (param.compute_qcharge_density && !param.qcharge_density)
         errorQuda("Charge density requested, but destination field not defined");
@@ -83,23 +74,17 @@ namespace quda
       void *d_qDensity = param.compute_qcharge_density ? pool_device_malloc(size) : nullptr;
       profile.TPSTOP(QUDA_PROFILE_INIT);
 
-      profile.TPSTART(QUDA_PROFILE_COMPUTE);
-
       if (param.compute_qcharge_density)
         computeQChargeDensity(param.energy, param.qcharge, d_qDensity, gaugeFmunu);
       else
         computeQCharge(param.energy, param.qcharge, gaugeFmunu);
 
-      profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-
       if (param.compute_qcharge_density) {
         profile.TPSTART(QUDA_PROFILE_D2H);
         qudaMemcpy(param.qcharge_density, d_qDensity, size, qudaMemcpyDeviceToHost);
         profile.TPSTOP(QUDA_PROFILE_D2H);
 
-        profile.TPSTART(QUDA_PROFILE_FREE);
         pool_device_free(d_qDensity);
-        profile.TPSTOP(QUDA_PROFILE_FREE);
       }
     }
   }
diff --git a/lib/gauge_phase.cu b/lib/gauge_phase.cu
index ff959ef0b3..52e47dad71 100644
--- a/lib/gauge_phase.cu
+++ b/lib/gauge_phase.cu
@@ -25,8 +25,8 @@ namespace quda {
       if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) {
         GaugePhaseArg<Float, nColor, recon, QUDA_STAGGERED_PHASE_MILC> arg(u);
         launch<GaugePhase>(tp, stream, arg);
-      } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_CPS) {
-        GaugePhaseArg<Float, nColor, recon, QUDA_STAGGERED_PHASE_CPS> arg(u);
+      } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) {
+        GaugePhaseArg<Float, nColor, recon, QUDA_STAGGERED_PHASE_CHROMA> arg(u);
         launch<GaugePhase>(tp, stream, arg);
       } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) {
         GaugePhaseArg<Float, nColor, recon, QUDA_STAGGERED_PHASE_TIFR> arg(u);
@@ -45,9 +45,11 @@ namespace quda {
 
   void applyGaugePhase(GaugeField &u)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugePhase_, ReconstructNone>(u);
     // ensure that ghosts are updated if needed
     if (u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) u.exchangeGhost();
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_plaq.cu b/lib/gauge_plaq.cu
index 7ad5c0399e..ee48d2e3d2 100644
--- a/lib/gauge_plaq.cu
+++ b/lib/gauge_plaq.cu
@@ -37,9 +37,11 @@ namespace quda {
 
   double3 plaquette(const GaugeField &U)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     array<double, 2> plq{0.0, 0.0};
     instantiate<GaugePlaq, ReconstructGauge>(U, plq);
     double3 plaq = make_double3(0.5*(plq[0] + plq[1]), plq[0], plq[1]);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     return plaq;
   }
 
diff --git a/lib/gauge_polyakov_loop.cu b/lib/gauge_polyakov_loop.cu
index a61027dc81..99ae5ea149 100644
--- a/lib/gauge_polyakov_loop.cu
+++ b/lib/gauge_polyakov_loop.cu
@@ -164,14 +164,14 @@ namespace quda {
       // as a function of the number of ranks in the `t` dimension
       gParam.setPrecision(QUDA_DOUBLE_PRECISION);
 
-      std::unique_ptr<GaugeField> product_field = std::make_unique<cudaGaugeField>(gParam);
+      std::unique_ptr<GaugeField> product_field = std::make_unique<GaugeField>(gParam);
       GaugeField& product_field_ref = reinterpret_cast<GaugeField&>(*product_field.get());
 
       // Create the field we reduce into
       x[3] = comm_dim(3);
       gParam.x = x;
       gParam.create = QUDA_NULL_FIELD_CREATE;
-      condensed_field = std::make_unique<cudaGaugeField>(gParam);
+      condensed_field = std::make_unique<GaugeField>(gParam);
       GaugeField& condensed_field_ref = reinterpret_cast<GaugeField&>(*condensed_field.get());
       profile.TPSTOP(QUDA_PROFILE_INIT);
 
diff --git a/lib/gauge_qcharge.cu b/lib/gauge_qcharge.cu
index d847b219ae..3b4e584b02 100644
--- a/lib/gauge_qcharge.cu
+++ b/lib/gauge_qcharge.cu
@@ -62,12 +62,16 @@ namespace quda
 
   void computeQCharge(double energy[3], double &qcharge, const GaugeField &Fmunu)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<QCharge, ReconstructNone>(Fmunu, energy, qcharge, nullptr, false);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
   void computeQChargeDensity(double energy[3], double &qcharge, void *qdensity, const GaugeField &Fmunu)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<QCharge, ReconstructNone>(Fmunu, energy, qcharge, qdensity, true);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_random.cu b/lib/gauge_random.cu
index 0e056d305b..f3bfe8e22c 100644
--- a/lib/gauge_random.cu
+++ b/lib/gauge_random.cu
@@ -4,6 +4,7 @@
 #include <instantiate.h>
 #include <tunable_nd.h>
 #include <kernels/gauge_random.cuh>
+#include "timer.h"
 
 namespace quda {
 
@@ -55,19 +56,26 @@ namespace quda {
     if (U.LinkType() != QUDA_SU3_LINKS && U.LinkType() != QUDA_MOMENTUM_LINKS)
       errorQuda("Unexpected link type %d", U.LinkType());
 
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeGauss, ReconstructFull>(U, rng, sigma);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
 
     // ensure multi-gpu consistency if required
+    getProfile().TPSTART(QUDA_PROFILE_COMMS);
     if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED) {
       U.exchangeExtendedGhost(U.R());
     } else if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) {
       U.exchangeGhost();
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMMS);
   }
 
   void gaugeGauss(GaugeField &U, unsigned long long seed, double sigma)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMMS);
     RNG randstates(U, seed);
+    getProfile().TPSTOP(QUDA_PROFILE_COMMS);
+
     gaugeGauss(U, randstates, sigma);
   }
 
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index 6177ef8170..c7f256f2ee 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -49,8 +49,14 @@ namespace quda {
       }
     }
 
-    void preTune() { if (out.Gauge_p() == in.Gauge_p()) out.backup(); }
-    void postTune() { if (out.Gauge_p() == in.Gauge_p()) out.restore(); }
+    void preTune()
+    {
+      if (out.data() == in.data()) out.backup();
+    }
+    void postTune()
+    {
+      if (out.data() == in.data()) out.restore();
+    }
 
     long long flops() const // just counts matrix multiplication
     {
@@ -72,7 +78,9 @@ namespace quda {
 
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeSTOUT>(out, in, false, rho);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
 
@@ -84,7 +92,9 @@ namespace quda {
 
     copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION);
     in.exchangeExtendedGhost(in.R(), false);
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     instantiate<GaugeSTOUT>(out, in, true, rho, epsilon);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     out.exchangeExtendedGhost(out.R(), false);
   }
 
diff --git a/lib/gauge_update_quda.cu b/lib/gauge_update_quda.cu
index 0fdcb17387..78c4b47f4a 100644
--- a/lib/gauge_update_quda.cu
+++ b/lib/gauge_update_quda.cu
@@ -2,6 +2,7 @@
 #include <tunable_nd.h>
 #include <instantiate.h>
 #include <kernels/gauge_update.cuh>
+#include "timer.h"
 
 namespace quda {
 
@@ -61,11 +62,13 @@ namespace quda {
 
   void updateGaugeField(GaugeField &out, double dt, const GaugeField& in, const GaugeField& mom, bool conj_mom, bool exact)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(out, in, mom);
     checkLocation(out, in, mom);
     checkReconstruct(out, in);
     if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Reconstruction type %d not supported", mom.Reconstruct());
     instantiate<UpdateGaugeField,ReconstructNo12>(out, in, mom, dt, conj_mom, exact);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu
index a3ce38ba81..d92fb0a68c 100644
--- a/lib/gauge_wilson_flow.cu
+++ b/lib/gauge_wilson_flow.cu
@@ -38,6 +38,7 @@ namespace quda {
       wflow_type(wflow_type),
       step_type(step_type)
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       strcat(aux, comm_dim_partitioned_string());
       switch (wflow_type) {
       case QUDA_GAUGE_SMEAR_WILSON_FLOW: strcat(aux,",computeWFlowStepWilson"); break;
@@ -52,6 +53,7 @@ namespace quda {
       }
 
       apply(device::get_default_stream());
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 
     template <QudaGaugeSmearType wflow_type, WFlowStepType step_type> using Arg =
diff --git a/lib/hisq_paths_force_quda.cu b/lib/hisq_paths_force_quda.cu
index 320000dc75..f076ca0b5b 100644
--- a/lib/hisq_paths_force_quda.cu
+++ b/lib/hisq_paths_force_quda.cu
@@ -547,6 +547,7 @@ namespace quda {
 #ifdef GPU_STAGGERED_DIRAC
     void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff_array[6])
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkNative(link, oprod, newOprod);
       checkLocation(newOprod, oprod, link);
       checkPrecision(oprod, link, newOprod);
@@ -557,32 +558,24 @@ namespace quda {
       gauge_param.geometry = QUDA_SCALAR_GEOMETRY;
       gauge_param.setPrecision(gauge_param.Precision(), true);
 
-      auto P3 = GaugeField::Create(gauge_param);
-
-      auto Pmu = GaugeField::Create(gauge_param);
-      auto P5 = GaugeField::Create(gauge_param);
-      auto Pnumu = GaugeField::Create(gauge_param);
-      auto Qnumu = GaugeField::Create(gauge_param);
+      auto P3 = GaugeField(gauge_param);
+      auto Pmu = GaugeField(gauge_param);
+      auto P5 = GaugeField(gauge_param);
+      auto Pnumu = GaugeField(gauge_param);
+      auto Qnumu = GaugeField(gauge_param);
 
       // need double buffers for these fields to fuse "side link" terms with
       // subsequent "middle link" terms in a different direction
-      auto Pmu_next = GaugeField::Create(gauge_param);
-      auto Pnumu_next = GaugeField::Create(gauge_param);
-      auto Qnumu_next = GaugeField::Create(gauge_param);
-
-      instantiateGaugeStaggered<HisqStaplesForce>(link, *P3, GaugeField_ref(*Pmu),
-        GaugeField_ref(*P5), GaugeField_ref(*Pnumu), GaugeField_ref(*Qnumu),
-        GaugeField_ref(*Pmu_next), GaugeField_ref(*Pnumu_next), GaugeField_ref(*Qnumu_next),
-        newOprod, oprod, path_coeff_array);
-
-      delete Pmu;
-      delete P3;
-      delete P5;
-      delete Pnumu;
-      delete Qnumu;
-      delete Pmu_next;
-      delete Pnumu_next;
-      delete Qnumu_next;
+      auto Pmu_next = GaugeField(gauge_param);
+      auto Pnumu_next = GaugeField(gauge_param);
+      auto Qnumu_next = GaugeField(gauge_param);
+
+      instantiateGaugeStaggered<HisqStaplesForce>(link, P3, GaugeField_ref(Pmu), GaugeField_ref(P5),
+                                                  GaugeField_ref(Pnumu), GaugeField_ref(Qnumu),
+                                                  GaugeField_ref(Pmu_next), GaugeField_ref(Pnumu_next),
+                                                  GaugeField_ref(Qnumu_next), newOprod, oprod, path_coeff_array);
+
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 #else
     void hisqStaplesForce(GaugeField &, const GaugeField &, const GaugeField &, const double[6])
@@ -651,10 +644,12 @@ namespace quda {
 #ifdef GPU_STAGGERED_DIRAC
     void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oldOprod, const GaugeField &link, double coeff)
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkNative(link, oldOprod, newOprod);
       checkLocation(newOprod, oldOprod, link);
       checkPrecision(newOprod, link, oldOprod);
       instantiateGaugeStaggered<HisqLongLinkForce>(link, newOprod, oldOprod, coeff);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 #else
     void hisqLongLinkForce(GaugeField &, const GaugeField &, const GaugeField &, double)
@@ -725,10 +720,12 @@ namespace quda {
 #ifdef GPU_STAGGERED_DIRAC
     void hisqCompleteForce(GaugeField &force, const GaugeField &link)
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkNative(link, force);
       checkLocation(force, link);
       checkPrecision(link, force);
       instantiateGaugeStaggered<HisqCompleteForce>(link, force);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 #else
     void hisqCompleteForce(GaugeField &, const GaugeField &)
diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp
index 38f08d30cc..621a6bff5d 100644
--- a/lib/interface_quda.cpp
+++ b/lib/interface_quda.cpp
@@ -71,29 +71,28 @@ static bool redundant_comms = false;
 
 #include <blas_lapack.h>
 
-
-cudaGaugeField *gaugePrecise = nullptr;
-cudaGaugeField *gaugeSloppy = nullptr;
-cudaGaugeField *gaugePrecondition = nullptr;
-cudaGaugeField *gaugeRefinement = nullptr;
-cudaGaugeField *gaugeEigensolver = nullptr;
-cudaGaugeField *gaugeExtended = nullptr;
-
-cudaGaugeField *gaugeFatPrecise = nullptr;
-cudaGaugeField *gaugeFatSloppy = nullptr;
-cudaGaugeField *gaugeFatPrecondition = nullptr;
-cudaGaugeField *gaugeFatRefinement = nullptr;
-cudaGaugeField *gaugeFatEigensolver = nullptr;
-cudaGaugeField *gaugeFatExtended = nullptr;
-
-cudaGaugeField *gaugeLongPrecise = nullptr;
-cudaGaugeField *gaugeLongSloppy = nullptr;
-cudaGaugeField *gaugeLongPrecondition = nullptr;
-cudaGaugeField *gaugeLongRefinement = nullptr;
-cudaGaugeField *gaugeLongEigensolver = nullptr;
-cudaGaugeField *gaugeLongExtended = nullptr;
-
-cudaGaugeField *gaugeSmeared = nullptr;
+GaugeField *gaugePrecise = nullptr;
+GaugeField *gaugeSloppy = nullptr;
+GaugeField *gaugePrecondition = nullptr;
+GaugeField *gaugeRefinement = nullptr;
+GaugeField *gaugeEigensolver = nullptr;
+GaugeField *gaugeExtended = nullptr;
+
+GaugeField *gaugeFatPrecise = nullptr;
+GaugeField *gaugeFatSloppy = nullptr;
+GaugeField *gaugeFatPrecondition = nullptr;
+GaugeField *gaugeFatRefinement = nullptr;
+GaugeField *gaugeFatEigensolver = nullptr;
+GaugeField *gaugeFatExtended = nullptr;
+
+GaugeField *gaugeLongPrecise = nullptr;
+GaugeField *gaugeLongSloppy = nullptr;
+GaugeField *gaugeLongPrecondition = nullptr;
+GaugeField *gaugeLongRefinement = nullptr;
+GaugeField *gaugeLongEigensolver = nullptr;
+GaugeField *gaugeLongExtended = nullptr;
+
+GaugeField *gaugeSmeared = nullptr;
 
 CloverField *cloverPrecise = nullptr;
 CloverField *cloverSloppy = nullptr;
@@ -101,8 +100,8 @@ CloverField *cloverPrecondition = nullptr;
 CloverField *cloverRefinement = nullptr;
 CloverField *cloverEigensolver = nullptr;
 
-cudaGaugeField *momResident = nullptr;
-cudaGaugeField *extendedGaugeResident = nullptr;
+GaugeField momResident;
+GaugeField *extendedGaugeResident = nullptr;
 
 std::vector<ColorSpinorField> solutionResident;
 
@@ -432,16 +431,14 @@ void initQudaDevice(int dev)
   initialized = true;
 
   profileInit2End.TPSTART(QUDA_PROFILE_TOTAL);
-  profileInit.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileInit);
   profileInit.TPSTART(QUDA_PROFILE_INIT);
 
-  if (getVerbosity() >= QUDA_SUMMARIZE) {
 #ifdef GITVERSION
-    printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion);
+  logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n", quda_version.c_str(), gitversion);
 #else
-    printfQuda("QUDA %s\n",quda_version.c_str());
+  logQuda(QUDA_SUMMARIZE, "QUDA %s\n", quda_version.c_str());
 #endif
-  }
 
 #ifdef MULTI_GPU
   if (dev < 0) {
@@ -469,7 +466,6 @@ void initQudaDevice(int dev)
   }
 
   profileInit.TPSTOP(QUDA_PROFILE_INIT);
-  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 /*
@@ -477,7 +473,7 @@ void initQudaDevice(int dev)
  */
 void initQudaMemory()
 {
-  profileInit.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileInit);
   profileInit.TPSTART(QUDA_PROFILE_INIT);
 
   if (!comms_initialized) init_default_comms();
@@ -501,7 +497,6 @@ void initQudaMemory()
   for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d));
 
   profileInit.TPSTOP(QUDA_PROFILE_INIT);
-  profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void updateR()
@@ -539,8 +534,8 @@ static bool invalidate_clover = true;
  * @param refinement[in/out] Reference the to pointer of a given "refinement" field.
  * @param eigensolver[in/out] Reference then to pointer of a given "eigensolver" field.
  */
-void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition,
-                                  cudaGaugeField *&refinement, cudaGaugeField *&eigensolver);
+void freeUniqueSloppyGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition,
+                                  GaugeField *&refinement, GaugeField *&eigensolver);
 
 /**
  * Abstraction utility that cleans up the full set of sloppy fields, as well as
@@ -555,36 +550,28 @@ void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&slo
  * @param extended[in/out] Reference to the pointer of a given "extended" field.
  * @param preserve_precise[in] Whether (true) or not (false) to preserve the precise field.
  */
-void freeUniqueGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition,
-                            cudaGaugeField *&refinement, cudaGaugeField *&eigensolver, cudaGaugeField *&extended,
-                            bool preserve_precise);
+void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition, GaugeField *&refinement,
+                            GaugeField *&eigensolver, GaugeField *&extended, bool preserve_precise);
 
 void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
-  profileGauge.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileGauge);
+  checkGaugeParam(param);
 
   if (!initialized) errorQuda("QUDA not initialized");
   if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param);
 
-  checkGaugeParam(param);
-
-  profileGauge.TPSTART(QUDA_PROFILE_INIT);
   // Set the specific input parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(*param, h_gauge);
 
   if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-  GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ?
-    static_cast<GaugeField*>(new cpuGaugeField(gauge_param)) :
-    static_cast<GaugeField*>(new cudaGaugeField(gauge_param));
+  GaugeField *in = GaugeField::Create(gauge_param);
 
   if (in->Order() == QUDA_BQCD_GAUGE_ORDER) {
     static size_t checksum = SIZE_MAX;
     size_t in_checksum = in->checksum(true);
     if (in_checksum == checksum) {
-      if (getVerbosity() >= QUDA_VERBOSE)
-        printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum);
-      profileGauge.TPSTOP(QUDA_PROFILE_INIT);
-      profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
+      logQuda(QUDA_VERBOSE, "Gauge field unchanged - using cached gauge field %lu\n", checksum);
       delete in;
       invalidate_clover = false;
       return;
@@ -613,7 +600,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   }
 
   // if not preserving then copy the gauge field passed in
-  cudaGaugeField *precise = nullptr;
+  GaugeField *precise = nullptr;
 
   // switch the parameters for creating the mirror precise cuda gauge field
   gauge_param.create = QUDA_NULL_FIELD_CREATE;
@@ -623,7 +610,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   gauge_param.pad = param->ga_pad;
   gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
 
-  precise = new cudaGaugeField(gauge_param);
+  precise = new GaugeField(gauge_param);
 
   if (param->use_resident_gauge) {
     if(gaugePrecise == nullptr) errorQuda("No resident gauge field");
@@ -631,24 +618,17 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     precise->copy(*gaugePrecise);
     precise->exchangeGhost();
     freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    profileGauge.TPSTOP(QUDA_PROFILE_INIT);
   } else {
-    profileGauge.TPSTOP(QUDA_PROFILE_INIT);
-    profileGauge.TPSTART(QUDA_PROFILE_H2D);
     precise->copy(*in);
-    profileGauge.TPSTOP(QUDA_PROFILE_H2D);
   }
 
   // for gaugeSmeared we are interested only in the precise version
   if (param->type == QUDA_SMEARED_LINKS) {
     gaugeSmeared = createExtendedGauge(*precise, R, profileGauge);
 
-    profileGauge.TPSTART(QUDA_PROFILE_FREE);
     delete precise;
     delete in;
-    profileGauge.TPSTOP(QUDA_PROFILE_FREE);
 
-    profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
     return;
   }
 
@@ -658,44 +638,44 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
   // switch the parameters for creating the mirror sloppy cuda gauge field
   gauge_param.reconstruct = param->reconstruct_sloppy;
   gauge_param.setPrecision(param->cuda_prec_sloppy, true);
-  cudaGaugeField *sloppy = nullptr;
+  GaugeField *sloppy = nullptr;
   if (param->cuda_prec == param->cuda_prec_sloppy && param->reconstruct == param->reconstruct_sloppy) {
     sloppy = precise;
   } else {
-    sloppy = new cudaGaugeField(gauge_param);
+    sloppy = new GaugeField(gauge_param);
     sloppy->copy(*precise);
   }
 
   // switch the parameters for creating the mirror preconditioner cuda gauge field
   gauge_param.reconstruct = param->reconstruct_precondition;
   gauge_param.setPrecision(param->cuda_prec_precondition, true);
-  cudaGaugeField *precondition = nullptr;
+  GaugeField *precondition = nullptr;
   if (param->cuda_prec == param->cuda_prec_precondition && param->reconstruct == param->reconstruct_precondition) {
     precondition = precise;
   } else if (param->cuda_prec_sloppy == param->cuda_prec_precondition
              && param->reconstruct_sloppy == param->reconstruct_precondition) {
     precondition = sloppy;
   } else {
-    precondition = new cudaGaugeField(gauge_param);
+    precondition = new GaugeField(gauge_param);
     precondition->copy(*precise);
   }
 
   // switch the parameters for creating the refinement cuda gauge field
   gauge_param.reconstruct = param->reconstruct_refinement_sloppy;
   gauge_param.setPrecision(param->cuda_prec_refinement_sloppy, true);
-  cudaGaugeField *refinement = nullptr;
+  GaugeField *refinement = nullptr;
   if (param->cuda_prec_sloppy == param->cuda_prec_refinement_sloppy
       && param->reconstruct_sloppy == param->reconstruct_refinement_sloppy) {
     refinement = sloppy;
   } else {
-    refinement = new cudaGaugeField(gauge_param);
+    refinement = new GaugeField(gauge_param);
     refinement->copy(*sloppy);
   }
 
   // switch the parameters for creating the eigensolver cuda gauge field
   gauge_param.reconstruct = param->reconstruct_eigensolver;
   gauge_param.setPrecision(param->cuda_prec_eigensolver, true);
-  cudaGaugeField *eigensolver = nullptr;
+  GaugeField *eigensolver = nullptr;
   if (param->cuda_prec == param->cuda_prec_eigensolver && param->reconstruct == param->reconstruct_eigensolver) {
     eigensolver = precise;
   } else if (param->cuda_prec_precondition == param->cuda_prec_eigensolver
@@ -705,14 +685,14 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
              && param->reconstruct_sloppy == param->reconstruct_eigensolver) {
     eigensolver = sloppy;
   } else {
-    eigensolver = new cudaGaugeField(gauge_param);
+    eigensolver = new GaugeField(gauge_param);
     eigensolver->copy(*precise);
   }
 
   profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   // create an extended preconditioning field
-  cudaGaugeField* extended = nullptr;
+  GaugeField *extended = nullptr;
   if (param->overlap){
     lat_dim_t R; // domain-overlap widths in different directions
     for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i);
@@ -757,9 +737,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
       errorQuda("Invalid gauge type %d", param->type);
   }
 
-  profileGauge.TPSTART(QUDA_PROFILE_FREE);
   delete in;
-  profileGauge.TPSTOP(QUDA_PROFILE_FREE);
 
   if (extendedGaugeResident) {
     // updated the resident gauge field if needed
@@ -768,13 +746,11 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param)
     // Use the static R (which is defined at the very beginning of lib/interface_quda.cpp) here
     extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon);
   }
-
-  profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 {
-  profileGauge.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileGauge);
 
   if (param->location != QUDA_CPU_FIELD_LOCATION) errorQuda("Non-cpu output location not yet supported");
 
@@ -783,31 +759,28 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param)
 
   // Set the specific cpu parameters and create the cpu gauge field
   GaugeFieldParam gauge_param(*param, h_gauge);
-  cpuGaugeField cpuGauge(gauge_param);
-  cudaGaugeField *cudaGauge = nullptr;
+  GaugeField cpuGauge(gauge_param);
+  GaugeField *cudaGauge = nullptr;
   switch (param->type) {
   case QUDA_WILSON_LINKS: cudaGauge = gaugePrecise; break;
   case QUDA_ASQTAD_FAT_LINKS: cudaGauge = gaugeFatPrecise; break;
   case QUDA_ASQTAD_LONG_LINKS: cudaGauge = gaugeLongPrecise; break;
   case QUDA_SMEARED_LINKS:
+    gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
     gauge_param.create = QUDA_NULL_FIELD_CREATE;
     gauge_param.reconstruct = param->reconstruct;
     gauge_param.setPrecision(param->cuda_prec, true);
     gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
     gauge_param.pad = param->ga_pad;
-    cudaGauge = new cudaGaugeField(gauge_param);
+    cudaGauge = new GaugeField(gauge_param);
     copyExtendedGauge(*cudaGauge, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     break;
   default: errorQuda("Invalid gauge type");
   }
 
-  profileGauge.TPSTART(QUDA_PROFILE_D2H);
-  cudaGauge->saveCPUField(cpuGauge);
-  profileGauge.TPSTOP(QUDA_PROFILE_D2H);
+  cpuGauge.copy(*cudaGauge);
 
   if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; }
-
-  profileGauge.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void loadSloppyCloverQuda(const QudaPrecision prec[]);
@@ -815,9 +788,8 @@ void freeSloppyCloverQuda();
 
 void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 {
+  auto profile = pushProfile(profileClover);
   pushVerbosity(inv_param->verbosity);
-  profileClover.TPSTART(QUDA_PROFILE_TOTAL);
-  profileClover.TPSTART(QUDA_PROFILE_INIT);
 
   checkCloverParam(inv_param);
   bool device_calc = false; // calculate clover and inverse on the device?
@@ -855,8 +827,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 
   CloverField *in = nullptr;
 
-  profileClover.TPSTOP(QUDA_PROFILE_INIT);
-
   bool clover_update = false;
   // If either of the clover params have changed, trigger a recompute
   double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0;
@@ -870,11 +840,10 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
 
   // compute or download clover field only if gauge field has been updated or clover field doesn't exist
   if (clover_update) {
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n");
+    logQuda(QUDA_VERBOSE, "Creating new clover field\n");
     freeSloppyCloverQuda();
     if (cloverPrecise) delete cloverPrecise;
 
-    profileClover.TPSTART(QUDA_PROFILE_INIT);
     cloverPrecise = new CloverField(clover_param);
 
     if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) {
@@ -890,47 +859,36 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
       inParam.reconstruct = false;
       in = new CloverField(inParam);
     }
-    profileClover.TPSTOP(QUDA_PROFILE_INIT);
 
     if (!device_calc) {
-      profileClover.TPSTART(QUDA_PROFILE_H2D);
       cloverPrecise->copy(*in, false);
       if ((h_clovinv && !inv_param->compute_clover_inverse) && !clover::dynamic_inverse())
         cloverPrecise->copy(*in, true);
-      profileClover.TPSTOP(QUDA_PROFILE_H2D);
     } else {
-      profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
       createCloverQuda(inv_param);
-      profileClover.TPSTART(QUDA_PROFILE_TOTAL);
     }
 
     if ((!h_clovinv || inv_param->compute_clover_inverse) && !clover::dynamic_inverse()) {
-      profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
       cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog);
       if (inv_param->compute_clover_trlog) {
         inv_param->trlogA[0] = cloverPrecise->TrLog()[0];
         inv_param->trlogA[1] = cloverPrecise->TrLog()[1];
       }
-      profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
     }
   } else {
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n");
+    logQuda(QUDA_VERBOSE, "Gauge field unchanged - using cached clover field\n");
   }
 
   // if requested, copy back the clover / inverse field
   if (inv_param->return_clover || inv_param->return_clover_inverse) {
     if (inv_param->return_clover) {
       if (!h_clover) errorQuda("Requested clover field return but no clover host pointer set");
-      profileClover.TPSTART(QUDA_PROFILE_D2H);
       in->copy(*cloverPrecise, false);
-      profileClover.TPSTOP(QUDA_PROFILE_D2H);
     }
 
     if (inv_param->return_clover_inverse) {
       if (!h_clovinv) errorQuda("Requested clover field inverse return but no clover host pointer set");
-      profileClover.TPSTART(QUDA_PROFILE_D2H);
       in->copy(*cloverPrecise, true);
-      profileClover.TPSTOP(QUDA_PROFILE_D2H);
     }
   }
 
@@ -945,15 +903,12 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
     delete tmp;
   }
 
-  profileClover.TPSTART(QUDA_PROFILE_FREE);
   if (in) delete in; // delete object referencing input field
-  profileClover.TPSTOP(QUDA_PROFILE_FREE);
 
   QudaPrecision prec[] = {inv_param->clover_cuda_prec_sloppy, inv_param->clover_cuda_prec_precondition,
                           inv_param->clover_cuda_prec_refinement_sloppy, inv_param->clover_cuda_prec_eigensolver};
   loadSloppyCloverQuda(prec);
 
-  profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
   popVerbosity();
 }
 
@@ -1050,8 +1005,8 @@ void freeGaugeQuda(void)
 }
 
 // These utility functions are declared w/doxygen above
-void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition,
-                                  cudaGaugeField *&refinement, cudaGaugeField *&eigensolver)
+void freeUniqueSloppyGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition,
+                                  GaugeField *&refinement, GaugeField *&eigensolver)
 {
   // In theory, we're checking for aliasing and freeing fields in the opposite order
   // from which they were allocated... but in any case, we're doing an all-to-all
@@ -1076,9 +1031,8 @@ void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&slo
   sloppy = nullptr;
 }
 
-void freeUniqueGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition,
-                            cudaGaugeField *&refinement, cudaGaugeField *&eigensolver, cudaGaugeField *&extended,
-                            bool preserve_precise)
+void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition, GaugeField *&refinement,
+                            GaugeField *&eigensolver, GaugeField *&extended, bool preserve_precise)
 {
   freeUniqueSloppyGaugeUtility(precise, sloppy, precondition, refinement, eigensolver);
 
@@ -1138,7 +1092,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
     if (gauge_param.Precision() == gaugePrecise->Precision() && gauge_param.reconstruct == gaugePrecise->Reconstruct()) {
       gaugeSloppy = gaugePrecise;
     } else {
-      gaugeSloppy = new cudaGaugeField(gauge_param);
+      gaugeSloppy = new GaugeField(gauge_param);
       gaugeSloppy->copy(*gaugePrecise);
     }
 
@@ -1154,7 +1108,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) {
       gaugePrecondition = gaugeSloppy;
     } else {
-      gaugePrecondition = new cudaGaugeField(gauge_param);
+      gaugePrecondition = new GaugeField(gauge_param);
       gaugePrecondition->copy(*gaugePrecise);
     }
 
@@ -1167,7 +1121,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
     if (gauge_param.Precision() == gaugeSloppy->Precision() && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) {
       gaugeRefinement = gaugeSloppy;
     } else {
-      gaugeRefinement = new cudaGaugeField(gauge_param);
+      gaugeRefinement = new GaugeField(gauge_param);
       gaugeRefinement->copy(*gaugeSloppy);
     }
 
@@ -1186,7 +1140,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugePrecondition->Reconstruct()) {
       gaugeEigensolver = gaugePrecondition;
     } else {
-      gaugeEigensolver = new cudaGaugeField(gauge_param);
+      gaugeEigensolver = new GaugeField(gauge_param);
       gaugeEigensolver->copy(*gaugePrecise);
     }
   }
@@ -1204,7 +1158,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
         && gauge_param.reconstruct == gaugeFatPrecise->Reconstruct()) {
       gaugeFatSloppy = gaugeFatPrecise;
     } else {
-      gaugeFatSloppy = new cudaGaugeField(gauge_param);
+      gaugeFatSloppy = new GaugeField(gauge_param);
       gaugeFatSloppy->copy(*gaugeFatPrecise);
     }
 
@@ -1220,7 +1174,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) {
       gaugeFatPrecondition = gaugeFatSloppy;
     } else {
-      gaugeFatPrecondition = new cudaGaugeField(gauge_param);
+      gaugeFatPrecondition = new GaugeField(gauge_param);
       gaugeFatPrecondition->copy(*gaugeFatPrecise);
     }
 
@@ -1233,7 +1187,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
         && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) {
       gaugeFatRefinement = gaugeFatSloppy;
     } else {
-      gaugeFatRefinement = new cudaGaugeField(gauge_param);
+      gaugeFatRefinement = new GaugeField(gauge_param);
       gaugeFatRefinement->copy(*gaugeFatSloppy);
     }
 
@@ -1252,7 +1206,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeFatPrecondition->Reconstruct()) {
       gaugeFatEigensolver = gaugeFatPrecondition;
     } else {
-      gaugeFatEigensolver = new cudaGaugeField(gauge_param);
+      gaugeFatEigensolver = new GaugeField(gauge_param);
       gaugeFatEigensolver->copy(*gaugeFatPrecise);
     }
   }
@@ -1271,7 +1225,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
         && gauge_param.reconstruct == gaugeLongPrecise->Reconstruct()) {
       gaugeLongSloppy = gaugeLongPrecise;
     } else {
-      gaugeLongSloppy = new cudaGaugeField(gauge_param);
+      gaugeLongSloppy = new GaugeField(gauge_param);
       gaugeLongSloppy->copy(*gaugeLongPrecise);
     }
 
@@ -1288,7 +1242,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) {
       gaugeLongPrecondition = gaugeLongSloppy;
     } else {
-      gaugeLongPrecondition = new cudaGaugeField(gauge_param);
+      gaugeLongPrecondition = new GaugeField(gauge_param);
       gaugeLongPrecondition->copy(*gaugeLongPrecise);
     }
 
@@ -1302,7 +1256,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
         && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) {
       gaugeLongRefinement = gaugeLongSloppy;
     } else {
-      gaugeLongRefinement = new cudaGaugeField(gauge_param);
+      gaugeLongRefinement = new GaugeField(gauge_param);
       gaugeLongRefinement->copy(*gaugeLongSloppy);
     }
 
@@ -1322,7 +1276,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r
                && gauge_param.reconstruct == gaugeLongPrecondition->Reconstruct()) {
       gaugeLongEigensolver = gaugeLongPrecondition;
     } else {
-      gaugeLongEigensolver = new cudaGaugeField(gauge_param);
+      gaugeLongEigensolver = new GaugeField(gauge_param);
       gaugeLongEigensolver->copy(*gaugeLongPrecise);
     }
   }
@@ -1371,48 +1325,48 @@ void flushChronoQuda(int i)
 
 void endQuda(void)
 {
-  profileEnd.TPSTART(QUDA_PROFILE_TOTAL);
-
   if (!initialized) return;
 
-  freeGaugeQuda();
-  freeCloverQuda();
+  {
+    auto profile = pushProfile(profileEnd);
 
-  for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i);
+    freeGaugeQuda();
+    freeCloverQuda();
 
-  solutionResident.clear();
+    for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i);
 
-  if(momResident) delete momResident;
+    solutionResident.clear();
+    momResident = GaugeField();
 
-  LatticeField::freeGhostBuffer();
-  ColorSpinorField::freeGhostBuffer();
-  FieldTmp<ColorSpinorField>::destroy();
+    LatticeField::freeGhostBuffer();
+    ColorSpinorField::freeGhostBuffer();
+    FieldTmp<ColorSpinorField>::destroy();
 
-  blas_lapack::generic::destroy();
-  blas_lapack::native::destroy();
-  reducer::destroy();
+    blas_lapack::generic::destroy();
+    blas_lapack::native::destroy();
+    reducer::destroy();
 
-  pool::flush_pinned();
-  pool::flush_device();
+    pool::flush_pinned();
+    pool::flush_device();
 
-  host_free(num_failures_h);
-  num_failures_h = nullptr;
-  num_failures_d = nullptr;
+    host_free(num_failures_h);
+    num_failures_h = nullptr;
+    num_failures_d = nullptr;
 
-  destroyDslashEvents();
+    destroyDslashEvents();
 
-  saveTuneCache();
-  saveProfile();
+    saveTuneCache();
+    saveProfile();
 
-  // flush any outstanding force monitoring (if enabled)
-  flushForceMonitor();
+    // flush any outstanding force monitoring (if enabled)
+    flushForceMonitor();
 
-  initialized = false;
+    initialized = false;
 
-  comm_finalize();
-  comms_initialized = false;
+    comm_finalize();
+    comms_initialized = false;
+  }
 
-  profileEnd.TPSTOP(QUDA_PROFILE_TOTAL);
   profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL);
 
   // print out the profile information of the lifetime of the library
@@ -1430,6 +1384,7 @@ void endQuda(void)
     profileGaugeUpdate.Print();
     profileExtendedGauge.Print();
     profileCloverForce.Print();
+    profileTMCloverForce.Print();
     profileStaggeredForce.Print();
     profileHISQForce.Print();
     profileContract.Print();
@@ -1516,15 +1471,11 @@ namespace quda {
       }
       memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls);
       memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls);
-      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-        printfQuda("Printing b_5 and c_5 values\n");
-        for (int i = 0; i < diracParam.Ls; i++) {
-          printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(),
-              diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag());
-          // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i,
-          // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i,
-          // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) );
-        }
+      logQuda(QUDA_DEBUG_VERBOSE, "Printing b_5 and c_5 values\n");
+      for (int i = 0; i < diracParam.Ls; i++) {
+        logQuda(QUDA_DEBUG_VERBOSE, "fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i,
+                diracParam.b_5[i].real(), diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(),
+                diracParam.c_5[i].imag());
       }
       break;
     case QUDA_STAGGERED_DSLASH:
@@ -1824,8 +1775,7 @@ namespace quda {
 
 void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity)
 {
-  profileDslash.TPSTART(QUDA_PROFILE_TOTAL);
-  profileDslash.TPSTART(QUDA_PROFILE_INIT);
+  auto profile = pushProfile(profileDslash, inv_param->secs, inv_param->gflops);
 
   const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise;
 
@@ -1853,15 +1803,11 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc);
 
-  profileDslash.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileDslash.TPSTART(QUDA_PROFILE_H2D);
   in = in_h;
-  profileDslash.TPSTOP(QUDA_PROFILE_H2D);
 
   profileDslash.TPSTART(QUDA_PROFILE_COMPUTE);
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION &&
       (inv_param->dslash_type == QUDA_STAGGERED_DSLASH ||
@@ -1891,19 +1837,13 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   }
   profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  profileDslash.TPSTART(QUDA_PROFILE_D2H);
   out_h = out;
-  profileDslash.TPSTOP(QUDA_PROFILE_D2H);
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
 
-  profileDslash.TPSTART(QUDA_PROFILE_FREE);
   delete dirac; // clean up
 
-  profileDslash.TPSTOP(QUDA_PROFILE_FREE);
-
   popVerbosity();
-  profileDslash.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
@@ -1929,7 +1869,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   ColorSpinorField in(cudaParam);
   in = in_h;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   cudaParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -1961,8 +1901,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   ColorSpinorField out_h(cpuParam);
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
-
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
   popVerbosity();
 }
 
@@ -1990,7 +1929,7 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   ColorSpinorField in(cudaParam);
   in = in_h;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   ColorSpinorField out(cudaParam);
@@ -2024,8 +1963,7 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param)
   ColorSpinorField out_h(cpuParam);
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
-
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
   popVerbosity();
 }
 
@@ -2071,9 +2009,9 @@ void checkClover(QudaInvertParam *param) {
   if (cloverEigensolver == nullptr) errorQuda("Eigensolver clover field doesn't exist");
 }
 
-quda::cudaGaugeField *checkGauge(QudaInvertParam *param)
+quda::GaugeField *checkGauge(QudaInvertParam *param)
 {
-  quda::cudaGaugeField *cudaGauge = nullptr;
+  quda::GaugeField *cudaGauge = nullptr;
   if (param->dslash_type != QUDA_ASQTAD_DSLASH) {
     if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist");
 
@@ -2171,7 +2109,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   ColorSpinorField in(cudaParam);
   in = in_h;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   ColorSpinorField out(cudaParam);
@@ -2198,8 +2136,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity
   ColorSpinorField out_h(cpuParam);
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
-
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
   popVerbosity();
 }
 
@@ -2207,13 +2144,12 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
 {
   if (!initialized) errorQuda("QUDA not initialized");
 
-  profileEigensolve.TPSTART(QUDA_PROFILE_TOTAL);
-  profileEigensolve.TPSTART(QUDA_PROFILE_INIT);
-
   // Transfer the inv param structure contained in eig_param.
   // This will define the operator to be eigensolved.
   QudaInvertParam *inv_param = eig_param->invert_param;
 
+  auto profile = pushProfile(profileEigensolve, inv_param->secs, inv_param->gflops);
+
   // QUDA can employ even-odd preconditioning to an operator.
   // For the eigensolver the solution type must match
   // the solve type, i.e., there is no full solution reconstruction
@@ -2244,11 +2180,9 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   checkEigParam(eig_param);
 
   // Check that the gauge field is valid
-  cudaGaugeField *cudaGauge = checkGauge(inv_param);
+  GaugeField *cudaGauge = checkGauge(inv_param);
 
-  // Set all timing statistics to zero
-  inv_param->secs = 0;
-  inv_param->gflops = 0;
+  // Set iter statistics to zero
   inv_param->iter = 0;
 
   // Dump all eigensolver and invert param variables to stdout if requested.
@@ -2330,8 +2264,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
     }
   }
   //------------------------------------------------------
-  profileEigensolve.TPSTOP(QUDA_PROFILE_INIT);
-
   // We must construct the correct Dirac operator type based on the three
   // options: The normal operator, the daggered operator, and if we pre
   // multiply by gamma5. Each combination requires a unique Dirac operator
@@ -2367,34 +2299,27 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam
   // host side gamma basis.
   for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); }
   if (!(eig_param->arpack_check)) {
-    profileEigensolve.TPSTART(QUDA_PROFILE_D2H);
     for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i];
-    profileEigensolve.TPSTOP(QUDA_PROFILE_D2H);
   }
 
-  profileEigensolve.TPSTART(QUDA_PROFILE_FREE);
   delete d;
   delete dSloppy;
   delete dPre;
-  profileEigensolve.TPSTOP(QUDA_PROFILE_FREE);
 
   popVerbosity();
 
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
-
-  profileEigensolve.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile)
   : profile(profile) {
-  profile.TPSTART(QUDA_PROFILE_INIT);
   QudaInvertParam *param = mg_param.invert_param;
   // set whether we are going use native or generic blas
   blas_lapack::set_native(param->native_blas_lapack);
 
   checkMultigridParam(&mg_param);
-  cudaGaugeField *cudaGauge = checkGauge(param);
+  GaugeField *cudaGauge = checkGauge(param);
 
   // check MG params (needs to go somewhere else)
   if (mg_param.n_level > QUDA_MAX_MG_LEVEL)
@@ -2407,8 +2332,6 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr
     errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present");
 
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaMultigridParam(&mg_param);
-  mg_param.secs = 0;
-  mg_param.gflops = 0;
 
   bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) ||
     (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);
@@ -2467,22 +2390,18 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr
 
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
-  profile.TPSTOP(QUDA_PROFILE_INIT);
 }
 
 void* newMultigridQuda(QudaMultigridParam *mg_param) {
   profilerStart(__func__);
-
+  auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops);
   pushVerbosity(mg_param->invert_param->verbosity);
 
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
   auto *mg = new multigrid_solver(*mg_param, profileInvert);
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
 
   saveTuneCache();
 
   popVerbosity();
-
   profilerStop(__func__);
   return static_cast<void*>(mg);
 }
@@ -2494,10 +2413,9 @@ void destroyMultigridQuda(void *mg) {
 void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
-
+  auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops);
   pushVerbosity(mg_param->invert_param->verbosity);
 
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);
 
   auto *mg = static_cast<multigrid_solver*>(mg_);
@@ -2599,18 +2517,16 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
   saveTuneCache();
 
   profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE);
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
 
   popVerbosity();
-
   profilerStop(__func__);
 }
 
 void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 {
   profilerStart(__func__);
+  auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops);
   pushVerbosity(mg_param->invert_param->verbosity);
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
 
   auto *mg = static_cast<multigrid_solver*>(mg_);
   checkMultigridParam(mg_param);
@@ -2618,7 +2534,6 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param)
 
   mg->mg->dumpNullVectors();
 
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
   popVerbosity();
   profilerStop(__func__);
 }
@@ -2630,11 +2545,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
 
   if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return;
 
-  profile.TPSTART(QUDA_PROFILE_INIT);
-
-  cudaGaugeField *cudaGauge = checkGauge(param);
-  eig_param.secs   = 0;
-  eig_param.gflops = 0;
+  GaugeField *cudaGauge = checkGauge(param);
 
   DiracParam diracParam;
   if(eig_param.cuda_prec_ritz == param->cuda_prec)
@@ -2665,7 +2576,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
     //but if not sufficient device memory, then the user may choose mapped type of memory
     ritzParam.mem_type = eig_param.mem_type_ritz;
   } else { //host location
-    ritzParam.mem_type = QUDA_MEMORY_PINNED;
+    ritzParam.mem_type = QUDA_MEMORY_HOST_PINNED;
   }
 
   int ritzVolume = 1;
@@ -2685,16 +2596,11 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile)
   deflParam = new DeflationParam(eig_param, RV, *m);
 
   defl = new Deflation(*deflParam, profile);
-
-  profile.TPSTOP(QUDA_PROFILE_INIT);
 }
 
 void* newDeflationQuda(QudaEigParam *eig_param) {
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileInvert, eig_param->secs, eig_param->gflops);
   auto *defl = new deflated_solver(*eig_param, profileInvert);
-
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
-
   saveProfile(__func__);
   flushProfile();
   return static_cast<void*>(defl);
@@ -2706,10 +2612,9 @@ void destroyDeflationQuda(void *df) {
 
 void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 {
+  auto profile = pushProfile(profileInvert, param->secs, param->gflops);
   profilerStart(__func__);
 
-  profileInvert.TPSTART(QUDA_PROFILE_TOTAL);
-
   if (!initialized) errorQuda("QUDA not initialized");
 
   pushVerbosity(param->verbosity);
@@ -2718,7 +2623,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   checkInvertParam(param, hp_x, hp_b);
 
   // check the gauge fields have been created
-  cudaGaugeField *cudaGauge = checkGauge(param);
+  GaugeField *cudaGauge = checkGauge(param);
 
   // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in
   // solve_type and solution_type, rather than in separate members of QudaInvertParam.  We're stuck with it
@@ -2735,8 +2640,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) ||
     (param->solve_type == QUDA_NORMERR_PC_SOLVE);
 
-  param->secs = 0;
-  param->gflops = 0;
   param->iter = 0;
 
   Dirac *d = nullptr;
@@ -2753,8 +2656,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   Dirac &diracPre = *dPre;
   Dirac &diracEig = *dEig;
 
-  profileInvert.TPSTART(QUDA_PROFILE_H2D);
-
   ColorSpinorField *in = nullptr;
   ColorSpinorField *out = nullptr;
 
@@ -2815,7 +2716,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
     diracPre.prefetch(QUDA_CUDA_FIELD_LOCATION);
   }
 
-  profileInvert.TPSTOP(QUDA_PROFILE_H2D);
   profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE);
 
   double nb = blas::norm2(b);
@@ -2841,17 +2741,8 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
 
   dirac.prepare(in, out, x, b, param->solution_type);
 
-  if (getVerbosity() >= QUDA_VERBOSE) {
-    double nin = blas::norm2(*in);
-    double nout = blas::norm2(*out);
-    printfQuda("Prepared source = %g\n", nin);
-    printfQuda("Prepared solution = %g\n", nout);
-  }
-
-  if (getVerbosity() >= QUDA_VERBOSE) {
-    double nin = blas::norm2(*in);
-    printfQuda("Prepared source post mass rescale = %g\n", nin);
-  }
+  logQuda(QUDA_VERBOSE, "Prepared source = %g\n", blas::norm2(*in));
+  logQuda(QUDA_VERBOSE, "Prepared solution = %g\n", blas::norm2(*out));
 
   // solution_type specifies *what* system is to be solved.
   // solve_type specifies *how* the system is to be solved.
@@ -2933,7 +2824,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
       bool orthogonal = true;
       bool apply_mat = false;
       bool hermitian = false;
-      MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
+      MinResExt mre(m, orthogonal, apply_mat, hermitian);
       mre(*out, *in, basis, Ap);
 
       profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);
@@ -2968,7 +2859,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
       bool orthogonal = true;
       bool apply_mat = false;
       bool hermitian = true;
-      MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert);
+      MinResExt mre(m, orthogonal, apply_mat, hermitian);
       mre(*out, *in, basis, Ap);
 
       profileInvert.TPSTOP(QUDA_PROFILE_CHRONO);
@@ -2998,7 +2889,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
     solverParam.updateInvertParam(*param);
   }
 
-  if (getVerbosity() >= QUDA_VERBOSE) { printfQuda("Solution = %g\n", blas::norm2(x)); }
+  logQuda(QUDA_VERBOSE, "Solution = %g\n", blas::norm2(x));
 
   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
   if (param->chrono_make_resident) {
@@ -3037,11 +2928,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
-  if (!param->make_resident_solution) {
-    profileInvert.TPSTART(QUDA_PROFILE_D2H);
-    h_x = x;
-    profileInvert.TPSTOP(QUDA_PROFILE_D2H);
-  }
+  if (!param->make_resident_solution) h_x = x;
 
   profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE);
 
@@ -3058,8 +2945,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   }
   profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
-  profileInvert.TPSTART(QUDA_PROFILE_FREE);
-
   if (param->use_resident_solution && !param->make_resident_solution) solutionResident.clear();
 
   delete d;
@@ -3067,16 +2952,11 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param)
   delete dPre;
   delete dEig;
 
-  profileInvert.TPSTOP(QUDA_PROFILE_FREE);
-
-  popVerbosity();
-
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
 
-  profileInvert.TPSTOP(QUDA_PROFILE_TOTAL);
-
   profilerStop(__func__);
+  popVerbosity();
 }
 
 void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks,
@@ -3145,12 +3025,13 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
   */
 
   profilerStart(__func__);
+  auto profile = pushProfile(profileInvertMultiSrc, param->secs, param->gflops);
 
   CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]};
   int num_sub_partition = quda::product(split_key);
 
   if (!split_key.is_valid()) {
-    errorQuda("split_key = [%d,%d,%d,%d] is not valid.\n", split_key[0], split_key[1], split_key[2], split_key[3]);
+    errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]);
   }
 
   if (num_sub_partition == 1) { // In this case we don't split the grid.
@@ -3159,10 +3040,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
   } else {
 
-    profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);
-    profileInvertMultiSrc.TPSTART(QUDA_PROFILE_INIT);
-
-    if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr.\n"); }
+    if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr"); }
 
     // Doing the sub-partition arithmatics
     if (param->num_src_per_sub_partition * num_sub_partition != param->num_src) {
@@ -3176,7 +3054,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) { pc_type = QUDA_5D_PC; }
 
     // Doesn't work for MG yet.
-    if (param->inv_type_precondition == QUDA_MG_INVERTER) { errorQuda("Split Grid does NOT work with MG yet."); }
+    if (param->inv_type_precondition == QUDA_MG_INVERTER) errorQuda("Split Grid does NOT work with MG yet");
 
     checkInvertParam(param, _hp_x[0], _hp_b[0]);
 
@@ -3202,14 +3080,14 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     // set up the gauge field params.
     if (!is_staggered) { // not staggered
       gf_param = new GaugeFieldParam(*gauge_param, h_gauge);
-      if (gf_param->order <= 4) { gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
+      if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
       in = GaugeField::Create(*gf_param);
     } else { // staggered
       milc_fatlink_param = new GaugeFieldParam(*gauge_param, milc_fatlinks);
-      if (milc_fatlink_param->order <= 4) { milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
+      if (milc_fatlink_param->order <= 4) milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
       milc_fatlink_field = GaugeField::Create(*milc_fatlink_param);
       milc_longlink_param = new GaugeFieldParam(*gauge_param, milc_longlinks);
-      if (milc_longlink_param->order <= 4) { milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; }
+      if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO;
       milc_longlink_field = GaugeField::Create(*milc_longlink_param);
     }
 
@@ -3233,13 +3111,12 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     }
 
     // Make the gauge param dimensions larger
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-      printfQuda("Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d).\n", comm_dim(0),
-                 comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]);
-    }
+    logQuda(QUDA_DEBUG_VERBOSE, "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n",
+            comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]);
+
     for (int d = 0; d < CommKey::n_dim; d++) {
       if (comm_dim(d) % split_key[d] != 0) {
-        errorQuda("Split not possible: %2d %% %2d != 0.", comm_dim(d), split_key[d]);
+        errorQuda("Split not possible: %2d %% %2d != 0", comm_dim(d), split_key[d]);
       }
       if (!is_staggered) {
         gf_param->x[d] *= split_key[d];
@@ -3300,15 +3177,15 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
 
     if (!is_staggered) {
       gf_param->create = QUDA_NULL_FIELD_CREATE;
-      collected_gauge = new quda::cpuGaugeField(*gf_param);
+      collected_gauge = new quda::GaugeField(*gf_param);
       std::vector<quda::GaugeField *> v_g(1);
       v_g[0] = in;
       quda::split_field(*collected_gauge, v_g, split_key);
     } else {
       milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE;
       milc_longlink_param->create = QUDA_NULL_FIELD_CREATE;
-      collected_milc_fatlink_field = new quda::cpuGaugeField(*milc_fatlink_param);
-      collected_milc_longlink_field = new quda::cpuGaugeField(*milc_longlink_param);
+      collected_milc_fatlink_field = new quda::GaugeField(*milc_fatlink_param);
+      collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param);
       std::vector<quda::GaugeField *> v_g(1);
       v_g[0] = milc_fatlink_field;
       quda::split_field(*collected_milc_fatlink_field, v_g, split_key);
@@ -3316,7 +3193,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
       quda::split_field(*collected_milc_longlink_field, v_g, split_key);
     }
 
-    profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_INIT);
     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_PREAMBLE);
 
     comm_barrier();
@@ -3342,36 +3218,33 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     comm_barrier();
 
     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_PREAMBLE);
-    profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL);
 
     // Load gauge field after pushing the split communicator so the comm buffers, etc are setup according to
     // the split topology.
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading gauge field...\n"); }
+    logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n");
     if (!is_staggered) {
-      loadGaugeQuda(collected_gauge->Gauge_p(), gauge_param);
+      loadGaugeQuda(collected_gauge->raw_pointer(), gauge_param);
     } else {
-      // freeGaugeQuda();
-      loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->Gauge_p(),
-                           collected_milc_longlink_field->Gauge_p());
+      loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->raw_pointer(),
+                           collected_milc_longlink_field->raw_pointer());
     }
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded gauge field...\n"); }
+    logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n");
 
     if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH
         || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
-      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading clover field...\n"); }
+      logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading clover field...\n");
       if (collected_clover) {
-        loadCloverQuda(collected_clover->V(false), collected_clover->V(true), param);
+        loadCloverQuda(collected_clover->data(false), collected_clover->data(true), param);
       } else {
         loadCloverQuda(nullptr, nullptr, param);
       }
-      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded clover field...\n"); }
+      logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded clover field...\n");
     }
 
     for (int n = 0; n < param->num_src_per_sub_partition; n++) {
-      op(_collect_x[n]->V(), _collect_b[n]->V(), param, args...);
+      op(_collect_x[n]->data(), _collect_b[n]->data(), param, args...);
     }
 
-    profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL);
     profileInvertMultiSrc.TPSTART(QUDA_PROFILE_EPILOGUE);
     push_communicator(default_comm_key);
     updateR();
@@ -3409,7 +3282,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col
     if (collected_clover) { delete collected_clover; }
 
     profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_EPILOGUE);
-    profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL);
 
     // Restore the gauge field
     if (!is_staggered) {
@@ -3483,11 +3355,9 @@ void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param
  */
 void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 {
+  auto profile = pushProfile(profileMulti, param->secs, param->gflops);
   profilerStart(__func__);
 
-  profileMulti.TPSTART(QUDA_PROFILE_TOTAL);
-  profileMulti.TPSTART(QUDA_PROFILE_INIT);
-
   if (!initialized) errorQuda("QUDA not initialized");
 
   checkInvertParam(param, hp_x[0], hp_b);
@@ -3533,9 +3403,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
     }
   }
 
-  // Timing and FLOP counters
-  param->secs = 0;
-  param->gflops = 0;
   param->iter = 0;
 
   for (int i=0; i<param->num_offset-1; i++) {
@@ -3591,8 +3458,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
     h_x[i] = std::make_unique<ColorSpinorField>(cpuParam);
   }
 
-  profileMulti.TPSTOP(QUDA_PROFILE_INIT);
-  profileMulti.TPSTART(QUDA_PROFILE_H2D);
   // Now I need a colorSpinorParam for the device
   ColorSpinorParam cudaParam(cpuParam, *param, QUDA_CUDA_FIELD_LOCATION);
   // This setting will download a host vector
@@ -3600,9 +3465,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   cudaParam.field = &h_b;
   ColorSpinorField b(cudaParam); // Creates b and downloads h_b to it
 
-  profileMulti.TPSTOP(QUDA_PROFILE_H2D);
-
-  profileMulti.TPSTART(QUDA_PROFILE_INIT);
   // Create the solution fields filled with zero
   cudaParam.create = QUDA_ZERO_FIELD_CREATE;
 
@@ -3622,8 +3484,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   std::vector<ColorSpinorField> &x = solutionResident;
   std::vector<ColorSpinorField> p;
 
-  profileMulti.TPSTOP(QUDA_PROFILE_INIT);
-
   profileMulti.TPSTART(QUDA_PROFILE_PREAMBLE);
 
   // Check source norms
@@ -3670,10 +3530,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 
   if (param->compute_true_res) {
     // check each shift has the desired tolerance and use sequential CG to refine
-    profileMulti.TPSTART(QUDA_PROFILE_INIT);
     cudaParam.create = QUDA_ZERO_FIELD_CREATE;
     ColorSpinorField r(cudaParam);
-    profileMulti.TPSTOP(QUDA_PROFILE_INIT);
     QudaInvertParam refineparam = *param;
     refineparam.cuda_prec_sloppy = param->cuda_prec_refinement_sloppy;
     Dirac &dirac = *d;
@@ -3703,9 +3561,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
       const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]);
       // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0
       if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) {
-	if (getVerbosity() >= QUDA_SUMMARIZE)
-	  printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n",
-		     i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
+        logQuda(QUDA_SUMMARIZE, "Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n", i,
+                param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq);
 
         // for staggered the shift is just a change in mass term (FIXME: for twisted mass also)
         if (param->dslash_type == QUDA_ASQTAD_DSLASH ||
@@ -3753,7 +3610,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
           bool orthogonal = false;
           bool apply_mat = true;
           bool hermitian = true;
-	  MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti);
+          MinResExt mre(*m, orthogonal, apply_mat, hermitian);
           mre(x[i], b, z, q);
         }
 
@@ -3791,8 +3648,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
   // restore shifts
   for (int i = 0; i < param->num_offset; i++) param->offset[i] = unscaled_shifts[i];
 
-  profileMulti.TPSTART(QUDA_PROFILE_D2H);
-
   if (param->compute_action) {
     Complex action(0);
     for (int i = 0; i < param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(b, x[i]);
@@ -3805,11 +3660,9 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
       blas::ax(sqrt(nb), x[i]);
     }
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution %d = %g\n", i, blas::norm2(x[i]));
-
+    logQuda(QUDA_VERBOSE, "Solution %d = %g\n", i, blas::norm2(x[i]));
     if (!param->make_resident_solution) *h_x[i] = x[i];
   }
-  profileMulti.TPSTOP(QUDA_PROFILE_D2H);
 
   profileMulti.TPSTART(QUDA_PROFILE_EPILOGUE);
 
@@ -3817,55 +3670,45 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param)
 
   profileMulti.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
-  profileMulti.TPSTART(QUDA_PROFILE_FREE);
   delete d;
   delete dSloppy;
   delete dPre;
   delete dRefine;
-  profileMulti.TPSTOP(QUDA_PROFILE_FREE);
-
-  popVerbosity();
 
   // cache is written out even if a long benchmarking job gets interrupted
   saveTuneCache();
 
-  profileMulti.TPSTOP(QUDA_PROFILE_TOTAL);
-
   profilerStop(__func__);
+  popVerbosity();
 }
 
 void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param)
 {
-  profileFatLink.TPSTART(QUDA_PROFILE_TOTAL);
-  profileFatLink.TPSTART(QUDA_PROFILE_INIT);
-
+  auto profile = pushProfile(profileFatLink);
   checkGaugeParam(param);
 
   GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  cpuGaugeField cpuFatLink(gParam);   // create the host fatlink
+  GaugeField cpuFatLink(gParam); // create the host fatlink
   gParam.gauge = longlink;
-  cpuGaugeField cpuLongLink(gParam);  // create the host longlink
+  GaugeField cpuLongLink(gParam); // create the host longlink
   gParam.gauge = ulink;
-  cpuGaugeField cpuUnitarizedLink(gParam);
+  GaugeField cpuUnitarizedLink(gParam);
   gParam.link_type = param->type;
   gParam.gauge = inlink;
-  cpuGaugeField cpuInLink(gParam);    // create the host sitelink
+  GaugeField cpuInLink(gParam); // create the host sitelink
 
   // create the device fields
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(param->cuda_prec, true);
   gParam.create = QUDA_NULL_FIELD_CREATE;
-  cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
-  profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
+  GaugeField *cudaInLink = new GaugeField(gParam);
 
-  cudaInLink->loadCPUField(cpuInLink, profileFatLink);
-  cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
+  cudaInLink->copy(cpuInLink);
+  GaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink);
 
-  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
   delete cudaInLink;
-  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
 
   gParam.create = QUDA_ZERO_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
@@ -3874,34 +3717,14 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
 
   if (longlink) {
-    profileFatLink.TPSTART(QUDA_PROFILE_INIT);
-    cudaGaugeField *cudaLongLink = new cudaGaugeField(gParam);
-    profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
-
-    profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
-    longKSLink(cudaLongLink, *cudaInLinkEx, path_coeff);
-    profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-    cudaLongLink->saveCPUField(cpuLongLink, profileFatLink);
-
-    profileFatLink.TPSTART(QUDA_PROFILE_FREE);
-    delete cudaLongLink;
-    profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
+    GaugeField longLink(gParam);
+    longKSLink(longLink, *cudaInLinkEx, path_coeff);
+    cpuLongLink.copy(longLink);
   }
 
-  profileFatLink.TPSTART(QUDA_PROFILE_INIT);
-  cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);
-  profileFatLink.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
-  fatKSLink(cudaFatLink, *cudaInLinkEx, path_coeff);
-  profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  if (fatlink) cudaFatLink->saveCPUField(cpuFatLink, profileFatLink);
-
-  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaInLinkEx;
-  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
+  GaugeField fatLink(gParam);
+  fatKSLink(fatLink, *cudaInLinkEx, path_coeff);
+  if (fatlink) cpuFatLink.copy(fatLink);
 
   if (ulink) {
     const double unitarize_eps = 1e-14;
@@ -3913,63 +3736,54 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink,
     quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
                                      svd_abs_error);
 
-    cudaGaugeField *cudaUnitarizedLink = new cudaGaugeField(gParam);
+    GaugeField unitarizedLink(gParam);
 
-    profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE);
     *num_failures_h = 0;
-    quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu
+    quda::unitarizeLinks(unitarizedLink, fatLink, num_failures_d); // unitarize on the gpu
     if (*num_failures_h > 0)
       errorQuda("Error in unitarization component of the hisq fattening: %d failures", *num_failures_h);
-    profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-    cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink, profileFatLink);
+    // project onto SU(3) if using the Chroma convention
+    if (param->staggered_phase_type == QUDA_STAGGERED_PHASE_CHROMA) {
+      *num_failures_h = 0;
+      const double tol = unitarizedLink.Precision() == QUDA_DOUBLE_PRECISION ? 1e-15 : 2e-6;
+      if (unitarizedLink.StaggeredPhaseApplied()) unitarizedLink.removeStaggeredPhase();
+      projectSU3(unitarizedLink, tol, num_failures_d);
+      if (!unitarizedLink.StaggeredPhaseApplied() && param->staggered_phase_applied)
+        unitarizedLink.applyStaggeredPhase();
+      if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
+    }
 
-    profileFatLink.TPSTART(QUDA_PROFILE_FREE);
-    delete cudaUnitarizedLink;
-    profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
+    cpuUnitarizedLink.copy(unitarizedLink);
   }
 
-  profileFatLink.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaFatLink;
-  profileFatLink.TPSTOP(QUDA_PROFILE_FREE);
-
-  profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL);
+  delete cudaInLinkEx;
 }
 
 void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
 {
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
-
+  auto profile = pushProfile(profileGaussianSmear);
   checkGaugeParam(param);
 
-  GaugeFieldParam gParam(*param, inlink, QUDA_GENERAL_LINKS);
-  gParam.gauge     = twolink;
-  cpuGaugeField cpuTwoLink(gParam);  // create the host twolink
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
+  GaugeFieldParam gParam(*param, inlink, QUDA_ASQTAD_LONG_LINKS);
+  gParam.gauge = twolink;
+  GaugeField cpuTwoLink(gParam); // create the host twolink
 
-  cudaGaugeField *cudaInLinkEx = nullptr;
+  GaugeField *cudaInLinkEx = nullptr;
 
-  if(inlink) {
+  if (inlink) {
     gParam.link_type = param->type;
     gParam.gauge     = inlink;
-    cpuGaugeField cpuInLink(gParam);    // create the host sitelink
+    GaugeField cpuInLink(gParam); // create the host sitelink
 
     // create the device fields
     gParam.reconstruct = param->reconstruct;
     gParam.setPrecision(param->cuda_prec, true);
     gParam.create = QUDA_NULL_FIELD_CREATE;
-    cudaGaugeField *cudaInLink = new cudaGaugeField(gParam);
-    profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
-
-    cudaInLink->loadCPUField(cpuInLink, profileGaussianSmear);
-    //
-    cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileGaussianSmear);
-    //
-    profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
-    delete cudaInLink;
-    profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE);
+    GaugeField cudaInLink(gParam);
 
+    cudaInLink.copy(cpuInLink);
+    cudaInLinkEx = createExtendedGauge(cudaInLink, R, profileGaussianSmear);
   } else {
     cudaInLinkEx = createExtendedGauge(*gaugePrecise, R, profileGaussianSmear);
   }
@@ -3984,101 +3798,53 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param)
   gsParam.nFace         = 3;
   gsParam.pad           = gsParam.pad*gsParam.nFace;
 
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
-
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
-  gaugeSmeared = new cudaGaugeField(gsParam);
-
-  
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_COMPUTE);
+  gaugeSmeared = new GaugeField(gsParam);
 
   computeTwoLink(*gaugeSmeared, *cudaInLinkEx);
   gaugeSmeared->exchangeGhost();
 
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_COMPUTE);
-  //
-  gaugeSmeared->saveCPUField(cpuTwoLink, profileGaussianSmear);
-
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
+  cpuTwoLink.copy(*gaugeSmeared);
 
   freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
   delete cudaInLinkEx;
-
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE);
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int* path_length,
 			  double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam)
 {
-  profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
-
+  auto profile = pushProfile(profileGaugeForce);
   checkGaugeParam(qudaGaugeParam);
 
   GaugeFieldParam gParam(*qudaGaugeParam, siteLink);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = qudaGaugeParam->gauge_offset;
-  gParam.site_size = qudaGaugeParam->site_size;
-  cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr;
-
-  cudaGaugeField* cudaSiteLink = nullptr;
-
-  if (qudaGaugeParam->use_resident_gauge) {
-    if (!gaugePrecise) errorQuda("No resident gauge field to use");
-    cudaSiteLink = gaugePrecise;
-  } else {
-    gParam.create = QUDA_NULL_FIELD_CREATE;
-    gParam.reconstruct = qudaGaugeParam->reconstruct;
-    gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
-    gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  GaugeField cpuSiteLink = !qudaGaugeParam->use_resident_gauge ? GaugeField(gParam) : GaugeField();
 
-    cudaSiteLink = new cudaGaugeField(gParam);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
-
-    profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-    cudaSiteLink->loadCPUField(*cpuSiteLink);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
-
-    profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
-  }
+  if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuSiteLink;
+  gParam.reconstruct = qudaGaugeParam->reconstruct;
+  gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
+  GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
   GaugeFieldParam gParamMom(*qudaGaugeParam, mom, QUDA_ASQTAD_MOM_LINKS);
   gParamMom.location = QUDA_CPU_FIELD_LOCATION;
-  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
-  else
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
 
-  gParamMom.site_offset = qudaGaugeParam->mom_offset;
-  gParamMom.site_size = qudaGaugeParam->site_size;
-  cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr;
+  GaugeField cpuMom = !qudaGaugeParam->use_resident_mom ? GaugeField(gParamMom) : GaugeField();
 
-  cudaGaugeField* cudaMom = nullptr;
-  if (qudaGaugeParam->use_resident_mom) {
-    if (!momResident) errorQuda("No resident momentum field to use");
-    cudaMom = momResident;
-    if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
-  } else {
-    gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
-    gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
-    gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
-    gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
-    gParamMom.create = QUDA_ZERO_FIELD_CREATE;
-    cudaMom = new cudaGaugeField(gParamMom);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
-    if (!qudaGaugeParam->overwrite_mom) {
-      profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-      cudaMom->loadCPUField(*cpuMom);
-      profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
-    }
-  }
+  if (qudaGaugeParam->use_resident_mom && momResident.empty()) errorQuda("No resident momentum field to use");
+  gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
+  gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE;
+  gParamMom.field = &cpuMom;
+  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
+  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
+  gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true);
 
-  cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce);
+  GaugeField cudaMom = qudaGaugeParam->use_resident_mom ? momResident.create_alias() : GaugeField(gParamMom);
+  if (qudaGaugeParam->use_resident_mom && qudaGaugeParam->overwrite_mom) cudaMom.zero();
+
+  GaugeField *cudaGauge = createExtendedGauge(cudaSiteLink, R, profileGaugeForce);
   // apply / remove phase as appropriate
   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();
 
@@ -4095,43 +3861,29 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   for (int d = 0; d < 4; d++) { input_path_v[d] = input_path_buf[d]; }
 
   // actually do the computation
-  profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE);
   if (!forceMonitor()) {
-    gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
+    gaugeForce(cudaMom, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
   } else {
     // if we are monitoring the force, separate the force computation from the momentum update
-    GaugeFieldParam gParam(*cudaMom);
+    GaugeFieldParam gParam(cudaMom);
     gParam.create = QUDA_ZERO_FIELD_CREATE;
-    GaugeField *force = GaugeField::Create(gParam);
-    gaugeForce(*force, *cudaGauge, 1.0, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
-    updateMomentum(*cudaMom, eb3, *force, "gauge");
-    delete force;
+    GaugeField force(gParam);
+    gaugeForce(force, *cudaGauge, 1.0, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
+    updateMomentum(cudaMom, eb3, force, "gauge");
   }
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  if (qudaGaugeParam->return_result_mom) {
-    profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
-    cudaMom->saveCPUField(*cpuMom);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
-  }
+  if (qudaGaugeParam->return_result_mom) cpuMom.copy(cudaMom);
 
-  profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
-  if (qudaGaugeParam->make_resident_gauge) {
-    if (gaugePrecise && gaugePrecise != cudaSiteLink) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaSiteLink;
-  } else {
-    delete cudaSiteLink;
+  if (qudaGaugeParam->make_resident_gauge && !qudaGaugeParam->use_resident_gauge) {
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaSiteLink);
   }
 
-  if (qudaGaugeParam->make_resident_mom) {
-    if (momResident && momResident != cudaMom) delete momResident;
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
-  }
-
-  if (cpuSiteLink) delete cpuSiteLink;
-  if (cpuMom) delete cpuMom;
+  if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!qudaGaugeParam->make_resident_mom)
+    momResident = GaugeField();
 
   if (qudaGaugeParam->make_resident_gauge) {
     if (extendedGaugeResident) delete extendedGaugeResident;
@@ -4139,65 +3891,39 @@ int computeGaugeForceQuda(void* mom, void* siteLink,  int*** input_path_buf, int
   } else {
     delete cudaGauge;
   }
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);
 
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);
   return 0;
 }
 
 int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff,
                          int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam)
 {
-  profileGaugePath.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaugePath.TPSTART(QUDA_PROFILE_INIT);
-
+  auto profile = pushProfile(profileGaugePath);
   checkGaugeParam(qudaGaugeParam);
 
   GaugeFieldParam gParam(*qudaGaugeParam, siteLink);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = qudaGaugeParam->gauge_offset;
-  gParam.site_size = qudaGaugeParam->site_size;
-  cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr;
-
-  cudaGaugeField *cudaSiteLink = nullptr;
-
-  if (qudaGaugeParam->use_resident_gauge) {
-    if (!gaugePrecise) errorQuda("No resident gauge field to use");
-    cudaSiteLink = gaugePrecise;
-  } else {
-    gParam.location = QUDA_CUDA_FIELD_LOCATION;
-    gParam.create = QUDA_NULL_FIELD_CREATE;
-    gParam.reconstruct = qudaGaugeParam->reconstruct;
-    gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
-
-    cudaSiteLink = new cudaGaugeField(gParam);
-    profileGaugePath.TPSTOP(QUDA_PROFILE_INIT);
-
-    profileGaugePath.TPSTART(QUDA_PROFILE_H2D);
-    cudaSiteLink->loadCPUField(*cpuSiteLink);
-    profileGaugePath.TPSTOP(QUDA_PROFILE_H2D);
+  GaugeField cpuSiteLink = !qudaGaugeParam->use_resident_gauge ? GaugeField(gParam) : GaugeField();
 
-    profileGaugePath.TPSTART(QUDA_PROFILE_INIT);
-  }
+  if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuSiteLink;
+  gParam.reconstruct = qudaGaugeParam->reconstruct;
+  gParam.setPrecision(qudaGaugeParam->cuda_prec, true);
+  GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
   GaugeFieldParam gParamOut(*qudaGaugeParam, out);
   gParamOut.location = QUDA_CPU_FIELD_LOCATION;
-  gParamOut.site_offset = qudaGaugeParam->gauge_offset;
-  gParamOut.site_size = qudaGaugeParam->site_size;
-  cpuGaugeField *cpuOut = new cpuGaugeField(gParamOut);
+  GaugeField cpuOut = GaugeField(gParamOut);
   gParamOut.location = QUDA_CUDA_FIELD_LOCATION;
-  gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
+  gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE;
+  gParamOut.field = &cpuOut;
   gParamOut.reconstruct = QUDA_RECONSTRUCT_NO;
   gParamOut.setPrecision(qudaGaugeParam->cuda_prec, true);
-  cudaGaugeField *cudaOut = new cudaGaugeField(gParamOut);
-  profileGaugePath.TPSTOP(QUDA_PROFILE_INIT);
-  if (!qudaGaugeParam->overwrite_gauge) {
-    profileGaugePath.TPSTART(QUDA_PROFILE_H2D);
-    cudaOut->loadCPUField(*cpuOut);
-    profileGaugePath.TPSTOP(QUDA_PROFILE_H2D);
-  }
+  GaugeField cudaOut(gParamOut);
 
-  cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath);
+  GaugeField *cudaGauge = createExtendedGauge(cudaSiteLink, R, profileGaugePath);
   // apply / remove phase as appropriate
   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();
 
@@ -4214,103 +3940,72 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *
   for (int d = 0; d < 4; d++) { input_path_v[d] = input_path_buf[d]; }
 
   // actually do the computation
-  profileGaugePath.TPSTART(QUDA_PROFILE_COMPUTE);
-  gaugePath(*cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
-  profileGaugePath.TPSTOP(QUDA_PROFILE_COMPUTE);
+  gaugePath(cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length);
 
-  profileGaugePath.TPSTART(QUDA_PROFILE_D2H);
-  cudaOut->saveCPUField(*cpuOut);
-  profileGaugePath.TPSTOP(QUDA_PROFILE_D2H);
+  cpuOut.copy(cudaOut);
+
+  if (qudaGaugeParam->make_resident_gauge && !qudaGaugeParam->use_resident_gauge) {
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaSiteLink);
+  }
 
-  profileGaugePath.TPSTART(QUDA_PROFILE_FREE);
   if (qudaGaugeParam->make_resident_gauge) {
-    if (gaugePrecise && gaugePrecise != cudaSiteLink) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaSiteLink;
     if (extendedGaugeResident) delete extendedGaugeResident;
     extendedGaugeResident = cudaGauge;
   } else {
-    delete cudaSiteLink;
     delete cudaGauge;
   }
 
-  delete cudaOut;
-
-  if (cpuSiteLink) delete cpuSiteLink;
-  if (cpuOut) delete cpuOut;
-  profileGaugePath.TPSTOP(QUDA_PROFILE_FREE);
-
-  profileGaugePath.TPSTOP(QUDA_PROFILE_TOTAL);
   return 0;
 }
 
 void momResidentQuda(void *mom, QudaGaugeParam *param)
 {
-  profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaugeForce.TPSTART(QUDA_PROFILE_INIT);
-
+  auto profile = pushProfile(profileGaugeForce);
   checkGaugeParam(param);
 
   GaugeFieldParam gParamMom(*param, mom, QUDA_ASQTAD_MOM_LINKS);
   gParamMom.location = QUDA_CPU_FIELD_LOCATION;
-  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
-  else
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
-  gParamMom.site_offset = param->mom_offset;
-  gParamMom.site_size = param->site_size;
 
-  cpuGaugeField cpuMom(gParamMom);
+  GaugeField cpuMom(gParamMom);
 
   if (param->make_resident_mom && !param->return_result_mom) {
-    if (momResident) delete momResident;
     gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
     gParamMom.create = QUDA_NULL_FIELD_CREATE;
     gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
     gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
     gParamMom.setPrecision(param->cuda_prec, true);
     gParamMom.create = QUDA_ZERO_FIELD_CREATE;
-    momResident = new cudaGaugeField(gParamMom);
+    momResident = GaugeField(gParamMom);
   } else if (param->return_result_mom && !param->make_resident_mom) {
-    if (!momResident) errorQuda("No resident momentum to return");
+    if (momResident.empty()) errorQuda("No resident momentum to return");
   } else {
     errorQuda("Unexpected combination make_resident_mom = %d return_result_mom = %d", param->make_resident_mom,
               param->return_result_mom);
   }
 
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
-
   if (param->make_resident_mom) {
     // we are downloading the momentum from the host
-    profileGaugeForce.TPSTART(QUDA_PROFILE_H2D);
-    momResident->loadCPUField(cpuMom);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D);
+    momResident.copy(cpuMom);
   } else if (param->return_result_mom) {
     // we are uploading the momentum to the host
-    profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
-    momResident->saveCPUField(cpuMom);
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
-
-    profileGaugeForce.TPSTART(QUDA_PROFILE_FREE);
-    delete momResident;
-    momResident = nullptr;
-    profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE);
+    cpuMom.copy(momResident);
+    momResident = GaugeField();
   }
-
-  profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void createCloverQuda(QudaInvertParam* invertParam)
 {
-  profileClover.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileClover);
   if (!cloverPrecise) errorQuda("Clover field not allocated");
 
   QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct();
   // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general)
   lat_dim_t R;
   for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d));
-  cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
-
-  profileClover.TPSTART(QUDA_PROFILE_INIT);
+  GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident :
+                                              createExtendedGauge(*gaugePrecise, R, profileClover, false, recon);
 
   GaugeField *ex = gauge;
   if (gauge->Precision() < cloverPrecise->Precision()) {
@@ -4325,15 +4020,11 @@ void createCloverQuda(QudaInvertParam* invertParam)
   GaugeFieldParam tensorParam(gaugePrecise->X(), ex->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY);
   tensorParam.location = QUDA_CUDA_FIELD_LOCATION;
   tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-  tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  tensorParam.setPrecision(tensorParam.Precision(), true);
   tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
-  cudaGaugeField Fmunu(tensorParam);
-  profileClover.TPSTOP(QUDA_PROFILE_INIT);
-  profileClover.TPSTART(QUDA_PROFILE_COMPUTE);
+  GaugeField Fmunu(tensorParam);
   computeFmunu(Fmunu, *ex);
   computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff);
-  profileClover.TPSTOP(QUDA_PROFILE_COMPUTE);
-  profileClover.TPSTOP(QUDA_PROFILE_TOTAL);
 
   if (ex != gauge) delete ex;
 
@@ -4348,15 +4039,15 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
   if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY)
     errorQuda("Only scalar and vector geometries are supported\n");
 
-  cpuGaugeField *cpuGauge = nullptr;
-  if (gauge) cpuGauge = new cpuGaugeField(gParam);
+  GaugeField *cpuGauge = nullptr;
+  if (gauge) cpuGauge = new GaugeField(gParam);
 
-  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  gParam.setPrecision(gParam.Precision(), true);
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  auto* cudaGauge = new cudaGaugeField(gParam);
+  auto *cudaGauge = new GaugeField(gParam);
 
   if (gauge) {
-    cudaGauge->loadCPUField(*cpuGauge);
+    cudaGauge->copy(*cpuGauge);
     delete cpuGauge;
   }
 
@@ -4365,48 +4056,49 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param)
 
 void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param)
 {
-  auto* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
+  auto *cudaGauge = reinterpret_cast<GaugeField *>(inGauge);
 
-  GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS);
+  GaugeFieldParam gParam(*param, gauge);
   gParam.geometry = cudaGauge->Geometry();
 
-  cpuGaugeField cpuGauge(gParam);
-  cudaGauge->saveCPUField(cpuGauge);
+  GaugeField cpuGauge(gParam);
+  cpuGauge.copy(*cudaGauge);
 }
 
 void destroyGaugeFieldQuda(void *gauge)
 {
-  auto* g = reinterpret_cast<cudaGaugeField*>(gauge);
+  auto *g = reinterpret_cast<GaugeField *>(gauge);
   delete g;
 }
 
 void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, void **, QudaGaugeParam *gauge_param,
                                QudaInvertParam *inv_param)
 {
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_TOTAL);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
+  auto profile = pushProfile(profileStaggeredForce);
 
   GaugeFieldParam gParam(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS);
 
   // create the host momentum field
   gParam.location = QUDA_CPU_FIELD_LOCATION;
   gParam.reconstruct = gauge_param->reconstruct;
-  gParam.t_boundary = QUDA_PERIODIC_T;
-  cpuGaugeField cpuMom(gParam);
+  GaugeField cpuMom(gParam);
 
   // create the device momentum field
+  if (gauge_param->use_resident_mom && momResident.empty())
+    errorQuda("Cannot use resident momentum field since none appears resident");
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
-  gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME
-  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuMom;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
-  cudaGaugeField *cudaMom = !gauge_param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
+  gParam.setPrecision(gParam.Precision(), true);
+  GaugeField cudaMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
 
   // create temporary field for quark-field outer product
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  cudaGaugeField cudaForce(gParam);
+  GaugeField cudaForce(gParam);
   GaugeField *cudaForce_[2] = {&cudaForce};
 
   ColorSpinorParam qParam;
@@ -4424,24 +4116,10 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   qParam.create = QUDA_NULL_FIELD_CREATE;
   qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
 
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_H2D);
-
-  if (gauge_param->use_resident_mom) {
-    if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident");
-    cudaMom = momResident;
-  } else {
-    // download the initial momentum (FIXME make an option just to return?)
-    cudaMom->loadCPUField(cpuMom);
-  }
-
   // resident gauge field is required
-  if (!gauge_param->use_resident_gauge || !gaugePrecise)
-    errorQuda("Resident gauge field is required");
-
-  if (!gaugePrecise->StaggeredPhaseApplied()) {
+  if (!gauge_param->use_resident_gauge || !gaugePrecise) errorQuda("Resident gauge field is required");
+  if (!gaugePrecise->StaggeredPhaseApplied())
     errorQuda("Gauge field requires the staggered phase factors to be applied");
-  }
 
   // check if staggered phase is the desired one
   if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) {
@@ -4449,12 +4127,9 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
               gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase());
   }
 
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT);
-
   const int nvector = inv_param->num_offset;
   std::vector<ColorSpinorField*> X(nvector);
-  for ( int i=0; i<nvector; i++) X[i] = ColorSpinorField::Create(qParam);
+  for (int i = 0; i < nvector; i++) X[i] = ColorSpinorField::Create(qParam);
 
   if (inv_param->use_resident_solution) {
     if (solutionResident.size() < (unsigned int)nvector)
@@ -4471,7 +4146,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   setDiracParam(diracParam, inv_param, pc_solve);
   Dirac *dirac = Dirac::create(diracParam);
 
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT);
   profileStaggeredForce.TPSTART(QUDA_PROFILE_PREAMBLE);
 
   for (int i=0; i<nvector; i++) {
@@ -4486,16 +4160,12 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
   }
 
   profileStaggeredForce.TPSTOP(QUDA_PROFILE_PREAMBLE);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
 
 #if 0
   if (inv_param->use_resident_solution) solutionResident.clear();
 #endif
   delete dirac;
 
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE);
-
   // compute quark-field outer product
   for (int i=0; i<nvector; i++) {
     ColorSpinorField &x = *(X[i]);
@@ -4508,31 +4178,17 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi
 
   // mom += delta * [U * force]TA
   applyU(cudaForce, *gaugePrecise);
-  updateMomentum(*cudaMom, dt * delta, cudaForce, "staggered");
-  qudaDeviceSynchronize();
-
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_COMPUTE);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_D2H);
+  updateMomentum(cudaMom, dt * delta, cudaForce, "staggered");
 
-  if (gauge_param->return_result_mom) {
-    // copy the momentum field back to the host
-    cudaMom->saveCPUField(cpuMom);
-  }
-
-  if (gauge_param->make_resident_mom) {
-    // make the momentum field resident
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
-  }
+  // copy the momentum field back to the host
+  if (gauge_param->return_result_mom) cpuMom.copy(cudaMom);
 
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H);
-  profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE);
+  if (gauge_param->make_resident_mom && !gauge_param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!gauge_param->make_resident_mom)
+    momResident = GaugeField();
 
   for (int i=0; i<nvector; i++) delete X[i];
-
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileStaggeredForce.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void computeHISQForceQuda(void* const milc_momentum,
@@ -4548,15 +4204,13 @@ void computeHISQForceQuda(void* const milc_momentum,
                           double **coeff,
                           QudaGaugeParam* gParam)
 {
+  auto profile = pushProfile(profileHISQForce);
+  checkGaugeParam(gParam);
+
   using namespace quda;
   using namespace quda::fermion_force;
-  profileHISQForce.TPSTART(QUDA_PROFILE_TOTAL);
   if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order);
 
-  checkGaugeParam(gParam);
-
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
-
   {
     // default settings for the unitarization
     const double unitarize_eps = 1e-14;
@@ -4600,16 +4254,14 @@ void computeHISQForceQuda(void* const milc_momentum,
   oParam.setPrecision(gParam->cpu_prec, true);
   oParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
 
-  cudaGaugeField *stapleOprod = new cudaGaugeField(oParam);
-  cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam);
-  cudaGaugeField *naikOprod = new cudaGaugeField(oParam);
+  GaugeField stapleOprod(oParam);
+  GaugeField oneLinkOprod(oParam);
+  GaugeField naikOprod(oParam);
 
   double act_path_coeff[6] = {0, 1, level2_coeff[2], level2_coeff[3], level2_coeff[4], level2_coeff[5]};
   // You have to look at the MILC routine to understand the following
   // Basically, I have already absorbed the one-link coefficient
 
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
-
   { // do outer-product computation
     ColorSpinorParam qParam;
     qParam.nColor = 3;
@@ -4634,56 +4286,38 @@ void computeHISQForceQuda(void* const milc_momentum,
     qParam.v = fermion[0];
 
     { // regular terms
-      GaugeField *oprod[2] = {stapleOprod, naikOprod};
+      GaugeField *oprod[2] = {&stapleOprod, &naikOprod};
 
       // loop over different quark fields
-      for(int i=0; i<num_terms; ++i){
+      for (int i = 0; i < num_terms; ++i) {
 
         // Wrap the MILC quark field
-        profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
         qParam.v = fermion[i];
         ColorSpinorField cpuQuark(qParam); // create host quark field
-        profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-        profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
         cudaQuark = cpuQuark;
-        profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
-
-        profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
         computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3);
-        qudaDeviceSynchronize();
-        profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       }
     }
 
     { // naik terms
-      oneLinkOprod->copy(*stapleOprod);
-      ax(level2_coeff[0], *oneLinkOprod);
-      GaugeField *oprod[2] = {oneLinkOprod, naikOprod};
+      oneLinkOprod.copy(stapleOprod);
+      ax(level2_coeff[0], oneLinkOprod);
+      GaugeField *oprod[2] = {&oneLinkOprod, &naikOprod};
 
       // loop over different quark fields
-      for(int i=0; i<num_naik_terms; ++i){
+      for (int i = 0; i < num_naik_terms; ++i) {
 
         // Wrap the MILC quark field
-        profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
         qParam.v = fermion[i + num_terms - num_naik_terms];
         ColorSpinorField cpuQuark(qParam); // create host quark field
-        profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
 
-        profileHISQForce.TPSTART(QUDA_PROFILE_H2D);
         cudaQuark = cpuQuark;
-        profileHISQForce.TPSTOP(QUDA_PROFILE_H2D);
-
-        profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
         computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3);
-        qudaDeviceSynchronize();
-        profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
       }
     }
   }
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
-
   // Compute the pad size
   int pad_size = 0;
 #ifdef MULTI_GPU
@@ -4706,13 +4340,13 @@ void computeHISQForceQuda(void* const milc_momentum,
     oParam.r[dir] = R[dir];
   }
 
-  cudaGaugeField *cudaInForce = new cudaGaugeField(oParam);
-  copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION);
-  delete stapleOprod;
+  GaugeField cudaInForce(oParam);
+  copyExtendedGauge(cudaInForce, stapleOprod, QUDA_CUDA_FIELD_LOCATION);
+  stapleOprod = GaugeField();
 
-  cudaGaugeField *cudaOutForce = new cudaGaugeField(oParam);
-  copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
-  delete oneLinkOprod;
+  GaugeField cudaOutForce(oParam);
+  copyExtendedGauge(cudaOutForce, oneLinkOprod, QUDA_CUDA_FIELD_LOCATION);
+  oneLinkOprod = GaugeField();
 
   // Create CPU momentum fields, prepare GPU momentum param
   GaugeFieldParam param(*gParam);
@@ -4723,11 +4357,11 @@ void computeHISQForceQuda(void* const milc_momentum,
   param.reconstruct = QUDA_RECONSTRUCT_10;
   param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   param.gauge = milc_momentum;
-  cpuGaugeField *cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr;
+  GaugeField cpuMom = (!gParam->use_resident_mom) ? GaugeField(param) : GaugeField();
 
   param.location = QUDA_CUDA_FIELD_LOCATION;
   param.create = QUDA_ZERO_FIELD_CREATE;
-  param.order = QUDA_FLOAT2_GAUGE_ORDER;
+  param.setPrecision(param.Precision(), true);
   GaugeFieldParam momParam(param);
 
   // Create CPU W, V, and U fields
@@ -4744,15 +4378,15 @@ void computeHISQForceQuda(void* const milc_momentum,
   wParam.link_type = QUDA_GENERAL_LINKS;
   wParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   wParam.gauge = (void *)w_link;
-  cpuGaugeField cpuWLink(wParam);
+  GaugeField cpuWLink(wParam);
 
   GaugeFieldParam vParam(wParam);
   vParam.gauge = (void *)v_link;
-  cpuGaugeField cpuVLink(vParam);
+  GaugeField cpuVLink(vParam);
 
   GaugeFieldParam uParam(vParam);
   uParam.gauge = (void *)u_link;
-  cpuGaugeField cpuULink(uParam);
+  GaugeField cpuULink(uParam);
 
   // Load the W field, which contains U(3) matrices, to the device
   gParam_field.ga_pad = 3 * pad_size;
@@ -4767,40 +4401,32 @@ void computeHISQForceQuda(void* const milc_momentum,
   wParam.create = QUDA_NULL_FIELD_CREATE;
   wParam.setPrecision(gParam->cpu_prec, true);
 
-  cudaGaugeField *cudaWLink = new cudaGaugeField(wParam);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
+  GaugeField cudaWLink(wParam);
 
-  cudaWLink->loadCPUField(cpuWLink, profileHISQForce);
-  cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce);
+  cudaWLink.copy(cpuWLink);
 
-  cudaInForce->exchangeExtendedGhost(R, profileHISQForce);
-  cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce);
-  cudaOutForce->exchangeExtendedGhost(R, profileHISQForce);
+  cudaWLink.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce);
+
+  cudaInForce.exchangeExtendedGhost(R, profileHISQForce);
+  cudaWLink.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce);
+  cudaOutForce.exchangeExtendedGhost(R, profileHISQForce);
 
   // Compute level two term
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaWLink, act_path_coeff);
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
+  hisqStaplesForce(cudaOutForce, cudaInForce, cudaWLink, act_path_coeff);
 
   // Load naik outer product
-  copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION);
-  cudaInForce->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce);
-  delete naikOprod;
+  copyExtendedGauge(cudaInForce, naikOprod, QUDA_CUDA_FIELD_LOCATION);
+  cudaInForce.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce);
+  naikOprod = GaugeField();
 
   // Compute Naik three-link term contribution
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaWLink, act_path_coeff[1]);
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
+  hisqLongLinkForce(cudaOutForce, cudaInForce, cudaWLink, act_path_coeff[1]);
 
-  cudaOutForce->exchangeExtendedGhost(R, profileHISQForce);
+  cudaOutForce.exchangeExtendedGhost(R, profileHISQForce);
 
   // Load the V field, which contains general matrices, to the device
-  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaWLink;
-  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
+  cudaWLink = GaugeField();
+
   for (int dir = 0; dir < 4; ++dir) {
     vParam.x[dir] += 2 * R[dir];
     vParam.r[dir] = R[dir];
@@ -4812,28 +4438,20 @@ void computeHISQForceQuda(void* const milc_momentum,
   vParam.setPrecision(gParam->cpu_prec, true);
   vParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
   vParam.pad = 3 * pad_size;
-  cudaGaugeField *cudaVLink = new cudaGaugeField(vParam);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
+  GaugeField cudaVLink(vParam);
 
-  cudaVLink->loadCPUField(cpuVLink, profileHISQForce);
-  cudaVLink->exchangeExtendedGhost(cudaVLink->R(), profileHISQForce);
+  cudaVLink.copy(cpuVLink);
+  cudaVLink.exchangeExtendedGhost(cudaVLink.R(), profileHISQForce);
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
   *num_failures_h = 0;
-  unitarizeForce(*cudaInForce, *cudaOutForce, *cudaVLink, num_failures_d);
+  unitarizeForce(cudaInForce, cudaOutForce, cudaVLink, num_failures_d);
 
   if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h);
 
-  cudaOutForce->zero();
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
-
   // Load the U field, which contains U(3) matrices, to the device
   // TODO: in theory these should just be SU(3) matrices with MILC phases?
-  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaVLink;
-  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
+  cudaVLink = GaugeField();
+
   for (int dir = 0; dir < 4; ++dir) {
     uParam.x[dir] += 2 * R[dir];
     uParam.r[dir] = R[dir];
@@ -4845,56 +4463,30 @@ void computeHISQForceQuda(void* const milc_momentum,
   uParam.setPrecision(gParam->cpu_prec, true);
   uParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED;
   uParam.pad = 3 * pad_size;
-  cudaGaugeField *cudaULink = new cudaGaugeField(uParam);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
+  GaugeField cudaULink(uParam);
 
-  cudaULink->loadCPUField(cpuULink, profileHISQForce);
-  cudaULink->exchangeExtendedGhost(cudaULink->R(), profileHISQForce);
+  cudaULink.copy(cpuULink);
+  cudaULink.exchangeExtendedGhost(cudaULink.R(), profileHISQForce);
 
   // Compute Fat7-staple term
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaULink, fat7_coeff);
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
-  delete cudaInForce;
-  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileHISQForce.TPSTART(QUDA_PROFILE_INIT);
-  cudaGaugeField* cudaMom = new cudaGaugeField(momParam);
-  profileHISQForce.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  hisqCompleteForce(*cudaOutForce, *cudaULink);
-
-  if (gParam->use_resident_mom) {
-    if (!momResident) errorQuda("No resident momentum field to use");
-    updateMomentum(*momResident, dt, *cudaOutForce, "hisq");
-  } else {
-    updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq");
-  }
-  qudaDeviceSynchronize();
-  profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE);
+  cudaOutForce.zero();
+  hisqStaplesForce(cudaOutForce, cudaInForce, cudaULink, fat7_coeff);
 
-  if (gParam->return_result_mom) {
-    // Close the paths, make anti-hermitian, and store in compressed format
-    if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce);
-  }
+  cudaInForce = GaugeField();
 
-  profileHISQForce.TPSTART(QUDA_PROFILE_FREE);
+  hisqCompleteForce(cudaOutForce, cudaULink);
 
-  if (cpuMom) delete cpuMom;
+  if (gParam->use_resident_mom && !momResident.Length()) errorQuda("No resident momentum field to use");
+  GaugeField mom = gParam->use_resident_mom ? momResident.create_alias() : GaugeField(momParam);
+  updateMomentum(mom, dt, cudaOutForce, "hisq");
 
-  if (!gParam->make_resident_mom) {
-    delete momResident;
-    momResident = nullptr;
-  }
-  if (cudaMom) delete cudaMom;
-  delete cudaOutForce;
-  delete cudaULink;
-  profileHISQForce.TPSTOP(QUDA_PROFILE_FREE);
+  // Close the paths, make anti-hermitian, and store in compressed format
+  if (gParam->return_result_mom) cpuMom.copy(mom);
 
-  profileHISQForce.TPSTOP(QUDA_PROFILE_TOTAL);
+  if (gParam->make_resident_mom && !gParam->use_resident_mom)
+    std::exchange(momResident, mom);
+  else if (!gParam->make_resident_mom)
+    momResident = GaugeField();
 }
 
 void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double *coeff, double kappa2, double ck,
@@ -4902,8 +4494,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
                             QudaInvertParam *inv_param)
 {
   using namespace quda;
-  profileCloverForce.TPSTART(QUDA_PROFILE_TOTAL);
-  profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
+  auto profile = pushProfile(profileCloverForce);
 
   checkGaugeParam(gauge_param);
   if (!gaugePrecise) errorQuda("No resident gauge field");
@@ -4911,30 +4502,23 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   GaugeFieldParam fParam(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS);
   // create the host momentum field
   fParam.location = QUDA_CPU_FIELD_LOCATION;
-  fParam.reconstruct = QUDA_RECONSTRUCT_10;
-  fParam.order = gauge_param->gauge_order;
-  cpuGaugeField *cpuMom = !gauge_param->use_resident_mom ? new cpuGaugeField(fParam) : nullptr;
+  GaugeField cpuMom = !gauge_param->use_resident_mom ? GaugeField(fParam) : GaugeField();
 
   // create the device momentum field
   fParam.location = QUDA_CUDA_FIELD_LOCATION;
-  fParam.create = QUDA_ZERO_FIELD_CREATE;
-  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
-  cudaGaugeField *cudaMom = !gauge_param->use_resident_mom ? new cudaGaugeField(fParam) : nullptr;
-
-  if (gauge_param->use_resident_mom) {
-    if (!momResident) errorQuda("No resident mom field allocated");
-    cudaMom = momResident;
-    momResident = nullptr;
-  } else {
-    cudaMom->loadCPUField(*cpuMom);
-  }
+  fParam.create = QUDA_COPY_FIELD_CREATE;
+  fParam.field = &cpuMom;
+  fParam.setPrecision(gauge_param->cuda_prec, true);
 
+  if (gauge_param->use_resident_mom && !momResident.Length()) errorQuda("No resident momentum field to use");
+  GaugeField cudaMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(fParam);
+  
   // create the device force field
   fParam.link_type = QUDA_GENERAL_LINKS;
   fParam.create = QUDA_ZERO_FIELD_CREATE;
-  fParam.order = QUDA_FLOAT2_GAUGE_ORDER;
   fParam.reconstruct = QUDA_RECONSTRUCT_NO;
-  cudaGaugeField cudaForce(fParam);
+  fParam.setPrecision(fParam.Precision(), true);
+  GaugeField cudaForce(fParam);
 
   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -4944,7 +4528,6 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
   qParam.nDim = 4;
   qParam.setPrecision(fParam.Precision(), fParam.Precision(), true);
-  qParam.pad = 0;
   for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir];
 
   // create the device quark field
@@ -4981,12 +4564,11 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 
   // create oprod and trace fields
   fParam.geometry = QUDA_TENSOR_GEOMETRY;
-  cudaGaugeField oprod(fParam);
-
-  profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
-  profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
+  GaugeField oprod(fParam);
 
   std::vector<double> force_coeff(nvector);
+
+  profileTMCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
   // loop over different quark fields
   for(int i=0; i<nvector; i++){
     ColorSpinorField &x = *(quarkX[i]);
@@ -4994,17 +4576,11 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 
     if (!inv_param->use_resident_solution) {
       // Wrap the even-parity MILC quark field
-      profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
-      profileCloverForce.TPSTART(QUDA_PROFILE_INIT);
       qParam.v = h_x[i];
       ColorSpinorField cpuQuarkX(qParam); // create host quark field
-      profileCloverForce.TPSTOP(QUDA_PROFILE_INIT);
 
-      profileCloverForce.TPSTART(QUDA_PROFILE_H2D);
       x.Even() = cpuQuarkX;
-      profileCloverForce.TPSTOP(QUDA_PROFILE_H2D);
 
-      profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
       gamma5(x.Even(), x.Even());
     } else {
       x.Even() = solutionResident[i];
@@ -5028,14 +4604,14 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   // TODO: In most situation, deallocation is unnecessery
   if (extendedGaugeResident) delete extendedGaugeResident;
   extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGaugeForce);
-  cudaGaugeField &gaugeEx = *extendedGaugeResident;
+  GaugeField &gaugeEx = *extendedGaugeResident;
 
   // In double precision the clover derivative is faster with no reconstruct
-  cudaGaugeField *u = &gaugeEx;
+  GaugeField *u = &gaugeEx;
   if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) {
     GaugeFieldParam param(gaugeEx);
     param.reconstruct = QUDA_RECONSTRUCT_NO;
-    u = new cudaGaugeField(param);
+    u = new GaugeField(param);
     u -> copy(gaugeEx);
   }
   computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt, 1);
@@ -5050,33 +4626,25 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
 
   computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon);
 
-  cudaGaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);
+  GaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce);
 
   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY);
   cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY);
 
-  updateMomentum(*cudaMom, -1.0, cudaForce, "clover");
-  profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
+  if (u != &gaugeEx) delete u;
+  updateMomentum(cudaMom, -1.0, cudaForce, "clover");
 
-  if (gauge_param->return_result_mom) {
-    cudaMom->saveCPUField(*cpuMom, profileGaugeForce);
-  }
+  profileTMCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  profileCloverForce.TPSTART(QUDA_PROFILE_FREE);
+  // copy the outer product field back to the host
+  if (gauge_param->return_result_mom) cpuMom.copy(cudaMom);
 
-  if (u != &gaugeEx) delete u;
   delete oprodEx;
 
-  if (gauge_param->make_resident_mom) {
-    if (momResident != nullptr && momResident != cudaMom) delete momResident;
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
-    momResident = nullptr;
-  }
-  if (cpuMom) {
-    delete cpuMom;
-  }
+  if (gauge_param->make_resident_mom && gauge_param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!gauge_param->make_resident_mom)
+    momResident = GaugeField();
 
   for (int i=0; i<nvector; i++) {
     delete quarkX[i];
@@ -5087,17 +4655,13 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double
   if (inv_param->use_resident_solution) solutionResident.clear();
 #endif
   delete dirac;
-  profileCloverForce.TPSTOP(QUDA_PROFILE_FREE);
-
-  profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coeff, int nvector, 
-     QudaGaugeParam *gauge_param, QudaInvertParam *inv_param, int detratio)
+                              QudaGaugeParam *gauge_param, QudaInvertParam *inv_param, int detratio)
 {
   using namespace quda;
-  profileTMCloverForce.TPSTART(QUDA_PROFILE_TOTAL);
-  profileTMCloverForce.TPSTART(QUDA_PROFILE_INIT);
+  auto profile = pushProfile(profileTMCloverForce);
 
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaGaugeParam(gauge_param);
@@ -5110,30 +4674,24 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
   if (!cloverPrecise) errorQuda("No resident clover field");
 
   GaugeFieldParam gParamMom(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS);
-  if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER)
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
-  else
-    gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
 
-  gParamMom.site_offset = gauge_param->mom_offset;
-  gParamMom.site_size = gauge_param->site_size;
-  cpuGaugeField cpuMom(gParamMom);
+  GaugeField cpuMom(gParamMom);
 
   //create the device momentum field
   gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
-  gParamMom.create =  QUDA_ZERO_FIELD_CREATE;
-  gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
-  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
+  gParamMom.create =  QUDA_COPY_FIELD_CREATE;
+  gParamMom.field = &cpuMom;
   gParamMom.setPrecision(gauge_param->cuda_prec, true);
-  gParamMom.create = QUDA_ZERO_FIELD_CREATE;
-  cudaGaugeField gpuMom(gParamMom);
+
+  if (gauge_param->use_resident_mom && !momResident.Length()) errorQuda("No resident momentum field to use");
+  GaugeField gpuMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(gParamMom);
 
   // create the device force field
   gParamMom.link_type = QUDA_GENERAL_LINKS;
   gParamMom.create = QUDA_ZERO_FIELD_CREATE;
-  gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
-  cudaGaugeField cudaForce(gParamMom);
+  gParamMom.setPrecision(gParamMom.Precision(), true);
+  GaugeField cudaForce(gParamMom);
 
   ColorSpinorParam qParam;
   qParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -5142,16 +4700,14 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
   qParam.siteSubset = QUDA_FULL_SITE_SUBSET;
   qParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
   qParam.nDim = 4;
-  qParam.setPrecision(gauge_param->cuda_prec,gauge_param->cuda_prec,true);
-  qParam.pad = 0;
+  qParam.setPrecision(gauge_param->cuda_prec, gauge_param->cuda_prec, true);
   qParam.twistFlavor = inv_param->twist_flavor;
-  qParam.pc_type = QUDA_4D_PC;
-  if (inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) { qParam.pc_type = QUDA_5D_PC; }
+  qParam.pc_type = inv_param->dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC;
   for(int dir = 0; dir<4; ++dir) qParam.x[dir] = gParamMom.x[dir];
 
   // create the device quark field
   qParam.create = QUDA_NULL_FIELD_CREATE;
-   qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
+  qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
 
   std::vector<ColorSpinorField*> quarkX, quarkP, quarkX0;
   for (int i=0; i<nvector; i++){
@@ -5164,28 +4720,27 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
   qParam.x[0] /= 2;
   ColorSpinorField tmp(qParam);
 
+  // create the host quark field
+  qParam.location = QUDA_CPU_FIELD_LOCATION;
+  qParam.create = QUDA_REFERENCE_FIELD_CREATE;
+  qParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+  qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+
   bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || 
     (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE);
-
   DiracParam diracParam;
   setDiracParam(diracParam, inv_param, pc_solve);
   Dirac *dirac = Dirac::create(diracParam);
-  
+
   // Make sure extendedGaugeResident has the correct R
   // TODO: In most situation, deallocation is unnecessery
   if (extendedGaugeResident) delete extendedGaugeResident;
   extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGaugeForce);
-  cudaGaugeField &gaugeEx = *extendedGaugeResident;
+  GaugeField &gaugeEx = *extendedGaugeResident;
 
   // create oprod and trace field
   gParamMom.geometry = QUDA_TENSOR_GEOMETRY;
-  cudaGaugeField oprod(gParamMom);
-
-  // create the host quark field
-  qParam.location = QUDA_CPU_FIELD_LOCATION;
-  qParam.create = QUDA_REFERENCE_FIELD_CREATE;
-  qParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-  qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
+  GaugeField oprod(gParamMom);
 
   std::vector<double> force_coeff(nvector);
   for(int i=0; i<nvector; i++){
@@ -5198,14 +4753,11 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
     ColorSpinorParam cpuParam(h_x[i], *inv_param, gauge.X(), true, inv_param->input_location);
     ColorSpinorField cpuQuarkX(cpuParam);
  
-    profileTMCloverForce.TPSTOP(QUDA_PROFILE_INIT);
-    profileTMCloverForce.TPSTART(QUDA_PROFILE_H2D);
     x.Odd() = cpuQuarkX; // in tmLQCD-parlance this is the odd part of X
-    profileTMCloverForce.TPSTOP(QUDA_PROFILE_H2D);
+
     profileTMCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
 
     if (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
-
       dirac->Dagger(QUDA_DAG_YES);
       gamma5(tmp, x.Odd());
       dirac->Dslash(x.Even(), tmp, QUDA_EVEN_PARITY);
@@ -5216,13 +4768,12 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
       dirac->M(p.Odd(), tmp); // this is the odd part of Y 
       dirac->Dagger(QUDA_DAG_NO);
 
-
       if (detratio){
         ColorSpinorParam cpuParam0(h_x0[i], *inv_param, gauge.X(), true, inv_param->input_location);
         ColorSpinorField cpuQuarkX0(cpuParam0);
         ColorSpinorField &x0 = *(quarkX0[i]);
         x0.Odd()=cpuQuarkX0;
-        blas::axpbyz(1,p.Odd(),1,x0.Odd(),p.Odd());
+        blas::axpbyz(1, p.Odd(), 1, x0.Odd(), p.Odd());
       }
       dirac->Dslash(p.Even(), p.Odd(), QUDA_EVEN_PARITY); // and now the even part of Y
       // up to here x.odd match X.odd in tmLQCD and p.odd=-Y.odd of tmLQCD
@@ -5231,16 +4782,11 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
       // the gamma5 application in tmLQCD is done  inside deriv_Sb
       gamma5(p.Even(), p.Even());
       gamma5(p.Odd(), p.Odd());
+    } else {
+      errorQuda("computeTMCloverForceQuda: MATPC type not supported");
     }
-    else errorQuda("computeTMCloverForceQuda: MATPC type not supported\n"); 
-
-    profileTMCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
-    profileTMCloverForce.TPSTART(QUDA_PROFILE_INIT);
   }
 
-  profileTMCloverForce.TPSTOP(QUDA_PROFILE_INIT);
-  profileTMCloverForce.TPSTART(QUDA_PROFILE_COMPUTE);
-  
   // derivative of the wilson operator it correspond to deriv_Sb(OE,...) plus  deriv_Sb(EO,...) in tmLQCD
   computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff);
   // derivative of the determinant of the sw term, second term of (A12) in hep-lat/0112051,  sw_deriv(EE, mnl->mu) in tmLQCD
@@ -5256,7 +4802,7 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
   // derivative of pseudofermion sw term, first term term of (A12) in hep-lat/0112051,  sw_spinor_eo(EE,..) plus sw_spinor_eo(OO,..)  in tmLQCD
   computeCloverSigmaOprod(oprod, quarkP,  quarkX, ferm_epsilon);
 
-  cudaGaugeField *oprodEx = createExtendedGauge(oprod, R, profileTMCloverForce);
+  GaugeField *oprodEx = createExtendedGauge(oprod, R, profileTMCloverForce);
 
   // oprod = (A12) of hep-lat/0112051 
   // compute the insertion of oprod in Fig.27 of hep-lat/0112051 
@@ -5264,13 +4810,16 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
   cloverDerivative(cudaForce, gaugeEx, *oprodEx, 1.0, QUDA_EVEN_PARITY);
 
   updateMomentum(gpuMom, -1.0, cudaForce, "tmclover");
+
   profileTMCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE);
 
-  profileTMCloverForce.TPSTART(QUDA_PROFILE_D2H);
-  gpuMom.saveCPUField(cpuMom);
-  profileTMCloverForce.TPSTOP(QUDA_PROFILE_D2H);
+  if (gauge_param->return_result_mom) cpuMom.copy(gpuMom);
+
+  if (gauge_param->make_resident_mom && gauge_param->use_resident_mom)
+    std::exchange(momResident, gpuMom);
+  else if (!gauge_param->make_resident_mom)
+    momResident = GaugeField();
 
-  profileTMCloverForce.TPSTART(QUDA_PROFILE_FREE);
   for (int i = 0; i < nvector; i++){
     delete quarkX[i];
     delete quarkP[i];
@@ -5278,325 +4827,186 @@ void computeTMCloverForceQuda(void *h_mom, void **h_x, void **h_x0, double *coef
 
   delete oprodEx;
   delete dirac;
-
-  profileTMCloverForce.TPSTOP(QUDA_PROFILE_FREE);
-  profileTMCloverForce.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
-void updateGaugeFieldQuda(void* gauge,
-			  void* momentum,
-			  double dt,
-			  int conj_mom,
-			  int exact,
-			  QudaGaugeParam* param)
+void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param)
 {
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_TOTAL);
-
+  auto profile = pushProfile(profileGaugeUpdate);
   checkGaugeParam(param);
 
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_INIT);
-
   // create the host fields
   GaugeFieldParam gParam(*param, gauge, QUDA_SU3_LINKS);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = param->gauge_offset;
-  gParam.site_size = param->site_size;
   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
-  cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
+  GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField();
 
-  GaugeFieldParam gParamMom(*param, momentum);
-  gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
-   QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
-  gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
-  gParamMom.site_offset = param->mom_offset;
-  gParamMom.site_size = param->site_size;
-  cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : nullptr;
+  GaugeFieldParam gParamMom(*param, momentum, QUDA_ASQTAD_MOM_LINKS);
+  GaugeField cpuMom = !param->use_resident_mom ? GaugeField(gParamMom) : GaugeField();
 
   // create the device fields
+  if (param->use_resident_mom && momResident.empty()) errorQuda("No resident mom field allocated");
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  gParam.create = QUDA_NULL_FIELD_CREATE;
-  gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuMom;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
+  gParam.setPrecision(gParam.Precision(), true);
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.pad = 0;
-  cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
+  GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
 
+  if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field allocated");
   gParam.link_type = QUDA_SU3_LINKS;
   gParam.reconstruct = param->reconstruct;
-  cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
-  auto *cudaOutGauge = new cudaGaugeField(gParam);
-
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D);
-
-  if (!param->use_resident_gauge) {   // load fields onto the device
-    cudaInGauge->loadCPUField(*cpuGauge);
-  } else { // or use resident fields already present
-    if (!gaugePrecise) errorQuda("No resident gauge field allocated");
-    cudaInGauge = gaugePrecise;
-    gaugePrecise = nullptr;
-  }
-
-  if (!param->use_resident_mom) {
-    cudaMom->loadCPUField(*cpuMom);
-  } else {
-    if (!momResident) errorQuda("No resident mom field allocated");
-    cudaMom = momResident;
-    momResident = nullptr;
-  }
-
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_H2D);
+  gParam.setPrecision(gParam.Precision(), true);
+  gParam.field = &cpuGauge;
+  GaugeField u_in = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
+  gParam.create = QUDA_NULL_FIELD_CREATE;
+  GaugeField u_out(gParam);
 
   // perform the update
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE);
-  updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom,
-      (bool)conj_mom, (bool)exact);
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);
+  updateGaugeField(u_out, dt, u_in, cudaMom, (bool)conj_mom, (bool)exact);
 
-  if (param->return_result_gauge) {
-    // copy the gauge field back to the host
-    profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
-    cudaOutGauge->saveCPUField(*cpuGauge);
-    profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H);
-  }
+  // copy the gauge field back to the host
+  if (param->return_result_gauge) cpuGauge.copy(u_out);
 
-  profileGaugeUpdate.TPSTART(QUDA_PROFILE_FREE);
   if (param->make_resident_gauge) {
-    if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaOutGauge;
-  } else {
-    delete cudaOutGauge;
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, u_out);
   }
 
-  if (param->make_resident_mom) {
-    if (momResident != nullptr && momResident != cudaMom) delete momResident;
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
-  }
-
-  delete cudaInGauge;
-  if (cpuMom) delete cpuMom;
-  if (cpuGauge) delete cpuGauge;
-
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_FREE);
-  profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL);
+  if (param->make_resident_mom && !param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!param->make_resident_mom)
+    momResident = GaugeField();
 }
 
- void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) {
-   profileProject.TPSTART(QUDA_PROFILE_TOTAL);
-
-   profileProject.TPSTART(QUDA_PROFILE_INIT);
-   checkGaugeParam(param);
-
-   // create the gauge field
-   GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
-   gParam.location = QUDA_CPU_FIELD_LOCATION;
-   gParam.site_offset = param->gauge_offset;
-   gParam.site_size = param->site_size;
-   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
-   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
-
-   // create the device fields
-   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-   gParam.create = QUDA_NULL_FIELD_CREATE;
-   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
-   gParam.reconstruct = param->reconstruct;
-   cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
-   profileProject.TPSTOP(QUDA_PROFILE_INIT);
-
-   if (param->use_resident_gauge) {
-     if (!gaugePrecise) errorQuda("No resident gauge field to use");
-     cudaGauge = gaugePrecise;
-     gaugePrecise = nullptr;
-   } else {
-     profileProject.TPSTART(QUDA_PROFILE_H2D);
-     cudaGauge->loadCPUField(*cpuGauge);
-     profileProject.TPSTOP(QUDA_PROFILE_H2D);
-   }
+void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param)
+{
+  auto profile = pushProfile(profileProject);
+  checkGaugeParam(param);
 
-   profileProject.TPSTART(QUDA_PROFILE_COMPUTE);
-   *num_failures_h = 0;
+  // create the gauge field
+  GaugeFieldParam gParam(*param, gauge_h, QUDA_SU3_LINKS);
+  gParam.location = QUDA_CPU_FIELD_LOCATION;
+  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
+  GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField();
 
-   // project onto SU(3)
-   if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase();
-   projectSU3(*cudaGauge, tol, num_failures_d);
-   if (!cudaGauge->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge->applyStaggeredPhase();
+  // create the device fields
+  if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuGauge;
+  gParam.reconstruct = param->reconstruct;
+  gParam.setPrecision(gParam.Precision(), true);
+  GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
-   profileProject.TPSTOP(QUDA_PROFILE_COMPUTE);
+  *num_failures_h = 0;
 
-   if(*num_failures_h>0)
-     errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
+  // project onto SU(3)
+  if (cudaGauge.StaggeredPhaseApplied()) cudaGauge.removeStaggeredPhase();
+  projectSU3(cudaGauge, tol, num_failures_d);
+  if (!cudaGauge.StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge.applyStaggeredPhase();
 
-   profileProject.TPSTART(QUDA_PROFILE_D2H);
-   if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
-   profileProject.TPSTOP(QUDA_PROFILE_D2H);
+  if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);
 
-   if (param->make_resident_gauge) {
-     if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-     gaugePrecise = cudaGauge;
-   } else {
-     delete cudaGauge;
-   }
+  if (param->return_result_gauge) cpuGauge.copy(cudaGauge);
 
-   profileProject.TPSTART(QUDA_PROFILE_FREE);
-   if (cpuGauge) delete cpuGauge;
-   profileProject.TPSTOP(QUDA_PROFILE_FREE);
-
-   profileProject.TPSTOP(QUDA_PROFILE_TOTAL);
- }
-
- void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) {
-   profilePhase.TPSTART(QUDA_PROFILE_TOTAL);
-
-   profilePhase.TPSTART(QUDA_PROFILE_INIT);
-   checkGaugeParam(param);
-
-   // create the gauge field
-   GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
-   bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
-   gParam.location = QUDA_CPU_FIELD_LOCATION;
-   cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr;
-
-   // create the device fields
-   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-   gParam.create = QUDA_NULL_FIELD_CREATE;
-   gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
-   gParam.reconstruct = param->reconstruct;
-   cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr;
-   profilePhase.TPSTOP(QUDA_PROFILE_INIT);
-
-   if (param->use_resident_gauge) {
-     if (!gaugePrecise) errorQuda("No resident gauge field to use");
-     cudaGauge = gaugePrecise;
-   } else {
-     profilePhase.TPSTART(QUDA_PROFILE_H2D);
-     cudaGauge->loadCPUField(*cpuGauge);
-     profilePhase.TPSTOP(QUDA_PROFILE_H2D);
-   }
+  if (param->make_resident_gauge && !param->use_resident_gauge) {
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaGauge);
+  }
+}
 
-   profilePhase.TPSTART(QUDA_PROFILE_COMPUTE);
-   *num_failures_h = 0;
+void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param)
+{
+  auto profile = pushProfile(profilePhase);
+  checkGaugeParam(param);
 
-   // apply / remove phase as appropriate
-   if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase();
-   else cudaGauge->removeStaggeredPhase();
+  // create the gauge field
+  GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS);
+  bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
+  gParam.location = QUDA_CPU_FIELD_LOCATION;
+  GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField();
 
-   profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);
+  // create the device fields
+  if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use");
+  gParam.location = QUDA_CUDA_FIELD_LOCATION;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
+  gParam.field = &cpuGauge;
+  gParam.reconstruct = param->reconstruct;
+  gParam.setPrecision(gParam.Precision(), true);
+  GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam);
 
-   profilePhase.TPSTART(QUDA_PROFILE_D2H);
-   if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge);
-   profilePhase.TPSTOP(QUDA_PROFILE_D2H);
+  *num_failures_h = 0;
 
-   if (param->make_resident_gauge) {
-     if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-     gaugePrecise = cudaGauge;
-   } else {
-     delete cudaGauge;
-   }
+  // apply / remove phase as appropriate
+  if (!cudaGauge.StaggeredPhaseApplied())
+    cudaGauge.applyStaggeredPhase();
+  else
+    cudaGauge.removeStaggeredPhase();
 
-   profilePhase.TPSTART(QUDA_PROFILE_FREE);
-   if (cpuGauge) delete cpuGauge;
-   profilePhase.TPSTOP(QUDA_PROFILE_FREE);
+  if (param->return_result_gauge) cpuGauge.copy(cudaGauge);
 
-   profilePhase.TPSTOP(QUDA_PROFILE_TOTAL);
- }
+  if (param->make_resident_gauge && !param->use_resident_gauge) {
+    if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaGauge);
+  }
+}
 
 // evaluate the momentum action
 double momActionQuda(void* momentum, QudaGaugeParam* param)
 {
-  profileMomAction.TPSTART(QUDA_PROFILE_TOTAL);
-
-  profileMomAction.TPSTART(QUDA_PROFILE_INIT);
+  auto profile = pushProfile(profileMomAction);
   checkGaugeParam(param);
 
   // create the momentum fields
   GaugeFieldParam gParam(*param, momentum, QUDA_ASQTAD_MOM_LINKS);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ?
-    QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
-  gParam.site_offset = param->mom_offset;
-  gParam.site_size = param->site_size;
-
-  cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParam) : nullptr;
+  GaugeField cpuMom = !param->use_resident_mom ? GaugeField(gParam) : GaugeField();
 
   // create the device fields
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  gParam.create = QUDA_NULL_FIELD_CREATE;
+  gParam.field = &cpuMom;
+  gParam.create = QUDA_COPY_FIELD_CREATE;
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.setPrecision(param->cuda_prec, true);
 
-  cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr;
-
-  profileMomAction.TPSTOP(QUDA_PROFILE_INIT);
-
-  profileMomAction.TPSTART(QUDA_PROFILE_H2D);
-  if (!param->use_resident_mom) {
-    cudaMom->loadCPUField(*cpuMom);
-  } else {
-    if (!momResident) errorQuda("No resident mom field allocated");
-    cudaMom = momResident;
-  }
-  profileMomAction.TPSTOP(QUDA_PROFILE_H2D);
+  if (param->use_resident_mom && momResident.empty()) errorQuda("No resident mom field allocated");
+  GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam);
 
   // perform the update
-  profileMomAction.TPSTART(QUDA_PROFILE_COMPUTE);
-  double action = computeMomAction(*cudaMom);
-  profileMomAction.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  profileMomAction.TPSTART(QUDA_PROFILE_FREE);
-  if (param->make_resident_mom) {
-    if (momResident != nullptr && momResident != cudaMom) delete momResident;
-    momResident = cudaMom;
-  } else {
-    delete cudaMom;
-    momResident = nullptr;
-  }
-  if (cpuMom) {
-    delete cpuMom;
-  }
+  double action = computeMomAction(cudaMom);
 
-  profileMomAction.TPSTOP(QUDA_PROFILE_FREE);
-  profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL);
+  if (param->make_resident_mom && !param->use_resident_mom)
+    std::exchange(momResident, cudaMom);
+  else if (!param->make_resident_mom)
+    momResident = GaugeField();
 
   return action;
 }
 
 void gaussGaugeQuda(unsigned long long seed, double sigma)
 {
-  profileGauss.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileGauss);
 
   if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field");
-
-  cudaGaugeField *data = gaugePrecise;
-
-  profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
-  quda::gaugeGauss(*data, seed, sigma);
-  profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE);
+  quda::gaugeGauss(*gaugePrecise, seed, sigma);
 
   if (extendedGaugeResident) {
     extendedGaugeResident->copy(*gaugePrecise);
     extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms);
   }
-
-  profileGauss.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void gaussMomQuda(unsigned long long seed, double sigma)
 {
-  profileGauss.TPSTART(QUDA_PROFILE_TOTAL);
-
-  if (!momResident) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field");
-
-  cudaGaugeField *data = momResident;
-
-  profileGauss.TPSTART(QUDA_PROFILE_COMPUTE);
-  quda::gaugeGauss(*data, seed, sigma);
-  profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  profileGauss.TPSTOP(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileGauss);
+  if (momResident.empty()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field");
+  quda::gaugeGauss(momResident, seed, sigma);
 }
 
 /*
@@ -5604,21 +5014,17 @@ void gaussMomQuda(unsigned long long seed, double sigma)
  */
 void plaqQuda(double plaq[3])
 {
-  profilePlaq.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profilePlaq);
 
   if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field");
 
-  cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
+  GaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq);
   extendedGaugeResident = data;
 
-  profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE);
   double3 plaq3 = quda::plaquette(*data);
   plaq[0] = plaq3.x;
   plaq[1] = plaq3.y;
   plaq[2] = plaq3.z;
-  profilePlaq.TPSTOP(QUDA_PROFILE_COMPUTE);
-
-  profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 /*
@@ -5672,23 +5078,23 @@ void copyExtendedResidentGaugeQuda(void *resident_gauge)
 
 void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha)
 {
+  auto profile = pushProfile(profileWuppertal);
+  pushVerbosity(inv_param->verbosity);
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
 
-  pushVerbosity(inv_param->verbosity);
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
-  cudaGaugeField *precise = nullptr;
+  GaugeField *precise = nullptr;
 
   if (gaugeSmeared != nullptr) {
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n");
+    logQuda(QUDA_VERBOSE, "Wuppertal smearing done with gaugeSmeared\n");
     GaugeFieldParam gParam(*gaugePrecise);
     gParam.create = QUDA_NULL_FIELD_CREATE;
-    precise = new cudaGaugeField(gParam);
+    precise = new GaugeField(gParam);
     copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION);
     precise->exchangeGhost();
   } else {
-    if (getVerbosity() >= QUDA_VERBOSE)
-      printfQuda("Wuppertal smearing done with gaugePrecise\n");
+    logQuda(QUDA_VERBOSE, "Wuppertal smearing done with gaugePrecise\n");
     precise = gaugePrecise;
   }
 
@@ -5699,11 +5105,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   ColorSpinorField in(cudaParam);
   in = in_h;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-    double cpu = blas::norm2(in_h);
-    double gpu = blas::norm2(in);
-    printfQuda("In CPU %e CUDA %e\n", cpu, gpu);
-  }
+  logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in));
 
   cudaParam.create = QUDA_NULL_FIELD_CREATE;
   ColorSpinorField out(cudaParam);
@@ -5723,10 +5125,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   for (unsigned int i = 0; i < n_steps; i++) {
     if (i) in = out;
     ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, comm_dim, profileWuppertal);
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-      double norm = blas::norm2(out);
-      printfQuda("Step %d, vector norm %e\n", i, norm);
-    }
+    logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out));
   }
 
   cpuParam.v = h_out;
@@ -5734,35 +5133,27 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param,
   ColorSpinorField out_h(cpuParam);
   out_h = out;
 
-  if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-    double cpu = blas::norm2(out_h);
-    double gpu = blas::norm2(out);
-    printfQuda("Out CPU %e CUDA %e\n", cpu, gpu);
-  }
+  logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out));
 
-  if (gaugeSmeared != nullptr)
-    delete precise;
+  if (gaugeSmeared != nullptr) delete precise;
 
   popVerbosity();
 }
- 
 
 void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param)
 {
-  if(smear_param->n_steps == 0) return;
-  
+  if (smear_param->n_steps == 0) return;
+  auto profile = pushProfile(profileGaussianSmear, smear_param->secs, smear_param->gflops);
+
   QudaInvertParam *inv_param = smear_param->inv_param;
-  
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_TOTAL);
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT);
 
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
-    
+
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param);
 
   if ( gaugeSmeared == nullptr || smear_param->compute_2link != 0 ) {
-  
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gaussian smearing done with gaugeSmeared\n");
+
+    logQuda(QUDA_VERBOSE, "Gaussian smearing done with gaugeSmeared\n");
     freeUniqueGaugeQuda(QUDA_SMEARED_LINKS);
 
     GaugeFieldParam gParam(*gaugePrecise);
@@ -5775,15 +5166,15 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
     gParam.nFace = 3; // FIXME: need a QudaLinkType with nFace=2.
     gParam.pad = gParam.pad*gParam.nFace;
     //
-    gaugeSmeared = new cudaGaugeField(gParam);
-    
-    cudaGaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge);//aux field
-    
+    gaugeSmeared = new GaugeField(gParam);
+
+    GaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge); // aux field
+
     computeTwoLink(*gaugeSmeared, *two_link_ext);
-    
+
     gaugeSmeared->exchangeGhost();
-    
-    delete two_link_ext;   
+
+    delete two_link_ext;
   }
 
   if (!initialized) errorQuda("QUDA not initialized");
@@ -5791,13 +5182,13 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printQudaInvertParam(inv_param); }
 
   checkInvertParam(inv_param);
-  
+
   // Create device side ColorSpinorField vectors and to pass to the
   // compute function.
   const lat_dim_t X = gaugeSmeared->X();
-  
+
   inv_param->dslash_type = QUDA_ASQTAD_DSLASH;
-  
+
   ColorSpinorParam cpuParam(h_in, *inv_param, X, QUDA_MAT_SOLUTION, QUDA_CPU_FIELD_LOCATION);
   cpuParam.nSpin = 1;
   // QUDA style pointer for host data.
@@ -5811,7 +5202,6 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   ColorSpinorField in(cudaParam);
   ColorSpinorField out(cudaParam);
   ColorSpinorField temp1(cudaParam);
- 
 
   // Create the smearing operator
   //------------------------------------------------------
@@ -5837,33 +5227,27 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
     errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(), inv_param->cuda_prec);
   //
   d = Dirac::create(diracParam); // create the Dirac operator
-  
+
   Dirac &dirac = *d;
   DiracM qsmear_op(dirac);
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT);
 
   // Copy host data to device
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D);
   in = in_h;
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D);
 
   const double ftmp    = -(smear_param->width*smear_param->width)/(4.0*smear_param->n_steps*4.0);  /* Extra 4 to compensate for stride 2 */
   // Scale up the source to prevent underflow
   profileGaussianSmear.TPSTART(QUDA_PROFILE_COMPUTE);
-  
-  const double msq     = 1. / ftmp;  
+
+  const double msq = 1. / ftmp;
   const double a       = inv_param->laplace3D * 2.0 + msq;
   const QudaParity  parity   = QUDA_INVALID_PARITY;
   for (int i = 0; i < smear_param->n_steps; i++) {
     if (i > 0) std::swap(in, out);
     blas::ax(ftmp, in);
     blas::axpy(a, in, temp1);
-    
+
     qsmear_op.Expose()->SmearOp(out, in, a, 0.0, smear_param->t0, parity);
-    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
-      double norm = blas::norm2(out);
-      printfQuda("Step %d, vector norm %e\n", i, norm);
-    }
+    logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out));
     blas::xpay(temp1, -1.0, out);
     blas::zero(temp1);
   }
@@ -5871,30 +5255,21 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par
   profileGaussianSmear.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   // Copy device data to host.
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H);
   in_h = out;
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H);
-
-  profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE);
-
-  if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Finished 2link Gaussian smearing.\n");
 
+  logQuda(QUDA_VERBOSE, "Finished 2link Gaussian smearing.\n");
   delete d;
 
-  smear_param->gflops = dirac.Flops();
-
   if (smear_param->delete_2link != 0) { freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); }
 
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE);
-  profileGaussianSmear.TPSTOP(QUDA_PROFILE_TOTAL);
   saveTuneCache();
 }
 
 
 void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param)
 {
+  auto profile = pushProfile(profileGaugeSmear);
   pushOutputPrefix("performGaugeSmearQuda: ");
-  profileGaugeSmear.TPSTART(QUDA_PROFILE_TOTAL);
   checkGaugeSmearParam(smear_param);
 
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
@@ -5903,45 +5278,36 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable
 
   GaugeFieldParam gParam(*gaugeSmeared);
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  auto *cudaGaugeTemp = new cudaGaugeField(gParam);
+  GaugeField tmp(gParam);
 
   int measurement_n = 0; // The nth measurement to take
   gaugeObservablesQuda(&obs_param[measurement_n]);
-  if (getVerbosity() >= QUDA_SUMMARIZE) {
-    printfQuda("Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
-  }
+  logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge);
 
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
-    profileGaugeSmear.TPSTART(QUDA_PROFILE_COMPUTE);
-
     switch (smear_param->smear_type) {
-    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->alpha); break;
-    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->rho); break;
+    case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha); break;
+    case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho); break;
     case QUDA_GAUGE_SMEAR_OVRIMP_STOUT:
-      OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->rho, smear_param->epsilon);
+      OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon);
       break;
     default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type);
     }
 
-    profileGaugeSmear.TPSTOP(QUDA_PROFILE_COMPUTE);
     if ((i + 1) % smear_param->meas_interval == 0) {
       measurement_n++;
       gaugeObservablesQuda(&obs_param[measurement_n]);
-      if (getVerbosity() >= QUDA_SUMMARIZE) {
-        printfQuda("Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge);
-      }
+      logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge);
     }
   }
 
-  delete cudaGaugeTemp;
-  profileGaugeSmear.TPSTOP(QUDA_PROFILE_TOTAL);
   popOutputPrefix();
 }
 
 void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param)
 {
+  auto profile = pushProfile(profileWFlow);
   pushOutputPrefix("performWFlowQuda: ");
-  profileWFlow.TPSTART(QUDA_PROFILE_TOTAL);
   checkGaugeSmearParam(smear_param);
 
   if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded");
@@ -5949,186 +5315,115 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam
   gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileWFlow);
 
   GaugeFieldParam gParamEx(*gaugeSmeared);
-  auto *gaugeAux = GaugeField::Create(gParamEx);
+  GaugeField gaugeAux(gParamEx);
 
   GaugeFieldParam gParam(*gaugePrecise);
   gParam.reconstruct = QUDA_RECONSTRUCT_NO; // temporary field is not on manifold so cannot use reconstruct
-  auto *gaugeTemp = GaugeField::Create(gParam);
+  GaugeField gaugeTemp(gParam);
 
-  GaugeField *in = gaugeSmeared;
-  GaugeField *out = gaugeAux;
+  GaugeField &in = *gaugeSmeared;
+  GaugeField &out = gaugeAux;
 
   int measurement_n = 0; // The nth measurement to take
 
-  gaugeObservables(*in, obs_param[measurement_n], profileWFlow);
+  gaugeObservables(in, obs_param[measurement_n]);
 
-  if (getVerbosity() >= QUDA_SUMMARIZE) {
-    printfQuda("flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");
-    printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0, obs_param[0].plaquette[0], obs_param[0].energy[0],
-               obs_param[0].energy[1], obs_param[0].energy[2], obs_param[0].qcharge);
-  }
+  logQuda(QUDA_SUMMARIZE, "flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n");
+  logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0, obs_param[0].plaquette[0],
+          obs_param[0].energy[0], obs_param[0].energy[1], obs_param[0].energy[2], obs_param[0].qcharge);
 
   for (unsigned int i = 0; i < smear_param->n_steps; i++) {
     // Perform W1, W2, and Vt Wilson Flow steps as defined in
     // https://arxiv.org/abs/1006.4518v3
-    profileWFlow.TPSTART(QUDA_PROFILE_COMPUTE);
     if (i > 0) std::swap(in, out); // output from prior step becomes input for next step
-    WFlowStep(*out, *gaugeTemp, *in, smear_param->epsilon, smear_param->smear_type);
-    profileWFlow.TPSTOP(QUDA_PROFILE_COMPUTE);
+    WFlowStep(out, gaugeTemp, in, smear_param->epsilon, smear_param->smear_type);
 
     if ((i + 1) % smear_param->meas_interval == 0) {
       measurement_n++; // increment measurements.
-      gaugeObservables(*out, obs_param[measurement_n], profileWFlow);
-      if (getVerbosity() >= QUDA_SUMMARIZE) {
-        printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1),
-                   obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0],
-                   obs_param[measurement_n].energy[1], obs_param[measurement_n].energy[2],
-                   obs_param[measurement_n].qcharge);
-      }
+      gaugeObservables(out, obs_param[measurement_n]);
+      logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1),
+              obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0],
+              obs_param[measurement_n].energy[1], obs_param[measurement_n].energy[2], obs_param[measurement_n].qcharge);
     }
   }
 
-  delete gaugeTemp;
-  delete gaugeAux;
-  profileWFlow.TPSTOP(QUDA_PROFILE_TOTAL);
   popOutputPrefix();
 }
 
 int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
                               const unsigned int verbose_interval, const double relax_boost, const double tolerance,
-                              const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param,
-                              double *timeinfo)
+                              const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param)
 {
-  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_TOTAL);
-
+  auto profile = pushProfile(GaugeFixOVRQuda);
   checkGaugeParam(param);
 
-  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_INIT);
-
   GaugeFieldParam gParam(*param, gauge);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = param->gauge_offset;
-  gParam.site_size = param->site_size;
-  auto *cpuGauge = new cpuGaugeField(gParam);
+  GaugeField cpuGauge(gParam);
 
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(gParam.Precision(), true);
-  auto *cudaInGauge = new cudaGaugeField(gParam);
-
-  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT);
-  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D);
-
-  cudaInGauge->loadCPUField(*cpuGauge);
+  GaugeField cudaInGauge(gParam);
 
-  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D);
+  cudaInGauge.copy(cpuGauge);
 
-  cudaGaugeField *cudaInGaugeEx = nullptr;
+  GaugeField *cudaInGaugeEx = createExtendedGauge(cudaInGauge, R, GaugeFixOVRQuda);
 
-  if (comm_size() == 1) {
-    // perform the update
-    GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
-    gaugeFixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
-                   stopWtheta);
-    GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
-  } else {
-    cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda);
-
-    // perform the update
-    GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE);
-    gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
-                   stopWtheta);
-    GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
+  // perform the update
+  gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
+                 stopWtheta);
 
-    copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
-  }
+  copyExtendedGauge(cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
   // copy the gauge field back to the host
-  GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H);
-  cudaInGauge->saveCPUField(*cpuGauge);
-  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H);
-
-  GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL);
+  cpuGauge.copy(cudaInGauge);
 
   if (param->make_resident_gauge) {
-    if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaInGauge;
+    freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaInGauge);
     if (extendedGaugeResident) delete extendedGaugeResident;
     extendedGaugeResident = cudaInGaugeEx;
   } else {
-    delete cudaInGauge;
-    if (cudaInGaugeEx) delete cudaInGaugeEx;
-  }
-
-  delete cpuGauge;
-
-  if(timeinfo){
-    timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D);
-    timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE);
-    timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H);
+    delete cudaInGaugeEx;
   }
 
   return 0;
 }
 
-int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const unsigned int Nsteps, \
-  const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \
-  const unsigned int  stopWtheta, QudaGaugeParam* param , double* timeinfo)
+int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps,
+                              const unsigned int verbose_interval, const double alpha, const unsigned int autotune,
+                              const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param)
 {
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL);
-
+  auto profile = pushProfile(GaugeFixFFTQuda);
   checkGaugeParam(param);
 
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_INIT);
-
   GaugeFieldParam gParam(*param, gauge);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  gParam.site_offset = param->gauge_offset;
-  gParam.site_size = param->site_size;
-  auto *cpuGauge = new cpuGaugeField(gParam);
+  GaugeField cpuGauge(gParam);
 
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.link_type = param->type;
   gParam.reconstruct = param->reconstruct;
   gParam.setPrecision(gParam.Precision(), true);
-  auto *cudaInGauge = new cudaGaugeField(gParam);
+  GaugeField cudaInGauge(gParam);
 
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT);
-
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D);
-
-  cudaInGauge->loadCPUField(*cpuGauge);
-
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D);
+  cudaInGauge.copy(cpuGauge);
 
   // perform the update
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_COMPUTE);
-
-  gaugeFixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
-
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_COMPUTE);
+  gaugeFixingFFT(cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
 
   // copy the gauge field back to the host
-  GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H);
-  cudaInGauge->saveCPUField(*cpuGauge);
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H);
-
-  GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL);
+  cpuGauge.copy(cudaInGauge);
 
   if (param->make_resident_gauge) {
-    if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
-    gaugePrecise = cudaInGauge;
-  } else {
-    delete cudaInGauge;
-  }
-
-  if (timeinfo) {
-    timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D);
-    timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE);
-    timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H);
+    freeUniqueGaugeQuda(QUDA_WILSON_LINKS);
+    gaugePrecise = new GaugeField();
+    std::exchange(*gaugePrecise, cudaInGauge);
   }
 
   return 0;
@@ -6137,12 +5432,10 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir,  const
 void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType,
                   QudaInvertParam *param, const int *X)
 {
+  auto profile = pushProfile(profileContract);
   // DMH: Easiest way to construct ColorSpinorField? Do we require the user
   //     to declare and fill and invert_param, or can it just be hacked?.
 
-  profileContract.TPSTART(QUDA_PROFILE_TOTAL);
-  profileContract.TPSTART(QUDA_PROFILE_INIT);
-
   // wrap CPU host side pointers
   lat_dim_t X_ = {X[0], X[1], X[2], X[3]};
   ColorSpinorParam cpuParam((void *)hp_x, *param, X_, false, param->input_location);
@@ -6165,33 +5458,27 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda
 
   size_t data_bytes = x[0].Volume() * x[0].Nspin() * x[0].Nspin() * 2 * x[0].Precision();
   void *d_result = pool_device_malloc(data_bytes);
-  profileContract.TPSTOP(QUDA_PROFILE_INIT);
 
-  profileContract.TPSTART(QUDA_PROFILE_H2D);
   x[0] = h_x;
   y[0] = h_y;
-  profileContract.TPSTOP(QUDA_PROFILE_H2D);
 
-  profileContract.TPSTART(QUDA_PROFILE_COMPUTE);
   contractQuda(x[0], y[0], d_result, cType);
-  profileContract.TPSTOP(QUDA_PROFILE_COMPUTE);
 
   profileContract.TPSTART(QUDA_PROFILE_D2H);
   qudaMemcpy(h_result, d_result, data_bytes, qudaMemcpyDeviceToHost);
   profileContract.TPSTOP(QUDA_PROFILE_D2H);
 
   pool_device_free(d_result);
-  profileContract.TPSTOP(QUDA_PROFILE_TOTAL);
 }
 
 void gaugeObservablesQuda(QudaGaugeObservableParam *param)
 {
-  profileGaugeObs.TPSTART(QUDA_PROFILE_TOTAL);
+  auto profile = pushProfile(profileGaugeObs);
   checkGaugeObservableParam(param);
 
   if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field");
 
-  cudaGaugeField *gauge = nullptr;
+  GaugeField *gauge = nullptr;
   if (!gaugeSmeared) {
     if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGaugeObs);
     gauge = extendedGaugeResident;
@@ -6207,6 +5494,5 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param)
       errorQuda("Removing staggered phases was requested, however staggered phases aren't already applied");
   }
 
-  gaugeObservables(*gauge, *param, profileGaugeObs);
-  profileGaugeObs.TPSTOP(QUDA_PROFILE_TOTAL);
+  gaugeObservables(*gauge, *param);
 }
diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp
index c5d3bf90a1..4fdf08020a 100644
--- a/lib/inv_bicgstab_quda.cpp
+++ b/lib/inv_bicgstab_quda.cpp
@@ -214,10 +214,6 @@ namespace quda {
 
     PrintStats("BiCGstab", k, r2, b2, heavy_quark_res);
 
-    if (!param.is_preconditioner) { // do not do the below if we this is an inner solver
-      blas::flops = 0;
-    }
-
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
@@ -344,10 +340,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9;
-
-    param.gflops += gflops;
     param.iter += k;
 
     if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -363,12 +355,6 @@ namespace quda {
       PrintSummary("BiCGstab", k, r2, b2, stop, param.tol_hq);
     }
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-    matPrecon.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
     profile.TPSTART(QUDA_PROFILE_FREE);
diff --git a/lib/inv_bicgstabl_quda.cpp b/lib/inv_bicgstabl_quda.cpp
index 0393fe308c..b0e00d9ff5 100644
--- a/lib/inv_bicgstabl_quda.cpp
+++ b/lib/inv_bicgstabl_quda.cpp
@@ -50,7 +50,6 @@ namespace quda {
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EIGEN);
     }
 
@@ -61,7 +60,6 @@ namespace quda {
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_EIGEN);
-      param.secs += profile.Last(QUDA_PROFILE_EIGEN);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
 
@@ -562,7 +560,6 @@ namespace quda {
     double heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r_full).z) : 0.0;
     const int heavy_quark_check = param.heavy_quark_check; // how often to check the heavy quark residual
 
-    blas::flops = 0;
     //bool l2_converge = false;
     //double r2_old = r2;
 
@@ -706,9 +703,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matEig.flops()) * 1e-9;
-    param.gflops = gflops;
     param.iter += total_iter;
 
     if (total_iter >= param.maxiter) // >= if n_krylov doesn't divide max iter.
@@ -726,12 +720,7 @@ namespace quda {
       param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r[0]).z) : 0.0;
     }
 
-    // Reset flops counters.
-    blas::flops = 0;
-    mat.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
-    param.secs += profile.Last(QUDA_PROFILE_EPILOGUE);
 
     PrintSummary(solver_name.c_str(), total_iter, r2, b2, stop, param.tol_hq);
   }
diff --git a/lib/inv_ca_cg.cpp b/lib/inv_ca_cg.cpp
index ec95bf3ffe..445b2acaf3 100644
--- a/lib/inv_ca_cg.cpp
+++ b/lib/inv_ca_cg.cpp
@@ -184,10 +184,7 @@ namespace quda
   {
     Solver::create(x, b);
     if (!init) {
-      if (!param.is_preconditioner) {
-        blas::flops = 0;
-        profile.TPSTART(QUDA_PROFILE_INIT);
-      }
+      if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
 
       Q_AQandg.resize(param.Nkrylov * (param.Nkrylov + 1));
       Q_AS.resize(param.Nkrylov * param.Nkrylov);
@@ -248,7 +245,6 @@ namespace quda
   {
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EIGEN);
     }
 
@@ -290,7 +286,6 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_EIGEN);
-      param.secs += profile.Last(QUDA_PROFILE_EIGEN);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
   }
@@ -318,7 +313,6 @@ namespace quda
   {
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EIGEN);
     }
 
@@ -357,7 +351,6 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_EIGEN);
-      param.secs += profile.Last(QUDA_PROFILE_EIGEN);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
   }
@@ -522,7 +515,6 @@ namespace quda
     int resIncreaseTotal = 0;
 
     if (!param.is_preconditioner) {
-      blas::flops = 0;
       profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
@@ -675,25 +667,8 @@ namespace quda
     }
 
     if (!param.is_preconditioner) {
-      qudaDeviceSynchronize(); // ensure solver is complete before ending timing
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      profile.TPSTART(QUDA_PROFILE_EPILOGUE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-
-      // store flops and reset counters
-      double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
-
-      param.gflops += gflops;
       param.iter += total_iter;
-
-      // reset the flops counters
-      blas::flops = 0;
-      mat.flops();
-      matSloppy.flops();
-      matPrecon.flops();
-      matEig.flops();
-
-      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     }
 
     PrintSummary("CA-CG", total_iter, r2, b2, stop, param.tol_hq);
diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp
index f9e605ea86..5b893bd3fc 100644
--- a/lib/inv_ca_gcr.cpp
+++ b/lib/inv_ca_gcr.cpp
@@ -28,10 +28,7 @@ namespace quda
     Solver::create(x, b);
 
     if (!init) {
-      if (!param.is_preconditioner) {
-        blas::flops = 0;
-        profile.TPSTART(QUDA_PROFILE_INIT);
-      }
+      if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT);
 
       alpha.resize(param.Nkrylov);
 
@@ -103,7 +100,6 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EIGEN);
     }
 
@@ -115,7 +111,6 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_EIGEN);
-      param.secs += profile.Last(QUDA_PROFILE_EIGEN);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
   }
@@ -268,7 +263,6 @@ namespace quda
     int resIncreaseTotal = 0;
 
     if (!param.is_preconditioner) {
-      blas::flops = 0;
       profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
     }
@@ -375,25 +369,8 @@ namespace quda
     }
 
     if (!param.is_preconditioner) {
-      qudaDeviceSynchronize(); // ensure solver is complete before ending timing
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      profile.TPSTART(QUDA_PROFILE_EPILOGUE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-
-      // store flops and reset counters
-      double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matMdagM.flops()) * 1e-9;
-
-      param.gflops += gflops;
       param.iter += total_iter;
-
-      // reset the flops counters
-      blas::flops = 0;
-      mat.flops();
-      matSloppy.flops();
-      matPrecon.flops();
-      matMdagM.flops();
-
-      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     }
 
     PrintSummary("CA-GCR", total_iter, r2, b2, stop, param.tol_hq);
diff --git a/lib/inv_cg3_quda.cpp b/lib/inv_cg3_quda.cpp
index 42ab22fcab..9ac9f85b9f 100644
--- a/lib/inv_cg3_quda.cpp
+++ b/lib/inv_cg3_quda.cpp
@@ -268,8 +268,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_INIT);
     profile.TPSTART(QUDA_PROFILE_PREAMBLE);
 
-    blas::flops = 0;
-
     // compute initial residual depending on whether we have an initial guess or not
     double r2;
     if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) {
@@ -474,13 +472,9 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
-    if (k == param.maxiter)
-      warningQuda("Exceeded maximum iterations %d", param.maxiter);
+    if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
 
     // compute the true residuals
     if (!mixed_precision && param.compute_true_res) {
@@ -491,11 +485,6 @@ namespace quda {
 
     PrintSummary("CG3", k, r2, b2, stop, param.tol_hq);
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
   }
 
diff --git a/lib/inv_cg_quda.cpp b/lib/inv_cg_quda.cpp
index 15bdf23002..40cd15ea1c 100644
--- a/lib/inv_cg_quda.cpp
+++ b/lib/inv_cg_quda.cpp
@@ -369,7 +369,6 @@ namespace quda {
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
       profile.TPSTART(QUDA_PROFILE_COMPUTE);
-      blas::flops = 0;
     }
 
     int k = 0;
@@ -544,9 +543,6 @@ namespace quda {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
       profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-      param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-      double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
-      param.gflops = gflops;
       param.iter += k;
 
       if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -563,15 +559,7 @@ namespace quda {
 
     PrintSummary("CG", k, r2, b2, stop, 0.0);
 
-    if (!param.is_preconditioner) {
-      // reset the flops counters
-      blas::flops = 0;
-      mat.flops();
-      matSloppy.flops();
-      matPrecon.flops();
-
-      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
-    }
+    if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
     if (param.is_preconditioner) commGlobalReductionPop();
   }
@@ -692,7 +680,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     int k = 0;
 
@@ -988,9 +975,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -1006,12 +990,6 @@ namespace quda {
 
     PrintSummary("CG", k, r2, b2, stop, param.tol_hq);
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-    matPrecon.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
   }
 
@@ -1163,7 +1141,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     int k = 0;
 
@@ -1311,9 +1288,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops()) * 1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -1332,11 +1306,6 @@ namespace quda {
       PrintSummary("CG", k, r2(i, i).real(), b2[i], stop[i], 0.0);
     }
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTART(QUDA_PROFILE_FREE);
 
@@ -1533,7 +1502,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) {
 
   profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
   profile.TPSTART(QUDA_PROFILE_COMPUTE);
-  blas::flops = 0;
 
   int k = 0;
 
@@ -1879,9 +1847,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) {
   profile.TPSTOP(QUDA_PROFILE_COMPUTE);
   profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-  param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-  double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
-  param.gflops = gflops;
   param.iter += k;
 
   if (k == param.maxiter)
@@ -1901,11 +1866,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) {
     PrintSummary("CG", k, r2(i,i).real(), b2[i], stop[i], 0.0);
   }
 
-  // reset the flops counters
-  blas::flops = 0;
-  mat.flops();
-  matSloppy.flops();
-
   profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
   profile.TPSTART(QUDA_PROFILE_FREE);
 
diff --git a/lib/inv_eigcg_quda.cpp b/lib/inv_eigcg_quda.cpp
index 57a963a20e..decb31af7e 100644
--- a/lib/inv_eigcg_quda.cpp
+++ b/lib/inv_eigcg_quda.cpp
@@ -179,11 +179,7 @@ namespace quda {
     inner.delta = 1e-20; // no reliable updates within the inner solver
     inner.precision = outer.precision_precondition; // preconditioners are uni-precision solvers
     inner.precision_sloppy = outer.precision_precondition;
-
     inner.iter   = 0;
-    inner.gflops = 0;
-    inner.secs   = 0;
-
     inner.inv_type_precondition = QUDA_INVALID_INVERTER;
     inner.is_preconditioner = true; // used to tell the inner solver it is an inner solver
 
@@ -193,9 +189,6 @@ namespace quda {
   // set the required parameters for the initCG solver
   static void fillInitCGSolverParam(SolverParam &inner, const SolverParam &outer) {
     inner.iter   = 0;
-    inner.gflops = 0;
-    inner.secs   = 0;
-
     inner.tol              = outer.tol;
     inner.tol_restart      = outer.tol_restart;
     inner.maxiter          = outer.maxiter;
@@ -460,7 +453,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     double rMinvr = blas::reDotProduct(r,*z);
     //Begin EigCG iterations:
@@ -517,9 +509,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + matSloppy.flops())*1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (k == param.maxiter)
@@ -532,10 +521,6 @@ namespace quda {
 
     PrintSummary("eigCG", k, r2, b2, args.global_stop, param.tol_hq);
 
-    // reset the flops counters
-    blas::flops = 0;
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTART(QUDA_PROFILE_FREE);
 
@@ -588,20 +573,11 @@ namespace quda {
       xProj = x;
       rProj = r; 
 
-      if(getVerbosity() >= QUDA_VERBOSE) printfQuda("\ninitCG stat: %i iter / %g secs = %g Gflops. \n", Kparam.iter, Kparam.secs, Kparam.gflops);
-
       Kparam.tol *= param.inc_tol;
 
       if(restart_idx == (param.max_restart_num-1)) Kparam.tol = full_tol;//do the last solve in the next cycle to full tolerance
-
-      param.secs   += Kparam.secs;
     }
 
-    if(getVerbosity() >= QUDA_VERBOSE) printfQuda("\ninitCG stat: %i iter / %g secs = %g Gflops. \n", Kparam.iter, Kparam.secs, Kparam.gflops);
-    //
-    param.secs   += Kparam.secs;
-    param.gflops += Kparam.gflops;
-
     k   += Kparam.iter;
 
     delete rp;
diff --git a/lib/inv_gcr_quda.cpp b/lib/inv_gcr_quda.cpp
index 9227f573b3..3caf952929 100644
--- a/lib/inv_gcr_quda.cpp
+++ b/lib/inv_gcr_quda.cpp
@@ -276,8 +276,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_INIT);
     profile.TPSTART(QUDA_PROFILE_PREAMBLE);
 
-    blas::flops = 0;
-
     blas::copy(r_sloppy, r);
 
     int total_iter = 0;
@@ -386,11 +384,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matMdagM.flops()) * 1e-9;
-    if (K) gflops += K->flops()*1e-9;
-
     if (k >= param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
 
     logQuda(QUDA_VERBOSE, "GCR: number of restarts = %d\n", restart);
@@ -410,16 +403,8 @@ namespace quda {
       if (0) blas::copy(b, K ? r_sloppy : p[k_break]);
     }
 
-    param.gflops += gflops;
     param.iter += total_iter;
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-    matPrecon.flops();
-    matMdagM.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTART(QUDA_PROFILE_FREE);
 
diff --git a/lib/inv_gmresdr_quda.cpp b/lib/inv_gmresdr_quda.cpp
index 62d7685eda..ab56176a2a 100644
--- a/lib/inv_gmresdr_quda.cpp
+++ b/lib/inv_gmresdr_quda.cpp
@@ -143,8 +143,6 @@ namespace quda {
     inner.precision_sloppy = outer.precision_precondition;
 
     inner.iter = 0;
-    inner.gflops = 0;
-    inner.secs = 0;
 
     inner.inv_type_precondition = QUDA_INVALID_INVERTER;
     inner.is_preconditioner = true;
@@ -282,7 +280,7 @@ namespace quda {
         blas::zero(Vm->Component(i));
     }
 
-    if (Zm->V() != Vm->V()) {
+    if (Zm->data() != Vm->data()) {
       std::vector<ColorSpinorField *> z(Zm->Components());
       std::vector<ColorSpinorField *> vk(args.Vkp1->Components().begin(), args.Vkp1->Components().begin() + args.k);
 
@@ -469,7 +467,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
 
@@ -549,9 +546,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops()) * 1e-9;
-    param.gflops = gflops;
     param.iter += tot_iters;
 
     mat(r, x);
@@ -560,9 +554,6 @@ namespace quda {
 
     PrintSummary("FGMResDR:", tot_iters, r2, b2, stop, param.tol_hq);
 
-    blas::flops = 0;
-    mat.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
 
     param.rhs_idx += 1;
diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp
index 44078ce783..cc69c3cd14 100644
--- a/lib/inv_mr_quda.cpp
+++ b/lib/inv_mr_quda.cpp
@@ -38,7 +38,7 @@ namespace quda
       bool mixed = param.precision != param.precision_sloppy;
 
       if (!mixed) csParam.create = QUDA_REFERENCE_FIELD_CREATE;
-      csParam.v = r.V();
+      csParam.v = r.data();
       r_sloppy = ColorSpinorField(csParam);
 
       init = true;
@@ -62,10 +62,7 @@ namespace quda
 
     create(x, b); // allocate fields
 
-    if (!param.is_preconditioner) {
-      blas::flops = 0;
-      profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    }
+    if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
     double b2 = blas::norm2(b); // Save norm of b
     double r2 = 0.0;            // if zero source then we will exit immediately doing no work
@@ -160,17 +157,7 @@ namespace quda
 
     if (!param.is_preconditioner) {
       profile.TPSTOP(QUDA_PROFILE_COMPUTE);
-      profile.TPSTART(QUDA_PROFILE_EPILOGUE);
-      param.secs += profile.Last(QUDA_PROFILE_COMPUTE);
-
-      // store flops and reset counters
-      double gflops = (blas::flops + mat.flops() + matSloppy.flops()) * 1e-9;
-
-      param.gflops += gflops;
       param.iter += iter;
-      blas::flops = 0;
-
-      profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     }
   }
 
diff --git a/lib/inv_mre.cpp b/lib/inv_mre.cpp
index 91a79bab55..10733a6aaa 100644
--- a/lib/inv_mre.cpp
+++ b/lib/inv_mre.cpp
@@ -5,8 +5,8 @@
 namespace quda
 {
 
-  MinResExt::MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian, TimeProfile &profile) :
-    mat(mat), orthogonal(orthogonal), apply_mat(apply_mat), hermitian(hermitian), profile(profile)
+  MinResExt::MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian) :
+    mat(mat), orthogonal(orthogonal), apply_mat(apply_mat), hermitian(hermitian)
   {
   }
 
@@ -44,14 +44,14 @@ namespace quda
       for (int j = 0; j < N; j++) { A(i, j) = A_[i * (N + 1) + j]; }
     }
 
-    profile.TPSTOP(QUDA_PROFILE_CHRONO);
-    profile.TPSTART(QUDA_PROFILE_EIGEN);
+    getProfile().TPSTOP(QUDA_PROFILE_CHRONO);
+    getProfile().TPSTART(QUDA_PROFILE_EIGEN);
 
     LDLT<matrix> cholesky(A);
     psi = cholesky.solve(phi);
 
-    profile.TPSTOP(QUDA_PROFILE_EIGEN);
-    profile.TPSTART(QUDA_PROFILE_CHRONO);
+    getProfile().TPSTOP(QUDA_PROFILE_EIGEN);
+    getProfile().TPSTART(QUDA_PROFILE_CHRONO);
 
     for (int i = 0; i < N; i++) psi_[i] = psi(i);
   }
@@ -70,8 +70,8 @@ namespace quda
   void MinResExt::operator()(ColorSpinorField &x, const ColorSpinorField &b, std::vector<ColorSpinorField> &p,
                              std::vector<ColorSpinorField> &q)
   {
-    bool running = profile.isRunning(QUDA_PROFILE_CHRONO);
-    if (!running) profile.TPSTART(QUDA_PROFILE_CHRONO);
+    bool running = getProfile().isRunning(QUDA_PROFILE_CHRONO);
+    if (!running) getProfile().TPSTART(QUDA_PROFILE_CHRONO);
 
     const int N = p.size();
     logQuda(QUDA_VERBOSE, "Constructing minimum residual extrapolation with basis size %d\n", N);
@@ -81,7 +81,7 @@ namespace quda
         blas::zero(x);
       else
         blas::copy(x, p[0]);
-      if (!running) profile.TPSTOP(QUDA_PROFILE_CHRONO);
+      if (!running) getProfile().TPSTOP(QUDA_PROFILE_CHRONO);
       return;
     }
 
@@ -133,7 +133,7 @@ namespace quda
       printfQuda("MinResExt: N = %d, |res| / |src| = %e\n", N, sqrt(blas::norm2(r) / blas::norm2(b)));
     }
 
-    if (!running) profile.TPSTOP(QUDA_PROFILE_CHRONO);
+    if (!running) getProfile().TPSTOP(QUDA_PROFILE_CHRONO);
   }
 
 } // namespace quda
diff --git a/lib/inv_msrc_cg_quda.cpp b/lib/inv_msrc_cg_quda.cpp
index 9a64386095..70bcb9a089 100644
--- a/lib/inv_msrc_cg_quda.cpp
+++ b/lib/inv_msrc_cg_quda.cpp
@@ -146,7 +146,6 @@ namespace quda {
 
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
-    blas::flops = 0;
 
     int k=0;
 
@@ -315,10 +314,6 @@ namespace quda {
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);
     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
-    reduceDouble(gflops);
-    param.gflops = gflops;
     param.iter += k;
 
     if (k==param.maxiter)
@@ -334,11 +329,6 @@ namespace quda {
 
     PrintSummary("CG", k, r2, b2, stop, inv.tol_hq);
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTART(QUDA_PROFILE_FREE);
 
diff --git a/lib/inv_multi_cg_quda.cpp b/lib/inv_multi_cg_quda.cpp
index b6757440f4..ada5a3326a 100644
--- a/lib/inv_multi_cg_quda.cpp
+++ b/lib/inv_multi_cg_quda.cpp
@@ -262,7 +262,6 @@ namespace quda {
 
     int k = 0;
     int rUpdate = 0;
-    blas::flops = 0;
 
     // now create the worker class for updating the shifted solutions and gradient vectors
     bool aux_update = false;
@@ -443,9 +442,6 @@ namespace quda {
     logQuda(QUDA_VERBOSE, "Reliable updates = %d\n", rUpdate);
     if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d\n", param.maxiter);
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (param.compute_true_res) {
@@ -490,11 +486,6 @@ namespace quda {
       }
     }
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     popOutputPrefix();
   }
diff --git a/lib/inv_pcg_quda.cpp b/lib/inv_pcg_quda.cpp
index 24d9259ef8..2fe62692de 100644
--- a/lib/inv_pcg_quda.cpp
+++ b/lib/inv_pcg_quda.cpp
@@ -203,8 +203,6 @@ namespace quda
     profile.TPSTOP(QUDA_PROFILE_PREAMBLE);
     profile.TPSTART(QUDA_PROFILE_COMPUTE);
 
-    blas::flops = 0;
-
     int k = 0;
     PrintStats("PCG", k, r2, b2, heavy_quark_res);
 
@@ -378,10 +376,6 @@ namespace quda
     if (mixed()) copy(x, x_sloppy);
     xpy(y, x); // x += y
 
-    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
-    double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9;
-    if (K) gflops += K->flops() * 1e-9;
-    param.gflops = gflops;
     param.iter += k;
 
     if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter);
@@ -393,13 +387,6 @@ namespace quda
     double true_res = xmyNorm(b, r);
     param.true_res = sqrt(true_res / b2);
 
-    // reset the flops counters
-    blas::flops = 0;
-    mat.flops();
-    matSloppy.flops();
-    matPrecon.flops();
-    matEig.flops();
-
     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
   }
 
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 503dfe712b..29528e0829 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -29,17 +29,13 @@ namespace quda {
     volume(1),
     localVolume(1),
     pad(param.pad),
-    total_bytes(0),
     nDim(param.nDim),
     location(param.location),
     precision(param.Precision()),
     ghost_precision(param.GhostPrecision()),
-    ghost_precision_reset(false),
     scale(param.scale),
     siteSubset(param.siteSubset),
     ghostExchange(param.ghostExchange),
-    ghost_bytes(0),
-    ghost_bytes_old(0),
     ghost_face_bytes {},
     ghost_face_bytes_aligned {},
     ghost_offset(),
@@ -59,11 +55,7 @@ namespace quda {
     mh_send {},
     mh_recv_rdma {},
     mh_send_rdma {},
-    initComms(false),
-    mem_type(param.mem_type),
-    backup_h(nullptr),
-    backup_norm_h(nullptr),
-    backed_up(false)
+    mem_type(param.mem_type)
   {
     create(param);
   }
@@ -75,18 +67,14 @@ namespace quda {
     localVolumeCB(field.localVolumeCB),
     stride(field.stride),
     pad(field.pad),
-    total_bytes(0),
     nDim(field.nDim),
     location(field.location),
     precision(field.precision),
     ghost_precision(field.ghost_precision),
-    ghost_precision_reset(false),
     scale(field.scale),
     siteSubset(field.siteSubset),
     ghostExchange(field.ghostExchange),
     nDimComms(field.nDimComms),
-    ghost_bytes(0),
-    ghost_bytes_old(0),
     ghost_face_bytes {},
     ghost_face_bytes_aligned {},
     ghost_offset(),
@@ -106,11 +94,7 @@ namespace quda {
     mh_send {},
     mh_recv_rdma {},
     mh_send_rdma {},
-    initComms(false),
-    mem_type(field.mem_type),
-    backup_h(nullptr),
-    backup_norm_h(nullptr),
-    backed_up(false)
+    mem_type(field.mem_type)
   {
     LatticeFieldParam param;
     field.fill(param);
@@ -184,8 +168,16 @@ namespace quda {
     // for 5-dimensional fields, we only communicate in the space-time dimensions
     nDimComms = nDim == 5 ? 4 : nDim;
 
+    // if the memory location isn't set, use field location to set it
     mem_type = param.mem_type;
-
+    if (mem_type == QUDA_MEMORY_INVALID) {
+      mem_type = location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_HOST;
+      logQuda(QUDA_DEBUG_VERBOSE, "setting default memory type mem_type %d\n", mem_type);
+    } else if (mem_type == QUDA_MEMORY_DEVICE && location == QUDA_CPU_FIELD_LOCATION) {
+      mem_type = QUDA_MEMORY_HOST;
+    } else if (mem_type == QUDA_MEMORY_HOST && location == QUDA_CUDA_FIELD_LOCATION) {
+      mem_type = QUDA_MEMORY_DEVICE;
+    }
     setTuningString();
   }
 
@@ -239,9 +231,7 @@ namespace quda {
     vol_string = std::exchange(src.vol_string, {});
     aux_string = std::exchange(src.aux_string, {});
     mem_type = std::exchange(src.mem_type, QUDA_MEMORY_INVALID);
-    backup_h = std::exchange(src.backup_h, nullptr);
-    backup_norm_h = std::exchange(src.backup_norm_h, nullptr);
-    backed_up = std::exchange(src.backed_up, false);
+    backup_h = std::exchange(src.backup_h, {});
   }
 
   void LatticeField::fill(LatticeFieldParam &param) const
@@ -558,7 +548,9 @@ namespace quda {
     vol_ss << x[0];
     for (int d = 1; d < nDim; d++) vol_ss << "x" << x[d];
     vol_string = vol_ss.str();
-    if (vol_string.size() >= TuneKey::volume_n) errorQuda("Vol string too large %lu", vol_string.size());
+    if (vol_string.size() >= TuneKey::volume_n)
+      errorQuda("Vol string %s (size = %lu) larger than maximum %d", vol_string.c_str(), vol_string.size(),
+                TuneKey::volume_n);
   }
 
   void LatticeField::checkField(const LatticeField &a) const {
@@ -604,7 +596,7 @@ namespace quda {
       const ColorSpinorField &csField = static_cast<const ColorSpinorField&>(*this);
       if (csField.FieldOrder() == 2 || csField.FieldOrder() == 4)
 	return static_cast<int>(csField.FieldOrder());
-    } else if (typeid(*this) == typeid(const cudaGaugeField)) {
+    } else if (typeid(*this) == typeid(const GaugeField)) {
       const GaugeField &gField = static_cast<const GaugeField&>(*this);
       if (gField.Order() == 2 || gField.Order() == 4)
 	return static_cast<int>(gField.Order());
@@ -622,20 +614,70 @@ namespace quda {
   std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param)
   {
     output << "nDim = " << param.nDim << std::endl;
-    for (int i = 0; i < param.nDim; i++) { output << "x[" << i << "] = " << param.x[i] << std::endl; }
+    output << "x = " << param.x << std::endl;
     output << "pad = " << param.pad << std::endl;
     output << "precision = " << param.Precision() << std::endl;
     output << "ghost_precision = " << param.GhostPrecision() << std::endl;
     output << "scale = " << param.scale << std::endl;
-
     output << "ghostExchange = " << param.ghostExchange << std::endl;
-    for (int i=0; i<param.nDim; i++) {
-      output << "r[" << i << "] = " << param.r[i] << std::endl;
-    }
-
+    output << "r = " << param.r << std::endl;
     return output;  // for multiple << operators.
   }
 
+  std::ostream &operator<<(std::ostream &output, const LatticeField &field)
+  {
+    output << "volume = " << field.volume << std::endl;
+    output << "volumeCB = " << field.volumeCB << std::endl;
+    output << "localVolume = " << field.localVolume << std::endl;
+    output << "localVolumeCB = " << field.localVolumeCB << std::endl;
+    output << "stride = " << field.stride << std::endl;
+    output << "pad = " << field.stride << std::endl;
+    output << "total_bytes = " << field.total_bytes << std::endl;
+    output << "nDim = " << field.nDim << std::endl;
+    output << "x = " << field.x << std::endl;
+    output << "r = " << field.r << std::endl;
+    output << "local_x = " << field.local_x << std::endl;
+    output << "surface = " << field.surface << std::endl;
+    output << "surfaceCB = " << field.surfaceCB << std::endl;
+    output << "local_surface = " << field.local_surface << std::endl;
+    output << "local_surfaceCB = " << field.local_surfaceCB << std::endl;
+    output << "location = " << field.location << std::endl;
+    output << "precision = " << field.precision << std::endl;
+    output << "ghost_precision = " << field.ghost_precision_reset << std::endl;
+    output << "scale = " << field.scale << std::endl;
+    output << "siteSubset = " << field.siteSubset << std::endl;
+    output << "ghostExchange = " << field.ghostExchange << std::endl;
+    output << "nDimComms = " << field.nDimComms << std::endl;
+    output << "ghost_bytes = " << field.ghost_bytes_old << std::endl;
+    output << "ghost_bytes_old = " << field.ghost_bytes_old << std::endl;
+    output << "ghost_face_bytes = " << field.ghost_face_bytes << std::endl;
+    output << "ghost_face_bytes_aligned = " << field.ghost_face_bytes_aligned << std::endl;
+    output << "ghost_offset = " << field.ghost_offset << std::endl;
+    output << "my_face_h = " << field.my_face_h << std::endl;
+    output << "my_face_hd = " << field.my_face_hd << std::endl;
+    output << "my_face_d = " << field.my_face_d << std::endl;
+    output << "my_face_dim_dir_h = " << field.my_face_dim_dir_h << std::endl;
+    output << "my_face_dim_dir_hd = " << field.my_face_dim_dir_hd << std::endl;
+    output << "my_face_dim_dir_d = " << field.my_face_dim_dir_d << std::endl;
+    output << "from_face_h = " << field.from_face_h << std::endl;
+    output << "from_face_hd = " << field.from_face_hd << std::endl;
+    output << "from_face_d = " << field.from_face_d << std::endl;
+    output << "from_face_dim_dir_h = " << field.from_face_dim_dir_h << std::endl;
+    output << "from_face_dim_dir_hd = " << field.from_face_dim_dir_hd << std::endl;
+    output << "from_face_dim_dir_d = " << field.from_face_dim_dir_d << std::endl;
+    output << "mh_recv = " << field.mh_recv << std::endl;
+    output << "mh_send = " << field.mh_send << std::endl;
+    output << "mh_recv_rdma = " << field.mh_recv_rdma << std::endl;
+    output << "mh_send_rdma = " << field.mh_send_rdma << std::endl;
+    output << "initComms = " << field.initComms << std::endl;
+    output << "vol_string = " << field.vol_string << std::endl;
+    output << "aux_string = " << field.aux_string << std::endl;
+    output << "mem_type = " << field.mem_type << std::endl;
+    for (auto i = 0u; i < field.backup_h.size(); i++)
+      output << "backup_h[" << i << "] = " << field.backup_h[i] << std::endl;
+    return output;
+  }
+
   static QudaFieldLocation reorder_location_ = QUDA_CUDA_FIELD_LOCATION;
 
   QudaFieldLocation reorder_location() { return reorder_location_; }
diff --git a/lib/llfat_quda.cu b/lib/llfat_quda.cu
index f39233aeea..face33527a 100644
--- a/lib/llfat_quda.cu
+++ b/lib/llfat_quda.cu
@@ -166,47 +166,48 @@ namespace quda {
   }
 
 #ifdef GPU_STAGGERED_DIRAC
-  void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff)
+  void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff)
   {
-    computeLongLink(*lng, u, coeff[1]);
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    computeLongLink(lng, u, coeff[1]);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
-  void fatKSLink(GaugeField *fat, const GaugeField& u, const double *coeff)
+  void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+
     GaugeFieldParam gParam(u);
     gParam.reconstruct = QUDA_RECONSTRUCT_NO;
     gParam.setPrecision(gParam.Precision());
     gParam.create = QUDA_NULL_FIELD_CREATE;
-    auto staple = GaugeField::Create(gParam);
-    auto staple1 = GaugeField::Create(gParam);
+    GaugeField staple(gParam);
+    GaugeField staple1(gParam);
 
-    if ( ((fat->X()[0] % 2 != 0) || (fat->X()[1] % 2 != 0) || (fat->X()[2] % 2 != 0) || (fat->X()[3] % 2 != 0))
-	&& (u.Reconstruct()  != QUDA_RECONSTRUCT_NO)){
-      errorQuda("Reconstruct %d and odd dimensionsize is not supported by link fattening code (yet)\n",
-		u.Reconstruct());
+    if (((fat.X()[0] % 2 != 0) || (fat.X()[1] % 2 != 0) || (fat.X()[2] % 2 != 0) || (fat.X()[3] % 2 != 0))
+        && (u.Reconstruct() != QUDA_RECONSTRUCT_NO)) {
+      errorQuda("Reconstruct %d and odd dimension size is not supported by link fattening code (yet)", u.Reconstruct());
     }
 
-    computeOneLink(*fat, u, coeff[0]-6.0*coeff[5]);
+    computeOneLink(fat, u, coeff[0] - 6.0 * coeff[5]);
 
     // Check the coefficients. If all of the following are zero, return.
     if (fabs(coeff[2]) >= MIN_COEFF || fabs(coeff[3]) >= MIN_COEFF ||
 	fabs(coeff[4]) >= MIN_COEFF || fabs(coeff[5]) >= MIN_COEFF) {
 
       for (int nu = 0; nu < 4; nu++) {
-        computeStaple(*fat, *staple, u, u, nu, -1, -1, coeff[2], 1);
+        computeStaple(fat, staple, u, u, nu, -1, -1, coeff[2], 1);
 
-        if (coeff[5] != 0.0) computeStaple(*fat, *staple, *staple, u, nu, -1, -1, coeff[5], 0);
+        if (coeff[5] != 0.0) computeStaple(fat, staple, staple, u, nu, -1, -1, coeff[5], 0);
 
         for (int rho = 0; rho < 4; rho++) {
           if (rho != nu) {
 
-            computeStaple(*fat, *staple1, *staple, u, rho, nu, -1, coeff[3], 1);
+            computeStaple(fat, staple1, staple, u, rho, nu, -1, coeff[3], 1);
 
             if (fabs(coeff[4]) > MIN_COEFF) {
               for (int sig = 0; sig < 4; sig++) {
-                if (sig != nu && sig != rho) {
-                  computeStaple(*fat, *staple, *staple1, u, sig, nu, rho, coeff[4], 0);
-                }
+                if (sig != nu && sig != rho) { computeStaple(fat, staple, staple1, u, sig, nu, rho, coeff[4], 0); }
               } //sig
             } // MIN_COEFF
           }
@@ -214,19 +215,12 @@ namespace quda {
       } //nu
     }
 
-    delete staple;
-    delete staple1;
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
-  void longKSLink(GaugeField *, const GaugeField&, const double *)
-  {
-    errorQuda("Long-link computation not enabled");
-  }
+  void longKSLink(GaugeField &, const GaugeField &, const double *) { errorQuda("Long-link computation not enabled"); }
 
-  void fatKSLink(GaugeField *, const GaugeField&, const double *)
-  {
-    errorQuda("Fat-link computation not enabled");
-  }
+  void fatKSLink(GaugeField &, const GaugeField &, const double *) { errorQuda("Fat-link computation not enabled"); }
 #endif
 
 #undef MIN_COEFF
diff --git a/lib/max_clover.cu b/lib/max_clover.cu
index 18c84ca7a3..48e9630421 100644
--- a/lib/max_clover.cu
+++ b/lib/max_clover.cu
@@ -50,7 +50,7 @@ namespace quda {
 #ifdef GPU_CLOVER_DIRAC
   double _norm(const CloverField &u, bool inverse, norm_type_ type)
   {
-    if (!u.V(inverse)) errorQuda("reqeusted clover is_inverse=%d, but not allocated", inverse);
+    if (!u.data(inverse)) errorQuda("reqeusted clover is_inverse=%d, but not allocated", inverse);
     double nrm = 0.0;
     switch(u.Precision()) {
     case QUDA_DOUBLE_PRECISION: nrm = _norm<double>(u, inverse, type); break;
diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp
index 8315ff6ce1..fed48a17f3 100644
--- a/lib/milc_interface.cpp
+++ b/lib/milc_interface.cpp
@@ -2610,7 +2610,7 @@ void* qudaCreateGaugeField(void* gauge, int geometry, int precision)
 void qudaSaveGaugeField(void* gauge, void* inGauge)
 {
   qudamilc_called<true>(__func__);
-  cudaGaugeField* cudaGauge = reinterpret_cast<cudaGaugeField*>(inGauge);
+  auto cudaGauge = reinterpret_cast<GaugeField *>(inGauge);
   QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS);
   saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam);
   qudamilc_called<false>(__func__);
@@ -3044,14 +3044,8 @@ void qudaGaugeFixingOVR(int precision, unsigned int gauge_dir, int Nsteps, int v
   qudaGaugeParam.site_size = arg->size;
   qudaGaugeParam.gauge_order = arg->site ? QUDA_MILC_SITE_GAUGE_ORDER : QUDA_MILC_GAUGE_ORDER;
 
-  double timeinfo[3];
   computeGaugeFixingOVRQuda(gauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval,
-                            stopWtheta, &qudaGaugeParam, timeinfo);
-
-  printfQuda("Time H2D: %lf\n", timeinfo[0]);
-  printfQuda("Time to Compute: %lf\n", timeinfo[1]);
-  printfQuda("Time D2H: %lf\n", timeinfo[2]);
-  printfQuda("Time all: %lf\n", timeinfo[0]+timeinfo[1]+timeinfo[2]);
+                            stopWtheta, &qudaGaugeParam);
 
   qudamilc_called<false>(__func__, verbosity);
 }
@@ -3073,15 +3067,8 @@ void qudaGaugeFixingFFT( int precision,
   qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
   //qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12;
 
-
-  double timeinfo[3];
-  computeGaugeFixingFFTQuda(milc_sitelink, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta, \
-    &qudaGaugeParam, timeinfo);
-
-  printfQuda("Time H2D: %lf\n", timeinfo[0]);
-  printfQuda("Time to Compute: %lf\n", timeinfo[1]);
-  printfQuda("Time D2H: %lf\n", timeinfo[2]);
-  printfQuda("Time all: %lf\n", timeinfo[0]+timeinfo[1]+timeinfo[2]);
+  computeGaugeFixingFFTQuda(milc_sitelink, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta,
+                            &qudaGaugeParam);
 }
 
 void qudaTwoLinkGaussianSmear( int external_precision, int quda_precision, void * h_gauge, void * source, QudaTwoLinkQuarkSmearArgs_t qsmear_args )
diff --git a/lib/momentum.cu b/lib/momentum.cu
index 78a981e509..7a574687ca 100644
--- a/lib/momentum.cu
+++ b/lib/momentum.cu
@@ -9,6 +9,7 @@
 #include <tunable_reduction.h>
 #include <tunable_nd.h>
 #include <kernels/momentum.cuh>
+#include "timer.h"
 
 namespace quda {
 
@@ -92,9 +93,11 @@ namespace quda {
   };
 
   double computeMomAction(const GaugeField& mom) {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (!mom.isNative()) errorQuda("Unsupported output ordering: %d\n", mom.Order());
     double action = 0.0;
     instantiate<ActionMom, Reconstruct10>(mom, action);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     return action;
   }
 
@@ -132,11 +135,13 @@ namespace quda {
 
   void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (mom.Reconstruct() != QUDA_RECONSTRUCT_10)
       errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct());
 
     checkPrecision(mom, force);
     instantiate<UpdateMom, ReconstructMom>(force, mom, coeff, fname);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
   template <typename Float, int nColor, QudaReconstructType recon>
@@ -173,9 +178,11 @@ namespace quda {
 
   void applyU(GaugeField &force, GaugeField &U)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (!force.isNative()) errorQuda("Unsupported output ordering: %d\n", force.Order());
     checkPrecision(force, U);
     instantiate<UApply, ReconstructNo12>(U, force);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/multi_blas_quda.cu b/lib/multi_blas_quda.cu
index 5020f7b109..002bb48235 100644
--- a/lib/multi_blas_quda.cu
+++ b/lib/multi_blas_quda.cu
@@ -81,9 +81,6 @@ namespace quda {
 #endif
 
         apply(device::get_default_stream());
-
-        blas::bytes += bytes();
-        blas::flops += flops();
       }
 
       TuneKey tuneKey() const override { return TuneKey(vol, typeid(f).name(), aux); }
diff --git a/lib/multi_reduce_quda.cu b/lib/multi_reduce_quda.cu
index 6af44e8107..8fcf9d080b 100644
--- a/lib/multi_reduce_quda.cu
+++ b/lib/multi_reduce_quda.cu
@@ -88,7 +88,7 @@ namespace quda {
         if (NXZ == NYW) {
           is_norm = true;
           for (int i = 0; i < NXZ; i++) {
-            if (x[i].V() != y[i].V() || x[i].V() != z[i].V() || x[i].V() != w[i].V()) {
+            if (x[i].data() != y[i].data() || x[i].data() != z[i].data() || x[i].data() != w[i].data()) {
               is_norm = false;
               break;
             }
@@ -97,9 +97,6 @@ namespace quda {
         if (is_norm) strcat(aux, ",norm");
 
         apply(device::get_default_stream());
-
-        blas::bytes += bytes();
-        blas::flops += flops();
       }
 
       TuneKey tuneKey() const override { return TuneKey(vol, typeid(r).name(), aux); }
diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp
index 06e48bf8e1..475a1d9b3b 100644
--- a/lib/multigrid.cpp
+++ b/lib/multigrid.cpp
@@ -245,9 +245,9 @@ namespace quda
     popLevel();
   }
 
-  void MG::resetStaggeredKD(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in,
-                            cudaGaugeField *gauge_sloppy_in, cudaGaugeField *fat_gauge_sloppy_in,
-                            cudaGaugeField *long_gauge_sloppy_in, double mass)
+  void MG::resetStaggeredKD(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in,
+                            GaugeField *gauge_sloppy_in, GaugeField *fat_gauge_sloppy_in,
+                            GaugeField *long_gauge_sloppy_in, double mass)
   {
     if (param.level != 0) errorQuda("The staggered KD operator can only be updated from level 0");
 
@@ -510,8 +510,8 @@ namespace quda
     bool is_coarse_naive_staggered = is_naive_staggered
       || (is_improved_staggered && param.mg_global.transfer_type[param.level] == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG);
 
-    cudaGaugeField *fine_gauge = diracSmoother->getStaggeredShortLinkField();
-    cudaGaugeField *sloppy_gauge = mixed_precision_setup ? diracSmootherSloppy->getStaggeredShortLinkField() : fine_gauge;
+    auto fine_gauge = diracSmoother->getStaggeredShortLinkField();
+    auto sloppy_gauge = mixed_precision_setup ? diracSmootherSloppy->getStaggeredShortLinkField() : fine_gauge;
 
     xInvKD = AllocateAndBuildStaggeredKahlerDiracInverse(
       *fine_gauge, diracSmoother->Mass(), param.mg_global.staggered_kd_dagger_approximation == QUDA_BOOLEAN_TRUE);
@@ -524,7 +524,7 @@ namespace quda
       // true is to force FLOAT2
       xinv_param.setPrecision(param.mg_global.invert_param->cuda_prec_precondition, true);
 
-      xInvKD_sloppy = std::shared_ptr<GaugeField>(reinterpret_cast<GaugeField *>(new cudaGaugeField(xinv_param)));
+      xInvKD_sloppy = std::shared_ptr<GaugeField>(reinterpret_cast<GaugeField *>(new GaugeField(xinv_param)));
       xInvKD_sloppy->copy(*xInvKD);
 
       ColorSpinorParam sloppy_tmp_param(*tmp_coarse);
@@ -545,7 +545,7 @@ namespace quda
     diracParamKD.mu_factor = 1.0;          // doesn't matter
     diracParamKD.dagger = QUDA_DAG_NO;
     diracParamKD.matpcType = QUDA_MATPC_EVEN_EVEN; // We can use this to track left vs right block jacobi in the future
-    diracParamKD.gauge = const_cast<cudaGaugeField *>(fine_gauge);
+    diracParamKD.gauge = fine_gauge;
     diracParamKD.xInvKD = xInvKD.get(); // FIXME: pulling a raw unmanaged pointer out of a unique_ptr...
     diracParamKD.dirac
       = const_cast<Dirac *>(diracSmoother); // used to determine if the outer solve is preconditioned or not
@@ -804,34 +804,6 @@ namespace quda
     popLevel();
   }
 
-  // FIXME need to make this more robust (implement Solver::flops() for all solvers)
-  double MG::flops() const {
-    double flops = 0;
-
-    if (param_coarse_solver) {
-      flops += param_coarse_solver->gflops * 1e9;
-      param_coarse_solver->gflops = 0;
-    } else if (param.level < param.Nlevel-1) {
-      flops += coarse->flops();
-    }
-
-    if (param_presmooth) {
-      flops += param_presmooth->gflops * 1e9;
-      param_presmooth->gflops = 0;
-    }
-
-    if (param_postsmooth) {
-      flops += param_postsmooth->gflops * 1e9;
-      param_postsmooth->gflops = 0;
-    }
-
-    if (transfer) {
-      flops += transfer->flops();
-    }
-
-    return flops;
-  }
-
   bool check_deviation(double deviation, double tol)
   {
     return (deviation > tol || std::isnan(deviation) || std::isinf(deviation));
diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp
new file mode 100644
index 0000000000..c4e6197850
--- /dev/null
+++ b/lib/quda_ptr.cpp
@@ -0,0 +1,162 @@
+#include <utility>
+#include "quda_ptr.h"
+#include "util_quda.h"
+#include "timer.h"
+
+namespace quda
+{
+
+  quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) : type(type), size(size), pool(pool)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_INIT);
+    if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST))
+      errorQuda("Memory pool not available for memory type %d", type);
+
+    if (size > 0) {
+      switch (type) {
+      case QUDA_MEMORY_DEVICE: device = pool ? pool_device_malloc(size) : device_malloc(size); break;
+      case QUDA_MEMORY_DEVICE_PINNED: device = device_pinned_malloc(size); break;
+      case QUDA_MEMORY_HOST: host = safe_malloc(size); break;
+      case QUDA_MEMORY_HOST_PINNED: host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); break;
+      case QUDA_MEMORY_MAPPED:
+        host = mapped_malloc(size);
+        device = get_mapped_device_pointer(host);
+        break;
+      case QUDA_MEMORY_MANAGED:
+        host = managed_malloc(size);
+        device = host;
+        break;
+      default: errorQuda("Unknown memory type %d", type);
+      }
+    }
+    getProfile().TPSTOP(QUDA_PROFILE_INIT);
+  }
+
+  quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) : type(type), reference(true)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_INIT);
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+      device = ptr;
+      host = nullptr;
+      break;
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED:
+      device = nullptr;
+      host = ptr;
+      break;
+    case QUDA_MEMORY_MANAGED:
+      device = ptr;
+      host = ptr;
+      break;
+    default: errorQuda("Unsupported memory type %d", type);
+    }
+    getProfile().TPSTOP(QUDA_PROFILE_INIT);
+  }
+
+  quda_ptr &quda_ptr::operator=(quda_ptr &&other)
+  {
+    if (&other != this) {
+      if (size > 0) errorQuda("Cannot move to already initialized quda_ptr");
+      type = std::exchange(other.type, QUDA_MEMORY_INVALID);
+      size = std::exchange(other.size, 0);
+      pool = std::exchange(other.pool, false);
+      device = std::exchange(other.device, nullptr);
+      host = std::exchange(other.host, nullptr);
+    }
+    return *this;
+  }
+
+  void quda_ptr::destroy()
+  {
+    if (size > 0) {
+      switch (type) {
+      case QUDA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break;
+      case QUDA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break;
+      case QUDA_MEMORY_HOST: host_free(host); break;
+      case QUDA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break;
+      case QUDA_MEMORY_MAPPED: host_free(host); break;
+      default: errorQuda("Unknown memory type %d", type);
+      }
+    }
+
+    size = 0;
+    device = nullptr;
+    host = nullptr;
+  }
+
+  quda_ptr::~quda_ptr()
+  {
+    getProfile().TPSTART(QUDA_PROFILE_FREE);
+    destroy();
+    getProfile().TPSTOP(QUDA_PROFILE_FREE);
+  }
+
+  void quda_ptr::exchange(quda_ptr &obj, quda_ptr &&new_value)
+  {
+    destroy();
+    *this = std::move(obj);
+    obj = std::move(new_value);
+  }
+
+  bool quda_ptr::is_device() const
+  {
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+    case QUDA_MEMORY_MAPPED:
+    case QUDA_MEMORY_MANAGED: return true;
+    default: return false;
+    }
+  }
+
+  bool quda_ptr::is_host() const
+  {
+    switch (type) {
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED:
+    case QUDA_MEMORY_MANAGED: return true;
+    default: return false;
+    }
+  }
+
+  void *quda_ptr::data() const
+  {
+    void *ptr = nullptr;
+
+    switch (type) {
+    case QUDA_MEMORY_DEVICE:
+    case QUDA_MEMORY_DEVICE_PINNED:
+    case QUDA_MEMORY_MAPPED:
+    case QUDA_MEMORY_MANAGED: ptr = device; break;
+    case QUDA_MEMORY_HOST:
+    case QUDA_MEMORY_HOST_PINNED: ptr = host; break;
+    default: errorQuda("Unknown memory type %d", type);
+    }
+
+    return ptr;
+  }
+
+  void *quda_ptr::data_device() const
+  {
+    if (!device) errorQuda("Device view not defined");
+    return device;
+  }
+
+  void *quda_ptr::data_host() const
+  {
+    if (!host) errorQuda("Host view not defined");
+    return host;
+  }
+
+  bool quda_ptr::is_reference() const { return reference; }
+
+  std::ostream &operator<<(std::ostream &output, const quda_ptr &ptr)
+  {
+    output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool
+           << ", device = " << ptr.device << ", host = " << ptr.host << ", reference = " << ptr.reference << "}";
+    return output;
+  }
+
+} // namespace quda
diff --git a/lib/reduce_quda.cu b/lib/reduce_quda.cu
index 7b61c78b6e..4a77ec79ee 100644
--- a/lib/reduce_quda.cu
+++ b/lib/reduce_quda.cu
@@ -62,9 +62,6 @@ namespace quda {
         }
 
         apply(device::get_default_stream());
-
-        blas::bytes += bytes();
-        blas::flops += flops();
       }
 
       TuneKey tuneKey() const override { return TuneKey(vol, typeid(r).name(), aux); }
diff --git a/lib/solver.cpp b/lib/solver.cpp
index 8b734e8bc5..12cce8f532 100644
--- a/lib/solver.cpp
+++ b/lib/solver.cpp
@@ -223,9 +223,6 @@ namespace quda {
       = (outer.inv_type_precondition == QUDA_MR_INVERTER) ? QUDA_INVALID_RESIDUAL : QUDA_L2_RELATIVE_RESIDUAL;
 
     inner.iter = 0;
-    inner.gflops = 0;
-    inner.secs = 0;
-
     inner.inv_type_precondition = QUDA_INVALID_INVERTER;
     inner.is_preconditioner = true; // tell inner solver it is a preconditioner
     inner.pipeline = true;
diff --git a/lib/spinor_dilute.cu b/lib/spinor_dilute.cu
deleted file mode 100644
index f95011d2ea..0000000000
--- a/lib/spinor_dilute.cu
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <color_spinor_field.h>
-#include <kernels/spinor_dilute.cuh>
-#include <tunable_nd.h>
-#include <instantiate.h>
-
-namespace quda {
-
-  template <typename real, int Ns, int Nc>
-  class SpinorDilute : TunableKernel2D {
-    std::vector<ColorSpinorField> &v;
-    const ColorSpinorField &src;
-    QudaDilutionType type;
-    unsigned int minThreads() const { return src.VolumeCB(); }
-
-  public:
-    SpinorDilute(const ColorSpinorField &src, std::vector<ColorSpinorField> &v, QudaDilutionType type) :
-      TunableKernel2D(src, src.SiteSubset()),
-      v(v),
-      src(src),
-      type(type)
-    {
-      switch (type) {
-      case QUDA_DILUTION_SPIN: strcat(aux, ",spin_dilution"); break;
-      case QUDA_DILUTION_COLOR: strcat(aux, ",color_dilution"); break;
-      case QUDA_DILUTION_SPIN_COLOR: strcat(aux, ",spin_color_dilution"); break;
-      case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: strcat(aux, ",spin_color_even_odd_dilution"); break;
-      default: errorQuda("Unsupported dilution type %d", type);
-      }
-      if (v.size() != static_cast<unsigned int>(get_size<Ns, Nc>(type)))
-        errorQuda("Input container size %lu does not match expected size %d for dilution type", v.size(), get_size<Ns, Nc>(type));
-      apply(device::get_default_stream());
-    }
-
-    template <QudaDilutionType type> using Arg = SpinorDiluteArg<real, Ns, Nc, type>;
-
-    template <QudaDilutionType type>
-    auto constexpr sequence() { return std::make_index_sequence<get_size<Ns, Nc>(type)>(); }
-
-    template <QudaDilutionType type>
-    void apply(TuneParam &tp, const qudaStream_t &stream) { launch<DiluteSpinor>(tp, stream, Arg<type>(v, src, sequence<type>())); }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      switch (type) {
-      case QUDA_DILUTION_SPIN: apply<QUDA_DILUTION_SPIN>(tp, stream); break;
-      case QUDA_DILUTION_COLOR: apply<QUDA_DILUTION_COLOR>(tp, stream); break;
-      case QUDA_DILUTION_SPIN_COLOR: apply<QUDA_DILUTION_SPIN_COLOR>(tp, stream); break;
-      case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: apply<QUDA_DILUTION_SPIN_COLOR_EVEN_ODD>(tp, stream); break;
-      default: errorQuda("Dilution type %d not supported", type);
-      }
-    }
-
-    long long bytes() const { return v.size() * v[0].Bytes() + src.Bytes(); }
-  };
-
-  void spinorDilute(std::vector<ColorSpinorField> &v, const ColorSpinorField &src, QudaDilutionType type)
-  {
-    instantiateSpinor<SpinorDilute>(src, v, type);
-  }
-
-} // namespace quda
diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu
new file mode 100644
index 0000000000..9b57458e8a
--- /dev/null
+++ b/lib/spinor_dilute.in.cu
@@ -0,0 +1,128 @@
+#include <color_spinor_field.h>
+#include <kernels/spinor_dilute.cuh>
+#include <tunable_nd.h>
+#include <instantiate.h>
+
+namespace quda
+{
+
+  template <typename real, int Ns, int Nc> class SpinorDilute : TunableKernel2D
+  {
+    std::vector<ColorSpinorField> &v;
+    const ColorSpinorField &src;
+    QudaDilutionType type;
+    const lat_dim_t &local_block;
+    unsigned int minThreads() const { return src.VolumeCB(); }
+    template <QudaDilutionType type> using Arg = SpinorDiluteArg<real, Ns, Nc, type>;
+
+  public:
+    SpinorDilute(const ColorSpinorField &src, std::vector<ColorSpinorField> &v, QudaDilutionType type,
+                 const lat_dim_t &local_block) :
+      TunableKernel2D(src, src.SiteSubset()), v(v), src(src), type(type), local_block(local_block)
+    {
+      switch (type) {
+      case QUDA_DILUTION_SPIN: strcat(aux, ",spin_dilution"); break;
+      case QUDA_DILUTION_COLOR: strcat(aux, ",color_dilution"); break;
+      case QUDA_DILUTION_SPIN_COLOR: strcat(aux, ",spin_color_dilution"); break;
+      case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: strcat(aux, ",spin_color_even_odd_dilution"); break;
+      case QUDA_DILUTION_BLOCK: strcat(aux, ",block_dilution"); break;
+      default: errorQuda("Unsupported dilution type %d", type);
+      }
+      if (type != QUDA_DILUTION_BLOCK && v.size() != static_cast<unsigned int>(get_size<Ns, Nc>(type)))
+        errorQuda("Input container size %lu does not match expected size %d for dilution type", v.size(),
+                  get_size<Ns, Nc>(type));
+
+      size_t block_volume = 1;
+      for (int i = 0; i < src.Ndim(); i++) block_volume *= local_block[i];
+      size_t n_blocks = comm_size() * src.Volume() / block_volume;
+      if (type == QUDA_DILUTION_BLOCK) {
+        if (v.size() != n_blocks)
+          errorQuda("Input container size %lu does not match expected size %lu for dilution block size (%d,%d,%d,%d)",
+                    v.size(), n_blocks, local_block[0], local_block[1], local_block[2], local_block[3]);
+        if (v.size() > Arg<QUDA_DILUTION_BLOCK>::max_dilution_size)
+          errorQuda("Container size %lu exceeds maximum size %d", v.size(), Arg<QUDA_DILUTION_BLOCK>::max_dilution_size);
+
+        for (auto i = 0; i < src.Ndim(); i++) {
+          if (local_block[i] == 0) errorQuda("Dim %d: Dilution block size = 0", i);
+          if ((src.X(i) * comm_dim(i)) % local_block[i] != 0)
+            errorQuda("Dim %d: Invalid dilution block size %d for global lattice dim = %d", i, local_block[i],
+                      src.X(i) * comm_dim(i));
+        }
+      }
+
+      apply(device::get_default_stream());
+    }
+
+    template <QudaDilutionType type> auto constexpr sequence()
+    {
+      return std::make_index_sequence<get_size<Ns, Nc>(type)>();
+    }
+
+    template <QudaDilutionType type> void apply(TuneParam &tp, const qudaStream_t &stream)
+    {
+      launch<DiluteSpinor>(tp, stream, Arg<type>(v, src, local_block, sequence<type>()));
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      switch (type) {
+      case QUDA_DILUTION_SPIN: apply<QUDA_DILUTION_SPIN>(tp, stream); break;
+      case QUDA_DILUTION_COLOR: apply<QUDA_DILUTION_COLOR>(tp, stream); break;
+      case QUDA_DILUTION_SPIN_COLOR: apply<QUDA_DILUTION_SPIN_COLOR>(tp, stream); break;
+      case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: apply<QUDA_DILUTION_SPIN_COLOR_EVEN_ODD>(tp, stream); break;
+      case QUDA_DILUTION_BLOCK: apply<QUDA_DILUTION_BLOCK>(tp, stream); break;
+      default: errorQuda("Dilution type %d not supported", type);
+      }
+    }
+
+    long long bytes() const { return v.size() * v[0].Bytes() + src.Bytes(); }
+  };
+
+  template <int...> struct IntList {
+  };
+
+  template <typename real, int Ns, int Nc, int... N>
+  void spinorDilute(const ColorSpinorField &src, std::vector<ColorSpinorField> &v, QudaDilutionType type,
+                    const lat_dim_t &local_block, IntList<Nc, N...>)
+  {
+    if (src.Ncolor() == Nc) {
+      SpinorDilute<real, Ns, Nc>(src, v, type, local_block);
+    } else {
+      if constexpr (sizeof...(N) > 0)
+        spinorDilute<real, Ns>(src, v, type, local_block, IntList<N...>());
+      else
+        errorQuda("nColor = %d not implemented", src.Ncolor());
+    }
+  }
+
+  template <typename real>
+  void spinorDilute(const ColorSpinorField &src, std::vector<ColorSpinorField> &v, QudaDilutionType type,
+                    const lat_dim_t &local_block)
+  {
+    checkNative(src);
+    if (!is_enabled_spin(src.Nspin())) errorQuda("spinorNoise has not been built for nSpin=%d fields", src.Nspin());
+
+    if (src.Nspin() == 4) {
+      if constexpr (is_enabled_spin(4)) spinorDilute<real, 4>(src, v, type, local_block, IntList<3>());
+    } else if (src.Nspin() == 2) {
+      if constexpr (is_enabled_spin(2))
+        spinorDilute<real, 2>(src, v, type, local_block, IntList<3, @QUDA_MULTIGRID_NVEC_LIST@>());
+    } else if (src.Nspin() == 1) {
+      if constexpr (is_enabled_spin(1)) spinorDilute<real, 1>(src, v, type, local_block, IntList<3>());
+    } else {
+      errorQuda("Nspin = %d not implemented", src.Nspin());
+    }
+  }
+
+  void spinorDilute(std::vector<ColorSpinorField> &v, const ColorSpinorField &src, QudaDilutionType type,
+                    const lat_dim_t &local_block)
+  {
+    switch (src.Precision()) {
+    case QUDA_DOUBLE_PRECISION: spinorDilute<double>(src, v, type, local_block); break;
+    case QUDA_SINGLE_PRECISION: spinorDilute<float>(src, v, type, local_block); break;
+    default: errorQuda("Not instantiated %d\n", src.Precision());
+    }
+  }
+
+} // namespace quda
diff --git a/lib/staggered_coarse_op.in.cpp b/lib/staggered_coarse_op.in.cpp
index 9560f79b58..bab3e1ffba 100644
--- a/lib/staggered_coarse_op.in.cpp
+++ b/lib/staggered_coarse_op.in.cpp
@@ -7,8 +7,8 @@ namespace quda
   };
 
   template <int fineColor, int coarseColor, int... N>
-  void StaggeredCoarseOp2(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                          const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp2(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                          const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                           QudaDiracType dirac, QudaMatPCType matpc, IntList<coarseColor, N...>)
   {
     if (Y.Ncolor() / 2 == coarseColor) {
@@ -24,8 +24,8 @@ namespace quda
   }
 
   template <int fineColor, int... N>
-  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                         const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                         const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                          QudaDiracType dirac, QudaMatPCType matpc, IntList<fineColor, N...>)
   {
     if (gauge.Ncolor() == fineColor) {
@@ -43,8 +43,8 @@ namespace quda
     }
   }
 
-  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                         const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
+  void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                         const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation,
                          QudaDiracType dirac, QudaMatPCType matpc)
   {
     if constexpr (is_enabled_spin(1) && is_enabled_multigrid()) {
diff --git a/lib/staggered_coarse_op.in.cu b/lib/staggered_coarse_op.in.cu
index 103b242655..cec28e4f2e 100644
--- a/lib/staggered_coarse_op.in.cu
+++ b/lib/staggered_coarse_op.in.cu
@@ -306,8 +306,8 @@ namespace quda {
   constexpr int coarseColor = @QUDA_MULTIGRID_NVEC@;
 
   template <>
-  void StaggeredCoarseOp<fineColor, coarseColor>(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge,
-                                                 const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass,
+  void StaggeredCoarseOp<fineColor, coarseColor>(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge,
+                                                 const GaugeField &longGauge, const GaugeField &XinvKD, double mass,
                                                  bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc)
   {
     QudaPrecision precision = checkPrecision(T.Vectors(X.Location()), X, Y);
@@ -351,13 +351,13 @@ namespace quda {
       gf_param.nFace = 1;
       gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-      tmp_U = std::make_unique<cpuGaugeField>(gf_param);
+      tmp_U = std::make_unique<GaugeField>(gf_param);
       need_tmp_U = true;
 
       //Copy the cuda gauge field to the cpu
-      gauge.saveCPUField(reinterpret_cast<cpuGaugeField&>(*tmp_U));
+      tmp_U.get()->copy(gauge);
 
-            // Create either a real or a dummy L field
+      // Create either a real or a dummy L field
       GaugeFieldParam lgf_param(longGauge.X(), precision, QUDA_RECONSTRUCT_NO, pad, longGauge.Geometry());
       if (!(dirac == QUDA_ASQTAD_DIRAC || dirac == QUDA_ASQTADKD_DIRAC))
         for (int i = 0; i < lgf_param.nDim; i++) lgf_param.x[i] = 0;
@@ -373,12 +373,12 @@ namespace quda {
       lgf_param.nFace = 3;
       lgf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-      tmp_L = std::make_unique<cpuGaugeField>(lgf_param);
+      tmp_L = std::make_unique<GaugeField>(lgf_param);
       need_tmp_L = true;
 
       //Copy the cuda gauge field to the cpu
       if (dirac == QUDA_ASQTAD_DIRAC || dirac == QUDA_ASQTADPC_DIRAC || dirac == QUDA_ASQTADKD_DIRAC)
-        longGauge.saveCPUField(reinterpret_cast<cpuGaugeField&>(*tmp_L));
+        tmp_L.get()->copy(longGauge);
 
       // Create either a real or a dummy Xinv field
       GaugeFieldParam xgf_param(XinvKD.X(), precision, QUDA_RECONSTRUCT_NO, pad, XinvKD.Geometry());
@@ -400,7 +400,7 @@ namespace quda {
       xgf_param.nFace = 0;
       xgf_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
 
-      tmp_Xinv = std::make_unique<cpuGaugeField>(xgf_param);
+      tmp_Xinv = std::make_unique<GaugeField>(xgf_param);
       need_tmp_Xinv = true;
 
       //Copy the cuda gauge field to the cpu
@@ -419,7 +419,7 @@ namespace quda {
         lgf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
         lgf_param.setPrecision(lgf_param.Precision());
         lgf_param.create = QUDA_NULL_FIELD_CREATE;
-        tmp_L = std::make_unique<cudaGaugeField>(lgf_param);
+        tmp_L = std::make_unique<GaugeField>(lgf_param);
         need_tmp_L = true;
       } else if ((dirac == QUDA_ASQTAD_DIRAC || dirac == QUDA_ASQTADPC_DIRAC || dirac == QUDA_ASQTADKD_DIRAC) && longGauge.Reconstruct() != QUDA_RECONSTRUCT_NO) {
         // create a copy of the gauge field with no reconstruction
@@ -427,7 +427,7 @@ namespace quda {
         lgf_param.reconstruct = QUDA_RECONSTRUCT_NO;
         lgf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
         lgf_param.setPrecision(lgf_param.Precision());
-        tmp_L = std::make_unique<cudaGaugeField>(lgf_param);
+        tmp_L = std::make_unique<GaugeField>(lgf_param);
 
         tmp_L->copy(longGauge);
         tmp_L->exchangeGhost();
@@ -443,7 +443,7 @@ namespace quda {
         xgf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
         xgf_param.setPrecision(xgf_param.Precision());
         xgf_param.create = QUDA_NULL_FIELD_CREATE;
-        tmp_Xinv = std::make_unique<cudaGaugeField>(xgf_param);
+        tmp_Xinv = std::make_unique<GaugeField>(xgf_param);
         need_tmp_Xinv = true;
       }
       // no need to worry about XinvKD's reconstruct
@@ -454,7 +454,7 @@ namespace quda {
         gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
         gf_param.order = QUDA_FLOAT2_GAUGE_ORDER;
         gf_param.setPrecision(gf_param.Precision());
-        tmp_U = std::make_unique<cudaGaugeField>(gf_param);
+        tmp_U = std::make_unique<GaugeField>(gf_param);
         need_tmp_U = true;
 
         tmp_U->copy(gauge);
diff --git a/lib/staggered_kd_apply_xinv.cu b/lib/staggered_kd_apply_xinv.cu
index 60e9034663..247668cb1c 100644
--- a/lib/staggered_kd_apply_xinv.cu
+++ b/lib/staggered_kd_apply_xinv.cu
@@ -22,7 +22,7 @@ namespace quda {
       Xinv(Xinv),
       dagger(dagger)
     {
-      if (out.V() == in.V()) errorQuda("Spinor fields cannot alias");
+      if (out.data() == in.data()) errorQuda("Spinor fields cannot alias");
       if (in.Nspin() != 1 || out.Nspin() != 1) errorQuda("Unsupported nSpin=%d %d", out.Nspin(), in.Nspin());
       if (Xinv.Geometry() != QUDA_KDINVERSE_GEOMETRY)
         errorQuda("Unsupported gauge geometry %d , expected %d for Xinv", Xinv.Geometry(), QUDA_KDINVERSE_GEOMETRY);
diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu
index b109bc2388..4494b75e1b 100644
--- a/lib/staggered_kd_build_xinv.cu
+++ b/lib/staggered_kd_build_xinv.cu
@@ -58,7 +58,7 @@ namespace quda {
       // reset scales as appropriate
       if constexpr (sizeof(Float) < QUDA_SINGLE_PRECISION) {
         double max_scale = g.abs_max();
-        if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Global U_max = %e\n", max_scale);
+        logQuda(QUDA_VERBOSE, "Global U_max = %e\n", max_scale);
         X.Scale(max_scale > 2.0*mass ? max_scale : 2.0*mass);
       }
 
@@ -113,7 +113,8 @@ namespace quda {
      @param mass[in] Mass of staggered fermion
      @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv
    */
-  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass, const bool dagger_approximation)
+  void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass,
+                                        const bool dagger_approximation)
   {
     using namespace blas_lapack;
     auto invert = use_native() ? native::BatchInvertMatrix : generic::BatchInvertMatrix;
@@ -154,13 +155,7 @@ namespace quda {
       gParam.geometry = QUDA_SCALAR_GEOMETRY;
       gParam.pad = 0;
 
-      if (location == QUDA_CUDA_FIELD_LOCATION)
-        xInvMilcOrder = std::make_unique<cudaGaugeField>(gParam);
-      else if (location == QUDA_CPU_FIELD_LOCATION)
-        xInvMilcOrder = std::make_unique<cpuGaugeField>(gParam);
-      else
-        errorQuda("Invalid field location %d", location);
-
+      xInvMilcOrder = std::make_unique<GaugeField>(gParam);
     }
 
     // Step 2: build a host or device gauge field as appropriate, but
@@ -190,10 +185,10 @@ namespace quda {
         gf_param.nFace = 1;
         gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
 
-        tmp_U = std::make_unique<cpuGaugeField>(gf_param);
+        tmp_U = std::make_unique<GaugeField>(gf_param);
 
         //Copy the cuda gauge field to the cpu
-        gauge.saveCPUField(reinterpret_cast<cpuGaugeField&>(*tmp_U));
+        tmp_U.get()->copy(gauge);
 
       } else if (location == QUDA_CUDA_FIELD_LOCATION) {
 
@@ -202,7 +197,7 @@ namespace quda {
         gf_param.reconstruct = QUDA_RECONSTRUCT_NO;
         gf_param.order = QUDA_FLOAT2_GAUGE_ORDER; // guaranteed for no recon
         gf_param.setPrecision( QUDA_SINGLE_PRECISION );
-        tmp_U = std::make_unique<cudaGaugeField>(gf_param);
+        tmp_U = std::make_unique<GaugeField>(gf_param);
 
         tmp_U->copy(gauge);
       }
@@ -216,18 +211,16 @@ namespace quda {
     if (location == QUDA_CUDA_FIELD_LOCATION) {
       x_param.order = QUDA_FLOAT2_GAUGE_ORDER;
       x_param.setPrecision(x_param.Precision());
-      tmp_X = std::make_unique<cudaGaugeField>(x_param);
-    } else {
-      tmp_X = std::make_unique<cpuGaugeField>(x_param);
     }
+    tmp_X = std::make_unique<GaugeField>(x_param);
     GaugeField& X = *tmp_X;
 
     // Step 4: Calculate X from U
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Computing the KD block on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
+    logQuda(QUDA_VERBOSE, "Computing the KD block on the %s\n", location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU");
 
     calculateStaggeredKDBlock(X, U, mass);
 
-    if (getVerbosity() >= QUDA_VERBOSE) printfQuda("X2 = %e\n", X.norm2(0));
+    logQuda(QUDA_VERBOSE, "X2 = %e\n", X.norm2(0));
 
     // Step 5: Calculate Xinv
     if (dagger_approximation) {
@@ -241,34 +234,33 @@ namespace quda {
         GaugeFieldParam param(*xInvMilcOrder);
         param.order = QUDA_MILC_GAUGE_ORDER; // MILC order == QDP order for Xinv
         param.setPrecision(QUDA_SINGLE_PRECISION);
-        cudaGaugeField X_(param);
-        
+        GaugeField X_(param);
+
         X_.copy(X);
 
-        blas::flops += invert((void*)xInvMilcOrder->Gauge_p(), (void*)X_.Gauge_p(), n, X_.Volume(), X_.Precision(), X.Location());
+        Tunable::flops_global(invert(xInvMilcOrder->data(), X_.data(), n, X_.Volume(), X_.Precision(), X.Location())
+                              + Tunable::flops_global());
 
       } else if (location == QUDA_CPU_FIELD_LOCATION) {
-
-        blas::flops += invert((void*)xInvMilcOrder->Gauge_p(), (void*)X.Gauge_p(), n, X.Volume(), X.Precision(), X.Location());
+        Tunable::flops_global(invert(xInvMilcOrder->data(), X.data(), n, X.Volume(), X.Precision(), X.Location())
+                              + Tunable::flops_global());
       }
 
-      if (getVerbosity() >= QUDA_VERBOSE) printfQuda("xInvMilcOrder = %e\n", xInvMilcOrder->norm2(0));
-
+      logQuda(QUDA_VERBOSE, "xInvMilcOrder = %e\n", xInvMilcOrder->norm2(0));
     }
 
     // Step 6: reorder the KD inverse into a "gauge field" with a QUDA_KDINVERSE_GEOMETRY
     // last two parameters: dagger approximation, mass (which becomes a scale in the dagger approx)
     ReorderStaggeredKahlerDiracInverse(Xinv, *xInvMilcOrder, dagger_approximation, mass);
 
-    if (getVerbosity() >= QUDA_VERBOSE) {
-      if (dagger_approximation) printfQuda("Using the dagger approximation to Xinv\n");
-      printfQuda("xInvKdGeometry = %e\n", Xinv.norm2());
-    }
+    if (dagger_approximation) logQuda(QUDA_VERBOSE, "Using the dagger approximation to Xinv\n");
+    logQuda(QUDA_VERBOSE, "xInvKdGeometry = %e\n", Xinv.norm2());
   }
 
 
   // Allocates and calculates the inverse KD block, returning Xinv
-  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass, const bool dagger_approximation)
+  std::shared_ptr<GaugeField> AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass,
+                                                                          const bool dagger_approximation)
   {
     GaugeFieldParam gParam(gauge);
     gParam.reconstruct = QUDA_RECONSTRUCT_NO;
@@ -282,7 +274,7 @@ namespace quda {
     // latter true is to force FLOAT2
     gParam.setPrecision(gauge.Precision(), true);
 
-    std::shared_ptr<GaugeField> Xinv(reinterpret_cast<GaugeField*>(new cudaGaugeField(gParam)));
+    std::shared_ptr<GaugeField> Xinv(reinterpret_cast<GaugeField *>(new GaugeField(gParam)));
 
     BuildStaggeredKahlerDiracInverse(*Xinv, gauge, mass, dagger_approximation);
 
diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu
index e1b6040568..5899b73916 100644
--- a/lib/staggered_oprod.cu
+++ b/lib/staggered_oprod.cu
@@ -83,8 +83,16 @@ namespace quda {
       }
     } // apply
 
-    void preTune() { U.backup(); if (U.Gauge_p() != L.Gauge_p()) L.backup(); }
-    void postTune() { U.restore(); if (U.Gauge_p() != L.Gauge_p()) L.restore(); }
+    void preTune()
+    {
+      U.backup();
+      if (U.data() != L.data()) L.backup();
+    }
+    void postTune()
+    {
+      U.restore();
+      if (U.data() != L.data()) L.restore();
+    }
 
     long long flops() const { return 0; } // FIXME
     long long bytes() const { return 0; } // FIXME
@@ -106,6 +114,7 @@ namespace quda {
 #ifdef GPU_STAGGERED_DIRAC
   void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     if (nFace == 1) {
       computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 0, coeff, nFace);
       double coeff_[2] = {-coeff[0],0.0}; // need to multiply by -1 on odd sites
@@ -116,6 +125,7 @@ namespace quda {
     } else {
       errorQuda("Invalid nFace=%d", nFace);
     }
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else // GPU_STAGGERED_DIRAC not defined
   void computeStaggeredOprod(GaugeField *[], ColorSpinorField &, const double [], int)
diff --git a/lib/staggered_two_link_quda.cu b/lib/staggered_two_link_quda.cu
index 8dce83c997..87182fbcab 100644
--- a/lib/staggered_two_link_quda.cu
+++ b/lib/staggered_two_link_quda.cu
@@ -53,10 +53,12 @@ namespace quda
 #if defined(GPU_STAGGERED_DIRAC) && defined(GPU_TWOLINK_GSMEAR)
   void computeTwoLink(GaugeField &newTwoLink, const GaugeField &link)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkNative(newTwoLink, link);
     checkLocation(newTwoLink, link);
     checkPrecision(newTwoLink, link);
-    instantiate<ComputeTwoLink, ReconstructNone>(link, newTwoLink);//FIXME : enable link-12/8 reconstruction  
+    instantiate<ComputeTwoLink, ReconstructNone>(link, newTwoLink);//FIXME : enable link-12/8 reconstruction
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 #else
   void computeTwoLink(GaugeField &, const GaugeField &)
diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp
index 7988dc6479..2b0d3c97ba 100644
--- a/lib/targets/cuda/malloc.cpp
+++ b/lib/targets/cuda/malloc.cpp
@@ -7,6 +7,7 @@
 #include <quda_internal.h>
 #include <device.h>
 #include <shmem_helper.cuh>
+#include "timer.h"
 
 #ifdef USE_QDPJIT
 #include "qdp_cache.h"
diff --git a/lib/targets/cuda/quda_api.cpp b/lib/targets/cuda/quda_api.cpp
index 85094480f2..b0a4b7cc4f 100644
--- a/lib/targets/cuda/quda_api.cpp
+++ b/lib/targets/cuda/quda_api.cpp
@@ -325,6 +325,14 @@ namespace quda
     QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
   }
 
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
+                   const char *file, const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func,
+                 file, line);
+  }
+
   void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
                         const char *func, const char *file, const char *line)
   {
@@ -372,6 +380,16 @@ namespace quda
     QudaMem set(ptr, value, count, device::get_default_stream(), false, func, file, line);
   }
 
+  void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, device::get_default_stream(), false, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
   void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
                         const char *file, const char *line)
   {
@@ -379,18 +397,27 @@ namespace quda
     QudaMem copy(ptr, value, count, stream, true, func, file, line);
   }
 
-  void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func,
-                     const char *file, const char *line)
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line)
   {
-    cudaError_t error = cudaMemset2D(ptr, pitch, value, width, height);
-    set_runtime_error(error, __func__, func, file, line);
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, stream, true, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
   }
 
-  void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream,
-                          const char *func, const char *file, const char *line)
+  void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
+                          const qudaStream_t &stream, const char *func, const char *file, const char *line)
   {
-    cudaError_t error = cudaMemset2DAsync(ptr, pitch, value, width, height, get_stream(stream));
-    set_runtime_error(error, __func__, func, file, line);
+    if (ptr.is_device()) {
+      cudaError_t error
+        = cudaMemset2DAsync(static_cast<char *>(ptr.data()) + offset, pitch, value, width, height, get_stream(stream));
+      set_runtime_error(error, __func__, func, file, line);
+    } else {
+      for (auto i = 0u; i < height; i++) memset(static_cast<char *>(ptr.data()) + offset + i * pitch, value, width);
+    }
   }
 
   void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream,
diff --git a/lib/targets/hip/blas_lapack_hipblas.cpp b/lib/targets/hip/blas_lapack_hipblas.cpp
index d3bff08720..573d5396c1 100644
--- a/lib/targets/hip/blas_lapack_hipblas.cpp
+++ b/lib/targets/hip/blas_lapack_hipblas.cpp
@@ -2,7 +2,7 @@
 #include <blas_lapack.h>
 #include <timer.h>
 #ifdef NATIVE_LAPACK_LIB
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <malloc_quda.h>
 #endif
 
diff --git a/lib/targets/hip/malloc.cpp b/lib/targets/hip/malloc.cpp
index acda672d85..6fd0986b94 100644
--- a/lib/targets/hip/malloc.cpp
+++ b/lib/targets/hip/malloc.cpp
@@ -528,12 +528,15 @@ namespace quda
       errorQuda("hipPointerGetAttributes returned error: %s\n", hipGetErrorString(error));
     }
 
-    switch (attr.memoryType) {
+    switch (attr.type) {
+#if HIP_VERSION_MAJOR >= 6
+    case hipMemoryTypeUnregistered: return QUDA_CPU_FIELD_LOCATION;
+#endif  // HIP_VERSION_MAJOR >= 6
     case hipMemoryTypeHost: return QUDA_CPU_FIELD_LOCATION;
     case hipMemoryTypeDevice: return QUDA_CUDA_FIELD_LOCATION;
     case hipMemoryTypeArray: return QUDA_CUDA_FIELD_LOCATION;
     case hipMemoryTypeUnified: return QUDA_CUDA_FIELD_LOCATION; ///< Not used currently
-    default: errorQuda("Unknown memory type %d\n", attr.memoryType); return QUDA_INVALID_FIELD_LOCATION;
+    default: errorQuda("Unknown memory type %d\n", attr.type); return QUDA_INVALID_FIELD_LOCATION;
     }
   }
 
diff --git a/lib/targets/hip/quda_api.cpp b/lib/targets/hip/quda_api.cpp
index 9191ec16a3..8fff9a75a5 100644
--- a/lib/targets/hip/quda_api.cpp
+++ b/lib/targets/hip/quda_api.cpp
@@ -261,6 +261,14 @@ namespace quda
     QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
   }
 
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
+                   const char *file, const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func,
+                 file, line);
+  }
+
   void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
                         const char *func, const char *file, const char *line)
   {
@@ -288,6 +296,16 @@ namespace quda
     QudaMem set(ptr, value, count, device::get_default_stream(), false, func, file, line);
   }
 
+  void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, device::get_default_stream(), false, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
   void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
                         const char *file, const char *line)
   {
@@ -295,18 +313,27 @@ namespace quda
     QudaMem copy(ptr, value, count, stream, true, func, file, line);
   }
 
-  void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func,
-                     const char *file, const char *line)
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line)
   {
-    hipError_t error = hipMemset2D(ptr, pitch, value, width, height);
-    set_runtime_error(error, __func__, func, file, line);
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, stream, true, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
   }
 
-  void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream,
-                          const char *func, const char *file, const char *line)
+  void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
+                          const qudaStream_t &stream, const char *func, const char *file, const char *line)
   {
-    hipError_t error = hipMemset2DAsync(ptr, pitch, value, width, height, get_stream(stream));
-    set_runtime_error(error, __func__, func, file, line);
+    if (ptr.is_device()) {
+      hipError_t error
+        = hipMemset2DAsync(static_cast<char *>(ptr.data()) + offset, pitch, value, width, height, get_stream(stream));
+      set_runtime_error(error, __func__, func, file, line);
+    } else {
+      for (auto i = 0u; i < height; i++) memset(static_cast<char *>(ptr.data()) + offset + i * pitch, value, width);
+    }
   }
 
   void qudaMemPrefetchAsync_(void *, size_t, QudaFieldLocation, const qudaStream_t &, const char *, const char *,
@@ -315,41 +342,6 @@ namespace quda
     // No prefetch
   }
 
-#if 0
-  bool qudaEventQuery_(qudaEvent_t &quda_event, const char *func, const char *file, const char *line)
-  {
-    cudaEvent_t &event = reinterpret_cast<cudaEvent_t&>(quda_event.event);
-#ifdef USE_DRIVER_API
-    PROFILE(CUresult error = cuEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
-    switch (error) {
-    case CUDA_SUCCESS: return true;
-    case CUDA_ERROR_NOT_READY: return false;
-    default: set_driver_error(error, __func__, func, file, line);
-    }
-#else
-    PROFILE(cudaError_t error = cudaEventQuery(event), QUDA_PROFILE_EVENT_QUERY);
-    switch (error) {
-    case cudaSuccess: return true;
-    case cudaErrorNotReady: return false;
-    default: set_runtime_error(error, __func__, func, file, line);
-    }
-#endif
-    return false;
-  }
-
-  void qudaEventRecord_(qudaEvent_t &quda_event, qudaStream_t stream, const char *func, const char *file, const char *line)
-  {
-    cudaEvent_t &event = reinterpret_cast<cudaEvent_t&>(quda_event.event);
-#ifdef USE_DRIVER_API
-    PROFILE(CUresult error = cuEventRecord(event, get_stream(stream)), QUDA_PROFILE_EVENT_RECORD);
-    set_driver_error(error, __func__, func, file, line);
-#else
-    PROFILE(cudaError_t error = cudaEventRecord(event, get_stream(stream)), QUDA_PROFILE_EVENT_RECORD);
-    set_runtime_error(error, __func__, func, file, line);
-#endif
-  }
-#endif
-
   bool qudaEventQuery_(qudaEvent_t &quda_event, const char *func, const char *file, const char *line)
   {
     hipEvent_t &event = reinterpret_cast<hipEvent_t &>(quda_event.event);
diff --git a/lib/timer.cpp b/lib/timer.cpp
index e8e427fd74..d77365ec34 100644
--- a/lib/timer.cpp
+++ b/lib/timer.cpp
@@ -1,10 +1,17 @@
+#include <stack>
 #include <quda_internal.h>
 #include <timer.h>
+#include <tune_quda.h>
+
+#ifdef INTERFACE_NVTX
+#include "nvtx3/nvToolsExt.h"
+#endif
 
 namespace quda {
 
   /**< Print out the profile information */
-  void TimeProfile::Print() {
+  void TimeProfile::Print()
+  {
     if (profile[QUDA_PROFILE_TOTAL].time > 0.0) {
       printfQuda("\n   %20s Total time = %9.3f secs\n", fname.c_str(), profile[QUDA_PROFILE_TOTAL].time);
     }
@@ -30,7 +37,6 @@ namespace quda {
       warningQuda("Accounted time %9.3f secs in %s is greater than total time %9.3f secs", accounted,
                   (const char *)&fname[0], profile[QUDA_PROFILE_TOTAL].time);
     }
-
   }
 
   std::string TimeProfile::pname[] = {"download",
@@ -78,9 +84,112 @@ namespace quda {
   const int TimeProfile::nvtx_num_colors = sizeof(nvtx_colors)/sizeof(uint32_t);
 #endif
 
-  Timer<> TimeProfile::global_profile[QUDA_PROFILE_COUNT];
-  bool TimeProfile::global_switchOff[QUDA_PROFILE_COUNT] = {};
-  int TimeProfile::global_total_level[QUDA_PROFILE_COUNT] = {};
+  // global timer
+  host_timer_t global_profile[QUDA_PROFILE_COUNT] = {};
+  static bool global_switchOff[QUDA_PROFILE_COUNT] = {};
+  static int global_total_level[QUDA_PROFILE_COUNT] = {};
+
+  void TimeProfile::StopGlobal(const char *func, const char *file, int line, QudaProfileType idx)
+  {
+    global_total_level[idx]--;
+    if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line);
+
+    // switch off total timer if we need to
+    if (global_switchOff[idx]) {
+      global_total_level[idx]--;
+      if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line);
+      global_switchOff[idx] = false;
+    }
+  }
+
+  void TimeProfile::StartGlobal(const char *func, const char *file, int line, QudaProfileType idx)
+  {
+    // if total timer isn't running, then start it running
+    if (!global_profile[idx].running) {
+      global_profile[idx].start(func, file, line);
+      global_total_level[idx]++;
+      global_switchOff[idx] = true;
+    }
+
+    if (global_total_level[idx] == 0) global_profile[idx].start(func, file, line);
+    global_total_level[idx]++;
+  }
+
+#ifdef INTERFACE_NVTX
+
+#define PUSH_RANGE(name, cid)                                                                                          \
+  {                                                                                                                    \
+    int color_id = cid;                                                                                                \
+    color_id = color_id % nvtx_num_colors;                                                                             \
+    nvtxEventAttributes_t eventAttrib = {};                                                                            \
+    eventAttrib.version = NVTX_VERSION;                                                                                \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;                                                                  \
+    eventAttrib.colorType = NVTX_COLOR_ARGB;                                                                           \
+    eventAttrib.color = nvtx_colors[color_id];                                                                         \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;                                                                 \
+    eventAttrib.message.ascii = name;                                                                                  \
+    eventAttrib.category = cid;                                                                                        \
+    nvtxRangePushEx(&eventAttrib);                                                                                     \
+  }
+#define POP_RANGE nvtxRangePop();
+#else
+#define PUSH_RANGE(name, cid)
+#define POP_RANGE
+#endif
+
+  static std::stack<QudaProfileType> pt_stack;
+
+  void TimeProfile::Start_(const char *func, const char *file, int line, QudaProfileType idx)
+  {
+    // if total timer isn't running, then start it running
+    if (!profile[QUDA_PROFILE_TOTAL].running && idx != QUDA_PROFILE_TOTAL) {
+      profile[QUDA_PROFILE_TOTAL].start(func, file, line);
+      switchOff = true;
+    }
+
+    // if a timer is already running, stop it and push to stack
+    for (auto i = 0; i < QUDA_PROFILE_COUNT - 1; i++) {
+      if (i == static_cast<int>(idx)) continue;
+      if (profile[i].running) {
+        if (i == QUDA_PROFILE_COMPUTE || i == QUDA_PROFILE_H2D || i == QUDA_PROFILE_D2H) qudaDeviceSynchronize();
+        profile[i].stop(file, func, line);
+        if (use_global) StopGlobal(func, file, line, static_cast<QudaProfileType>(i));
+        pt_stack.push(static_cast<QudaProfileType>(i));
+      }
+    }
+
+    profile[idx].start(func, file, line);
+    PUSH_RANGE(fname.c_str(), idx)
+    if (use_global) StartGlobal(func, file, line, idx);
+  }
+
+  void TimeProfile::Stop_(const char *func, const char *file, int line, QudaProfileType idx)
+  {
+    if (idx == QUDA_PROFILE_COMPUTE || idx == QUDA_PROFILE_H2D || idx == QUDA_PROFILE_D2H)
+      qudaDeviceSynchronize(); // ensure accurate profiling
+    profile[idx].stop(func, file, line);
+    POP_RANGE
+
+    if (pt_stack.empty()) {
+      // switch off total timer if we need to (only if no timer being popped)
+      if (switchOff && idx != QUDA_PROFILE_TOTAL) {
+        profile[QUDA_PROFILE_TOTAL].stop(func, file, line);
+        switchOff = false;
+      }
+      if (use_global) StopGlobal(func, file, line, idx);
+    }
+
+    // restore any pre-existing timers if needed
+    if (!pt_stack.empty()) {
+      auto i = pt_stack.top();
+      pt_stack.pop();
+      profile[i].start(func, file, line);
+      if (use_global) StartGlobal(func, file, line, i);
+    }
+  }
+
+#undef PUSH_RANGE
+#undef POP_RANGE
 
   void TimeProfile::PrintGlobal() {
     if (global_profile[QUDA_PROFILE_TOTAL].time > 0.0) {
@@ -113,4 +222,33 @@ namespace quda {
     }
   }
 
+  TimeProfile dummy("default", false);
+
+  static std::stack<TimeProfile *> tp_stack;
+
+  pushProfile::pushProfile(TimeProfile &profile, double &secs, double &gflops) :
+    profile(profile), secs(secs), gflops(gflops), flops(Tunable::flops_global())
+
+  {
+    profile.TPSTART(QUDA_PROFILE_TOTAL);
+    tp_stack.push(&profile);
+  }
+
+  pushProfile::~pushProfile()
+  {
+    if (tp_stack.empty()) errorQuda("popProfile() called with empty stack");
+    auto &profile = *(tp_stack.top());
+    if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one");
+    tp_stack.pop();
+    profile.TPSTOP(QUDA_PROFILE_TOTAL);
+    secs = profile.Last(QUDA_PROFILE_TOTAL);
+    gflops = (Tunable::flops_global() - flops) * 1e-9;
+    if (&gflops != &gflops_dummy) comm_allreduce_sum(gflops);
+  }
+
+  TimeProfile &getProfile()
+  {
+    if (tp_stack.empty()) return dummy;
+    return *(tp_stack.top());
+  }
 }
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 57134ec3d4..7a7b551be5 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -644,21 +644,65 @@ namespace quda
     }
   }
 
-  TuneParam::TuneParam() :
-    block(device::warp_size(), 1, 1),
-    grid(1, 1, 1),
-    shared_bytes(0),
-    set_max_shared_bytes(false),
-    aux(),
-    time(FLT_MAX),
-    n_calls(0)
+  TuneParam::TuneParam() : block(device::warp_size(), 1, 1) { }
+
+  std::ostream &operator<<(std::ostream &output, const TuneParam &param)
   {
-    aux = make_int4(1, 1, 1, 1);
+    output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), ";
+    output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), ";
+    output << "shared_bytes=" << param.shared_bytes;
+    output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")";
+    return output;
+  }
+
+  bool Tunable::tuneSharedBytes() const
+  {
+    static bool tune_shared = true;
+    static bool init = false;
+
+    if (!init) {
+      char *enable_shared_env = getenv("QUDA_ENABLE_TUNING_SHARED");
+      if (enable_shared_env) {
+        if (strcmp(enable_shared_env, "0") == 0) { tune_shared = false; }
+      }
+      init = true;
+    }
+    return tune_shared;
   }
 
   int Tunable::blockStep() const { return device::warp_size(); }
   int Tunable::blockMin() const { return device::warp_size(); }
 
+  bool Tunable::tuned() const
+  {
+    // not tuning is equivalent to already tuned
+    if (!getTuning()) return true;
+
+    TuneKey key = tuneKey();
+    if (use_managed_memory()) strcat(key.aux, ",managed");
+    // if key is present in cache then already tuned
+    return getTuneCache().find(key) != getTuneCache().end();
+  }
+
+  std::string Tunable::paramString(const TuneParam &param) const
+  {
+    std::stringstream ps;
+    ps << param;
+    return ps.str();
+  }
+
+  std::string Tunable::perfString(float time) const
+  {
+    float gflops = flops() / (1e9 * time);
+    float gbytes = bytes() / (1e9 * time);
+    std::stringstream ss;
+    ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, ";
+    ss << gbytes << " GB/s";
+    return ss.str();
+  }
+
+  std::string Tunable::miscString(const TuneParam &) const { return std::string(); }
+
   int32_t Tunable::getTuneRank() const
   {
     static bool init = false;
@@ -804,6 +848,7 @@ namespace quda
     TuneKey key = tunable.tuneKey();
     if (use_managed_memory()) strcat(key.aux, ",managed");
     last_key = key;
+    bool is_policy = strncmp(key.aux, "policy,", 7) == 0 ? true : false;
 
 #ifdef LAUNCH_TIMER
     launchTimer.TPSTOP(QUDA_PROFILE_INIT);
@@ -846,6 +891,10 @@ namespace quda
         trace_list.push_back(trace_entry);
       }
 
+      if (!is_policy) {
+        Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+        Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+      }
       return param_tuned;
     }
 
@@ -864,6 +913,10 @@ namespace quda
       logQuda(QUDA_DEBUG_VERBOSE, "Launching %s with %s at vol=%s with %s (untuned)\n", key.name, key.aux, key.volume,
               tunable.paramString(param_default).c_str());
 
+      if (!is_policy) {
+        Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+        Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+      }
       return param_default;
     } else if (!tuning) {
 
@@ -1077,6 +1130,10 @@ namespace quda
 
     param.n_calls = profile_count ? 1 : 0;
 
+    if (!is_policy) {
+      Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter
+      Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter
+    }
     return param;
   }
 
diff --git a/lib/unitarize_force_quda.cu b/lib/unitarize_force_quda.cu
index 29b315d2ef..84b94a0d54 100644
--- a/lib/unitarize_force_quda.cu
+++ b/lib/unitarize_force_quda.cu
@@ -56,6 +56,7 @@ namespace quda {
     void unitarizeForce(GaugeField &newForce, const GaugeField &oldForce, const GaugeField &u,
 			int* fails)
     {
+      getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
       checkReconstruct(u, oldForce, newForce);
       checkPrecision(u, oldForce, newForce);
 
@@ -63,6 +64,7 @@ namespace quda {
         errorQuda("Only native order supported");
 
       instantiate<ForceUnitarize, ReconstructNone>(newForce, oldForce, u, fails);
+      getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     }
 #else
     void unitarizeForce(GaugeField &, const GaugeField &, const GaugeField &, int*)
diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu
index fa006f0b4a..8bddafea55 100644
--- a/lib/unitarize_links_quda.cu
+++ b/lib/unitarize_links_quda.cu
@@ -60,14 +60,14 @@ namespace quda {
     for (unsigned int i = 0; i < infield.Volume(); ++i) {
       for (int dir=0; dir<4; ++dir){
 	if (infield.Precision() == QUDA_SINGLE_PRECISION) {
-	  copyArrayToLink(inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
-	  unitarizeLinkNewton(outlink, inlink, max_iter_newton);
-	  copyLinkToArray(((float*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
-	} else if (infield.Precision() == QUDA_DOUBLE_PRECISION) {
-	  copyArrayToLink(inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
-	  unitarizeLinkNewton(outlink, inlink, max_iter_newton);
-	  copyLinkToArray(((double*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink);
-	} // precision?
+          copyArrayToLink(inlink, infield.data<float *>() + (i * 4 + dir) * 18); // order of arguments?
+          unitarizeLinkNewton(outlink, inlink, max_iter_newton);
+          copyLinkToArray(outfield.data<float *>() + (i * 4 + dir) * 18, outlink);
+        } else if (infield.Precision() == QUDA_DOUBLE_PRECISION) {
+          copyArrayToLink(inlink, infield.data<double *>() + (i * 4 + dir) * 18); // order of arguments?
+          unitarizeLinkNewton(outlink, inlink, max_iter_newton);
+          copyLinkToArray(outfield.data<double *>() + (i * 4 + dir) * 18, outlink);
+        } // precision?
       } // dir
     }   // loop over volume
   }
@@ -81,13 +81,13 @@ namespace quda {
     for (unsigned int i = 0; i < field.Volume(); ++i) {
       for (int dir=0; dir<4; ++dir) {
 	if (field.Precision() == QUDA_SINGLE_PRECISION) {
-	  copyArrayToLink(link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
-	} else if (field.Precision() == QUDA_DOUBLE_PRECISION) {
-	  copyArrayToLink(link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments?
-	} else {
-	  errorQuda("Unsupported precision\n");
-	}
-	if (link.isUnitary(max_error) == false) {
+          copyArrayToLink(link, field.data<float *>() + (i * 4 + dir) * 18); // order of arguments?
+        } else if (field.Precision() == QUDA_DOUBLE_PRECISION) {
+          copyArrayToLink(link, field.data<double *>() + (i * 4 + dir) * 18); // order of arguments?
+        } else {
+          errorQuda("Unsupported precision\n");
+        }
+        if (link.isUnitary(max_error) == false) {
 	  printf("Unitarity failure\n");
 	  printf("site index = %u,\t direction = %d\n", i, dir);
 	  printLink(link);
@@ -125,9 +125,12 @@ namespace quda {
                         UnitarizeArg<Float, nColor, recon>(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error));
     }
 
-    void preTune() { if (in.Gauge_p() == out.Gauge_p()) out.backup(); }
+    void preTune()
+    {
+      if (in.data() == out.data()) out.backup();
+    }
     void postTune() {
-      if (in.Gauge_p() == out.Gauge_p()) out.restore();
+      if (in.data() == out.data()) out.restore();
       qudaMemset(fails, 0, sizeof(int)); // reset fails counter
     }
 
@@ -138,8 +141,10 @@ namespace quda {
 
   void unitarizeLinks(GaugeField& out, const GaugeField &in, int* fails)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     checkPrecision(out, in);
     instantiate<UnitarizeLinks, ReconstructNo12>(out, in, fails);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
   void unitarizeLinks(GaugeField &links, int* fails) { unitarizeLinks(links, links, fails); }
@@ -179,11 +184,13 @@ namespace quda {
 
   void projectSU3(GaugeField &u, double tol, int *fails)
   {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
     // check the the field doesn't have staggered phases applied
     if (u.StaggeredPhaseApplied())
       errorQuda("Cannot project gauge field with staggered phases applied");
 
     instantiate<ProjectSU3>(u, tol, fails);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
   }
 
 } // namespace quda
diff --git a/lib/util_quda.cpp b/lib/util_quda.cpp
index 02a78834d0..93b9fb4627 100644
--- a/lib/util_quda.cpp
+++ b/lib/util_quda.cpp
@@ -134,17 +134,17 @@ void popOutputPrefix()
 
 char *getPrintBuffer() { return buffer_; }
 
-char* getOmpThreadStr() {
-  static char omp_thread_string[128];
+const char *getOmpThreadStr()
+{
+  static std::string omp_thread_string;
   static bool init = false;
   if (!init) {
-    strcpy(omp_thread_string,"omp_threads=");
-    char *omp_threads = getenv("OMP_NUM_THREADS");
-    strcat(omp_thread_string, omp_threads ? omp_threads : "1");
-    strcat(omp_thread_string, ",");
+#ifdef QUDA_OPENMP
+    omp_thread_string = std::string("omp_threads=" + std::to_string(omp_get_max_threads()) + ",");
+#endif
     init = true;
   }
-  return omp_thread_string;
+  return omp_thread_string.c_str();
 }
 
 void errorQuda_(const char *func, const char *file, int line, ...)
diff --git a/lib/vector_io.cpp b/lib/vector_io.cpp
index b247177876..ed469750f7 100644
--- a/lib/vector_io.cpp
+++ b/lib/vector_io.cpp
@@ -52,7 +52,7 @@ namespace quda
       std::vector<void *> V(Nvec * Ls);
       for (int i = 0; i < Nvec; i++) {
         auto &v = create_tmp ? tmp[i] : vecs[i];
-        for (int j = 0; j < Ls; j++) { V[i * Ls + j] = static_cast<char *>(v.V()) + j * stride; }
+        for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data<char *>() + j * stride; }
       }
 
       // time loading
@@ -137,7 +137,7 @@ namespace quda
       std::vector<const void *> V(Nvec * Ls);
       for (int i = 0; i < Nvec; i++) {
         auto &v = create_tmp ? tmp[i] : vecs[i];
-        for (int j = 0; j < Ls; j++) { V[i * Ls + j] = static_cast<const char *>(v.V()) + j * stride; }
+        for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data<const char *>() + j * stride; }
       }
 
       // time saving
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6e086db09f..7184a71b9d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -9,6 +9,15 @@ target_compile_options(
   $<IF:$<CONFIG:RELEASE>,-w,-Wall -Wextra>
   $<$<CONFIG:STRICT>:-Werror>
 )
+
+# ignore any unkown pragmas if not using OpenMP
+if(NOT ${QUDA_OPENMP})
+  target_compile_options(quda_test PUBLIC $<$<COMPILE_LANGUAGE:CXX>:
+    $<$<CXX_COMPILER_ID:Clang>:-Wno-unknown-pragmas>
+    $<$<CXX_COMPILER_ID:GNU>:-Wno-unknown-pragmas>
+    >)
+endif()
+
 if(BUILD_SHARED_LIBS)
   install(TARGETS quda_test ${QUDA_EXCLUDE_FROM_INSTALL} DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
@@ -270,8 +279,9 @@ if(QUDA_MPI OR QUDA_QMP)
   if(DEFINED ENV{QUDA_TEST_GRID_SIZE})
     get_test_ranks($ENV{QUDA_TEST_GRID_SIZE} QUDA_TEST_NUM_PROCS)
   endif()
-  set(QUDA_CTEST_LAUNCH "${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${QUDA_TEST_NUM_PROCS} ${MPIEXEC_PREFLAGS}"
-      CACHE STRING "CTest Launcher command for QUDA's tests")
+  message(STATUS "ctest will run on ${QUDA_TEST_NUM_PROCS} processes")
+  set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS}
+    CACHE STRING "CTest Launcher command for QUDA's tests")
 endif()
 
 # BLAS tests
@@ -380,6 +390,18 @@ foreach(pol IN LISTS DSLASH_POLICIES)
       set_tests_properties(dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    add_test(NAME dslash_${DIRAC_NAME}_splitgrid_policy${pol2}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --all-partitions 0
+                     --test Dslash
+                     --dim 2 4 6 8
+                     --gtest_output=xml:dslash_${DIRAC_NAME}_splitgrid_test_pol${pol2}.xml)
+    if(polenv)
+      set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
+    set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+
     add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
                      --dslash-type ${DIRAC_NAME}
@@ -835,6 +857,17 @@ endif()
       set_tests_properties(dslash_${DIRAC_NAME}_mat_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
     endif()
 
+    add_test(NAME dslash_${DIRAC_NAME}_splitgrid_policy${pol2}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test MatPC
+                     --dim 2 4 6 8
+                     --gtest_output=xml:dslash_${DIRAC_NAME}_matpc_test_pol${pol2}.xml)
+    if(polenv)
+      set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
+    endif()
+    set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+
     add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2}
     COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
             --dslash-type ${DIRAC_NAME}
@@ -936,6 +969,20 @@ foreach(prec IN LISTS TEST_PRECS)
       --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
       --enable-testing true
       --gtest_output=xml:invert_test_wilson_${prec}.xml)
+
+      if(DEFINED ENV{QUDA_ENABLE_TUNING})
+        if($ENV{QUDA_ENABLE_TUNING} EQUAL 0)
+          add_test(NAME invert_test_splitgrid_wilson_${prec}
+            COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:invert_test> ${MPIEXEC_POSTFLAGS}
+            --dslash-type wilson --ngcrkrylov 8
+            --dim 2 4 6 8 --prec ${prec} --tol ${tol} --tolhq ${tol} --niter 1000
+            --nsrc ${QUDA_TEST_NUM_PROCS}
+            --enable-testing true
+            --gtest_output=xml:invert_test_splitgrid_wilson_${prec}.xml)
+
+          set_tests_properties(invert_test_splitgrid_wilson_${prec} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+        endif()
+      endif()
   endif()
   
   if(QUDA_DIRAC_TWISTED_MASS)
@@ -1267,9 +1314,7 @@ foreach(prec IN LISTS TEST_PRECS)
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:unitarize_link_test> ${MPIEXEC_POSTFLAGS}
                      --dim 2 4 6 8 --prec ${prec}
                      --gtest_output=xml:unitarize_link_test_${prec}.xml)
-  endif()
 
-  if(QUDA_FORCE_HISQ)
     add_test(NAME hisq_paths_force_${prec}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:hisq_paths_force_test> ${MPIEXEC_POSTFLAGS}
                      --dim 2 4 6 8 --prec ${prec}
@@ -1289,7 +1334,7 @@ foreach(prec IN LISTS TEST_PRECS)
   if (TARGET dilution_test)
     add_test(NAME dilution_test_${prec}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dilution_test> ${MPIEXEC_POSTFLAGS}
-                     --dim 4 6 8 10 --prec ${prec}
+                     --dim 4 6 8 10 --dilution-block-size 4 6 4 5 --prec ${prec}
                      --gtest_output=xml:dilution_test_${prec}.xml)
   endif()
 
diff --git a/tests/TMCloverForce_test.cpp b/tests/TMCloverForce_test.cpp
index 1aae9c2502..7557a83139 100644
--- a/tests/TMCloverForce_test.cpp
+++ b/tests/TMCloverForce_test.cpp
@@ -60,15 +60,15 @@ void init(int argc, char **argv)
     setInvertParam(inv_param);
   }
   inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES;
-  inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
-  inv_param.cuda_prec_refinement_sloppy = QUDA_DOUBLE_PRECISION;
-  inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION;
-  inv_param.cuda_prec_eigensolver = QUDA_HALF_PRECISION;
+  //inv_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
+  //inv_param.cuda_prec_refinement_sloppy = QUDA_DOUBLE_PRECISION;
+  //inv_param.cuda_prec_precondition = QUDA_HALF_PRECISION;
+  //inv_param.cuda_prec_eigensolver = QUDA_HALF_PRECISION;
   // inv_param.clover_location = QUDA_CUDA_FIELD_LOCATION;
-  inv_param.clover_cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
-  inv_param.clover_cuda_prec_refinement_sloppy = QUDA_DOUBLE_PRECISION;
-  inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION;
-  inv_param.clover_cuda_prec_eigensolver = QUDA_HALF_PRECISION;
+  //inv_param.clover_cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
+  //inv_param.clover_cuda_prec_refinement_sloppy = QUDA_DOUBLE_PRECISION;
+  //inv_param.clover_cuda_prec_precondition = QUDA_HALF_PRECISION;
+  //inv_param.clover_cuda_prec_eigensolver = QUDA_HALF_PRECISION;
 
   if (inv_deflate) {
     setEigParam(eig_param);
@@ -179,26 +179,28 @@ void TMCloverForce_test()
   // param.reconstruct = QUDA_RECONSTRUCT_NO;
 
   param.create = QUDA_ZERO_FIELD_CREATE;
-  quda::cpuGaugeField Mom_milc(param);
-  quda::cpuGaugeField Mom_ref_milc(param);
+  quda::GaugeField Mom_milc(param);
+  quda::GaugeField Mom_ref_milc(param);
 
   param.order = QUDA_QDP_GAUGE_ORDER;
-  quda::cpuGaugeField Mom_qdp(param);
+  quda::GaugeField Mom_qdp(param);
 
   // initialize some data in cpuMom
-  createMomCPU(Mom_ref_milc.Gauge_p(), gauge_param.cpu_prec, 0.0);
+  createMomCPU(Mom_ref_milc.data(), gauge_param.cpu_prec, 0.0);
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) Mom_milc.copy(Mom_ref_milc);
   if (gauge_order == QUDA_QDP_GAUGE_ORDER) Mom_qdp.copy(Mom_ref_milc);
 
   void *mom = nullptr;
+  void *mom_array[QUDA_MAX_DIM];
   // void *sitelink = nullptr;
 
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) {
     // sitelink = U_milc.Gauge_p();
-    mom = Mom_milc.Gauge_p();
+    mom = Mom_milc.data();
   } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
     // sitelink = U_qdp.Gauge_p();
-    mom = Mom_qdp.Gauge_p();
+    for (int d = 0; d < 4; d++) mom_array[d] = Mom_qdp.data(d);
+    mom = reinterpret_cast<void *>(mom_array);
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
@@ -236,7 +238,7 @@ void TMCloverForce_test()
     for (int n = 0; n < Nsrc; n++) {
       out_multishift[n * multishift + i] = quda::ColorSpinorField(cs_param);
       spinorNoise(out_multishift[n * multishift + i], rng, QUDA_NOISE_GAUSS);
-      in[n][i] = out_multishift[n * multishift + i].V();
+      in[n][i] = out_multishift[n * multishift + i].data();
       ////////////my init
       // double *vin = (double *)in[0][0];
       // for (int x = 0; x < 2 * 4 * 4 * 2 * 24; x++) {
@@ -275,16 +277,16 @@ void TMCloverForce_test()
   // The number comes from CPU implementation in MILC, gauge_force_imp.c
   int flops = 153004;
 
-  void *refmom = Mom_ref_milc.Gauge_p();
+  void *refmom = Mom_ref_milc.data();
   // int *check_out = compute_force ? &force_check : &path_check;
   int *check_out = true ? &force_check : &path_check;
   if (verify_results) {
     
     TMCloverForce_reference(refmom, in[0].data(), coeff, 1, gauge, clover, clover_inv, &gauge_param, &inv_param);
     *check_out
-      = compare_floats(Mom_milc.Gauge_p(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
+      = compare_floats(Mom_milc.data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
     // if (compute_force)
-    strong_check_mom(Mom_milc.Gauge_p(), refmom, 4 * V, gauge_param.cpu_prec);
+    strong_check_mom(Mom_milc.data(), refmom, 4 * V, gauge_param.cpu_prec);
   }
 
   logQuda(QUDA_VERBOSE, "\nComputing momentum action\n");
diff --git a/tests/blas_test.cpp b/tests/blas_test.cpp
index 713ffb5d0a..b2315c3f70 100644
--- a/tests/blas_test.cpp
+++ b/tests/blas_test.cpp
@@ -9,6 +9,8 @@
 #include <host_utils.h>
 #include <command_line_params.h>
 
+#include <tune_quda.h>
+
 // include because of nasty globals used in the tests
 #include <dslash_reference.h>
 
@@ -1152,14 +1154,13 @@ TEST_P(BlasTest, benchmark)
   // do the initial tune
   benchmark(kernel, 1);
 
-  // now rerun with more iterations to get accurate speed measurements
-  quda::blas::flops = 0;
-  quda::blas::bytes = 0;
+  auto flops0 = quda::Tunable::flops_global();
+  auto bytes0 = quda::Tunable::bytes_global();
 
   double secs = benchmark(kernel, niter);
 
-  double gflops = (quda::blas::flops * 1e-9) / (secs);
-  double gbytes = quda::blas::bytes / (secs * 1e9);
+  double gflops = (quda::Tunable::flops_global() - flops0) * 1e-9 / secs;
+  double gbytes = (quda::Tunable::bytes_global() - bytes0) / (secs * 1e9);
   RecordProperty("Gflops", std::to_string(gflops));
   RecordProperty("GBs", std::to_string(gbytes));
   printfQuda("%-31s: Gflop/s = %6.1f, GB/s = %6.1f\n", kernel_map.at(kernel).c_str(), gflops, gbytes);
diff --git a/tests/covdev_test.cpp b/tests/covdev_test.cpp
index 197b1a47ac..d296a553af 100644
--- a/tests/covdev_test.cpp
+++ b/tests/covdev_test.cpp
@@ -25,7 +25,7 @@ using namespace quda;
 QudaGaugeParam gauge_param;
 QudaInvertParam inv_param;
 
-cpuGaugeField *cpuLink = nullptr;
+GaugeField *cpuLink = nullptr;
 
 std::unique_ptr<ColorSpinorField> spinor, spinorOut, spinorRef;
 std::unique_ptr<ColorSpinorField> cudaSpinor, cudaSpinorOut;
@@ -34,8 +34,6 @@ std::unique_ptr<ColorSpinorField> tmp;
 
 void *links[4];
 
-void **ghostLink;
-
 QudaParity parity = QUDA_EVEN_PARITY;
 
 GaugeCovDev *dirac;
@@ -96,8 +94,7 @@ void init(int argc, char **argv)
   // cpuLink is only used for ghost allocation
   GaugeFieldParam cpuParam(gauge_param, links);
   cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuLink = new cpuGaugeField(cpuParam);
-  ghostLink = cpuLink->Ghost();
+  cpuLink = new GaugeField(cpuParam);
 
   printfQuda("Links sending...");
   loadGaugeQuda(links, &gauge_param);
@@ -164,9 +161,9 @@ void covdevRef(int mu)
   // compare to dslash reference implementation
   printfQuda("Calculating reference implementation...");
 #ifdef MULTI_GPU
-  mat_mg4dir(*spinorRef, links, ghostLink, *spinor, dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec);
+  mat_mg4dir(*spinorRef, *cpuLink, *spinor, dagger, mu);
 #else
-  mat(spinorRef->V(), links, spinor->V(), dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec);
+  mat(*spinorRef, *cpuLink, *spinor, dagger, mu);
 #endif
   printfQuda("done.\n");
 }
diff --git a/tests/dilution_test.cpp b/tests/dilution_test.cpp
index 9d7475d114..267f350b7e 100644
--- a/tests/dilution_test.cpp
+++ b/tests/dilution_test.cpp
@@ -42,12 +42,25 @@ TEST_P(DilutionTest, verify)
   ColorSpinorParam param;
   constructWilsonTestSpinorParam(&param, &inv_param, &gauge_param);
   param.siteSubset = site_subset;
+  if (site_subset == QUDA_PARITY_SITE_SUBSET) param.x[0] /= 2;
   param.nSpin = nSpin;
   param.setPrecision(inv_param.cuda_prec, inv_param.cuda_prec, true); // change order to native order
   param.location = QUDA_CUDA_FIELD_LOCATION;
   param.create = QUDA_NULL_FIELD_CREATE;
   ColorSpinorField src(param);
 
+  // compute number of blocks when using block dilution
+  int block_volume = 1;
+  lat_dim_t block_size = {dilution_block_size[0], dilution_block_size[1], dilution_block_size[2], dilution_block_size[3]};
+  if (src.SiteSubset() == QUDA_PARITY_SITE_SUBSET) block_size[0] /= 2;
+  for (int i = 0; i < src.Ndim(); i++) block_volume *= block_size[i];
+  int n_blocks = comm_size() * src.Volume() / block_volume;
+  if (dilution_type == QUDA_DILUTION_BLOCK) {
+    logQuda(QUDA_VERBOSE, "Dilution block size = %d x %d x %d x %d\n", block_size[0], block_size[1], block_size[2],
+            block_size[3]);
+    logQuda(QUDA_VERBOSE, "Number of dilution blocks = %d\n", n_blocks);
+  }
+
   RNG rng(src, 1234);
 
   for (int i = 0; i < Nsrc; i++) {
@@ -59,11 +72,12 @@ TEST_P(DilutionTest, verify)
     case QUDA_DILUTION_COLOR: size = src.Ncolor(); break;
     case QUDA_DILUTION_SPIN_COLOR: size = src.Nspin() * src.Ncolor(); break;
     case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: size = src.Nspin() * src.Ncolor() * src.SiteSubset(); break;
+    case QUDA_DILUTION_BLOCK: size = n_blocks; break;
     default: errorQuda("Invalid dilution type %d", dilution_type);
     }
 
     std::vector<ColorSpinorField> v(size, param);
-    spinorDilute(v, src, dilution_type);
+    spinorDilute(v, src, dilution_type, block_size);
 
     param.create = QUDA_ZERO_FIELD_CREATE;
     ColorSpinorField sum(param);
@@ -85,16 +99,31 @@ TEST_P(DilutionTest, verify)
 using ::testing::Combine;
 using ::testing::Values;
 
+INSTANTIATE_TEST_SUITE_P(WilsonFull, DilutionTest,
+                         Combine(Values(QUDA_FULL_SITE_SUBSET),
+                                 Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR,
+                                        QUDA_DILUTION_SPIN_COLOR_EVEN_ODD, QUDA_DILUTION_BLOCK),
+                                 Values(4)),
+                         [](testing::TestParamInfo<test_t> param) {
+                           return get_dilution_type_str(::testing::get<1>(param.param));
+                         });
+
+INSTANTIATE_TEST_SUITE_P(
+  WilsonParity, DilutionTest,
+  Combine(Values(QUDA_PARITY_SITE_SUBSET),
+          Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_BLOCK), Values(4)),
+  [](testing::TestParamInfo<test_t> param) { return get_dilution_type_str(::testing::get<1>(param.param)); });
+
 INSTANTIATE_TEST_SUITE_P(
-  WilsonFull, DilutionTest,
+  CoarseFull, DilutionTest,
   Combine(Values(QUDA_FULL_SITE_SUBSET),
           Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD),
-          Values(4)),
+          Values(2)),
   [](testing::TestParamInfo<test_t> param) { return get_dilution_type_str(::testing::get<1>(param.param)); });
 
-INSTANTIATE_TEST_SUITE_P(WilsonParity, DilutionTest,
+INSTANTIATE_TEST_SUITE_P(CoarseParity, DilutionTest,
                          Combine(Values(QUDA_PARITY_SITE_SUBSET),
-                                 Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR), Values(4)),
+                                 Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR), Values(2)),
                          [](testing::TestParamInfo<test_t> param) {
                            return get_dilution_type_str(::testing::get<1>(param.param));
                          });
diff --git a/tests/dslash_ctest.cpp b/tests/dslash_ctest.cpp
index d75d1f3da9..631d640203 100644
--- a/tests/dslash_ctest.cpp
+++ b/tests/dslash_ctest.cpp
@@ -37,10 +37,6 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
       return true;
     }
 
-    // work out if test_split_grid is enabled
-    bool test_split_grid = (grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3] > 1);
-    if (::testing::get<2>(GetParam()) > 0 && test_split_grid) { return true; }
-
     const std::array<bool, 16> partition_enabled {true, true, true,  false,  true,  false, false, false,
                                                   true, false, false, false, true, false, true, true};
     if (!ctest_all_partitions && !partition_enabled[::testing::get<2>(GetParam())]) return true;
@@ -68,8 +64,6 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
   }
 
 public:
-  DslashTest() : dslash_test_wrapper(dtest_type) { }
-
   virtual void SetUp()
   {
     int prec = ::testing::get<0>(GetParam());
@@ -94,12 +88,20 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
     commDimPartitionedReset();
   }
 
-  static void SetUpTestCase() { initQuda(device_ordinal); }
+  static void SetUpTestCase()
+  {
+    initQuda(device_ordinal);
+    DslashTestWrapper::dtest_type = dtest_type;
+  }
 
   // Per-test-case tear-down.
   // Called after the last test in this test case.
   // Can be omitted if not needed.
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase()
+  {
+    DslashTestWrapper::destroy();
+    endQuda();
+  }
 };
 
 TEST_P(DslashTest, verify)
diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp
index 2eec7d4bba..11f8265e7f 100644
--- a/tests/dslash_test.cpp
+++ b/tests/dslash_test.cpp
@@ -33,8 +33,6 @@ class DslashTest : public ::testing::Test
   }
 
 public:
-  DslashTest() : dslash_test_wrapper(dtest_type) { }
-
   virtual void SetUp()
   {
     dslash_test_wrapper.init_test(argc_copy, argv_copy);
@@ -43,12 +41,20 @@ class DslashTest : public ::testing::Test
 
   virtual void TearDown() { dslash_test_wrapper.end(); }
 
-  static void SetUpTestCase() { initQuda(device_ordinal); }
+  static void SetUpTestCase()
+  {
+    initQuda(device_ordinal);
+    DslashTestWrapper::dtest_type = dtest_type;
+  }
 
   // Per-test-case tear-down.
   // Called after the last test in this test case.
   // Can be omitted if not needed.
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase()
+  {
+    DslashTestWrapper::destroy();
+    endQuda();
+  }
 };
 
 TEST_F(DslashTest, benchmark) { dslash_test_wrapper.run_test(niter, /**show_metrics =*/true); }
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index abc1270cb4..3d13d11478 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -25,6 +25,7 @@
 #include <gtest/gtest.h>
 
 #include <color_spinor_field.h>
+#include <tune_quda.h>
 
 using namespace quda;
 
@@ -49,14 +50,14 @@ struct DslashTime {
 struct DslashTestWrapper {
 
   // CPU color spinor fields
-  ColorSpinorField spinor;
-  ColorSpinorField spinorOut;
-  ColorSpinorField spinorRef;
-  ColorSpinorField spinorTmp;
+  static inline ColorSpinorField spinor;
+  static inline ColorSpinorField spinorOut;
+  static inline ColorSpinorField spinorRef;
+  static inline ColorSpinorField spinorTmp;
   // For split grid
-  std::vector<ColorSpinorField> vp_spinor;
-  std::vector<ColorSpinorField> vp_spinorOut;
-  std::vector<ColorSpinorField> vp_spinorRef;
+  static inline std::vector<ColorSpinorField> vp_spinor;
+  static inline std::vector<ColorSpinorField> vp_spinorOut;
+  static inline std::vector<ColorSpinorField> vp_spinorRef;
 
   // CUDA color spinor fields
   ColorSpinorField cudaSpinor;
@@ -68,9 +69,9 @@ struct DslashTestWrapper {
   quda::DiracDomainWall4DPC *dirac_4dpc = nullptr;
 
   // Raw pointers
-  void *hostGauge[4] = {nullptr};
-  void *hostClover = nullptr;
-  void *hostCloverInv = nullptr;
+  static inline void *hostGauge[4] = {nullptr};
+  static inline void *hostClover = nullptr;
+  static inline void *hostCloverInv = nullptr;
 
   // Parameters
   QudaGaugeParam gauge_param;
@@ -78,14 +79,12 @@ struct DslashTestWrapper {
 
   // Test options
   QudaParity parity = QUDA_EVEN_PARITY;
-  dslash_test_type dtest_type = dslash_test_type::Dslash;
-  bool test_split_grid = false;
+  static inline dslash_test_type dtest_type = dslash_test_type::Dslash;
+  static inline bool test_split_grid = false;
   int num_src = 1;
 
   const bool transfer = false;
 
-  DslashTestWrapper(dslash_test_type dtest) : dtest_type(dtest) { }
-
   void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon)
   {
     cuda_prec = getPrecision(precision);
@@ -107,7 +106,12 @@ struct DslashTestWrapper {
 
     inv_param.cuda_prec = cuda_prec;
 
-    init(argc, argv);
+    static bool first_time = true;
+    if (first_time) {
+      init_host(argc, argv);
+      first_time = false;
+    }
+    init();
   }
 
   void init_test(int argc, char **argv)
@@ -117,15 +121,16 @@ struct DslashTestWrapper {
     setWilsonGaugeParam(gauge_param);
     setInvertParam(inv_param);
 
-    init(argc, argv);
+    static bool first_time = true;
+    if (first_time) {
+      init_host(argc, argv);
+      first_time = false;
+    }
+    init();
   }
 
-  void init(int argc, char **argv)
+  void init_host(int argc, char **argv)
   {
-    num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
-    test_split_grid = num_src > 1;
-    if (test_split_grid) { dtest_type = dslash_test_type::Dslash; }
-
     if (dslash_type == QUDA_ASQTAD_DSLASH || dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) {
       errorQuda("Asqtad not supported.  Please try staggered_dslash_test instead");
     } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH
@@ -137,6 +142,13 @@ struct DslashTestWrapper {
       Ls = 1;
     }
 
+    if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match");
+
+    for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
+    num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
+    test_split_grid = num_src > 1;
+    if (test_split_grid) { dtest_type = dslash_test_type::Dslash; }
+
     inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO;
     inv_param.solve_type = (dtest_type == dslash_test_type::Mat || dtest_type == dslash_test_type::MatDagMat) ?
       QUDA_DIRECT_SOLVE :
@@ -177,8 +189,6 @@ struct DslashTestWrapper {
       }
     }
 
-    if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match");
-
     // construct input fields
     for (int dir = 0; dir < 4; dir++) hostGauge[dir] = safe_malloc((size_t)V * gauge_site_size * gauge_param.cpu_prec);
 
@@ -186,8 +196,18 @@ struct DslashTestWrapper {
         || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
       hostClover = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec);
       hostCloverInv = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec);
+
+      if (compute_clover)
+        printfQuda("Computing clover field on GPU\n");
+      else {
+        printfQuda("Sending clover field to GPU\n");
+        constructHostCloverField(hostClover, hostCloverInv, inv_param);
+      }
     }
 
+    printfQuda("Randomizing fields... ");
+    constructHostGaugeField(hostGauge, gauge_param, argc, argv);
+
     ColorSpinorParam csParam;
     csParam.nColor = 3;
     csParam.nSpin = 4;
@@ -199,11 +219,7 @@ struct DslashTestWrapper {
       csParam.x[4] = Ls;
     }
 
-    if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
-      csParam.pc_type = QUDA_5D_PC;
-    } else {
-      csParam.pc_type = QUDA_4D_PC;
-    }
+    csParam.pc_type = dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC;
 
     // ndeg_tm
     if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
@@ -235,11 +251,6 @@ struct DslashTestWrapper {
 
     spinor.Source(QUDA_RANDOM_SOURCE);
 
-    inv_param.split_grid[0] = grid_partition[0];
-    inv_param.split_grid[1] = grid_partition[1];
-    inv_param.split_grid[2] = grid_partition[2];
-    inv_param.split_grid[3] = grid_partition[3];
-
     if (test_split_grid) {
       inv_param.num_src = num_src;
       inv_param.num_src_per_sub_partition = 1;
@@ -250,26 +261,18 @@ struct DslashTestWrapper {
       std::fill(vp_spinor.begin(), vp_spinor.end(), spinor);
     }
 
-    csParam.x[0] = gauge_param.X[0];
-
     // set verbosity prior to loadGaugeQuda
     setVerbosity(verbosity);
     inv_param.verbosity = verbosity;
+  }
 
-    printfQuda("Randomizing fields... ");
-    constructHostGaugeField(hostGauge, gauge_param, argc, argv);
-
+  void init()
+  {
     printfQuda("Sending gauge field to GPU\n");
     loadGaugeQuda(hostGauge, &gauge_param);
 
     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH
         || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
-      if (compute_clover)
-        printfQuda("Computing clover field on GPU\n");
-      else {
-        printfQuda("Sending clover field to GPU\n");
-        constructHostCloverField(hostClover, hostCloverInv, inv_param);
-      }
       inv_param.compute_clover = compute_clover;
       inv_param.return_clover = compute_clover;
       inv_param.compute_clover_inverse = true;
@@ -279,28 +282,16 @@ struct DslashTestWrapper {
     }
 
     if (!transfer) {
+      ColorSpinorParam csParam(spinor);
       csParam.location = QUDA_CUDA_FIELD_LOCATION;
       csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
       csParam.setPrecision(inv_param.cuda_prec, inv_param.cuda_prec, true);
 
-      if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-        csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-      } else {
-        csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-        csParam.x[0] /= 2;
-      }
-
       printfQuda("Creating cudaSpinor with nParity = %d\n", csParam.siteSubset);
       cudaSpinor = ColorSpinorField(csParam);
       printfQuda("Creating cudaSpinorOut with nParity = %d\n", csParam.siteSubset);
       cudaSpinorOut = ColorSpinorField(csParam);
 
-      if (inv_param.solution_type == QUDA_MAT_SOLUTION || inv_param.solution_type == QUDA_MATDAG_MAT_SOLUTION) {
-        csParam.x[0] /= 2;
-      }
-
-      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-
       printfQuda("Sending spinor field to GPU\n");
       cudaSpinor = spinor;
 
@@ -329,12 +320,28 @@ struct DslashTestWrapper {
         dirac = nullptr;
       }
     }
+  }
+
+  static void destroy()
+  {
+    for (int dir = 0; dir < 4; dir++)
+      if (hostGauge[dir]) host_free(hostGauge[dir]);
 
-    for (int dir = 0; dir < 4; dir++) host_free(hostGauge[dir]);
     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH
         || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
-      host_free(hostClover);
-      host_free(hostCloverInv);
+      if (hostClover) host_free(hostClover);
+      if (hostCloverInv) host_free(hostCloverInv);
+    }
+
+    spinor = {};
+    spinorOut = {};
+    spinorRef = {};
+    spinorTmp = {};
+
+    if (test_split_grid) {
+      vp_spinor.clear();
+      vp_spinorOut.clear();
+      vp_spinorRef.clear();
     }
   }
 
@@ -347,52 +354,55 @@ struct DslashTestWrapper {
     if (dslash_type == QUDA_WILSON_DSLASH) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        wil_dslash(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+        wil_dslash(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPC:
-        wil_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
+        wil_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
                   inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::Mat:
-        wil_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+        wil_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec,
+                gauge_param);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        wil_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
+        wil_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger,
                   inv_param.cpu_prec, gauge_param);
-        wil_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.matpc_type, not_dagger,
+        wil_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.matpc_type, not_dagger,
                   inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatDagMat:
-        wil_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-        wil_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, not_dagger, inv_param.cpu_prec, gauge_param);
+        wil_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec,
+                gauge_param);
+        wil_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, not_dagger, inv_param.cpu_prec,
+                gauge_param);
         break;
       default: printfQuda("Test type not defined\n"); exit(-1);
       }
     } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        clover_dslash(spinorRef.V(), hostGauge, hostCloverInv, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec,
-                      gauge_param);
+        clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger,
+                      inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPC:
-        clover_matpc(spinorRef.V(), hostGauge, hostClover, hostCloverInv, spinor.V(), inv_param.kappa,
+        clover_matpc(spinorRef.data(), hostGauge, hostClover, hostCloverInv, spinor.data(), inv_param.kappa,
                      inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::Mat:
-        clover_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.dagger,
+        clover_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.dagger,
                    inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        clover_matpc(spinorTmp.V(), hostGauge, hostClover, hostCloverInv, spinor.V(), inv_param.kappa,
+        clover_matpc(spinorTmp.data(), hostGauge, hostClover, hostCloverInv, spinor.data(), inv_param.kappa,
                      inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-        clover_matpc(spinorRef.V(), hostGauge, hostClover, hostCloverInv, spinorTmp.V(), inv_param.kappa,
+        clover_matpc(spinorRef.data(), hostGauge, hostClover, hostCloverInv, spinorTmp.data(), inv_param.kappa,
                      inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatDagMat:
-        clover_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.dagger,
+        clover_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.dagger,
+                   inv_param.cpu_prec, gauge_param);
+        clover_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, not_dagger,
                    inv_param.cpu_prec, gauge_param);
-        clover_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, not_dagger, inv_param.cpu_prec,
-                   gauge_param);
         break;
       default: printfQuda("Test type not defined\n"); exit(-1);
       }
@@ -401,38 +411,39 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         // My dslash should be the same as the clover dslash
-        clover_dslash(spinorRef.V(), hostGauge, hostCloverInv, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec,
-                      gauge_param);
+        clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger,
+                      inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPC:
         // my matpc op
-        cloverHasenbuschTwist_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa,
-                                    inv_param.mu, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
-                                    gauge_param);
+        cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv,
+                                    inv_param.kappa, inv_param.mu, inv_param.matpc_type, inv_param.dagger,
+                                    inv_param.cpu_prec, gauge_param);
 
         break;
       case dslash_test_type::Mat:
         // my mat
-        cloverHasenbuchTwist_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+        cloverHasenbuchTwist_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                                  inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
         break;
       case dslash_test_type::MatPCDagMatPC:
         // matpc^\dagger matpc
         // my matpc op
-        cloverHasenbuschTwist_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa,
-                                    inv_param.mu, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
-                                    gauge_param);
+        cloverHasenbuschTwist_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv,
+                                    inv_param.kappa, inv_param.mu, inv_param.matpc_type, inv_param.dagger,
+                                    inv_param.cpu_prec, gauge_param);
 
-        cloverHasenbuschTwist_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa,
-                                    inv_param.mu, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
+        cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv,
+                                    inv_param.kappa, inv_param.mu, inv_param.matpc_type, not_dagger, inv_param.cpu_prec,
+                                    gauge_param);
 
         break;
       case dslash_test_type::MatDagMat:
         // my mat
-        cloverHasenbuchTwist_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+        cloverHasenbuchTwist_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                                  inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
-        cloverHasenbuchTwist_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu,
-                                 not_dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
+        cloverHasenbuchTwist_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa,
+                                 inv_param.mu, not_dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type);
 
         break;
       default: printfQuda("Test type not defined\n"); exit(-1);
@@ -441,54 +452,54 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tm_dslash(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity,
-                    inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+          tm_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+                    parity, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else {
-          tm_ndeg_dslash(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, parity,
-                         inv_param.dagger, inv_param.matpc_type, inv_param.cpu_prec, gauge_param);
+          tm_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+                         parity, inv_param.dagger, inv_param.matpc_type, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::MatPC:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tm_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                    inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else {
-          tm_ndeg_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                         inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::Mat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tm_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                  inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else {
-          tm_ndeg_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                       inv_param.dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::MatPCDagMatPC:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
-          tm_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                    inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tm_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                    inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
         } else {
-          tm_ndeg_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                         inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tm_ndeg_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                         inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
       case dslash_test_type::MatDagMat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
-          tm_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                  inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tm_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
+          tm_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor,
                  not_dagger, inv_param.cpu_prec, gauge_param);
         } else {
-          tm_ndeg_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                       inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tm_ndeg_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
+          tm_ndeg_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon,
                       not_dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
@@ -498,54 +509,57 @@ struct DslashTestWrapper {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tmc_dslash(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
-                     inv_param.twist_flavor, parity, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
-                     gauge_param);
+          tmc_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
+                     inv_param.mu, inv_param.twist_flavor, parity, inv_param.matpc_type, inv_param.dagger,
+                     inv_param.cpu_prec, gauge_param);
         else
-          tmc_ndeg_dslash(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa,
+          tmc_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
                           inv_param.mu, inv_param.epsilon, parity, inv_param.matpc_type, inv_param.dagger,
                           inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPC:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tmc_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
+          tmc_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                     inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else
-          tmc_ndeg_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
-                         inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
+          tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
+                         inv_param.mu, inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
+                         gauge_param);
         break;
       case dslash_test_type::Mat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET)
-          tmc_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+          tmc_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                   inv_param.twist_flavor, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         else
-          tmc_ndeg_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+          tmc_ndeg_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                        inv_param.epsilon, inv_param.dagger, inv_param.cpu_prec, gauge_param);
         break;
       case dslash_test_type::MatPCDagMatPC:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
-          tmc_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
+          tmc_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
                     inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
-                    inv_param.twist_flavor, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param);
+          tmc_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa,
+                    inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, not_dagger, inv_param.cpu_prec,
+                    gauge_param);
         } else {
-          tmc_ndeg_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu,
-                         inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_ndeg_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa,
+          tmc_ndeg_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa,
+                         inv_param.mu, inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec,
+                         gauge_param);
+          tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa,
                          inv_param.mu, inv_param.epsilon, inv_param.matpc_type, not_dagger, inv_param.cpu_prec,
                          gauge_param);
         }
         break;
       case dslash_test_type::MatDagMat:
         if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) {
-          tmc_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+          tmc_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                   inv_param.twist_flavor, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu,
+          tmc_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu,
                   inv_param.twist_flavor, not_dagger, inv_param.cpu_prec, gauge_param);
         } else {
-          tmc_ndeg_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu,
+          tmc_ndeg_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu,
                        inv_param.epsilon, inv_param.dagger, inv_param.cpu_prec, gauge_param);
-          tmc_ndeg_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu,
+          tmc_ndeg_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu,
                        inv_param.epsilon, not_dagger, inv_param.cpu_prec, gauge_param);
         }
         break;
@@ -554,26 +568,26 @@ struct DslashTestWrapper {
     } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) {
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dw_dslash(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                  inv_param.mass);
+        dw_dslash(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                  gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatPC:
-        dw_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger,
+        dw_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                  gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::Mat:
-        dw_mat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        dw_mat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                inv_param.mass);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        dw_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger,
+        dw_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                  gauge_param.cpu_prec, gauge_param, inv_param.mass);
-        dw_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, inv_param.matpc_type, not_dagger,
+        dw_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, inv_param.matpc_type, not_dagger,
                  gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatDagMat:
-        dw_matdagmat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                     inv_param.mass);
+        dw_matdagmat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec,
+                     gauge_param, inv_param.mass);
         break;
       default: printf("Test type not supported for domain wall\n"); exit(-1);
       }
@@ -582,35 +596,35 @@ struct DslashTestWrapper {
       for (int xs = 0; xs < Ls; xs++) kappa_5[xs] = kappa5;
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                    inv_param.mass);
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                    gauge_param, inv_param.mass);
         break;
       case dslash_test_type::M5:
-        dw_dslash_5_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec,
+        dw_dslash_5_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                        gauge_param, inv_param.mass, true);
         break;
       case dslash_test_type::M5inv:
-        dslash_5_inv(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                     inv_param.mass, kappa_5);
+        dslash_5_inv(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                     gauge_param, inv_param.mass, kappa_5);
         break;
       case dslash_test_type::MatPC:
-        dw_4d_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger,
+        dw_4d_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                     gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::Mat:
-        dw_4d_mat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                  inv_param.mass);
+        dw_4d_mat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec,
+                  gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        dw_4d_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger,
+        dw_4d_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger,
                     gauge_param.cpu_prec, gauge_param, inv_param.mass);
-        dw_4d_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, inv_param.matpc_type, not_dagger,
+        dw_4d_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, inv_param.matpc_type, not_dagger,
                     gauge_param.cpu_prec, gauge_param, inv_param.mass);
         break;
       case dslash_test_type::MatDagMat:
-        dw_4d_mat(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                  inv_param.mass);
-        dw_4d_mat(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, not_dagger, gauge_param.cpu_prec, gauge_param,
+        dw_4d_mat(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec,
+                  gauge_param, inv_param.mass);
+        dw_4d_mat(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, not_dagger, gauge_param.cpu_prec, gauge_param,
                   inv_param.mass);
         break;
       default: printf("Test type not supported for domain wall\n"); exit(-1);
@@ -629,44 +643,44 @@ struct DslashTestWrapper {
       }
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                    inv_param.mass);
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                    gauge_param, inv_param.mass);
         break;
       case dslash_test_type::M5:
-        mdw_dslash_5(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                     inv_param.mass, kappa_5, true);
+        mdw_dslash_5(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                     gauge_param, inv_param.mass, kappa_5, true);
         break;
       case dslash_test_type::Dslash4pre:
-        mdw_dslash_4_pre(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_dslash_4_pre(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                          gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5, true);
         break;
       case dslash_test_type::M5inv:
-        mdw_dslash_5_inv(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_dslash_5_inv(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                          gauge_param, inv_param.mass, kappa_mdwf);
         break;
       case dslash_test_type::MatPC:
-        mdw_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger,
+        mdw_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger,
                   gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       case dslash_test_type::Mat:
-        mdw_mat(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_mat(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec,
                 gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        mdw_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger,
+        mdw_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger,
                   gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
-        mdw_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa_b, kappa_c, inv_param.matpc_type, not_dagger,
+        mdw_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa_b, kappa_c, inv_param.matpc_type, not_dagger,
                   gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       case dslash_test_type::MatDagMat:
-        mdw_mat(spinorTmp.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_mat(spinorTmp.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec,
                 gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
-        mdw_mat(spinorRef.V(), hostGauge, spinorTmp.V(), kappa_b, kappa_c, not_dagger, gauge_param.cpu_prec,
+        mdw_mat(spinorRef.data(), hostGauge, spinorTmp.data(), kappa_b, kappa_c, not_dagger, gauge_param.cpu_prec,
                 gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       case dslash_test_type::MatPCDagMatPCLocal:
         // reference for MdagM local operator
-        mdw_mdagm_local(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type,
+        mdw_mdagm_local(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type,
                         gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5);
         break;
       default: printf("Test type not supported for Mobius domain wall\n"); exit(-1);
@@ -688,49 +702,49 @@ struct DslashTestWrapper {
       }
       switch (dtest_type) {
       case dslash_test_type::Dslash:
-        dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param,
-                    inv_param.mass);
+        dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
+                    gauge_param, inv_param.mass);
         break;
       case dslash_test_type::M5:
-        mdw_eofa_m5(spinorRef.V(), spinor.V(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
+        mdw_eofa_m5(spinorRef.data(), spinor.data(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
                     (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2,
                     inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift, gauge_param.cpu_prec);
         break;
       case dslash_test_type::Dslash4pre:
-        mdw_dslash_4_pre(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec,
+        mdw_dslash_4_pre(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec,
                          gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5, true);
         break;
       case dslash_test_type::M5inv:
-        mdw_eofa_m5inv(spinorRef.V(), spinor.V(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
+        mdw_eofa_m5inv(spinorRef.data(), spinor.data(), parity, inv_param.dagger, inv_param.mass, inv_param.m5,
                        (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2,
                        inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift, gauge_param.cpu_prec);
         break;
       case dslash_test_type::Mat:
-        mdw_eofa_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        mdw_eofa_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]),
                      inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift);
         break;
       case dslash_test_type::MatDagMat:
-        mdw_eofa_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.dagger, gauge_param.cpu_prec, gauge_param,
+        mdw_eofa_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]),
                      inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift);
-        mdw_eofa_mat(spinorRef.V(), hostGauge, spinorTmp.V(), not_dagger, gauge_param.cpu_prec, gauge_param,
+        mdw_eofa_mat(spinorRef.data(), hostGauge, spinorTmp.data(), not_dagger, gauge_param.cpu_prec, gauge_param,
                      inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]),
                      inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift);
         break;
       case dslash_test_type::MatPC:
-        mdw_eofa_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.matpc_type, inv_param.dagger,
+        mdw_eofa_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.matpc_type, inv_param.dagger,
                        gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
                        (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm,
                        inv_param.eofa_shift);
         break;
       case dslash_test_type::MatPCDagMatPC:
-        mdw_eofa_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.matpc_type, inv_param.dagger,
+        mdw_eofa_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.matpc_type, inv_param.dagger,
                        gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
                        (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm,
                        inv_param.eofa_shift);
-        mdw_eofa_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.matpc_type, not_dagger, gauge_param.cpu_prec,
-                       gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
+        mdw_eofa_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.matpc_type, not_dagger,
+                       gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]),
                        (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm,
                        inv_param.eofa_shift);
         break;
@@ -764,8 +778,8 @@ struct DslashTestWrapper {
       std::vector<void *> _hp_x(inv_param.num_src);
       std::vector<void *> _hp_b(inv_param.num_src);
       for (int i = 0; i < inv_param.num_src; i++) {
-        _hp_x[i] = vp_spinorOut[i].V();
-        _hp_b[i] = vp_spinor[i].V();
+        _hp_x[i] = vp_spinorOut[i].data();
+        _hp_b[i] = vp_spinor[i].data();
       }
 
       if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH
@@ -786,21 +800,21 @@ struct DslashTestWrapper {
           switch (dtest_type) {
           case dslash_test_type::Dslash:
             if (transfer) {
-              dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracDomainWall4DPC *>(dirac)->Dslash4(cudaSpinorOut, cudaSpinor, parity);
             }
             break;
           case dslash_test_type::M5:
             if (transfer) {
-              dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracDomainWall4DPC *>(dirac)->Dslash5(cudaSpinorOut, cudaSpinor);
             }
             break;
           case dslash_test_type::M5inv:
             if (transfer) {
-              dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracDomainWall4DPC *>(dirac)->M5inv(cudaSpinorOut, cudaSpinor);
             }
@@ -808,7 +822,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPC:
           case dslash_test_type::Mat:
             if (transfer) {
-              MatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->M(cudaSpinorOut, cudaSpinor);
             }
@@ -816,7 +830,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPCDagMatPC:
           case dslash_test_type::MatDagMat:
             if (transfer) {
-              MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->MdagM(cudaSpinorOut, cudaSpinor);
             }
@@ -828,28 +842,28 @@ struct DslashTestWrapper {
           switch (dtest_type) {
           case dslash_test_type::Dslash:
             if (transfer) {
-              dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracMobiusPC *>(dirac)->Dslash4(cudaSpinorOut, cudaSpinor, parity);
             }
             break;
           case dslash_test_type::M5:
             if (transfer) {
-              dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracMobiusPC *>(dirac)->Dslash5(cudaSpinorOut, cudaSpinor);
             }
             break;
           case dslash_test_type::Dslash4pre:
             if (transfer) {
-              dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracMobiusPC *>(dirac)->Dslash4pre(cudaSpinorOut, cudaSpinor);
             }
             break;
           case dslash_test_type::M5inv:
             if (transfer) {
-              dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type);
+              dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type);
             } else {
               static_cast<quda::DiracMobiusPC *>(dirac)->M5inv(cudaSpinorOut, cudaSpinor);
             }
@@ -857,7 +871,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPC:
           case dslash_test_type::Mat:
             if (transfer) {
-              MatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->M(cudaSpinorOut, cudaSpinor);
             }
@@ -865,7 +879,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPCDagMatPC:
           case dslash_test_type::MatDagMat:
             if (transfer) {
-              MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->MdagM(cudaSpinorOut, cudaSpinor);
             }
@@ -940,13 +954,13 @@ struct DslashTestWrapper {
           case dslash_test_type::Dslash:
             if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
               if (transfer) {
-                dslashQuda(spinorOut.V(), spinor.V(), &inv_param, parity);
+                dslashQuda(spinorOut.data(), spinor.data(), &inv_param, parity);
               } else {
                 dirac->Dslash(cudaSpinorOut, cudaSpinor, parity);
               }
             } else {
               if (transfer) {
-                dslashQuda(spinorOut.V(), spinor.V(), &inv_param, parity);
+                dslashQuda(spinorOut.data(), spinor.data(), &inv_param, parity);
               } else {
                 dirac->Dslash(cudaSpinorOut, cudaSpinor, parity);
               }
@@ -955,7 +969,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPC:
           case dslash_test_type::Mat:
             if (transfer) {
-              MatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->M(cudaSpinorOut, cudaSpinor);
             }
@@ -963,7 +977,7 @@ struct DslashTestWrapper {
           case dslash_test_type::MatPCDagMatPC:
           case dslash_test_type::MatDagMat:
             if (transfer) {
-              MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param);
+              MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param);
             } else {
               dirac->MdagM(cudaSpinorOut, cudaSpinor);
             }
@@ -995,22 +1009,33 @@ struct DslashTestWrapper {
       printfQuda("Tuning...\n");
       dslashCUDA(1); // warm-up run
     }
+
+    auto flops0 = quda::Tunable::flops_global();
+    auto bytes0 = quda::Tunable::bytes_global();
+
     printfQuda("Executing %d kernel loops...\n", niter);
-    if (!transfer) dirac->Flops();
     DslashTime dslash_time = dslashCUDA(niter);
     printfQuda("done.\n\n");
 
+    unsigned long long flops = (quda::Tunable::flops_global() - flops0);
+    unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0);
+
     if (!test_split_grid) {
       if (!transfer) spinorOut = cudaSpinorOut;
 
       // print timing information
       printfQuda("%fus per kernel call\n", 1e6 * dslash_time.event_time / niter);
-      // FIXME No flops count for twisted-clover yet
-      unsigned long long flops = 0;
-      if (!transfer) flops = dirac->Flops();
-      printfQuda("%llu flops per kernel call, %llu flops per site\n", flops / niter,
-                 (flops / niter) / cudaSpinor.Volume());
-      printfQuda("GFLOPS = %f\n", 1.0e-9 * flops / dslash_time.event_time);
+
+      printfQuda("%llu flops per kernel call, %llu flops per site %llu bytes per site\n", flops / niter,
+                 (flops / niter) / cudaSpinor.Volume(), (bytes / niter) / cudaSpinor.Volume());
+
+      double gflops = 1.0e-9 * flops / dslash_time.event_time;
+      printfQuda("GFLOPS = %f\n", gflops);
+      ::testing::Test::RecordProperty("Gflops", std::to_string(gflops));
+
+      double gbytes = 1.0e-9 * bytes / dslash_time.event_time;
+      printfQuda("GBYTES = %f\n", gbytes);
+      ::testing::Test::RecordProperty("Gbytes", std::to_string(gbytes));
 
       size_t ghost_bytes = cudaSpinor.GhostBytes();
 
diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp
index e22879ff92..7c17540a60 100644
--- a/tests/eigensolve_test.cpp
+++ b/tests/eigensolve_test.cpp
@@ -179,7 +179,7 @@ std::vector<double> eigensolve(test_t test_param)
   // Allocate host side memory and pointers
   for (int i = 0; i < n_eig; i++) {
     evecs[i] = quda::ColorSpinorField(cs_param);
-    host_evecs_ptr[i] = evecs[i].V();
+    host_evecs_ptr[i] = evecs[i].data();
   }
 
   // Complex eigenvalues
@@ -208,12 +208,12 @@ std::vector<double> eigensolve(test_t test_param)
     for (int i = 0; i < eig_n_conv; i++) {
       if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) {
         double _Complex sigma = evals[i];
-        residua[i] = verifyWilsonTypeSingularVector(evecs[i].V(), evecs[i + eig_n_conv].V(), sigma, i, gauge_param,
+        residua[i] = verifyWilsonTypeSingularVector(evecs[i].data(), evecs[i + eig_n_conv].data(), sigma, i, gauge_param,
                                                     eig_param, gauge.data(), clover.data(), clover_inv.data());
 
       } else {
         double _Complex lambda = evals[i];
-        residua[i] = verifyWilsonTypeEigenvector(evecs[i].V(), lambda, i, gauge_param, eig_param, gauge.data(),
+        residua[i] = verifyWilsonTypeEigenvector(evecs[i].data(), lambda, i, gauge_param, eig_param, gauge.data(),
                                                  clover.data(), clover_inv.data());
       }
     }
diff --git a/tests/gauge_alg_test.cpp b/tests/gauge_alg_test.cpp
index adf22a0f30..00ba1f3689 100644
--- a/tests/gauge_alg_test.cpp
+++ b/tests/gauge_alg_test.cpp
@@ -108,7 +108,7 @@ class GaugeAlgTest : public ::testing::Test
           gParam.x[d] += 2 * gParam.r[d];
         }
 
-        U = new cudaGaugeField(gParam);
+        U = new GaugeField(gParam);
 
         RNG randstates(*U, 1234);
 
@@ -160,12 +160,12 @@ class GaugeAlgTest : public ::testing::Test
           for (int d = 0; d < 4; d++)
             if (comm_dim_partitioned(d)) R[d] = 2;
           static TimeProfile GaugeFix("GaugeFix");
-          cudaGaugeField *tmp = new cudaGaugeField(gauge_field_param);
+          GaugeField *tmp = new GaugeField(gauge_field_param);
           tmp->copy(*host);
           U = createExtendedGauge(*tmp, R, GaugeFix);
           delete tmp;
         } else {
-          U = new cudaGaugeField(gauge_field_param);
+          U = new GaugeField(gauge_field_param);
           U->copy(*host);
         }
 
@@ -266,7 +266,7 @@ class GaugeAlgTest : public ::testing::Test
     gParam.reconstruct = param.reconstruct;
     gParam.setPrecision(gParam.Precision(), true);
 
-    cudaGaugeField *gauge = new cudaGaugeField(gParam);
+    GaugeField *gauge = new GaugeField(gParam);
 
     // copy into regular field
     copyExtendedGauge(*gauge, *U, QUDA_CUDA_FIELD_LOCATION);
diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp
index 712d0c6204..370470fc50 100644
--- a/tests/gauge_path_test.cpp
+++ b/tests/gauge_path_test.cpp
@@ -124,16 +124,17 @@ void gauge_force_test(bool compute_force = true)
   }
 
   quda::GaugeFieldParam param(gauge_param);
+  param.location = QUDA_CPU_FIELD_LOCATION;
   param.create = QUDA_NULL_FIELD_CREATE;
   param.order = QUDA_QDP_GAUGE_ORDER;
   param.location = QUDA_CPU_FIELD_LOCATION;
-  quda::cpuGaugeField U_qdp(param);
+  quda::GaugeField U_qdp(param);
 
   // fills the gauge field with random numbers
-  createSiteLinkCPU((void **)U_qdp.Gauge_p(), gauge_param.cpu_prec, 0);
+  createSiteLinkCPU(U_qdp, gauge_param.cpu_prec, 0);
 
   param.order = QUDA_MILC_GAUGE_ORDER;
-  quda::cpuGaugeField U_milc(param);
+  quda::GaugeField U_milc(param);
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) U_milc.copy(U_qdp);
   if (compute_force) {
     param.reconstruct = QUDA_RECONSTRUCT_10;
@@ -142,27 +143,31 @@ void gauge_force_test(bool compute_force = true)
     param.reconstruct = QUDA_RECONSTRUCT_NO;
   }
   param.create = QUDA_ZERO_FIELD_CREATE;
-  quda::cpuGaugeField Mom_milc(param);
-  quda::cpuGaugeField Mom_ref_milc(param);
+  quda::GaugeField Mom_milc(param);
+  quda::GaugeField Mom_ref_milc(param);
 
   param.order = QUDA_QDP_GAUGE_ORDER;
-  quda::cpuGaugeField Mom_qdp(param);
+  quda::GaugeField Mom_qdp(param);
 
   // initialize some data in cpuMom
   if (compute_force) {
-    createMomCPU(Mom_ref_milc.Gauge_p(), gauge_param.cpu_prec);
+    createMomCPU(Mom_ref_milc.data(), gauge_param.cpu_prec);
     if (gauge_order == QUDA_MILC_GAUGE_ORDER) Mom_milc.copy(Mom_ref_milc);
     if (gauge_order == QUDA_QDP_GAUGE_ORDER) Mom_qdp.copy(Mom_ref_milc);
   }
   void *mom = nullptr;
   void *sitelink = nullptr;
+  void *sitelink_array[QUDA_MAX_DIM];
+  void *mom_array[QUDA_MAX_DIM];
 
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) {
-    sitelink = U_milc.Gauge_p();
-    mom = Mom_milc.Gauge_p();
+    sitelink = U_milc.data();
+    mom = Mom_milc.data();
   } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
-    sitelink = U_qdp.Gauge_p();
-    mom = Mom_qdp.Gauge_p();
+    for (int d = 0; d < 4; d++) sitelink_array[d] = U_qdp.data(d);
+    sitelink = reinterpret_cast<void *>(sitelink_array);
+    for (int d = 0; d < 4; d++) mom_array[d] = Mom_qdp.data(d);
+    mom = reinterpret_cast<void *>(mom_array);
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
@@ -196,14 +201,17 @@ void gauge_force_test(bool compute_force = true)
   // The number comes from CPU implementation in MILC, gauge_force_imp.c
   int flops = 153004;
 
-  void *refmom = Mom_ref_milc.Gauge_p();
+  void *refmom = Mom_ref_milc.data();
   int *check_out = compute_force ? &force_check : &path_check;
   if (verify_results) {
-    gauge_force_reference(refmom, eb3, (void **)U_qdp.Gauge_p(), gauge_param.cpu_prec, input_path_buf, length,
-                          loop_coeff, num_paths, compute_force);
+    quda::host_timer_t verify_timer;
+    verify_timer.start();
+    gauge_force_reference(refmom, eb3, U_qdp, input_path_buf, length, loop_coeff, num_paths, compute_force);
     *check_out
-      = compare_floats(Mom_milc.Gauge_p(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
-    if (compute_force) strong_check_mom(Mom_milc.Gauge_p(), refmom, 4 * V, gauge_param.cpu_prec);
+      = compare_floats(Mom_milc.data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec);
+    if (compute_force) strong_check_mom(Mom_milc.data(), refmom, 4 * V, gauge_param.cpu_prec);
+    verify_timer.stop();
+    printfQuda("Verification time = %.2f ms\n", verify_timer.last());
   }
 
   if (compute_force) {
@@ -255,21 +263,23 @@ void gauge_loop_test()
   param.create = QUDA_NULL_FIELD_CREATE;
   param.order = QUDA_QDP_GAUGE_ORDER;
   param.location = QUDA_CPU_FIELD_LOCATION;
-  quda::cpuGaugeField U_qdp(param);
+  quda::GaugeField U_qdp(param);
 
   // fills the gauge field with random numbers
-  createSiteLinkCPU((void **)U_qdp.Gauge_p(), gauge_param.cpu_prec, 0);
+  createSiteLinkCPU(U_qdp, gauge_param.cpu_prec, 0);
 
   param.order = QUDA_MILC_GAUGE_ORDER;
-  quda::cpuGaugeField U_milc(param);
+  quda::GaugeField U_milc(param);
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) U_milc.copy(U_qdp);
 
   void *sitelink = nullptr;
+  void *sitelink_array[QUDA_MAX_DIM];
 
   if (gauge_order == QUDA_MILC_GAUGE_ORDER) {
-    sitelink = U_milc.Gauge_p();
+    sitelink = U_milc.data();
   } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) {
-    sitelink = U_qdp.Gauge_p();
+    for (int d = 0; d < 4; d++) sitelink_array[d] = U_qdp.data(d);
+    sitelink = reinterpret_cast<void *>(sitelink_array);
   } else {
     errorQuda("Unsupported gauge order %d", gauge_order);
   }
@@ -311,8 +321,11 @@ void gauge_loop_test()
   std::vector<quda::Complex> traces_ref(num_paths);
 
   if (verify_results) {
-    gauge_loop_trace_reference((void **)U_qdp.Gauge_p(), gauge_param.cpu_prec, traces_ref, scale_factor, trace_path_p,
-                               trace_loop_length_p, trace_loop_coeff_p, num_paths);
+    quda::host_timer_t verify_timer;
+    verify_timer.start();
+
+    gauge_loop_trace_reference(U_qdp, traces_ref, scale_factor, trace_path_p, trace_loop_length_p, trace_loop_coeff_p,
+                               num_paths);
 
     loop_deviation = 0;
     for (int i = 0; i < num_paths; i++) {
@@ -342,6 +355,9 @@ void gauge_loop_test()
             "Plaquette loop space %e time %e total %e ; plaqQuda space %e time %e total %e ; deviation %e\n",
             plaq_loop[0], plaq_loop[1], plaq_loop[2], obsParam.plaquette[0], obsParam.plaquette[1],
             obsParam.plaquette[2], plaq_deviation);
+
+    verify_timer.stop();
+    printfQuda("Verification time = %.2f ms\n", verify_timer.last());
   }
 
   double perf = 1.0 * niter * flops * V / (host_timer.last() * 1e+9);
diff --git a/tests/heatbath_test.cpp b/tests/heatbath_test.cpp
index 673712201f..771909ffa5 100644
--- a/tests/heatbath_test.cpp
+++ b/tests/heatbath_test.cpp
@@ -53,33 +53,9 @@ void display_test_info()
              dimPartitioned(3));
 }
 
-int main(int argc, char **argv)
+void heatbath_test(int argc, char **argv)
 {
-  // command line options
-  auto app = make_app();
-  add_heatbath_option_group(app);
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
-  }
-
-  if (prec_sloppy == QUDA_INVALID_PRECISION) prec_sloppy = prec;
-  if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) link_recon_sloppy = link_recon;
-
-  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
-  initComms(argc, argv, gridsize_from_cmdline);
-
-  // call srand() with a rank-dependent seed
-  initRand();
-
-  display_test_info();
-
-  // initialize the QUDA library
-  initQuda(device_ordinal);
-
   // *** QUDA parameters begin here.
-
   QudaGaugeParam gauge_param = newQudaGaugeParam();
   setWilsonGaugeParam(gauge_param);
   gauge_param.t_boundary = QUDA_PERIODIC_T;
@@ -91,12 +67,17 @@ int main(int argc, char **argv)
   // Allocate space on the host (always best to allocate and free in the same scope)
   for (int dir = 0; dir < 4; dir++) { load_gauge[dir] = safe_malloc(V * gauge_site_size * gauge_param.cpu_prec); }
   constructHostGaugeField(load_gauge, gauge_param, argc, argv);
+
+  if (prec_sloppy == QUDA_INVALID_PRECISION) prec_sloppy = prec;
+  if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) link_recon_sloppy = link_recon;
+
   // Load the gauge field to the device
   loadGaugeQuda((void *)load_gauge, &gauge_param);
 
-  int *num_failures_h = (int *)mapped_malloc(sizeof(int));
-  int *num_failures_d = (int *)get_mapped_device_pointer(num_failures_h);
-  *num_failures_h = 0;
+  quda::quda_ptr num_failures(QUDA_MEMORY_MAPPED, sizeof(int), false);
+  int &num_failures_h = *static_cast<int *>(num_failures.data_host());
+  int &num_failures_d = *static_cast<int *>(num_failures.data_device());
+  num_failures_h = 0;
 
   // start the timer
   double time0 = -((double)clock());
@@ -110,7 +91,7 @@ int main(int argc, char **argv)
     gParam.link_type = gauge_param.type;
     gParam.reconstruct = gauge_param.reconstruct;
     gParam.setPrecision(gParam.Precision(), true);
-    cudaGaugeField *gauge = new cudaGaugeField(gParam);
+    GaugeField gauge(gParam);
 
     int pad = 0;
     lat_dim_t y;
@@ -126,9 +107,9 @@ int main(int argc, char **argv)
     gParamEx.t_boundary = gParam.t_boundary;
     gParamEx.nFace = 1;
     for (int dir = 0; dir < 4; ++dir) gParamEx.r[dir] = R[dir];
-    cudaGaugeField *gaugeEx = new cudaGaugeField(gParamEx);
+    GaugeField gaugeEx(gParamEx);
     // CURAND random generator initialization
-    RNG *randstates = new RNG(*gauge, 1234);
+    RNG randstates(gauge, 1234);
 
     int nsteps = heatbath_num_steps;
     int nwarm = heatbath_warmup_steps;
@@ -145,21 +126,21 @@ int main(int argc, char **argv)
 
     if (latfile.size() > 0) { // We loaded in a gauge field
       // copy internal extended field to gaugeEx
-      copyExtendedResidentGaugeQuda((void *)gaugeEx);
+      copyExtendedResidentGaugeQuda(&gaugeEx);
     } else {
       if (coldstart)
-        InitGaugeField(*gaugeEx);
+        InitGaugeField(gaugeEx);
       else
-        InitGaugeField(*gaugeEx, *randstates);
+        InitGaugeField(gaugeEx, randstates);
 
       // copy into regular field
-      copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+      copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
       // load the gauge field from gauge
-      gauge_param.gauge_order = gauge->Order();
+      gauge_param.gauge_order = gauge.Order();
       gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
 
-      loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+      loadGaugeQuda(gauge.data(), &gauge_param);
     }
 
     QudaGaugeObservableParam param = newQudaGaugeObservableParam();
@@ -175,37 +156,37 @@ int main(int argc, char **argv)
     // Do a warmup if requested
     if (nwarm > 0) {
       for (int step = 1; step <= nwarm; ++step) {
-        Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps);
+        Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps);
 
-        quda::unitarizeLinks(*gaugeEx, num_failures_d);
-        if (*num_failures_h > 0) errorQuda("Error in the unitarization\n");
+        quda::unitarizeLinks(gaugeEx, &num_failures_d);
+        if (num_failures_h > 0) errorQuda("Error in the unitarization\n");
       }
     }
 
     // copy into regular field
-    copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+    copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
     // load the gauge field from gauge
-    gauge_param.gauge_order = gauge->Order();
+    gauge_param.gauge_order = gauge.Order();
     gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
 
-    loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+    loadGaugeQuda(gauge.data(), &gauge_param);
     gaugeObservablesQuda(&param);
     printfQuda("step=0 plaquette = %e topological charge = %e\n", param.plaquette[0], param.qcharge);
 
     freeGaugeQuda();
 
     for (int step = 1; step <= nsteps; ++step) {
-      Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps);
+      Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps);
 
       // Reunitarize gauge links...
-      quda::unitarizeLinks(*gaugeEx, num_failures_d);
-      if (*num_failures_h > 0) errorQuda("Error in the unitarization\n");
+      quda::unitarizeLinks(gaugeEx, &num_failures_d);
+      if (num_failures_h > 0) errorQuda("Error in the unitarization\n");
 
       // copy into regular field
-      copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+      copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
-      loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+      loadGaugeQuda(gauge.data(), &gauge_param);
       gaugeObservablesQuda(&param);
       printfQuda("step=%d plaquette = %e topological charge = %e\n", step, param.plaquette[0], param.qcharge);
 
@@ -219,14 +200,15 @@ int main(int argc, char **argv)
 
       QudaGaugeParam gauge_param = newQudaGaugeParam();
       setWilsonGaugeParam(gauge_param);
+      gauge_param.t_boundary = gauge.TBoundary();
 
       void *cpu_gauge[4];
       for (int dir = 0; dir < 4; dir++) { cpu_gauge[dir] = safe_malloc(V * gauge_site_size * gauge_param.cpu_prec); }
 
       // copy into regular field
-      copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+      copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
-      saveGaugeFieldQuda((void *)cpu_gauge, (void *)gauge, &gauge_param);
+      saveGaugeFieldQuda((void *)cpu_gauge, &gauge, &gauge_param);
 
       write_gauge_field(gauge_outfile.c_str(), cpu_gauge, gauge_param.cpu_prec, gauge_param.X, 0, (char **)0);
 
@@ -235,27 +217,44 @@ int main(int argc, char **argv)
       printfQuda("No output file specified.\n");
     }
 
-    delete gauge;
-    delete gaugeEx;
     // Release all temporary memory used for data exchange between GPUs in multi-GPU mode
     PGaugeExchangeFree();
-
-    delete randstates;
   }
 
   // stop the timer
   time0 += clock();
   time0 /= CLOCKS_PER_SEC;
 
-  // printfQuda("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n",
-  // inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0);
   printfQuda("\nDone, total time = %g secs\n", time0);
 
-  host_free(num_failures_h);
-
   freeGaugeQuda();
-
   for (int dir = 0; dir < 4; dir++) host_free(load_gauge[dir]);
+}
+
+int main(int argc, char **argv)
+{
+  // command line options
+  auto app = make_app();
+  add_heatbath_option_group(app);
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
+  }
+
+  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
+  initComms(argc, argv, gridsize_from_cmdline);
+
+  // call srand() with a rank-dependent seed
+  initRand();
+
+  display_test_info();
+
+  // initialize the QUDA library
+  initQuda(device_ordinal);
+
+  // run the test
+  heatbath_test(argc, argv);
 
   // finalize the QUDA library
   endQuda();
diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp
index 646a661bed..7560dbc105 100644
--- a/tests/hisq_paths_force_test.cpp
+++ b/tests/hisq_paths_force_test.cpp
@@ -15,34 +15,34 @@
 
 using namespace quda;
 
-cpuGaugeField *cpuGauge = NULL;
-cudaGaugeField *cudaForce = NULL;
-cpuGaugeField *cpuForce = NULL;
-cpuGaugeField *hostVerifyForce = NULL;
+GaugeField *cpuGauge = NULL;
+GaugeField *cudaForce = NULL;
+GaugeField *cpuForce = NULL;
+GaugeField *hostVerifyForce = NULL;
 
-cudaGaugeField *cudaMom = NULL;
-cpuGaugeField *cpuMom = NULL;
-cpuGaugeField *refMom = NULL;
+GaugeField *cudaMom = NULL;
+GaugeField *cpuMom = NULL;
+GaugeField *refMom = NULL;
 
 QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER;
 
-cpuGaugeField *cpuOprod = NULL;
-cudaGaugeField *cudaOprod = NULL;
-cpuGaugeField *cpuLongLinkOprod = NULL;
-cudaGaugeField *cudaLongLinkOprod = NULL;
+GaugeField *cpuOprod = NULL;
+GaugeField *cudaOprod = NULL;
+GaugeField *cpuLongLinkOprod = NULL;
+GaugeField *cudaLongLinkOprod = NULL;
 
 int ODD_BIT = 1;
 
 QudaPrecision force_prec = QUDA_DOUBLE_PRECISION;
 
-cudaGaugeField *cudaGauge_ex = NULL;
-cpuGaugeField *cpuGauge_ex = NULL;
-cudaGaugeField *cudaForce_ex = NULL;
-cpuGaugeField *cpuForce_ex = NULL;
-cpuGaugeField *cpuOprod_ex = NULL;
-cudaGaugeField *cudaOprod_ex = NULL;
-cpuGaugeField *cpuLongLinkOprod_ex = NULL;
-cudaGaugeField *cudaLongLinkOprod_ex = NULL;
+GaugeField *cudaGauge_ex = NULL;
+GaugeField *cpuGauge_ex = NULL;
+GaugeField *cudaForce_ex = NULL;
+GaugeField *cpuForce_ex = NULL;
+GaugeField *cpuOprod_ex = NULL;
+GaugeField *cudaOprod_ex = NULL;
+GaugeField *cpuLongLinkOprod_ex = NULL;
+GaugeField *cudaLongLinkOprod_ex = NULL;
 
 static void setPrecision(QudaPrecision precision)
 {
@@ -227,7 +227,7 @@ static void hisq_force_startup()
     gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0;
     gParam_ex.x[d] = X[d] + 2 * gParam_ex.r[d];
   } // set halo region for GPU
-  cudaGauge_ex = new cudaGaugeField(gParam_ex);
+  cudaGauge_ex = new GaugeField(gParam_ex);
 
   // Create the host gauge field
   memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam));
@@ -238,7 +238,7 @@ static void hisq_force_startup()
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.order = gauge_order;
-  cpuGauge = new cpuGaugeField(gParam);
+  cpuGauge = new GaugeField(gParam);
 
   gParam_ex = GaugeFieldParam(qudaGaugeParam_ex);
   gParam.location = QUDA_CPU_FIELD_LOCATION;
@@ -250,12 +250,12 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region for CPU
-  cpuGauge_ex = new cpuGaugeField(gParam_ex);
+  cpuGauge_ex = new GaugeField(gParam_ex);
 
   auto generated_link_type = (link_recon == QUDA_RECONSTRUCT_NO ?
                                 SITELINK_PHASE_NO :
                                 (link_recon == QUDA_RECONSTRUCT_13 ? SITELINK_PHASE_U1 : SITELINK_PHASE_MILC));
-  createSiteLinkCPU((void **)cpuGauge->Gauge_p(), qudaGaugeParam.cpu_prec, generated_link_type);
+  createSiteLinkCPU(*cpuGauge, qudaGaugeParam.cpu_prec, generated_link_type);
   copyExtendedGauge(*cpuGauge_ex, *cpuGauge, QUDA_CPU_FIELD_LOCATION);
 
   qudaGaugeParam.type = QUDA_GENERAL_LINKS;
@@ -279,8 +279,8 @@ static void hisq_force_startup()
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.order = gauge_order;
-  cpuForce = new cpuGaugeField(gParam);
-  hostVerifyForce = new cpuGaugeField(gParam);
+  cpuForce = new GaugeField(gParam);
+  hostVerifyForce = new GaugeField(gParam);
 
   gParam_ex.location = QUDA_CPU_FIELD_LOCATION;
   gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
@@ -292,7 +292,7 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   }
-  cpuForce_ex = new cpuGaugeField(gParam_ex);
+  cpuForce_ex = new GaugeField(gParam_ex);
 
   // create the momentum matrix
   gParam.location = QUDA_CPU_FIELD_LOCATION;
@@ -302,8 +302,8 @@ static void hisq_force_startup()
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.order = QUDA_MILC_GAUGE_ORDER;
   gParam.create = QUDA_NULL_FIELD_CREATE;
-  cpuMom = new cpuGaugeField(gParam);
-  refMom = new cpuGaugeField(gParam);
+  cpuMom = new GaugeField(gParam);
+  refMom = new GaugeField(gParam);
 
   /**********************************
    * Create the outer product fields *
@@ -316,8 +316,8 @@ static void hisq_force_startup()
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.order = gauge_order;
-  cpuOprod = new cpuGaugeField(gParam);
-  cpuLongLinkOprod = new cpuGaugeField(gParam);
+  cpuOprod = new GaugeField(gParam);
+  cpuLongLinkOprod = new GaugeField(gParam);
 
   // Create extended outer product fields
   gParam_ex.location = QUDA_CPU_FIELD_LOCATION;
@@ -328,13 +328,13 @@ static void hisq_force_startup()
     gParam_ex.r[d] = R[d];
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region for CPU
-  cpuOprod_ex = new cpuGaugeField(gParam_ex);
-  cpuLongLinkOprod_ex = new cpuGaugeField(gParam_ex);
+  cpuOprod_ex = new GaugeField(gParam_ex);
+  cpuLongLinkOprod_ex = new GaugeField(gParam_ex);
 
   // initialize the CPU outer product fields and exchange once
   createStagForOprodCPU(stag_for_oprod, force_prec, qudaGaugeParam.X, *rng);
-  computeLinkOrderedOuterProduct(stag_for_oprod, cpuOprod->Gauge_p(), force_prec, 1);
-  computeLinkOrderedOuterProduct(stag_for_oprod, cpuLongLinkOprod->Gauge_p(), force_prec, 3);
+  computeLinkOrderedOuterProduct(stag_for_oprod, *cpuOprod, force_prec, 1);
+  computeLinkOrderedOuterProduct(stag_for_oprod, *cpuLongLinkOprod, force_prec, 3);
 
   copyExtendedGauge(*cpuOprod_ex, *cpuOprod, QUDA_CPU_FIELD_LOCATION);
   copyExtendedGauge(*cpuLongLinkOprod_ex, *cpuLongLinkOprod, QUDA_CPU_FIELD_LOCATION);
@@ -352,9 +352,9 @@ static void hisq_force_startup()
     gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0;
     gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d];
   } // set halo region
-  cudaForce_ex = new cudaGaugeField(gParam_ex);
-  cudaOprod_ex = new cudaGaugeField(gParam_ex);
-  cudaLongLinkOprod_ex = new cudaGaugeField(gParam_ex);
+  cudaForce_ex = new GaugeField(gParam_ex);
+  cudaOprod_ex = new GaugeField(gParam_ex);
+  cudaLongLinkOprod_ex = new GaugeField(gParam_ex);
 
   // create a device force for verify
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -362,7 +362,7 @@ static void hisq_force_startup()
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.link_type = QUDA_GENERAL_LINKS;
   gParam.setPrecision(prec, true);
-  cudaForce = new cudaGaugeField(gParam);
+  cudaForce = new GaugeField(gParam);
 
   // create the device momentum field
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
@@ -370,21 +370,21 @@ static void hisq_force_startup()
   gParam.reconstruct = QUDA_RECONSTRUCT_10;
   gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
   gParam.setPrecision(prec, true);
-  cudaMom = new cudaGaugeField(gParam);
+  cudaMom = new GaugeField(gParam);
 
   /********************************************************************
    * Copy to and exchange gauge and outer product fields on the device *
    ********************************************************************/
   cpuGauge_ex->exchangeExtendedGhost(R, true);
-  cudaGauge_ex->loadCPUField(*cpuGauge);
+  cudaGauge_ex->copy(*cpuGauge);
   cudaGauge_ex->exchangeExtendedGhost(cudaGauge_ex->R());
 
   cpuOprod_ex->exchangeExtendedGhost(R, true);
-  cudaOprod_ex->loadCPUField(*cpuOprod);
+  cudaOprod_ex->copy(*cpuOprod);
   cudaOprod_ex->exchangeExtendedGhost(cudaOprod_ex->R());
 
   cpuLongLinkOprod_ex->exchangeExtendedGhost(R, true);
-  cudaLongLinkOprod_ex->loadCPUField(*cpuLongLinkOprod);
+  cudaLongLinkOprod_ex->copy(*cpuLongLinkOprod);
   cudaLongLinkOprod_ex->exchangeExtendedGhost(cudaLongLinkOprod_ex->R());
 
   /**********************
@@ -460,18 +460,15 @@ static int hisq_force_test(bool lepage)
 
     copyExtendedGauge(*cpuForce, *cpuForce_ex, QUDA_CPU_FIELD_LOCATION);
     copyExtendedGauge(*cudaForce, *cudaForce_ex, QUDA_CUDA_FIELD_LOCATION);
-    cudaForce->saveCPUField(*hostVerifyForce);
+    hostVerifyForce->copy(*cudaForce);
 
     int res = 1;
     for (int dir = 0; dir < 4; dir++) {
-      res &= compare_floats(reinterpret_cast<void **>(cpuForce->Gauge_p())[dir],
-                            reinterpret_cast<void **>(hostVerifyForce->Gauge_p())[dir], V * gauge_site_size,
+      res &= compare_floats(cpuForce->data<void *>(dir), hostVerifyForce->data<void *>(dir), V * gauge_site_size,
                             getTolerance(force_prec), force_prec);
     }
 
-    strong_check_link(reinterpret_cast<void **>(hostVerifyForce->Gauge_p()),
-                      "GPU results: ", reinterpret_cast<void **>(cpuForce->Gauge_p()), "CPU reference results:", V,
-                      force_prec);
+    strong_check_link(*hostVerifyForce, "GPU result:", *cpuForce, "CPU reference results:");
     logQuda(QUDA_SUMMARIZE, "Lepage %s staples force test %s\n\n", lepage ? "enabled" : "disabled",
             (1 == res) ? "PASSED" : "FAILED");
   }
@@ -497,18 +494,15 @@ static int hisq_force_test(bool lepage)
 
       copyExtendedGauge(*cpuForce, *cpuForce_ex, QUDA_CPU_FIELD_LOCATION);
       copyExtendedGauge(*cudaForce, *cudaForce_ex, QUDA_CUDA_FIELD_LOCATION);
-      cudaForce->saveCPUField(*hostVerifyForce);
+      hostVerifyForce->copy(*cudaForce);
 
       int res = 1;
       for (int dir = 0; dir < 4; dir++) {
-        res &= compare_floats(reinterpret_cast<void **>(cpuForce->Gauge_p())[dir],
-                              reinterpret_cast<void **>(hostVerifyForce->Gauge_p())[dir], V * gauge_site_size,
+        res &= compare_floats(cpuForce->data(dir), hostVerifyForce->data(dir), V * gauge_site_size,
                               getTolerance(force_prec), force_prec);
       }
 
-      strong_check_link(reinterpret_cast<void **>(hostVerifyForce->Gauge_p()),
-                        "GPU results: ", reinterpret_cast<void **>(cpuForce->Gauge_p()), "CPU reference results:", V,
-                        force_prec);
+      strong_check_link(*hostVerifyForce, "GPU results: ", *cpuForce, "CPU reference results:");
       logQuda(QUDA_SUMMARIZE, "Long link force test %s\n\n", (1 == res) ? "PASSED" : "FAILED");
     }
   }
@@ -526,14 +520,14 @@ static int hisq_force_test(bool lepage)
     host_timer.stop();
     host_time_sec += host_timer.last();
 
-    cudaMom->saveCPUField(*cpuMom);
+    cpuMom->copy(*cudaMom);
   }
 
   int accuracy_level = 3;
   if (verify_results) {
-    int res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4 * cpuMom->Volume() * mom_site_size,
+    int res = compare_floats(cpuMom->data(), refMom->data(), 4 * cpuMom->Volume() * mom_site_size,
                              getTolerance(force_prec), force_prec);
-    accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4 * cpuMom->Volume(), force_prec);
+    accuracy_level = strong_check_mom(cpuMom->data(), refMom->data(), 4 * cpuMom->Volume(), force_prec);
     logQuda(QUDA_SUMMARIZE, "Test (lepage coeff %e) %s\n", d_act_path_coeff[5], (1 == res) ? "PASSED" : "FAILED");
   }
   long long staple_io, staple_flops, long_io, long_flops, complete_io, complete_flops;
diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp
index cb6572b25b..98c2ae91d3 100644
--- a/tests/hisq_stencil_test.cpp
+++ b/tests/hisq_stencil_test.cpp
@@ -104,8 +104,10 @@ static void hisq_test()
   double u4 = u2 * u2;
   double u6 = u4 * u2;
 
+  std::array<std::array<double, 6>, 3> act_paths;
+
   // First path: create V, W links
-  double act_path_coeff_1[6] = {
+  act_paths[0] = {
     (1.0 / 8.0),                             /* one link */
     u2 * (0.0),                              /* Naik */
     u2 * (-1.0 / 8.0) * 0.5,                 /* simple staple */
@@ -115,7 +117,7 @@ static void hisq_test()
   };
 
   // Second path: create X, long links
-  double act_path_coeff_2[6] = {
+  act_paths[1] = {
     ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */
                                                       /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */
     (-1.0 / 24.0),                                    /* Naik */
@@ -126,7 +128,7 @@ static void hisq_test()
   };
 
   // Paths for epsilon corrections. Not used if n_naiks = 1.
-  double act_path_coeff_3[6] = {
+  act_paths[2] = {
     (1.0 / 8.0),   /* one link b/c of Naik */
     (-1.0 / 24.0), /* Naik */
     0.0,           /* simple staple */
@@ -185,7 +187,7 @@ static void hisq_test()
   // Tuning run...
   {
     printfQuda("Tuning...\n");
-    computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_path_coeff_2, &qudaGaugeParam);
+    computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &qudaGaugeParam);
   }
 
   struct timeval t0, t1;
@@ -196,11 +198,11 @@ static void hisq_test()
     // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying!
 
     // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
-    computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_path_coeff_1, &qudaGaugeParam);
+    computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &qudaGaugeParam);
 
     if (n_naiks > 1) {
       // Create Naiks, 3rd path table set
-      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_3, &qudaGaugeParam);
+      computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &qudaGaugeParam);
 
       // Rescale+copy Naiks into Naik field
       cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size);
@@ -211,7 +213,7 @@ static void hisq_test()
     }
 
     // Create X and long links, 2nd path table set
-    computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_2, &qudaGaugeParam);
+    computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &qudaGaugeParam);
 
     if (n_naiks > 1) {
       // Add into Naik field
@@ -244,9 +246,6 @@ static void hisq_test()
   }
 
   if (verify_results) {
-
-    double *act_paths[3] = {act_path_coeff_1, act_path_coeff_2, act_path_coeff_3};
-
     computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, sitelink, &qudaGaugeParam,
                         act_paths, eps_naik);
   }
diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp
index 1ab7b6a71b..01e3c78c18 100644
--- a/tests/hisq_unitarize_force_test.cpp
+++ b/tests/hisq_unitarize_force_test.cpp
@@ -12,21 +12,21 @@
 #include <sys/time.h>
 #include <gtest/gtest.h>
 
-quda::cudaGaugeField *cudaFatLink = NULL;
-quda::cpuGaugeField *cpuFatLink = NULL;
+quda::GaugeField *cudaFatLink = NULL;
+quda::GaugeField *cpuFatLink = NULL;
 
-quda::cudaGaugeField *cudaOprod = NULL;
-quda::cpuGaugeField *cpuOprod = NULL;
+quda::GaugeField *cudaOprod = NULL;
+quda::GaugeField *cpuOprod = NULL;
 
-quda::cudaGaugeField *cudaResult = NULL;
-quda::cpuGaugeField *cpuResult = NULL;
+quda::GaugeField *cudaResult = NULL;
+quda::GaugeField *cpuResult = NULL;
 
-quda::cpuGaugeField *cpuReference = NULL;
+quda::GaugeField *cpuReference = NULL;
 
 static QudaGaugeParam gaugeParam;
 
 // Create a field of links that are not su3_matrices
-void createNoisyLinkCPU(void **field, QudaPrecision prec, int seed)
+void createNoisyLinkCPU(quda::GaugeField &field, QudaPrecision prec, int seed)
 {
   createSiteLinkCPU(field, prec, 0);
 
@@ -34,10 +34,10 @@ void createNoisyLinkCPU(void **field, QudaPrecision prec, int seed)
   for (int dir = 0; dir < 4; ++dir) {
     for (int i = 0; i < V * 18; ++i) {
       if (prec == QUDA_DOUBLE_PRECISION) {
-        double *ptr = ((double **)field)[dir] + i;
+        double *ptr = field.data<double *>(dir) + i;
         *ptr += (rand() - RAND_MAX / 2.0) / (20.0 * RAND_MAX);
       } else if (prec == QUDA_SINGLE_PRECISION) {
-        float *ptr = ((float **)field)[dir] + i;
+        float *ptr = field.data<float *>(dir) + i;
         *ptr += (rand() - RAND_MAX / 2.0) / (20.0 * RAND_MAX);
       }
     }
@@ -66,10 +66,10 @@ static void hisq_force_init()
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.anisotropy = 1;
 
-  cpuFatLink = new quda::cpuGaugeField(gParam);
-  cpuOprod = new quda::cpuGaugeField(gParam);
-  cpuResult = new quda::cpuGaugeField(gParam);
-  cpuReference = new quda::cpuGaugeField(gParam);
+  cpuFatLink = new quda::GaugeField(gParam);
+  cpuOprod = new quda::GaugeField(gParam);
+  cpuResult = new quda::GaugeField(gParam);
+  cpuReference = new quda::GaugeField(gParam);
 
   // create "gauge fields"
   int seed = 0;
@@ -77,20 +77,20 @@ static void hisq_force_init()
   seed += quda::comm_rank();
 #endif
 
-  createNoisyLinkCPU((void **)cpuFatLink->Gauge_p(), gaugeParam.cpu_prec, seed);
-  createNoisyLinkCPU((void **)cpuOprod->Gauge_p(), gaugeParam.cpu_prec, seed + 1);
+  createNoisyLinkCPU(*cpuFatLink, gaugeParam.cpu_prec, seed);
+  createNoisyLinkCPU(*cpuOprod, gaugeParam.cpu_prec, seed + 1);
 
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.setPrecision(gaugeParam.cuda_prec, true);
 
-  cudaFatLink = new quda::cudaGaugeField(gParam);
-  cudaOprod = new quda::cudaGaugeField(gParam);
-  cudaResult = new quda::cudaGaugeField(gParam);
+  cudaFatLink = new quda::GaugeField(gParam);
+  cudaOprod = new quda::GaugeField(gParam);
+  cudaResult = new quda::GaugeField(gParam);
 
   gParam.order = QUDA_QDP_GAUGE_ORDER;
 
-  cudaFatLink->loadCPUField(*cpuFatLink);
-  cudaOprod->loadCPUField(*cpuOprod);
+  cudaFatLink->copy(*cpuFatLink);
+  cudaOprod->copy(*cpuOprod);
 }
 
 static void hisq_force_end()
@@ -135,14 +135,14 @@ TEST(hisq_force_unitarize, verify)
     quda::fermion_force::unitarizeForceCPU(*cpuResult, *cpuOprod, *cpuFatLink);
   }
 
-  cudaResult->saveCPUField(*cpuReference);
+  cpuReference->copy(*cudaResult);
 
   printfQuda("Comparing CPU and GPU results\n");
   int res[4];
 
   double accuracy = prec == QUDA_DOUBLE_PRECISION ? 1e-10 : 1e-5;
   for (int dir = 0; dir < 4; ++dir) {
-    res[dir] = compare_floats(((char **)cpuReference->Gauge_p())[dir], ((char **)cpuResult->Gauge_p())[dir],
+    res[dir] = compare_floats(cpuReference->data<void *>(dir), cpuResult->data<void *>(dir),
                               cpuReference->Volume() * gauge_site_size, accuracy, gaugeParam.cpu_prec);
 
     quda::comm_allreduce_int(res[dir]);
diff --git a/tests/host_reference/CloverForce_reference.h b/tests/host_reference/CloverForce_reference.h
index eac2f57c42..c635f3b5f8 100644
--- a/tests/host_reference/CloverForce_reference.h
+++ b/tests/host_reference/CloverForce_reference.h
@@ -251,11 +251,11 @@ void CloverForce_kernel_host(std::array<void *, 4> gauge, void *h_mom, quda::Col
 {
 
   gFloat **gaugeFull = (gFloat **)gauge.data();
-  sFloat *spinorField = (sFloat *)inB.V();
+  sFloat *spinorField = (sFloat *)inB.data();
 
   gFloat *gaugeEven[4], *gaugeOdd[4];
 
-  sFloat *A = (sFloat *)inA.V();
+  sFloat *A = (sFloat *)inA.data();
 
   for (int dir = 0; dir < 4; dir++) {
     gaugeEven[dir] = gaugeFull[dir];
@@ -892,6 +892,9 @@ void cloverDerivative_reference(void *h_mom, void **gauge, void *oprod, int pari
   // auto oprod_ex = quda::createExtendedTensorGauge(oprod_qdp.data(), param, R);
   // printf("HERE before oprod_ex created\n");
 
+  void *u_array[QUDA_MAX_DIM];
+  for (int d = 0; d < 4; d++) u_array[d] = qdp_ex->data(d);
+
   for (int i = 0; i < Vh; i++) {
     for (int yIndex = 0; yIndex < 2; yIndex++) {
       for (int mu = 0; mu < 4; mu++) {
@@ -899,15 +902,17 @@ void cloverDerivative_reference(void *h_mom, void **gauge, void *oprod, int pari
           if (nu == mu)
             continue;
           else if (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION)
-            computeForce_reference<double>(h_mom, (void **)qdp_ex->Gauge_p(), lat, oprod, i, yIndex, parity, mu, nu);
+            computeForce_reference<double>(h_mom, u_array, lat, oprod, i, yIndex, parity, mu, nu);
           else if (gauge_param.cpu_prec == QUDA_SINGLE_PRECISION)
-            computeForce_reference<float>(h_mom, (void **)qdp_ex->Gauge_p(), lat, oprod, i, yIndex, parity, mu, nu);
+            computeForce_reference<float>(h_mom, u_array, lat, oprod, i, yIndex, parity, mu, nu);
           else
             errorQuda("Unsupported precision %d", gauge_param.cpu_prec);
         }
       }
     }
   }
+
+  delete qdp_ex;
 }
 
 template <typename sFloat, typename gFloat>
@@ -916,8 +921,8 @@ void CloverSigmaOprod_reference(void *oprod_, quda::ColorSpinorField &inp, quda:
 {
   int nColor = 3;
   gFloat *oprod = (gFloat *)oprod_;
-  sFloat *x = (sFloat *)inx.V();
-  sFloat *p = (sFloat *)inp.V();
+  sFloat *x = (sFloat *)inx.data();
+  sFloat *p = (sFloat *)inp.data();
 
   gFloat oprod_f[gauge_site_size];
   gFloat oprod_imx2[gauge_site_size];
diff --git a/tests/host_reference/TMCloverForce_reference.cpp b/tests/host_reference/TMCloverForce_reference.cpp
index d7fac10c37..dfb0ab5d50 100644
--- a/tests/host_reference/TMCloverForce_reference.cpp
+++ b/tests/host_reference/TMCloverForce_reference.cpp
@@ -98,7 +98,7 @@ void TMCloverForce_reference(void *h_mom, void **h_x, double *coeff, int nvector
   x.Odd() = load_half;
   qParam.create = QUDA_NULL_FIELD_CREATE;
 
-  Gamma5_host((double *)tmp.V(), (double *)x.Odd().V(), x.Odd().VolumeCB());
+  Gamma5_host(tmp.data<double*>(), x.Odd().data<double*>(), x.Odd().VolumeCB());
 
   int parity = 0;
   QudaMatPCType myMatPCType = inv_param->matpc_type;
@@ -106,26 +106,26 @@ void TMCloverForce_reference(void *h_mom, void **h_x, double *coeff, int nvector
   if (myMatPCType == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || myMatPCType == QUDA_MATPC_ODD_ODD_ASYMMETRIC) {
 
     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-      tmc_dslash(x.Even().V(), gauge.data(), tmp.V(), clover.data(), clover_inv.data(), inv_param->kappa, inv_param->mu,
+      tmc_dslash(x.Even().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa, inv_param->mu,
                  inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
     } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
-      clover_dslash(x.Even().V(), gauge.data(), clover_inv.data(), tmp.V(), parity, QUDA_DAG_YES, inv_param->cpu_prec,
+      clover_dslash(x.Even().data(), gauge.data(), clover_inv.data(), tmp.data(), parity, QUDA_DAG_YES, inv_param->cpu_prec,
                     *gauge_param);
     } else {
       errorQuda("TMCloverForce_reference: dslash_type not supported\n");
     }
-    Gamma5_host((double *)x.Even().V(), (double *)x.Even().V(), x.Even().VolumeCB());
+    Gamma5_host(x.Even().data<double*>(), x.Even().data<double*>(), x.Even().VolumeCB());
 
     if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
-      tmc_matpc(p.Odd().V(), gauge.data(), tmp.V(), clover.data(), clover_inv.data(), inv_param->kappa, inv_param->mu,
+      tmc_matpc(p.Odd().data(), gauge.data(), tmp.data(), clover.data(), clover_inv.data(), inv_param->kappa, inv_param->mu,
                 inv_param->twist_flavor, myMatPCType, QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
-      tmc_dslash(p.Even().V(), gauge.data(), p.Odd().V(), clover.data(), clover_inv.data(), inv_param->kappa,
+      tmc_dslash(p.Even().data(), gauge.data(), p.Odd().data(), clover.data(), clover_inv.data(), inv_param->kappa,
                  inv_param->mu, inv_param->twist_flavor, parity, myMatPCType, QUDA_DAG_NO, inv_param->cpu_prec,
                  *gauge_param);
     } else if (inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
-      clover_matpc(p.Odd().V(), gauge.data(), clover.data(), clover_inv.data(), tmp.V(), inv_param->kappa, myMatPCType,
+      clover_matpc(p.Odd().data(), gauge.data(), clover.data(), clover_inv.data(), tmp.data(), inv_param->kappa, myMatPCType,
                    QUDA_DAG_YES, inv_param->cpu_prec, *gauge_param);
-      clover_dslash(p.Even().V(), gauge.data(), clover_inv.data(), p.Odd().V(), parity, QUDA_DAG_NO,
+      clover_dslash(p.Even().data(), gauge.data(), clover_inv.data(), p.Odd().data(), parity, QUDA_DAG_NO,
                     inv_param->cpu_prec, *gauge_param);
     } else {
       errorQuda("TMCloverForce_reference: dslash_type not supported\n");
@@ -135,8 +135,8 @@ void TMCloverForce_reference(void *h_mom, void **h_x, double *coeff, int nvector
     errorQuda("TMCloverForce_reference: MATPC type not supported\n");
   }
 
-  Gamma5_host((double *)p.Even().V(), (double *)p.Even().V(), p.Even().VolumeCB());
-  Gamma5_host((double *)p.Odd().V(), (double *)p.Odd().V(), p.Odd().VolumeCB());
+  Gamma5_host(p.Even().data<double *>(), p.Even().data<double*>(), p.Even().VolumeCB());
+  Gamma5_host(p.Odd().data<double*>(), p.Odd().data<double*>(), p.Odd().VolumeCB());
 
   double force_coeff = coeff[0];
   quda::GaugeFieldParam momparam(*gauge_param);
@@ -146,9 +146,9 @@ void TMCloverForce_reference(void *h_mom, void **h_x, double *coeff, int nvector
   momparam.reconstruct = QUDA_RECONSTRUCT_10;
   momparam.link_type = QUDA_ASQTAD_MOM_LINKS;
   momparam.create = QUDA_ZERO_FIELD_CREATE;
-  quda::cpuGaugeField mom(momparam);
-  createMomCPU(mom.Gauge_p(), gauge_param->cpu_prec, 0.0);
-  void *refmom = mom.Gauge_p();
+  quda::GaugeField mom(momparam);
+  createMomCPU(mom.data(), gauge_param->cpu_prec, 0.0);
+  void *refmom = mom.data();
 
   // derivative of the wilson operator it correspond to deriv_Sb(OE,...) plus  deriv_Sb(EO,...) in tmLQCD
   CloverForce_reference(refmom, gauge, x, p, force_coeff);
@@ -189,18 +189,19 @@ void TMCloverForce_reference(void *h_mom, void **h_x, double *coeff, int nvector
 
   // create extended field
   quda::GaugeFieldParam gParamMom(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS);
+  gParamMom.location = QUDA_CUDA_FIELD_LOCATION;
   gParamMom.link_type = QUDA_GENERAL_LINKS;
   gParamMom.create = QUDA_ZERO_FIELD_CREATE;
   gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
   gParamMom.reconstruct = QUDA_RECONSTRUCT_NO;
   gParamMom.geometry = QUDA_TENSOR_GEOMETRY;
-  quda::cudaGaugeField cudaOprod(gParamMom);
+  quda::GaugeField cudaOprod(gParamMom);
   cudaOprod.copy_from_buffer(oprod);
 
   quda::lat_dim_t R;
   for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
   quda::TimeProfile profile_host("profile_host");
-  quda::cudaGaugeField *cudaOprodEx = createExtendedGauge(cudaOprod, R, profile_host);
+  quda::GaugeField *cudaOprodEx = createExtendedGauge(cudaOprod, R, profile_host);
 
   int ghostFace[4];
   int ghost_size = 0;
@@ -229,5 +230,7 @@ void TMCloverForce_reference(void *h_mom, void **h_x, double *coeff, int nvector
   cloverDerivative_reference(refmom, gauge.data(), oprod_ex, QUDA_ODD_PARITY, *gauge_param);
   cloverDerivative_reference(refmom, gauge.data(), oprod_ex, QUDA_EVEN_PARITY, *gauge_param);
 
-  add_mom((double *)h_mom, (double *)mom.Gauge_p(), 4 * V * mom_site_size, -1.0);
-}
\ No newline at end of file
+  add_mom((double *)h_mom, (double *)mom.data(), 4 * V * mom_site_size, -1.0);
+
+  delete cudaOprodEx;
+}
diff --git a/tests/host_reference/covdev_reference.cpp b/tests/host_reference/covdev_reference.cpp
index 081b19142c..66aaf85fa8 100644
--- a/tests/host_reference/covdev_reference.cpp
+++ b/tests/host_reference/covdev_reference.cpp
@@ -82,32 +82,31 @@ void covdev_dslash(void *res, void **link, void *spinorField, int oddBit, int da
   }
 }
 
-template <typename sFloat, typename gFloat> void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu)
+template <typename sFloat, typename gFloat>
+void Mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu)
 {
-  sFloat *inEven = in;
-  sFloat *inOdd = in + Vh * spinor_site_size;
-  sFloat *outEven = out;
-  sFloat *outOdd = out + Vh * spinor_site_size;
-
   // full dslash operator
-  covdevReference(outOdd, link, inEven, 1, daggerBit, mu);
-  covdevReference(outEven, link, inOdd, 0, daggerBit, mu);
+  void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  covdevReference(reinterpret_cast<sFloat *>(out.Odd().data()), reinterpret_cast<gFloat **>(data),
+                  reinterpret_cast<sFloat *>(in.Even().data()), 1, daggerBit, mu);
+  covdevReference(reinterpret_cast<sFloat *>(out.Even().data()), reinterpret_cast<gFloat **>(data),
+                  reinterpret_cast<sFloat *>(in.Odd().data()), 0, daggerBit, mu);
 }
 
-void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
+void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu)
 {
 
-  if (sPrecision == QUDA_DOUBLE_PRECISION) {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      Mat((double *)out, (double **)link, (double *)in, dagger_bit, mu);
+  if (checkPrecision(in, out) == QUDA_DOUBLE_PRECISION) {
+    if (link.Precision() == QUDA_DOUBLE_PRECISION) {
+      Mat<double, double>(out, link, in, dagger_bit, mu);
     } else {
-      Mat((double *)out, (float **)link, (double *)in, dagger_bit, mu);
+      Mat<double, float>(out, link, in, dagger_bit, mu);
     }
   } else {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      Mat((float *)out, (double **)link, (float *)in, dagger_bit, mu);
+    if (link.Precision() == QUDA_DOUBLE_PRECISION) {
+      Mat<float, double>(out, link, in, dagger_bit, mu);
     } else {
-      Mat((float *)out, (float **)link, (float *)in, dagger_bit, mu);
+      Mat<float, float>(out, link, in, dagger_bit, mu);
     }
   }
 }
@@ -179,8 +178,8 @@ void covdevReference_mg4dir(sFloat *res, gFloat **link, gFloat **ghostLink, cons
     int offset = spinor_site_size * sid;
 
     gFloat *lnk = gaugeLink_mg4dir(sid, mu, oddBit, linkEven, linkOdd, ghostLinkEven, ghostLinkOdd, 1, 1);
-    const sFloat *spinor = spinorNeighbor_mg4dir(sid, mu, oddBit, static_cast<const sFloat *>(in.V()), fwd_nbr_spinor,
-                                                 back_nbr_spinor, 1, 1);
+    const sFloat *spinor = spinorNeighbor_mg4dir(sid, mu, oddBit, static_cast<const sFloat *>(in.data()),
+                                                 fwd_nbr_spinor, back_nbr_spinor, 1, 1);
 
     sFloat gaugedSpinor[spinor_site_size];
 
@@ -193,7 +192,7 @@ void covdevReference_mg4dir(sFloat *res, gFloat **link, gFloat **ghostLink, cons
   } // 4-d volume
 }
 
-void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int oddBit,
+void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit,
                           int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
 {
   QudaParity otherparity = QUDA_INVALID_PARITY;
@@ -208,32 +207,42 @@ void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink,
 
   in.exchangeGhost(otherparity, nFace, daggerBit);
 
+  void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  void *ghostLink[4] = {link.Ghost()[0].data(), link.Ghost()[1].data(), link.Ghost()[2].data(), link.Ghost()[3].data()};
+
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      covdevReference_mg4dir((double *)out.V(), (double **)link, (double **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((double *)out.data(), reinterpret_cast<double **>(data), (double **)ghostLink, in, oddBit,
+                             daggerBit, mu);
     } else {
-      covdevReference_mg4dir((double *)out.V(), (float **)link, (float **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((double *)out.data(), reinterpret_cast<float **>(data), (float **)ghostLink, in, oddBit,
+                             daggerBit, mu);
     }
   } else {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      covdevReference_mg4dir((float *)out.V(), (double **)link, (double **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((float *)out.data(), reinterpret_cast<double **>(data), (double **)ghostLink, in, oddBit,
+                             daggerBit, mu);
     } else {
-      covdevReference_mg4dir((float *)out.V(), (float **)link, (float **)ghostLink, in, oddBit, daggerBit, mu);
+      covdevReference_mg4dir((float *)out.data(), reinterpret_cast<float **>(data), (float **)ghostLink, in, oddBit,
+                             daggerBit, mu);
     }
   }
 }
 
 template <typename sFloat, typename gFloat>
-void Mat_mg4dir(ColorSpinorField &out, gFloat **link, gFloat **ghostLink, const ColorSpinorField &in, int daggerBit,
-                int mu)
+void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu)
 {
+  void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  void *ghostLink[4] = {link.Ghost()[0].data(), link.Ghost()[1].data(), link.Ghost()[2].data(), link.Ghost()[3].data()};
+
   const int nFace = 1;
   {
     auto &inEven = in.Even();
     auto &outOdd = out.Odd();
 
     inEven.exchangeGhost(QUDA_EVEN_PARITY, nFace, daggerBit);
-    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outOdd.V()), link, ghostLink, in.Even(), 1, daggerBit, mu);
+    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outOdd.data()), reinterpret_cast<gFloat **>(data),
+                           reinterpret_cast<gFloat **>(ghostLink), in.Even(), 1, daggerBit, mu);
   }
 
   {
@@ -241,31 +250,30 @@ void Mat_mg4dir(ColorSpinorField &out, gFloat **link, gFloat **ghostLink, const
     auto &outEven = out.Even();
 
     inOdd.exchangeGhost(QUDA_ODD_PARITY, nFace, daggerBit);
-    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outEven.V()), link, ghostLink, in.Odd(), 0, daggerBit, mu);
+    covdevReference_mg4dir(reinterpret_cast<sFloat *>(outEven.data()), reinterpret_cast<gFloat **>(data),
+                           reinterpret_cast<gFloat **>(ghostLink), in.Odd(), 0, daggerBit, mu);
   }
 }
 
-void mat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit,
-                int mu, QudaPrecision sPrecision, QudaPrecision gPrecision)
+void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu)
 {
-  if (sPrecision == QUDA_DOUBLE_PRECISION) {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      Mat_mg4dir<double, double>(out, (double **)link, (double **)ghostLink, in, dagger_bit, mu);
+  if (checkPrecision(in, out) == QUDA_DOUBLE_PRECISION) {
+    if (link.Precision() == QUDA_DOUBLE_PRECISION) {
+      Mat_mg4dir<double, double>(out, link, in, dagger_bit, mu);
     } else {
-      Mat_mg4dir<double, float>(out, (float **)link, (float **)ghostLink, in, dagger_bit, mu);
+      Mat_mg4dir<double, float>(out, link, in, dagger_bit, mu);
     }
   } else {
-    if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      Mat_mg4dir<float, double>(out, (double **)link, (double **)ghostLink, in, dagger_bit, mu);
+    if (link.Precision() == QUDA_DOUBLE_PRECISION) {
+      Mat_mg4dir<float, double>(out, link, in, dagger_bit, mu);
     } else {
-      Mat_mg4dir<float, float>(out, (float **)link, (float **)ghostLink, in, dagger_bit, mu);
+      Mat_mg4dir<float, float>(out, link, in, dagger_bit, mu);
     }
   }
 }
 
-void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit,
-                      int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp,
-                      QudaParity parity)
+void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu,
+                      QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity)
 {
   // assert sPrecision and gPrecision must be the same
   if (sPrecision != gPrecision) errorQuda("Spinor precision and gPrecison is not the same");
@@ -279,9 +287,9 @@ void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, cons
     errorQuda("full parity not supported");
   }
 
-  covdev_dslash_mg4dir(tmp, link, ghostLink, in, otherparity, dagger_bit, mu, sPrecision, gPrecision);
+  covdev_dslash_mg4dir(tmp, link, in, otherparity, dagger_bit, mu, sPrecision, gPrecision);
 
-  covdev_dslash_mg4dir(out, link, ghostLink, tmp, parity, dagger_bit, mu, sPrecision, gPrecision);
+  covdev_dslash_mg4dir(out, link, tmp, parity, dagger_bit, mu, sPrecision, gPrecision);
 }
 
 #endif
diff --git a/tests/host_reference/covdev_reference.h b/tests/host_reference/covdev_reference.h
index 19b1809cf0..3c0c1b18e3 100644
--- a/tests/host_reference/covdev_reference.h
+++ b/tests/host_reference/covdev_reference.h
@@ -6,18 +6,16 @@ using namespace quda;
 
 void setDims(int *);
 
-void covdev_dslash(void *res, void **link, void *spinorField, int oddBit, int daggerBit, int mu,
+void covdev_dslash(void *res, const GaugeField &link, void *spinorField, int oddBit, int daggerBit, int mu,
                    QudaPrecision sPrecision, QudaPrecision gPrecision);
-void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int oddBit,
+void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit,
                           int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision);
 
-void mat(void *out, void **link, void *in, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision);
+void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu);
 
-void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision,
+void matdagmat(void *out, const GaugeField &link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision,
                QudaPrecision gPrecision, void *tmp, QudaParity parity);
 
-void mat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int daggerBit, int mu,
-                QudaPrecision sPrecision, QudaPrecision gPrecision);
-void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit,
-                      int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp,
-                      QudaParity parity);
+void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu);
+void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu,
+                      QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity);
diff --git a/tests/host_reference/domain_wall_dslash_reference.cpp b/tests/host_reference/domain_wall_dslash_reference.cpp
index c42b0e4a71..29edd18a44 100644
--- a/tests/host_reference/domain_wall_dslash_reference.cpp
+++ b/tests/host_reference/domain_wall_dslash_reference.cpp
@@ -746,8 +746,8 @@ void mdw_eofa_m5inv(void *res, void *spinorField, int oddBit, int daggerBit, dou
 
 // this actually applies the preconditioned dslash, e.g., D_ee^{-1} D_eo or D_oo^{-1} D_oe
 #ifndef MULTI_GPU
-void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &,
-               double mferm)
+void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+               QudaGaugeParam &, double mferm)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     dslashReference_4d_sgpu<QUDA_5D_PC>((double *)out, (double **)gauge, (double *)in, oddBit, daggerBit);
@@ -758,13 +758,13 @@ void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qud
   }
 }
 #else
-void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                QudaGaugeParam &gauge_param, double mferm)
 {
-  GaugeFieldParam gauge_field_param(gauge_param, gauge);
+  GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuGaugeField cpu(gauge_field_param);
-  void **ghostGauge = (void **)cpu.Ghost();
+  GaugeField cpu(gauge_field_param);
+  void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
   // First wrap the input spinor into a ColorSpinorField
@@ -815,7 +815,7 @@ void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qud
 #endif
 
 #ifndef MULTI_GPU
-void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                  QudaGaugeParam &, double)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
@@ -825,13 +825,13 @@ void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, Q
   }
 }
 #else
-void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                  QudaGaugeParam &gauge_param, double)
 {
-  GaugeFieldParam gauge_field_param(gauge_param, gauge);
+  GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuGaugeField cpu(gauge_field_param);
-  void **ghostGauge = (void **)cpu.Ghost();
+  GaugeField cpu(gauge_field_param);
+  void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
   // First wrap the input spinor into a ColorSpinorField
@@ -879,8 +879,8 @@ void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, Q
 }
 #endif
 
-void dw_dslash_5_4d(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &,
-                    double mferm, bool zero_initialize)
+void dw_dslash_5_4d(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+                    QudaGaugeParam &, double mferm, bool zero_initialize)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     if (zero_initialize)
@@ -895,8 +895,8 @@ void dw_dslash_5_4d(void *out, void **, void *in, int oddBit, int daggerBit, Qud
   }
 }
 
-void dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &,
-                  double mferm, double *kappa)
+void dslash_5_inv(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+                  QudaGaugeParam &, double mferm, double *kappa)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     dslashReference_5th_inv((double *)out, (double *)in, oddBit, daggerBit, mferm, kappa);
@@ -905,7 +905,7 @@ void dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaP
   }
 }
 
-void mdw_dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void mdw_dslash_5_inv(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                       QudaGaugeParam &, double mferm, double _Complex *kappa)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
@@ -915,8 +915,8 @@ void mdw_dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, Q
   }
 }
 
-void mdw_dslash_5(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &,
-                  double mferm, double _Complex *kappa, bool zero_initialize)
+void mdw_dslash_5(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+                  QudaGaugeParam &, double mferm, double _Complex *kappa, bool zero_initialize)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     if (zero_initialize)
@@ -935,7 +935,7 @@ void mdw_dslash_5(void *out, void **, void *in, int oddBit, int daggerBit, QudaP
   }
 }
 
-void mdw_dslash_4_pre(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision,
+void mdw_dslash_4_pre(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision,
                       QudaGaugeParam &, double mferm, double _Complex *b5, double _Complex *c5, bool zero_initialize)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
@@ -960,7 +960,7 @@ void mdw_dslash_4_pre(void *out, void **, void *in, int oddBit, int daggerBit, Q
   }
 }
 
-void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
+void dw_mat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
             QudaGaugeParam &gauge_param, double mferm)
 {
 
@@ -976,7 +976,7 @@ void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, Qud
   xpay(in, -kappa, out, V5 * spinor_site_size, precision);
 }
 
-void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
+void dw_4d_mat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
                QudaGaugeParam &gauge_param, double mferm)
 {
 
@@ -995,7 +995,7 @@ void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit,
   xpay(in, -kappa, out, V5 * spinor_site_size, precision);
 }
 
-void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger,
+void mdw_mat(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger,
              QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5)
 {
   void *tmp = safe_malloc(V5h * spinor_site_size * precision);
@@ -1042,9 +1042,9 @@ void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double
   host_free(tmp);
 }
 
-void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param,
-                  double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm,
-                  double eofa_shift)
+void mdw_eofa_mat(void *out, void *const *gauge, void *in, int dagger, QudaPrecision precision,
+                  QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2,
+                  double mq3, int eofa_pm, double eofa_shift)
 {
   void *tmp = safe_malloc(V5h * spinor_site_size * precision);
 
@@ -1096,7 +1096,7 @@ void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision p
   host_free(tmp);
 }
 //
-void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
+void dw_matdagmat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision,
                   QudaGaugeParam &gauge_param, double mferm)
 {
   void *tmp = safe_malloc(V5 * spinor_site_size * precision);
@@ -1108,7 +1108,7 @@ void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bi
   host_free(tmp);
 }
 
-void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit,
+void dw_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit,
               QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
 {
   void *tmp = safe_malloc(V5h * spinor_site_size * precision);
@@ -1128,7 +1128,7 @@ void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType mat
   host_free(tmp);
 }
 
-void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit,
+void dw_4d_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit,
                  QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm)
 {
   double kappa2 = -kappa * kappa;
@@ -1168,7 +1168,7 @@ void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType
   host_free(kappa5);
 }
 
-void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
+void mdw_matpc(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
                QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm,
                double _Complex *b5, double _Complex *c5)
 {
@@ -1240,9 +1240,9 @@ void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, doub
   host_free(kappa_mdwf);
 }
 
-void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type, int dagger, QudaPrecision precision,
-                    QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2,
-                    double mq3, int eofa_pm, double eofa_shift)
+void mdw_eofa_matpc(void *out, void *const *gauge, void *in, QudaMatPCType matpc_type, int dagger,
+                    QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c,
+                    double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift)
 {
   void *tmp = safe_malloc(V5h * spinor_site_size * precision);
 
@@ -1311,14 +1311,14 @@ void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type,
   host_free(tmp);
 }
 
-void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
+void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
                      QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm,
                      double _Complex *b5, double _Complex *c5)
 {
   lat_dim_t R;
   for (int d = 0; d < 4; d++) { R[d] = comm_dim_partitioned(d) ? 2 : 0; }
 
-  cpuGaugeField *padded_gauge = createExtendedGauge(gauge, gauge_param, R);
+  GaugeField *padded_gauge = createExtendedGauge((void **)gauge, gauge_param, R);
 
   int padded_V = 1;
   int W[4];
@@ -1357,7 +1357,7 @@ void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b
   QudaGaugeParam padded_gauge_param(gauge_param);
   for (int d = 0; d < 4; d++) { padded_gauge_param.X[d] += 2 * R[d]; }
 
-  void **padded_gauge_p = (void **)(padded_gauge->Gauge_p());
+  void *padded_gauge_p[] = {padded_gauge->data(0), padded_gauge->data(1), padded_gauge->data(2), padded_gauge->data(3)};
 
   // Extend these global variables then restore them
   int V5_old = V5;
@@ -1458,7 +1458,7 @@ void MatPCDag(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa,
 }
 */
 
-void matpc(void *, void **, void *, double, QudaMatPCType, int, QudaPrecision, QudaPrecision, double)
+void matpc(void *, void *const *, void *, double, QudaMatPCType, int, QudaPrecision, QudaPrecision, double)
 {
   /*
     if (!dagger_bit) {
@@ -1513,7 +1513,7 @@ void MatPCDagMatPC(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa,
 }
 */
 // Wrapper to templates that handles different precisions.
-void matdagmat(void *, void **, void *, double, QudaPrecision, QudaPrecision, double)
+void matdagmat(void *, void *const *, void *, double, QudaPrecision, QudaPrecision, double)
 {
   /*
     if (sPrecision == QUDA_DOUBLE_PRECISION) {
@@ -1533,7 +1533,7 @@ void matdagmat(void *, void **, void *, double, QudaPrecision, QudaPrecision, do
 }
 
 // Wrapper to templates that handles different precisions.
-void matpcdagmatpc(void *, void **, void *, double, QudaPrecision, QudaPrecision, double, QudaMatPCType)
+void matpcdagmatpc(void *, void *const *, void *, double, QudaPrecision, QudaPrecision, double, QudaMatPCType)
 {
   /*
     if (sPrecision == QUDA_DOUBLE_PRECISION) {
diff --git a/tests/host_reference/domain_wall_dslash_reference.h b/tests/host_reference/domain_wall_dslash_reference.h
index 3751fe88f4..4e6ff1edfb 100644
--- a/tests/host_reference/domain_wall_dslash_reference.h
+++ b/tests/host_reference/domain_wall_dslash_reference.h
@@ -8,51 +8,51 @@
 extern "C" {
 #endif
 
-void dw_dslash(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
+void dw_dslash(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
                QudaGaugeParam &param, double mferm);
 
-void dslash_4_4d(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
+void dslash_4_4d(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
                  QudaGaugeParam &param, double mferm);
 
-void dw_dslash_5_4d(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
-                    QudaGaugeParam &param, double mferm, bool zero_initialize);
+void dw_dslash_5_4d(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger,
+                    QudaPrecision precision, QudaGaugeParam &param, double mferm, bool zero_initialize);
 
-void dslash_5_inv(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
+void dslash_5_inv(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
                   QudaGaugeParam &param, double mferm, double *kappa);
 
-void mdw_dslash_5_inv(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
-                      QudaGaugeParam &param, double mferm, double _Complex *kappa);
+void mdw_dslash_5_inv(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger,
+                      QudaPrecision precision, QudaGaugeParam &param, double mferm, double _Complex *kappa);
 
-void mdw_dslash_5(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
+void mdw_dslash_5(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
                   QudaGaugeParam &param, double mferm, double _Complex *kappa, bool zero_initialize);
 
-void mdw_dslash_4_pre(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision,
-                      QudaGaugeParam &param, double mferm, double _Complex *b5, double _Complex *c5,
-                      bool zero_initialize);
+void mdw_dslash_4_pre(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger,
+                      QudaPrecision precision, QudaGaugeParam &param, double mferm, double _Complex *b5,
+                      double _Complex *c5, bool zero_initialize);
 
-void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam &param,
-            double mferm);
+void dw_mat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision,
+            QudaGaugeParam &param, double mferm);
 
-void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision,
+void dw_4d_mat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision,
                QudaGaugeParam &param, double mferm);
 
-void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger,
+void mdw_mat(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger,
              QudaPrecision precision, QudaGaugeParam &param, double mferm, double _Complex *b5, double _Complex *c5);
 
-void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision,
+void dw_matdagmat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision,
                   QudaGaugeParam &param, double mferm);
 
-void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger,
+void dw_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger,
               QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm);
 
-void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger,
+void dw_4d_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger,
                  QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm);
 
-void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
+void mdw_matpc(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
                QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm,
                double _Complex *b5, double _Complex *c5);
 
-void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
+void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c,
                      QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm,
                      double _Complex *b5, double _Complex *c5);
 void mdw_eofa_m5(void *res, void *spinorField, int oddBit, int daggerBit, double mferm, double m5, double b, double c,
@@ -61,13 +61,13 @@ void mdw_eofa_m5(void *res, void *spinorField, int oddBit, int daggerBit, double
 void mdw_eofa_m5inv(void *res, void *spinorField, int oddBit, int daggerBit, double mferm, double m5, double b, double c,
                     double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift, QudaPrecision precision);
 
-void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param,
-                  double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm,
-                  double eofa_shift);
+void mdw_eofa_mat(void *out, void *const *gauge, void *in, int dagger, QudaPrecision precision,
+                  QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2,
+                  double mq3, int eofa_pm, double eofa_shift);
 
-void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type, int dagger, QudaPrecision precision,
-                    QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2,
-                    double mq3, int eofa_pm, double eofa_shift);
+void mdw_eofa_matpc(void *out, void *const *gauge, void *in, QudaMatPCType matpc_type, int dagger,
+                    QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c,
+                    double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift);
 
 #ifdef __cplusplus
 }
diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp
index 9fc53fe6bf..4edc471143 100644
--- a/tests/host_reference/dslash_reference.cpp
+++ b/tests/host_reference/dslash_reference.cpp
@@ -744,10 +744,17 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou
 }
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[],
-                                void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param,
-                                QudaInvertParam &inv_param, int shift)
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink,
+                                quda::GaugeField &longlink, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param,
+                                int shift)
 {
+  void *qdp_fatlink[] = {fatlink.data(0), fatlink.data(1), fatlink.data(2), fatlink.data(3)};
+  void *qdp_longlink[] = {longlink.data(0), longlink.data(1), longlink.data(2), longlink.data(3)};
+  void *ghost_fatlink[]
+    = {fatlink.Ghost()[0].data(), fatlink.Ghost()[1].data(), fatlink.Ghost()[2].data(), fatlink.Ghost()[3].data()};
+  void *ghost_longlink[]
+    = {longlink.Ghost()[0].data(), longlink.Ghost()[1].data(), longlink.Ghost()[2].data(), longlink.Ghost()[3].data()};
+
   switch (test_type) {
   case 0: // full parity solution, full parity system
   case 1: // full parity solution, solving EVEN EVEN prec system
@@ -763,10 +770,10 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
                     QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
 
     if (dslash_type == QUDA_LAPLACE_DSLASH) {
-      xpay(out.V(), kappa, ref.V(), ref.Length(), gauge_param.cpu_prec);
-      ax(0.5 / kappa, ref.V(), ref.Length(), gauge_param.cpu_prec);
+      xpay(out.data(), kappa, ref.data(), ref.Length(), gauge_param.cpu_prec);
+      ax(0.5 / kappa, ref.data(), ref.Length(), gauge_param.cpu_prec);
     } else {
-      axpy(2 * mass, out.V(), ref.V(), ref.Length(), gauge_param.cpu_prec);
+      axpy(2 * mass, out.data(), ref.data(), ref.Length(), gauge_param.cpu_prec);
     }
     break;
 
@@ -788,9 +795,9 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi
     len = Vh;
   }
 
-  mxpy(in.V(), ref.V(), len * stag_spinor_site_size, inv_param.cpu_prec);
-  double nrm2 = norm_2(ref.V(), len * stag_spinor_site_size, inv_param.cpu_prec);
-  double src2 = norm_2(in.V(), len * stag_spinor_site_size, inv_param.cpu_prec);
+  mxpy(in.data(), ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
+  double nrm2 = norm_2(ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
+  double src2 = norm_2(in.data(), len * stag_spinor_site_size, inv_param.cpu_prec);
   double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z);
   double l2r = sqrt(nrm2 / src2);
 
diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h
index 44392628c2..82745008fc 100644
--- a/tests/host_reference/dslash_reference.h
+++ b/tests/host_reference/dslash_reference.h
@@ -3,6 +3,7 @@
 #include <array>
 #include <host_utils.h>
 #include <comm_quda.h>
+#include <gauge_field.h>
 
 template <typename Float> static inline void sum(Float *dst, Float *a, Float *b, int cnt)
 {
@@ -109,9 +110,9 @@ std::array<double, 2> verifyWilsonTypeInversion(void *spinorOut, void **spinorOu
                                                 QudaInvertParam &inv_param, void **gauge, void *clover, void *clover_inv);
 
 double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in,
-                                quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[],
-                                void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param,
-                                QudaInvertParam &inv_param, int shift);
+                                quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink,
+                                quda::GaugeField &longlink, QudaGaugeParam &gauge_param, QudaInvertParam &inv_param,
+                                int shift);
 
 // i represents a "half index" into an even or odd "half lattice".
 // when oddBit={0,1} the half lattice is {even,odd}.
diff --git a/tests/host_reference/dslash_test_helpers.cpp b/tests/host_reference/dslash_test_helpers.cpp
index b46b69ff75..be2a7cac18 100644
--- a/tests/host_reference/dslash_test_helpers.cpp
+++ b/tests/host_reference/dslash_test_helpers.cpp
@@ -7,9 +7,9 @@
 using namespace quda;
 
 // need a better solution here but as long as they gauge field live in interface probably ok
-extern cudaGaugeField *gaugePrecise;
-extern cudaGaugeField *gaugeFatPrecise;
-extern cudaGaugeField *gaugeLongPrecise;
+extern GaugeField *gaugePrecise;
+extern GaugeField *gaugeFatPrecise;
+extern GaugeField *gaugeLongPrecise;
 
 void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, dslash_test_type test_type)
 {
diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp
index 1e2eaa58c7..fd555e3898 100644
--- a/tests/host_reference/gauge_force_reference.cpp
+++ b/tests/host_reference/gauge_force_reference.cpp
@@ -8,6 +8,7 @@
 #include "host_utils.h"
 #include "misc.h"
 #include "gauge_force_reference.h"
+#include "timer.h"
 
 extern int Z[4];
 extern int V;
@@ -67,8 +68,16 @@ struct fcomplex {
 struct dcomplex {
   double real;
   double imag;
+
+  void operator+=(const dcomplex &other)
+  {
+    real += other.real;
+    imag += other.imag;
+  }
 };
 
+#pragma omp declare reduction(dcomplex_sum:dcomplex : omp_out += omp_in)
+
 struct fsu3_matrix {
   using real_t = float;
   using complex_t = fcomplex;
@@ -302,9 +311,7 @@ int gf_neighborIndexFullLattice(size_t i, int dx[], const lattice_t &lat)
 template <typename su3_matrix>
 static su3_matrix compute_gauge_path(su3_matrix **sitelink, int i, int *path, int len, int dx[4], const lattice_t &lat)
 {
-  su3_matrix prev_matrix, curr_matrix;
-
-  memset(&curr_matrix, 0, sizeof(curr_matrix));
+  su3_matrix prev_matrix, curr_matrix = {};
 
   curr_matrix.e[0][0].real = 1;
   curr_matrix.e[1][1].real = 1;
@@ -346,16 +353,14 @@ template <typename su3_matrix, typename Float>
 static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, int *path, int len, Float loop_coeff,
                                  int dir, const lattice_t &lat)
 {
-  su3_matrix curr_matrix, tmat;
-  int dx[4];
-
+#pragma omp parallel for
   for (size_t i = 0; i < lat.volume; i++) {
-    memset(dx, 0, sizeof(dx));
-
+    int dx[4] = {};
     dx[dir] = 1;
 
-    curr_matrix = compute_gauge_path(sitelink, i, path, len, dx, lat);
+    su3_matrix curr_matrix = compute_gauge_path(sitelink, i, path, len, dx, lat);
 
+    su3_matrix tmat;
     su3_adjoint(&curr_matrix, &tmat);
     scalar_mult_add_su3_matrix(staple + i, &tmat, loop_coeff, staple + i);
   } // i
@@ -364,16 +369,14 @@ static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, int
 template <typename su3_matrix>
 static dcomplex compute_loop_trace(su3_matrix **sitelink, int *path, int len, double loop_coeff, const lattice_t &lat)
 {
-  su3_matrix tmat;
-  dcomplex accum;
-  memset(&accum, 0, sizeof(accum));
-  int dx[4];
+  dcomplex accum = {};
 
+#pragma omp parallel for reduction(dcomplex_sum : accum)
   for (size_t i = 0; i < lat.volume; i++) {
-    memset(dx, 0, sizeof(dx));
-    tmat = compute_gauge_path(sitelink, i, path, len, dx, lat);
+    int dx[4] = {};
+    su3_matrix tmat = compute_gauge_path(sitelink, i, path, len, dx, lat);
     auto tr = trace_su3(&tmat);
-    CSUM(accum, tr);
+    accum += dcomplex {tr.real, tr.imag};
   }
 
   CSCALE(accum, loop_coeff);
@@ -385,6 +388,7 @@ template <typename su3_matrix, typename anti_hermitmat, typename Float>
 static void update_mom(anti_hermitmat *momentum, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3,
                        const lattice_t &lat)
 {
+#pragma omp parallel for
   for (size_t i = 0; i < lat.volume; i++) {
     su3_matrix tmat1;
     su3_matrix tmat2;
@@ -406,6 +410,7 @@ template <typename su3_matrix, typename Float>
 static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3,
                          const lattice_t &lat)
 {
+#pragma omp parallel for
   for (size_t i = 0; i < lat.volume; i++) {
     su3_matrix tmat;
 
@@ -422,11 +427,11 @@ static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_
 /* This function only computes one direction @dir
  *
  */
-void gauge_force_reference_dir(void *refMom, int dir, double eb3, void **sitelink, void **sitelink_ex,
+void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *sitelink, void *const *sitelink_ex,
                                QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths,
                                const lattice_t &lat, bool compute_force)
 {
-  size_t size = V * 2 * lat.n_color * lat.n_color * prec;
+  size_t size = size_t(V) * 2 * lat.n_color * lat.n_color * prec;
   void *staple = safe_malloc(size);
   memset(staple, 0, size);
 
@@ -458,9 +463,11 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void **sitelin
   host_free(staple);
 }
 
-void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecision prec, int ***path_dir, int *length,
+void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir, int *length,
                            void *loop_coeff, int num_paths, bool compute_force)
 {
+  void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
+
   // created extended field
   quda::lat_dim_t R;
   for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
@@ -472,17 +479,20 @@ void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecis
   auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
   lattice_t lat(*qdp_ex);
 
+  void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
   for (int dir = 0; dir < 4; dir++) {
-    gauge_force_reference_dir(refMom, dir, eb3, sitelink, (void **)qdp_ex->Gauge_p(), prec, path_dir[dir], length,
-                              loop_coeff, num_paths, lat, compute_force);
+    gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex, u.Precision(), path_dir[dir], length, loop_coeff,
+                              num_paths, lat, compute_force);
   }
 
   delete qdp_ex;
 }
 
-void gauge_loop_trace_reference(void **sitelink, QudaPrecision prec, std::vector<quda::Complex> &loop_traces,
-                                double factor, int **input_path, int *length, double *path_coeff, int num_paths)
+void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex> &loop_traces, double factor,
+                                int **input_path, int *length, double *path_coeff, int num_paths)
 {
+  void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
+
   // create extended field
   quda::lat_dim_t R;
   for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d);
@@ -493,12 +503,12 @@ void gauge_loop_trace_reference(void **sitelink, QudaPrecision prec, std::vector
 
   auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R);
   lattice_t lat(*qdp_ex);
-  void **sitelink_ex = (void **)qdp_ex->Gauge_p();
+  void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)};
 
   std::vector<double> loop_tr_dbl(2 * num_paths);
 
   for (int i = 0; i < num_paths; i++) {
-    if (prec == QUDA_DOUBLE_PRECISION) {
+    if (u.Precision() == QUDA_DOUBLE_PRECISION) {
       dcomplex tr = compute_loop_trace((dsu3_matrix **)sitelink_ex, input_path[i], length[i], path_coeff[i], lat);
       loop_tr_dbl[2 * i] = factor * tr.real;
       loop_tr_dbl[2 * i + 1] = factor * tr.imag;
diff --git a/tests/host_reference/gauge_force_reference.h b/tests/host_reference/gauge_force_reference.h
index 27f0447f18..5331b99386 100644
--- a/tests/host_reference/gauge_force_reference.h
+++ b/tests/host_reference/gauge_force_reference.h
@@ -24,8 +24,10 @@ struct lattice_t {
 
 int gf_neighborIndexFullLattice(size_t i, int dx[], const lattice_t &lat);
 
-void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecision prec, int ***path_dir, int *length,
+#include <gauge_field.h>
+
+void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ***path_dir, int *length,
                            void *loop_coeff, int num_paths, bool compute_force);
 
-void gauge_loop_trace_reference(void **sitelink, QudaPrecision prec, std::vector<quda::Complex> &loop_traces,
-                                double factor, int **input_path, int *length, double *path_coeff, int num_paths);
+void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex> &loop_traces, double factor,
+                                int **input_path, int *length, double *path_coeff, int num_paths);
diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp
index e8efded3e8..5e85375bb3 100644
--- a/tests/host_reference/hisq_force_reference.cpp
+++ b/tests/host_reference/hisq_force_reference.cpp
@@ -84,9 +84,9 @@ typedef struct {
   double space;
 } danti_hermitmat;
 
-template <typename su3_matrix> su3_matrix *get_su3_matrix(su3_matrix *p, int idx, int dir)
+template <typename su3_matrix> su3_matrix *get_su3_matrix(quda::GaugeField &p, int idx, int dir)
 {
-  su3_matrix *data = ((su3_matrix **)p)[dir];
+  auto data = static_cast<su3_matrix *>(p.data(dir));
   return data + idx;
 }
 
@@ -96,28 +96,29 @@ template <typename su3_vector, typename su3_matrix> void su3_projector(su3_vecto
     for (int j = 0; j < 3; j++) CMUL_J(a->c[i], b->c[j], c->e[i][j]);
 }
 
-template <typename su3_vector, typename su3_matrix>
-void computeLinkOrderedOuterProduct(su3_vector *src, su3_matrix *dest, size_t nhops)
+template <typename su3_matrix, typename su3_vector>
+void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, size_t nhops)
 {
-  int dx[4];
+#pragma omp parallel for
   for (int i = 0; i < V; ++i) {
+    int dx[4];
     for (int dir = 0; dir < 4; ++dir) {
       dx[3] = dx[2] = dx[1] = dx[0] = 0;
       dx[dir] = nhops;
       int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
       su3_vector *hw = src + nbr_idx;
-      su3_matrix *p = get_su3_matrix(dest, i, dir);
+      su3_matrix *p = get_su3_matrix<su3_matrix>(dest, i, dir);
       su3_projector(hw, &src[i], p);
     } // dir
   }   // i
 }
 
-void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, size_t nhops)
+void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dst, QudaPrecision precision, size_t nhops)
 {
   if (precision == QUDA_SINGLE_PRECISION) {
-    computeLinkOrderedOuterProduct((fsu3_vector *)src, (fsu3_matrix *)dst, nhops);
+    computeLinkOrderedOuterProduct<fsu3_matrix>((fsu3_vector *)src, dst, nhops);
   } else {
-    computeLinkOrderedOuterProduct((dsu3_vector *)src, (dsu3_matrix *)dst, nhops);
+    computeLinkOrderedOuterProduct<dsu3_matrix>((dsu3_vector *)src, dst, nhops);
   }
 }
 
@@ -894,10 +895,12 @@ void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Rea
   // To keep the code as close to the GPU code as possible, we'll
   // loop over the even sites first and then the odd sites
   LoadStore<Real> ls(volume);
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
     computeMiddleLinkSite<Real, 0>(site, dim, oprod, Qprev, link, sig, mu, coeff, ls, Pmu, P3, Qmu, newOprod);
   }
   // Loop over odd lattice sites
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
     computeMiddleLinkSite<Real, 1>(site, dim, oprod, Qprev, link, sig, mu, coeff, ls, Pmu, P3, Qmu, newOprod);
   }
@@ -988,10 +991,12 @@ void computeSideLinkField(const int dim[4], const Real *const P3,
 #endif
   LoadStore<Real> ls(volume);
 
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
     computeSideLinkSite<Real, 0>(site, dim, P3, Qprod, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod);
   }
 
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
     computeSideLinkSite<Real, 1>(site, dim, P3, Qprod, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod);
   }
@@ -1098,6 +1103,7 @@ void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *
 #endif
 
   LoadStore<Real> ls(volume);
+#pragma omp parallel for
   for (int site = 0; site < loop_count; ++site) {
 
     computeAllLinkSite<Real, 0>(site, dim, oprod, Qprev, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod);
@@ -1197,17 +1203,17 @@ void doHisqStaplesForceCPU(const int dim[4], PathCoefficients<double> staple_coe
 #undef Qmu
 #undef Qnumu
 
-void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link,
-                         quda::cpuGaugeField *newOprod)
+void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda::GaugeField &link,
+                         quda::GaugeField *newOprod)
 {
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
 #ifdef MULTI_GPU
-  int len = Vh_ex * 2;
+  uint64_t len = Vh_ex * 2;
 #else
-  int len = 1;
+  uint64_t len = 1;
   for (int dir = 0; dir < 4; ++dir) len *= X_[dir];
 #endif
   // allocate memory for temporary fields
@@ -1222,13 +1228,17 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, q
   act_path_coeff.seven = path_coeff[4];
   act_path_coeff.lepage = path_coeff[5];
 
+  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
+  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
   if (precision == QUDA_DOUBLE_PRECISION) {
-    doHisqStaplesForceCPU<double>(X_, act_path_coeff, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(),
-                                  (double **)tempmat, (double *)newOprod->Gauge_p());
-
+    doHisqStaplesForceCPU<double>(X_, act_path_coeff, reinterpret_cast<double *>(oprod_array),
+                                  reinterpret_cast<double *>(link_array), (double **)tempmat,
+                                  reinterpret_cast<double *>(noprod_array));
   } else if (precision == QUDA_SINGLE_PRECISION) {
-    doHisqStaplesForceCPU<float>(X_, act_path_coeff, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(),
-                                 (float **)tempmat, (float *)newOprod->Gauge_p());
+    doHisqStaplesForceCPU<float>(X_, act_path_coeff, reinterpret_cast<float *>(oprod_array),
+                                 reinterpret_cast<float *>(link_array), (float **)tempmat,
+                                 reinterpret_cast<float *>(noprod_array));
   } else {
     errorQuda("Unsupported precision");
   }
@@ -1293,31 +1303,35 @@ void computeLongLinkField(const int dim[4], const Real *const oprod, const Real
   const int half_volume = volume / 2;
 
   LoadStore<Real> ls(volume);
+#pragma omp parallel for
   for (int site = 0; site < half_volume; ++site) {
     computeLongLinkSite<Real, 0>(site, dim, oprod, link, sig, coeff, ls, output);
   }
   // Loop over odd lattice sites
+#pragma omp parallel for
   for (int site = 0; site < half_volume; ++site) {
     computeLongLinkSite<Real, 1>(site, dim, oprod, link, sig, coeff, ls, output);
   }
 }
 
-void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link,
-                          quda::cpuGaugeField *newOprod)
+void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *newOprod)
 {
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
+  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
+  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
+  void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      computeLongLinkField<float>(X_, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(), sig, coeff,
-                                  (float *)newOprod->Gauge_p());
+      computeLongLinkField<float>(X_, reinterpret_cast<float *>(oprod_array), reinterpret_cast<float *>(link_array),
+                                  sig, coeff, reinterpret_cast<float *>(noprod_array));
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      computeLongLinkField<double>(X_, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(), sig, coeff,
-                                   (double *)newOprod->Gauge_p());
+      computeLongLinkField<double>(X_, reinterpret_cast<double *>(oprod_array), reinterpret_cast<double *>(link_array),
+                                   sig, coeff, reinterpret_cast<double *>(noprod_array));
     } else {
-      errorQuda("Unrecognised precision\n");
+      errorQuda("Unrecognised precision");
     }
   } // sig
 }
@@ -1357,23 +1371,29 @@ void completeForceField(const int dim[4], const Real *const oprod, const Real *c
   const int half_volume = volume / 2;
   LoadStore<Real> ls(volume);
 
+#pragma omp parallel for
   for (int site = 0; site < half_volume; ++site) { completeForceSite<Real, 0>(site, dim, oprod, link, sig, ls, mom); }
+#pragma omp parallel for
   for (int site = 0; site < half_volume; ++site) { completeForceSite<Real, 1>(site, dim, oprod, link, sig, ls, mom); }
 }
 
-void hisqCompleteForceCPU(quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom)
+void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *mom)
 {
   int X_[4];
   for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d];
   QudaPrecision precision = oprod.Precision();
 
+  void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)};
+  void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)};
   for (int sig = 0; sig < 4; ++sig) {
     if (precision == QUDA_SINGLE_PRECISION) {
-      completeForceField<float>(X_, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(), sig, (float *)mom->Gauge_p());
+      completeForceField<float>(X_, reinterpret_cast<float *>(oprod_array), reinterpret_cast<float *>(link_array), sig,
+                                mom->data<float *>());
     } else if (precision == QUDA_DOUBLE_PRECISION) {
-      completeForceField<double>(X_, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(), sig, (double *)mom->Gauge_p());
+      completeForceField<double>(X_, reinterpret_cast<double *>(oprod_array), reinterpret_cast<double *>(link_array),
+                                 sig, mom->data<double *>());
     } else {
-      errorQuda("Unrecognised precision\n");
+      errorQuda("Unrecognised precision");
     }
   } // loop over sig
 }
diff --git a/tests/host_reference/hisq_force_reference.h b/tests/host_reference/hisq_force_reference.h
index 6e5e2923e4..bdf78c4750 100644
--- a/tests/host_reference/hisq_force_reference.h
+++ b/tests/host_reference/hisq_force_reference.h
@@ -8,11 +8,11 @@
 /**
    @brief Compute a staggered spinor outer product for some offset, CPU version
    @param[in] src Pointer to an appropriately sized host staggered spinor field
-   @param[out] dest Pointer to an appropriately sized output outer product field
+   @param[out] dest Reference to a gauge field for the outer product
    @param[in] precision Precision of data (single or double)
    @param[in] separation Offset for outer product (1 for fat links, 3 for long links)
 */
-void computeLinkOrderedOuterProduct(void *src, void *dest, QudaPrecision precision, size_t separation);
+void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dest, QudaPrecision precision, size_t separation);
 
 /**
    @brief Compute the force contribution from the fat links, CPU version
@@ -21,8 +21,8 @@ void computeLinkOrderedOuterProduct(void *src, void *dest, QudaPrecision precisi
    @param[in] link Gauge field links
    @param[out] newOprod Force accumulated with fat link contributions
 */
-void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link,
-                         quda::cpuGaugeField *newOprod);
+void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda::GaugeField &link,
+                         quda::GaugeField *newOprod);
 
 /**
    @brief Compute the force contribution from the long link, CPU version
@@ -31,8 +31,7 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, q
    @param[in] link Gauge field links
    @param[out] newOprod Force accumulated with fat link contributions
 */
-void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link,
-                          quda::cpuGaugeField *newOprod);
+void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *newOprod);
 
 /**
    @brief Accumulate the force contributions into the momentum field, CPU version
@@ -40,6 +39,6 @@ void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGau
    @param[in] link Gauge field links
    @param[out] mom Accumulated momentum
 */
-void hisqCompleteForceCPU(quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom);
+void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *mom);
 
 #endif
diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp
index 86ecd17464..14852a9f22 100644
--- a/tests/host_reference/staggered_dslash_reference.cpp
+++ b/tests/host_reference/staggered_dslash_reference.cpp
@@ -42,6 +42,7 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
                               sFloat **, sFloat **, int oddBit, int daggerBit, QudaDslashType dslash_type)
 #endif
 {
+#pragma omp parallel for
   for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0;
 
   gFloat *fatlinkEven[4], *fatlinkOdd[4];
@@ -66,6 +67,7 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
 #endif
   }
 
+#pragma omp parallel for
   for (int sid = 0; sid < Vh; sid++) {
     int offset = stag_spinor_site_size * sid;
 
@@ -122,8 +124,8 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
   } // 4-d volume
 }
 
-void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                     void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
+void staggeredDslash(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink,
+                     void *const *ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
                      QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type)
 {
   QudaParity otherparity = QUDA_INVALID_PARITY;
@@ -138,34 +140,34 @@ void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, voi
 
   in.exchangeGhost(otherparity, nFace, daggerBit);
 
-  void **fwd_nbr_spinor = in.fwdGhostFaceBuffer;
-  void **back_nbr_spinor = in.backGhostFaceBuffer;
+  auto fwd_nbr_spinor = in.fwdGhostFaceBuffer;
+  auto back_nbr_spinor = in.backGhostFaceBuffer;
 
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      staggeredDslashReference((double *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
-                               (double **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor,
+      staggeredDslashReference((double *)out.data(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
+                               (double **)ghost_longlink, (double *)in.data(), (double **)fwd_nbr_spinor,
                                (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
     } else {
-      staggeredDslashReference((double *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
-                               (float **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor,
+      staggeredDslashReference((double *)out.data(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
+                               (float **)ghost_longlink, (double *)in.data(), (double **)fwd_nbr_spinor,
                                (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
     }
   } else {
     if (gPrecision == QUDA_DOUBLE_PRECISION) {
-      staggeredDslashReference((float *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
-                               (double **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor,
+      staggeredDslashReference((float *)out.data(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink,
+                               (double **)ghost_longlink, (float *)in.data(), (float **)fwd_nbr_spinor,
                                (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
     } else {
-      staggeredDslashReference((float *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
-                               (float **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor,
+      staggeredDslashReference((float *)out.data(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink,
+                               (float **)ghost_longlink, (float *)in.data(), (float **)fwd_nbr_spinor,
                                (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type);
     }
   }
 }
 
-void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                        void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
+void staggeredMatDagMat(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink,
+                        void *const *ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
                         QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity,
                         QudaDslashType dslash_type)
 {
@@ -189,8 +191,8 @@ void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink,
 
   double msq_x4 = mass * mass * 4;
   if (sPrecision == QUDA_DOUBLE_PRECISION) {
-    axmy((double *)in.V(), (double)msq_x4, (double *)out.V(), Vh * stag_spinor_site_size);
+    axmy((double *)in.data(), (double)msq_x4, (double *)out.data(), Vh * stag_spinor_site_size);
   } else {
-    axmy((float *)in.V(), (float)msq_x4, (float *)out.V(), Vh * stag_spinor_site_size);
+    axmy((float *)in.data(), (float)msq_x4, (float *)out.data(), Vh * stag_spinor_site_size);
   }
 }
diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h
index 54d40fdc0d..2d47138dc0 100644
--- a/tests/host_reference/staggered_dslash_reference.h
+++ b/tests/host_reference/staggered_dslash_reference.h
@@ -16,11 +16,11 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink,
                               gFloat **ghostLonglink, sFloat *spinorField, sFloat **fwd_nbr_spinor,
                               sFloat **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type);
 
-void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                     void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
+void staggeredDslash(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink,
+                     void *const *ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit,
                      QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type);
 
-void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink,
-                        void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
+void staggeredMatDagMat(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink,
+                        void *const *ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit,
                         QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity,
                         QudaDslashType dslash_type);
diff --git a/tests/host_reference/wilson_dslash_reference.cpp b/tests/host_reference/wilson_dslash_reference.cpp
index 3a766e570c..471f79c38d 100644
--- a/tests/host_reference/wilson_dslash_reference.cpp
+++ b/tests/host_reference/wilson_dslash_reference.cpp
@@ -191,8 +191,9 @@ void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qu
 
   GaugeFieldParam gauge_field_param(gauge_param, gauge);
   gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuGaugeField cpu(gauge_field_param);
-  void **ghostGauge = (void **)cpu.Ghost();
+  gauge_field_param.location = QUDA_CPU_FIELD_LOCATION;
+  GaugeField cpu(gauge_field_param);
+  void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()};
 
   // Get spinor ghost fields
   // First wrap the input spinor into a ColorSpinorField
diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp
index 362b1695cd..eda5c778a7 100644
--- a/tests/invert_test.cpp
+++ b/tests/invert_test.cpp
@@ -20,6 +20,7 @@ QudaMultigridParam mg_param;
 QudaInvertParam mg_inv_param;
 QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL];
 QudaEigParam eig_param;
+bool use_split_grid = false;
 
 // if --enable-testing true is passed, we run the tests defined in here
 #include <invert_test_gtest.hpp>
@@ -206,7 +207,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
   // params corresponds to split grid
   for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
   int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
-  bool use_split_grid = num_sub_partition > 1;
+  use_split_grid = num_sub_partition > 1;
 
   // Now QUDA is initialised and the fields are loaded, we may setup the preconditioner
   void *mg_preconditioner = nullptr;
@@ -214,6 +215,8 @@ std::vector<std::array<double, 2>> solve(test_t param)
     if (use_split_grid) { errorQuda("Split grid does not work with MG yet."); }
     mg_preconditioner = newMultigridQuda(&mg_param);
     inv_param.preconditioner = mg_preconditioner;
+
+    printfQuda("MG Setup Done: %g secs, %g Gflops\n", mg_param.secs, mg_param.gflops / mg_param.secs);
   }
 
   // Vector construct START
@@ -260,7 +263,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
       // Allocate memory and set pointers
       for (int n = 0; n < Nsrc; n++) {
         out_multishift[n * multishift + i] = quda::ColorSpinorField(cs_param);
-        _hp_multi_x[n][i] = out_multishift[n * multishift + i].V();
+        _hp_multi_x[n][i] = out_multishift[n * multishift + i].data();
       }
     }
   }
@@ -285,9 +288,9 @@ std::vector<std::array<double, 2>> solve(test_t param)
       if (inv_deflate) eig_param.preserve_deflation = i < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
       // Perform QUDA inversions
       if (multishift > 1) {
-        invertMultiShiftQuda(_hp_multi_x[i].data(), in[i].V(), &inv_param);
+        invertMultiShiftQuda(_hp_multi_x[i].data(), in[i].data(), &inv_param);
       } else {
-        invertQuda(out[i].V(), in[i].V(), &inv_param);
+        invertQuda(out[i].data(), in[i].data(), &inv_param);
       }
 
       time[i] = inv_param.secs;
@@ -304,8 +307,8 @@ std::vector<std::array<double, 2>> solve(test_t param)
     std::vector<void *> _hp_x(Nsrc);
     std::vector<void *> _hp_b(Nsrc);
     for (int i = 0; i < Nsrc; i++) {
-      _hp_x[i] = out[i].V();
-      _hp_b[i] = in[i].V();
+      _hp_x[i] = out[i].data();
+      _hp_b[i] = in[i].data();
     }
     // Run split grid
     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH
@@ -338,7 +341,7 @@ std::vector<std::array<double, 2>> solve(test_t param)
   // Perform host side verification of inversion if requested
   if (verify_results) {
     for (int i = 0; i < Nsrc; i++) {
-      res[i] = verifyInversion(out[i].V(), _hp_multi_x[i].data(), in[i].V(), check.V(), gauge_param, inv_param,
+      res[i] = verifyInversion(out[i].data(), _hp_multi_x[i].data(), in[i].data(), check.data(), gauge_param, inv_param,
                                gauge.data(), clover.data(), clover_inv.data());
     }
   }
diff --git a/tests/invert_test_gtest.hpp b/tests/invert_test_gtest.hpp
index 3d16341bcb..74866dbcd7 100644
--- a/tests/invert_test_gtest.hpp
+++ b/tests/invert_test_gtest.hpp
@@ -117,6 +117,8 @@ bool skip_test(test_t param)
     return true;
 #endif
   }
+  // split-grid doesn't support split-grid at present
+  if (use_split_grid && multishift > 1) return true;
 
   return false;
 }
diff --git a/tests/llfat_test.cpp b/tests/llfat_test.cpp
index 221ab352e3..0342007de3 100644
--- a/tests/llfat_test.cpp
+++ b/tests/llfat_test.cpp
@@ -41,6 +41,7 @@ static void llfat_test()
   qudaGaugeParam.X[3] = tdim;
 
   setDims(qudaGaugeParam.X);
+  setVerbosity(verbosity);
 
   qudaGaugeParam.cpu_prec = cpu_prec;
   qudaGaugeParam.cuda_prec = qudaGaugeParam.cuda_prec_sloppy = prec;
diff --git a/tests/multigrid_benchmark_test.cpp b/tests/multigrid_benchmark_test.cpp
index c24e374064..02c66e1644 100644
--- a/tests/multigrid_benchmark_test.cpp
+++ b/tests/multigrid_benchmark_test.cpp
@@ -12,6 +12,7 @@
 // include because of nasty globals used in the tests
 #include <dslash_reference.h>
 #include <dirac_quda.h>
+#include <tune_quda.h>
 #include <gauge_tools.h>
 #include <gtest/gtest.h>
 
@@ -23,7 +24,7 @@ using namespace quda;
 
 std::vector<ColorSpinorField> xD, yD;
 
-std::shared_ptr<cudaGaugeField> Y_d, X_d, Xinv_d, Yhat_d;
+std::shared_ptr<GaugeField> Y_d, X_d, Xinv_d, Yhat_d;
 
 int Ncolor;
 
@@ -96,15 +97,14 @@ void initFields(QudaPrecision prec)
   gParam.setPrecision(prec_sloppy);
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-
-  Y_d = std::make_shared<cudaGaugeField>(gParam);
-  Yhat_d = std::make_shared<cudaGaugeField>(gParam);
+  Y_d = std::make_shared<GaugeField>(gParam);
+  Yhat_d = std::make_shared<GaugeField>(gParam);
 
   gParam.geometry = QUDA_SCALAR_GEOMETRY;
   gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
   gParam.nFace = 0;
-  X_d = std::make_shared<cudaGaugeField>(gParam);
-  Xinv_d = std::make_shared<cudaGaugeField>(gParam);
+  X_d = std::make_shared<GaugeField>(gParam);
+  Xinv_d = std::make_shared<GaugeField>(gParam);
 
   // insert random noise into the gauge fields
   {
@@ -279,12 +279,11 @@ int main(int argc, char **argv)
     if (test_rc != 0) warningQuda("Tests failed");
   }
 
-  // now rerun with more iterations to get accurate speed measurements
-  dirac->Flops();    // reset flops counter
-  dirac_pc->Flops(); // reset flops counter
-
+  auto flops0 = quda::Tunable::flops_global();
   double secs = benchmark(test_type, niter);
-  double gflops = ((test_type < 5 ? dirac->Flops() : dirac_pc->Flops()) * 1e-9) / (secs);
+  auto flops1 = quda::Tunable::flops_global();
+
+  double gflops = (flops1 - flops0) * 1e-9 / secs;
 
   printfQuda("Ncolor = %2d, %-31s: Gflop/s = %6.1f\n", Ncolor, names[test_type], gflops);
 
diff --git a/tests/multigrid_evolve_test.cpp b/tests/multigrid_evolve_test.cpp
index d8c72f19fc..9545942a51 100644
--- a/tests/multigrid_evolve_test.cpp
+++ b/tests/multigrid_evolve_test.cpp
@@ -35,13 +35,13 @@ void setReunitarizationConsts()
   setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error);
 }
 
-void CallUnitarizeLinks(quda::cudaGaugeField *cudaInGauge)
+void CallUnitarizeLinks(quda::GaugeField &gauge)
 {
   using namespace quda;
   int *num_failures_dev = (int *)device_malloc(sizeof(int));
   int num_failures;
   qudaMemset(num_failures_dev, 0, sizeof(int));
-  unitarizeLinks(*cudaInGauge, num_failures_dev);
+  unitarizeLinks(gauge, num_failures_dev);
 
   qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), qudaMemcpyDeviceToHost);
   if (num_failures > 0) errorQuda("Error in the unitarization\n");
@@ -219,16 +219,17 @@ int main(int argc, char **argv)
   {
     using namespace quda;
     GaugeFieldParam gParam(gauge_param);
+    gParam.location = QUDA_CUDA_FIELD_LOCATION;
     gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
     gParam.create = QUDA_NULL_FIELD_CREATE;
     gParam.link_type = gauge_param.type;
     gParam.reconstruct = gauge_param.reconstruct;
     gParam.setPrecision(gParam.Precision(), true);
-    cudaGaugeField *gauge = new cudaGaugeField(gParam);
+    GaugeField gauge(gParam);
 
     int pad = 0;
-    lat_dim_t y;
-    lat_dim_t R;
+    lat_dim_t y = {};
+    lat_dim_t R = {};
     for (int dir = 0; dir < 4; ++dir)
       if (comm_dim_partitioned(dir)) R[dir] = 2;
     for (int dir = 0; dir < 4; ++dir) y[dir] = gauge_param.X[dir] + 2 * R[dir];
@@ -239,15 +240,16 @@ int main(int argc, char **argv)
     gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET;
     gParamEx.t_boundary = gParam.t_boundary;
     gParamEx.nFace = 1;
-    for (int dir = 0; dir < 4; ++dir) gParamEx.r[dir] = R[dir];
-    cudaGaugeField *gaugeEx = new cudaGaugeField(gParamEx);
+    gParamEx.r = R;
+
+    GaugeField gaugeEx(gParamEx);
 
     QudaGaugeObservableParam obs_param = newQudaGaugeObservableParam();
     obs_param.compute_plaquette = QUDA_BOOLEAN_TRUE;
     obs_param.compute_qcharge = QUDA_BOOLEAN_TRUE;
 
     // CURAND random generator initialization
-    RNG *randstates = new RNG(*gauge, 1234);
+    RNG randstates(gauge, 1234);
     int nsteps = 10;
     int nhbsteps = 1;
     int novrsteps = 1;
@@ -255,22 +257,22 @@ int main(int argc, char **argv)
     double beta_value = 6.2;
 
     if (link_recon != QUDA_RECONSTRUCT_8 && coldstart)
-      InitGaugeField(*gaugeEx);
+      InitGaugeField(gaugeEx);
     else
-      InitGaugeField(*gaugeEx, *randstates);
+      InitGaugeField(gaugeEx, randstates);
     // Reunitarization setup
     setReunitarizationConsts();
 
     // Do a series of Heatbath updates
-    Monte(*gaugeEx, *randstates, beta_value, 100 * nhbsteps, 100 * novrsteps);
+    Monte(gaugeEx, randstates, beta_value, 100 * nhbsteps, 100 * novrsteps);
 
     // Copy into regular field
-    copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+    copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
     // load the gauge field from gauge
-    gauge_param.gauge_order = gauge->Order();
+    gauge_param.gauge_order = gauge.Order();
     gauge_param.location = QUDA_CUDA_FIELD_LOCATION;
-    loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+    loadGaugeQuda(gauge.data(), &gauge_param);
     gaugeObservablesQuda(&obs_param);
 
     // Demonstrate MG evolution on an evolving gauge field
@@ -311,14 +313,14 @@ int main(int argc, char **argv)
 
     for (int step = 1; step < nsteps; ++step) {
       freeGaugeQuda();
-      Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps);
+      Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps);
 
       // Reunitarize gauge links
       CallUnitarizeLinks(gaugeEx);
 
       // Copy into regular field
-      copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
-      loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+      copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+      loadGaugeQuda(gauge.data(), &gauge_param);
 
       if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) {
         constructHostCloverField(clover, clover_inv, inv_param);
@@ -382,9 +384,9 @@ int main(int argc, char **argv)
     CallUnitarizeLinks(gaugeEx);
 
     // copy into regular field
-    copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION);
+    copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION);
 
-    loadGaugeQuda(gauge->Gauge_p(), &gauge_param);
+    loadGaugeQuda(gauge.data(), &gauge_param);
     // Recompute Gauge Observables
     gaugeObservablesQuda(&obs_param);
 
@@ -447,12 +449,8 @@ int main(int argc, char **argv)
     // free the multigrid solver
     if (inv_multigrid) destroyMultigridQuda(mg_preconditioner);
 
-    delete gauge;
-    delete gaugeEx;
     // Release all temporary memory used for data exchange between GPUs in multi-GPU mode
     PGaugeExchangeFree();
-
-    delete randstates;
   }
 
   // stop the timer
diff --git a/tests/pack_test.cpp b/tests/pack_test.cpp
index 3c5974ddc8..fe68c2645c 100644
--- a/tests/pack_test.cpp
+++ b/tests/pack_test.cpp
@@ -108,20 +108,20 @@ void packTest()
     param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;
 
     GaugeFieldParam cpsParam(param, cpsCpuGauge_p);
-    cpuGaugeField cpsCpuGauge(cpsParam);
+    GaugeField cpsCpuGauge(cpsParam);
     cpsParam.create = QUDA_NULL_FIELD_CREATE;
     cpsParam.reconstruct = param.reconstruct;
     cpsParam.setPrecision(param.cuda_prec, true);
     cpsParam.pad = param.ga_pad;
-    cudaGaugeField cudaCpsGauge(cpsParam);
+    GaugeField cudaCpsGauge(cpsParam);
 
     host_timer.start();
-    cudaCpsGauge.loadCPUField(cpsCpuGauge);
+    cudaCpsGauge.copy(cpsCpuGauge);
     host_timer.stop();
     printfQuda("CPS Gauge send time = %e seconds\n", host_timer.last());
 
     host_timer.start();
-    cudaCpsGauge.saveCPUField(cpsCpuGauge);
+    cpsCpuGauge.copy(cudaCpsGauge);
     host_timer.stop();
     printfQuda("CPS Gauge restore time = %e seconds\n", host_timer.last());
   }
@@ -132,20 +132,20 @@ void packTest()
     param.gauge_order = QUDA_QDP_GAUGE_ORDER;
 
     GaugeFieldParam qdpParam(param, qdpCpuGauge_p);
-    cpuGaugeField qdpCpuGauge(qdpParam);
+    GaugeField qdpCpuGauge(qdpParam);
     qdpParam.create = QUDA_NULL_FIELD_CREATE;
     qdpParam.reconstruct = param.reconstruct;
     qdpParam.setPrecision(param.cuda_prec, true);
     qdpParam.pad = param.ga_pad;
-    cudaGaugeField cudaQdpGauge(qdpParam);
+    GaugeField cudaQdpGauge(qdpParam);
 
     host_timer.start();
-    cudaQdpGauge.loadCPUField(qdpCpuGauge);
+    cudaQdpGauge.copy(qdpCpuGauge);
     host_timer.stop();
     printfQuda("QDP Gauge send time = %e seconds\n", host_timer.last());
 
     host_timer.start();
-    cudaQdpGauge.saveCPUField(qdpCpuGauge);
+    qdpCpuGauge.copy(cudaQdpGauge);
     host_timer.stop();
     printfQuda("QDP Gauge restore time = %e seconds\n", host_timer.last());
   }
diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index 8505b04fe6..78a735ad13 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -62,8 +62,6 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
   }
 
 public:
-  virtual ~StaggeredDslashTest() { }
-
   virtual void SetUp()
   {
     int prec = ::testing::get<0>(GetParam());
@@ -92,7 +90,11 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
   // Per-test-case tear-down.
   // Called after the last test in this test case.
   // Can be omitted if not needed.
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase()
+  {
+    StaggeredDslashTestWrapper::destroy();
+    endQuda();
+  }
 };
 
 TEST_P(StaggeredDslashTest, verify)
@@ -102,7 +104,10 @@ TEST_P(StaggeredDslashTest, verify)
 
   double deviation = dslash_test_wrapper.verify();
   double tol = getTolerance(dslash_test_wrapper.inv_param.cuda_prec);
-
+  if ((dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_8
+       || dslash_test_wrapper.gauge_param.reconstruct == QUDA_RECONSTRUCT_9)
+      && dslash_test_wrapper.inv_param.cuda_prec >= QUDA_HALF_PRECISION)
+    tol *= 10; // if recon 8, we tolerate a greater deviation
   ASSERT_LE(deviation, tol) << "Reference CPU and QUDA implementations do not agree";
 }
 
diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp
index f61e62d88c..0a3a063e45 100644
--- a/tests/staggered_dslash_test.cpp
+++ b/tests/staggered_dslash_test.cpp
@@ -22,8 +22,6 @@ class StaggeredDslashTest : public ::testing::Test
   }
 
 public:
-  StaggeredDslashTest() = default;
-
   virtual void SetUp()
   {
     dslash_test_wrapper.init_test(argc_copy, argv_copy);
@@ -37,7 +35,11 @@ class StaggeredDslashTest : public ::testing::Test
   // Per-test-case tear-down.
   // Called after the last test in this test case.
   // Can be omitted if not needed.
-  static void TearDownTestCase() { endQuda(); }
+  static void TearDownTestCase()
+  {
+    StaggeredDslashTestWrapper::destroy();
+    endQuda();
+  }
 };
 
 TEST_F(StaggeredDslashTest, benchmark) { dslash_test_wrapper.run_test(niter, /**show_metrics =*/true); }
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 5c6d885673..a4aaac347b 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -19,6 +19,7 @@
 #include "dslash_test_helpers.h"
 #include <assert.h>
 #include <gtest/gtest.h>
+#include <tune_quda.h>
 
 using namespace quda;
 
@@ -44,7 +45,10 @@ struct DslashTime {
 
 struct StaggeredDslashTestWrapper {
 
-  void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  static inline void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
+  // In the HISQ case, we include building fat/long links in this unit test
+  static inline void *qdp_fatlink_cpu[4] = {};
+  static inline void *qdp_longlink_cpu[4] = {};
 
   QudaGaugeParam gauge_param;
   QudaInvertParam inv_param;
@@ -52,23 +56,21 @@ struct StaggeredDslashTestWrapper {
   void *milc_fatlink_gpu;
   void *milc_longlink_gpu;
 
-  cpuGaugeField *cpuFat = nullptr;
-  cpuGaugeField *cpuLong = nullptr;
+  GaugeField *cpuFat = nullptr;
+  GaugeField *cpuLong = nullptr;
 
-  ColorSpinorField spinor;
-  ColorSpinorField spinorOut;
-  ColorSpinorField spinorRef;
-  ColorSpinorField tmpCpu;
+  static inline ColorSpinorField spinor;
+  static inline ColorSpinorField spinorOut;
+  static inline ColorSpinorField spinorRef;
+  static inline ColorSpinorField tmpCpu;
   ColorSpinorField cudaSpinor;
   ColorSpinorField cudaSpinorOut;
 
-  std::vector<ColorSpinorField> vp_spinor;
-  std::vector<ColorSpinorField> vp_spinor_out;
+  static inline std::vector<ColorSpinorField> vp_spinor;
+  static inline std::vector<ColorSpinorField> vp_spinor_out;
 
-  // In the HISQ case, we include building fat/long links in this unit test
-  void *qdp_fatlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_longlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr};
-  void **ghost_fatlink_cpu = nullptr, **ghost_longlink_cpu = nullptr;
+  void *ghost_fatlink_cpu[4] = {};
+  void *ghost_longlink_cpu[4] = {};
 
   QudaParity parity = QUDA_EVEN_PARITY;
 
@@ -79,7 +81,7 @@ struct StaggeredDslashTestWrapper {
   char **argv_copy;
 
   // Split grid options
-  bool test_split_grid = false;
+  static inline bool test_split_grid = false;
   int num_src = 1;
 
   void staggeredDslashRef()
@@ -102,9 +104,9 @@ struct StaggeredDslashTestWrapper {
       staggeredDslash(spinorRef.Odd(), qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu,
                       spinor.Even(), QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type);
       if (dslash_type == QUDA_LAPLACE_DSLASH) {
-        xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec);
+        xpay(spinor.data(), kappa, spinorRef.data(), spinor.Length(), gauge_param.cpu_prec);
       } else {
-        axpy(2 * mass, spinor.V(), spinorRef.V(), spinor.Length(), gauge_param.cpu_prec);
+        axpy(2 * mass, spinor.data(), spinorRef.data(), spinor.Length(), gauge_param.cpu_prec);
       }
       break;
     default: errorQuda("Test type %d not defined", static_cast<int>(dtest_type));
@@ -131,7 +133,12 @@ struct StaggeredDslashTestWrapper {
 
     link_recon = link_recon_;
 
-    init(argc, argv);
+    static bool first_time = true;
+    if (first_time) {
+      init_host(argc, argv);
+      first_time = false;
+    }
+    init();
   }
 
   void init_test(int argc, char **argv)
@@ -142,52 +149,122 @@ struct StaggeredDslashTestWrapper {
     setStaggeredGaugeParam(gauge_param);
     setStaggeredInvertParam(inv_param);
 
-    init(argc, argv);
+    static bool first_time = true;
+    if (first_time) {
+      init_host(argc, argv);
+      first_time = false;
+    }
+    init();
   }
 
-  void init(int argc, char **argv)
+  void init_host(int argc, char **argv)
   {
-    inv_param.split_grid[0] = grid_partition[0];
-    inv_param.split_grid[1] = grid_partition[1];
-    inv_param.split_grid[2] = grid_partition[2];
-    inv_param.split_grid[3] = grid_partition[3];
+    setDims(gauge_param.X);
+    dw_setDims(gauge_param.X, 1);
+    if (Nsrc != 1) {
+      warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc);
+      Nsrc = 1;
+    }
 
+    for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
     num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
     test_split_grid = num_src > 1;
     if (test_split_grid) { dtest_type = dslash_test_type::Dslash; }
 
-    inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO;
+    for (int dir = 0; dir < 4; dir++) {
+      qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+      qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
+    }
 
-    setDims(gauge_param.X);
-    dw_setDims(gauge_param.X, 1);
-    if (Nsrc != 1) {
-      warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc);
-      Nsrc = 1;
+    bool compute_on_gpu = false; // reference fat/long fields should be computed on cpu
+    constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv,
+                                     compute_on_gpu);
+
+    ColorSpinorParam csParam;
+    csParam.nColor = 3;
+    csParam.nSpin = 1;
+    csParam.nDim = 4;
+    for (int d = 0; d < 4; d++) { csParam.x[d] = gauge_param.X[d]; }
+    csParam.x[4] = 1;
+
+    csParam.setPrecision(inv_param.cpu_prec);
+    // inv_param.solution_type = QUDA_MAT_SOLUTION;
+    csParam.pad = 0;
+    if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) {
+      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
+      csParam.x[0] /= 2;
+      inv_param.solution_type = QUDA_MATPC_SOLUTION;
+    } else {
+      csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
+      inv_param.solution_type = QUDA_MAT_SOLUTION;
     }
 
-    // Allocate a lot of memory because I'm very confused
-    void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-    void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
+    csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
+    csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
+    csParam.create = QUDA_ZERO_FIELD_CREATE;
+    csParam.pc_type = QUDA_4D_PC;
+    csParam.location = QUDA_CPU_FIELD_LOCATION;
 
-    milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-    milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    spinor = ColorSpinorField(csParam);
+    spinorOut = ColorSpinorField(csParam);
+    spinorRef = ColorSpinorField(csParam);
+    tmpCpu = ColorSpinorField(csParam);
+
+    spinor.Source(QUDA_RANDOM_SOURCE);
 
+    if (test_split_grid) {
+      inv_param.num_src = num_src;
+      inv_param.num_src_per_sub_partition = 1;
+      resize(vp_spinor, num_src, csParam);
+      resize(vp_spinor_out, num_src, csParam);
+      std::fill(vp_spinor.begin(), vp_spinor.end(), spinor);
+    }
+
+    inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO;
+
+    // set verbosity prior to loadGaugeQuda
+    setVerbosity(verbosity);
+  }
+
+  void init()
+  {
+    // Prepare the fields to be used for the GPU computation
     void *qdp_fatlink_gpu[4];
     void *qdp_longlink_gpu[4];
-
     for (int dir = 0; dir < 4; dir++) {
-      qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-
       qdp_fatlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
       qdp_longlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-
-      qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-      qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
     }
+    // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
+    // "compute" the fat/long links or not.
+    if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) {
+      for (int dir = 0; dir < 4; dir++) {
+        memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+        memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
+      }
+    } else {
+      // QUDA_ASQTAD_DSLASH
+      if (compute_fatlong) {
+        computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, host_gauge_data_type_size,
+                          n_naiks, eps_naik);
+      } else {
+        // Not computing FatLong
+        for (int dir = 0; dir < 4; dir++) {
+          memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+          memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size);
+        }
+      }
+    }
+
+    // Create ghost zones for CPU fields,
+    // prepare and load the GPU fields
+    void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
 
-    bool gauge_loaded = false;
-    constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu,
-                                           qdp_fatlink_gpu, gauge_param, argc, argv, gauge_loaded);
+    milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
+    milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
 
     // Alright, we've created all the void** links.
     // Create the void* pointers
@@ -195,23 +272,21 @@ struct StaggeredDslashTestWrapper {
     reorderQDPtoMILC(milc_fatlink_cpu, qdp_fatlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
     reorderQDPtoMILC(milc_longlink_gpu, qdp_longlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
     reorderQDPtoMILC(milc_longlink_cpu, qdp_longlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-    // Create ghost zones for CPU fields,
-    // prepare and load the GPU fields
 
 #ifdef MULTI_GPU
     gauge_param.type = (dslash_type == QUDA_ASQTAD_DSLASH) ? QUDA_ASQTAD_FAT_LINKS : QUDA_SU3_LINKS;
     gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
     GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink_cpu);
     cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-    cpuFat = new cpuGaugeField(cpuFatParam);
-    ghost_fatlink_cpu = cpuFat->Ghost();
+    cpuFat = new GaugeField(cpuFatParam);
+    for (int i = 0; i < 4; i++) ghost_fatlink_cpu[i] = cpuFat->Ghost()[i].data();
 
     if (dslash_type == QUDA_ASQTAD_DSLASH) {
       gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
       GaugeFieldParam cpuLongParam(gauge_param, milc_longlink_cpu);
       cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-      cpuLong = new cpuGaugeField(cpuLongParam);
-      ghost_longlink_cpu = cpuLong ? cpuLong->Ghost() : nullptr;
+      cpuLong = new GaugeField(cpuLongParam);
+      for (int i = 0; i < 4; i++) ghost_longlink_cpu[i] = cpuLong ? cpuLong->Ghost()[i].data() : nullptr;
     }
 #endif
 
@@ -225,9 +300,6 @@ struct StaggeredDslashTestWrapper {
       gauge_param.reconstruct = gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
     }
 
-    // set verbosity prior to loadGaugeQuda
-    setVerbosity(verbosity);
-
     printfQuda("Sending fat links to GPU\n");
     loadGaugeQuda(milc_fatlink_gpu, &gauge_param);
 
@@ -247,47 +319,7 @@ struct StaggeredDslashTestWrapper {
       loadGaugeQuda(milc_longlink_gpu, &gauge_param);
     }
 
-    ColorSpinorParam csParam;
-    csParam.nColor = 3;
-    csParam.nSpin = 1;
-    csParam.nDim = 4;
-    for (int d = 0; d < 4; d++) { csParam.x[d] = gauge_param.X[d]; }
-    csParam.x[4] = 1;
-
-    csParam.setPrecision(inv_param.cpu_prec);
-    // inv_param.solution_type = QUDA_MAT_SOLUTION;
-    csParam.pad = 0;
-    if (dtest_type != dslash_test_type::Mat && dslash_type != QUDA_LAPLACE_DSLASH) {
-      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
-      csParam.x[0] /= 2;
-      inv_param.solution_type = QUDA_MATPC_SOLUTION;
-    } else {
-      csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
-      inv_param.solution_type = QUDA_MAT_SOLUTION;
-    }
-
-    csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
-    csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
-    csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered
-    csParam.create = QUDA_ZERO_FIELD_CREATE;
-    csParam.pc_type = QUDA_4D_PC;
-    csParam.location = QUDA_CPU_FIELD_LOCATION;
-
-    spinor = ColorSpinorField(csParam);
-    spinorOut = ColorSpinorField(csParam);
-    spinorRef = ColorSpinorField(csParam);
-    tmpCpu = ColorSpinorField(csParam);
-
-    spinor.Source(QUDA_RANDOM_SOURCE);
-
-    if (test_split_grid) {
-      inv_param.num_src = num_src;
-      inv_param.num_src_per_sub_partition = 1;
-      resize(vp_spinor, num_src, csParam);
-      resize(vp_spinor_out, num_src, csParam);
-      std::fill(vp_spinor.begin(), vp_spinor.end(), spinor);
-    }
-
+    ColorSpinorParam csParam(spinor);
     csParam.fieldOrder = colorspinor::getNative(inv_param.cuda_prec, 1);
     csParam.pad = 0;
     csParam.setPrecision(inv_param.cuda_prec);
@@ -303,28 +335,23 @@ struct StaggeredDslashTestWrapper {
     setDiracParam(diracParam, &inv_param, pc);
     dirac = Dirac::create(diracParam);
 
-    for (int dir = 0; dir < 4; dir++) {
-      host_free(qdp_fatlink_gpu[dir]);
-      host_free(qdp_longlink_gpu[dir]);
-      host_free(qdp_inlink[dir]);
-    }
     host_free(milc_fatlink_cpu);
     host_free(milc_longlink_cpu);
-  }
 
-  void end()
-  {
     for (int dir = 0; dir < 4; dir++) {
-      if (qdp_fatlink_cpu[dir] != nullptr) {
-        host_free(qdp_fatlink_cpu[dir]);
-        qdp_fatlink_cpu[dir] = nullptr;
+      if (qdp_fatlink_gpu[dir] != nullptr) {
+        host_free(qdp_fatlink_gpu[dir]);
+        qdp_fatlink_gpu[dir] = nullptr;
       }
-      if (qdp_longlink_cpu[dir] != nullptr) {
-        host_free(qdp_longlink_cpu[dir]);
-        qdp_longlink_cpu[dir] = nullptr;
+      if (qdp_longlink_gpu[dir] != nullptr) {
+        host_free(qdp_longlink_gpu[dir]);
+        qdp_longlink_gpu[dir] = nullptr;
       }
     }
+  }
 
+  void end()
+  {
     if (dirac != nullptr) {
       delete dirac;
       dirac = nullptr;
@@ -348,6 +375,25 @@ struct StaggeredDslashTestWrapper {
     commDimPartitionedReset();
   }
 
+  static void destroy()
+  {
+    for (int dir = 0; dir < 4; dir++) {
+      if (qdp_inlink[dir]) host_free(qdp_inlink[dir]);
+      if (qdp_fatlink_cpu[dir]) host_free(qdp_fatlink_cpu[dir]);
+      if (qdp_longlink_cpu[dir]) host_free(qdp_longlink_cpu[dir]);
+    }
+
+    spinor = {};
+    spinorOut = {};
+    spinorRef = {};
+    tmpCpu = {};
+
+    if (test_split_grid) {
+      vp_spinor.clear();
+      vp_spinor_out.clear();
+    }
+  }
+
   DslashTime dslashCUDA(int niter)
   {
     DslashTime dslash_time;
@@ -363,8 +409,8 @@ struct StaggeredDslashTestWrapper {
       std::vector<void *> _hp_x(inv_param.num_src);
       std::vector<void *> _hp_b(inv_param.num_src);
       for (int i = 0; i < inv_param.num_src; i++) {
-        _hp_x[i] = vp_spinor_out[i].V();
-        _hp_b[i] = vp_spinor[i].V();
+        _hp_x[i] = vp_spinor_out[i].data();
+        _hp_b[i] = vp_spinor[i].data();
       }
       dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, milc_fatlink_gpu, milc_longlink_gpu,
                                   &gauge_param);
@@ -404,20 +450,30 @@ struct StaggeredDslashTestWrapper {
     printfQuda("Tuning...\n");
     dslashCUDA(1);
 
-    // reset flop counter
-    dirac->Flops();
+    auto flops0 = quda::Tunable::flops_global();
+    auto bytes0 = quda::Tunable::bytes_global();
 
     DslashTime dslash_time = dslashCUDA(niter);
+
+    unsigned long long flops = (quda::Tunable::flops_global() - flops0);
+    unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0);
+
     spinorOut = cudaSpinorOut;
 
     if (print_metrics) {
       printfQuda("%fus per kernel call\n", 1e6 * dslash_time.event_time / niter);
 
-      unsigned long long flops = dirac->Flops();
+      printfQuda("%llu flops per kernel call, %llu flops per site %llu bytes per site\n", flops / niter,
+                 (flops / niter) / cudaSpinor.Volume(), (bytes / niter) / cudaSpinor.Volume());
+
       double gflops = 1.0e-9 * flops / dslash_time.event_time;
       printfQuda("GFLOPS = %f\n", gflops);
       ::testing::Test::RecordProperty("Gflops", std::to_string(gflops));
 
+      double gbytes = 1.0e-9 * bytes / dslash_time.event_time;
+      printfQuda("GBYTES = %f\n", gbytes);
+      ::testing::Test::RecordProperty("Gbytes", std::to_string(gbytes));
+
       size_t ghost_bytes = cudaSpinor.GhostBytes();
 
       ::testing::Test::RecordProperty("Halo_bidirectitonal_BW_GPU",
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 70877d36d2..a495a11251 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -132,7 +132,7 @@ int main(int argc, char **argv)
   milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
   milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
 
-  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv);
+  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, true);
 
   // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
   double plaq[3];
diff --git a/tests/staggered_gsmear_test_utils.h b/tests/staggered_gsmear_test_utils.h
index 42bd43e293..06c252292d 100644
--- a/tests/staggered_gsmear_test_utils.h
+++ b/tests/staggered_gsmear_test_utils.h
@@ -128,17 +128,17 @@ struct StaggeredGSmearTestWrapper { //
         quda::blas::ax(ftmp, tmp);
         quda::blas::axpy(a, tmp, tmp2);
 
-        staggeredTwoLinkGaussianSmear(spinorRef.Even(), qdp_twolnk, (void **)cpuTwoLink->Ghost(), tmp.Even(),
-                                      &gauge_param, &inv_param, 0, smear_coeff, smear_t0, gauge_param.cpu_prec);
-        staggeredTwoLinkGaussianSmear(spinorRef.Odd(), qdp_twolnk, (void **)cpuTwoLink->Ghost(), tmp.Odd(),
-                                      &gauge_param, &inv_param, 1, smear_coeff, smear_t0, gauge_param.cpu_prec);
+        staggeredTwoLinkGaussianSmear(spinorRef.Even(), qdp_twolnk, *cpuTwoLink, tmp.Even(), &gauge_param, &inv_param,
+                                      0, smear_coeff, smear_t0, gauge_param.cpu_prec);
+        staggeredTwoLinkGaussianSmear(spinorRef.Odd(), qdp_twolnk, *cpuTwoLink, tmp.Odd(), &gauge_param, &inv_param, 1,
+                                      smear_coeff, smear_t0, gauge_param.cpu_prec);
 
         // blas::xpay(*tmp2, -1.0, *spinorRef);
-        xpay(tmp2.Even().V(), -1.0, spinorRef.Even().V(), spinor.Even().Length(), gauge_param.cpu_prec);
-        xpay(tmp2.Odd().V(), -1.0, spinorRef.Odd().V(), spinor.Odd().Length(), gauge_param.cpu_prec);
+        xpay(tmp2.Even().data(), -1.0, spinorRef.Even().data(), spinor.Even().Length(), gauge_param.cpu_prec);
+        xpay(tmp2.Odd().data(), -1.0, spinorRef.Odd().data(), spinor.Odd().Length(), gauge_param.cpu_prec);
         //
-        memset(tmp2.Even().V(), 0, spinor.Even().Length() * gauge_param.cpu_prec);
-        memset(tmp2.Odd().V(), 0, spinor.Odd().Length() * gauge_param.cpu_prec);
+        memset(tmp2.Even().data(), 0, spinor.Even().Length() * gauge_param.cpu_prec);
+        memset(tmp2.Odd().data(), 0, spinor.Odd().Length() * gauge_param.cpu_prec);
       }
       break;
     }
@@ -327,7 +327,7 @@ struct StaggeredGSmearTestWrapper { //
         qsm_param.delete_2link = smear_delete_two_link;
         qsm_param.t0 = smear_t0;
 
-        performTwoLinkGaussianSmearNStep(spinor.V(), &qsm_param);
+        performTwoLinkGaussianSmearNStep(spinor.data(), &qsm_param);
 
         quda_gflops = qsm_param.gflops;
 
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index d6dccacca2..5fcd338b4b 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -102,60 +102,8 @@ void display_test_info()
              dimPartitioned(3));
 }
 
-int main(int argc, char **argv)
+void test(int argc, char **argv)
 {
-  setQudaDefaultMgTestParams();
-  // Parse command line options
-  auto app = make_app();
-  add_eigen_option_group(app);
-  add_deflation_option_group(app);
-  add_multigrid_option_group(app);
-  add_comms_option_group(app);
-  CLI::TransformPairs<int> test_type_map {{"full", 0}, {"full_ee_prec", 1}, {"full_oo_prec", 2}, {"even", 3},
-                                          {"odd", 4},  {"mcg_even", 5},     {"mcg_odd", 6}};
-  app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map));
-  try {
-    app->parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    return app->exit(e);
-  }
-  setVerbosity(verbosity);
-  if (!inv_multigrid) solve_type = QUDA_INVALID_SOLVE;
-
-  if (inv_deflate && inv_multigrid) {
-    printfQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n");
-    exit(0);
-  }
-
-  // Set values for precisions via the command line.
-  setQudaPrecisions();
-
-  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
-  initComms(argc, argv, gridsize_from_cmdline);
-
-  initRand();
-
-  // Only these fermions are supported in this file. Ensure a reasonable default,
-  // ensure that the default is improved staggered
-  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) {
-    printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type),
-               get_dslash_str(QUDA_ASQTAD_DSLASH));
-    dslash_type = QUDA_ASQTAD_DSLASH;
-  }
-
-  // Need to add support for LAPLACE MG?
-  if (inv_multigrid) {
-    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) {
-      printfQuda("dslash_type %s not supported for multigrid preconditioner\n", get_dslash_str(dslash_type));
-      exit(0);
-    }
-  }
-
-  // Deduce operator, solution, and operator preconditioning types
-  if (!inv_multigrid) setQudaStaggeredInvTestParams();
-
-  display_test_info();
-
   // Set QUDA internal parameters
   QudaGaugeParam gauge_param = newQudaGaugeParam();
   QudaInvertParam inv_param = newQudaInvertParam();
@@ -167,11 +115,7 @@ int main(int argc, char **argv)
   QudaEigParam mg_eig_param[mg_levels];
 
   // params related to split grid.
-  inv_param.split_grid[0] = grid_partition[0];
-  inv_param.split_grid[1] = grid_partition[1];
-  inv_param.split_grid[2] = grid_partition[2];
-  inv_param.split_grid[3] = grid_partition[3];
-
+  for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
   int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
   bool use_split_grid = num_sub_partition > 1;
 
@@ -205,9 +149,6 @@ int main(int argc, char **argv)
     inv_param.eig_param = nullptr;
   }
 
-  // This must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
-  initQuda(device_ordinal);
-
   setDims(gauge_param.X);
   // Hack: use the domain wall dimensions so we may use the 5th dim for multi indexing
   dw_setDims(gauge_param.X, 1);
@@ -215,29 +156,36 @@ int main(int argc, char **argv)
   // Staggered Gauge construct START
   //-----------------------------------------------------------------------------------
   // Allocate host staggered gauge fields
-  void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr};
-  void *milc_fatlink = nullptr;
-  void *milc_longlink = nullptr;
-  GaugeField *cpuFat = nullptr;
-  GaugeField *cpuLong = nullptr;
-
-  for (int dir = 0; dir < 4; dir++) {
-    qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    qdp_fatlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-    qdp_longlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size);
-  }
-  milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-  milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size);
-
-  // For load, etc
+  gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ?
+    QUDA_SU3_LINKS :
+    QUDA_ASQTAD_FAT_LINKS;
   gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
+  gauge_param.location = QUDA_CPU_FIELD_LOCATION;
+
+  GaugeFieldParam cpuParam(gauge_param);
+  cpuParam.create = QUDA_NULL_FIELD_CREATE;
+  cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
+  cpuParam.order = QUDA_QDP_GAUGE_ORDER;
+  GaugeField cpuIn = GaugeField(cpuParam);
+  GaugeField cpuFatQDP = GaugeField(cpuParam);
+  cpuParam.order = QUDA_MILC_GAUGE_ORDER;
+  GaugeField cpuFatMILC = GaugeField(cpuParam);
+
+  cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS;
+  cpuParam.nFace = 3;
+  cpuParam.order = QUDA_QDP_GAUGE_ORDER;
+  GaugeField cpuLongQDP = GaugeField(cpuParam);
+  cpuParam.order = QUDA_MILC_GAUGE_ORDER;
+  GaugeField cpuLongMILC = GaugeField(cpuParam);
+
+  void *qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)};
+  void *qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)};
+  void *qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)};
+  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, true);
 
-  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv);
   // Reorder gauge fields to MILC order
-  reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
-  reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec);
+  cpuFatMILC = cpuFatQDP;
+  cpuLongMILC = cpuLongQDP;
 
   // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
   // This needs to be called before `loadFatLongGaugeQuda` because this routine also loads the
@@ -252,23 +200,14 @@ int main(int argc, char **argv)
     printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]);
   }
 
-  // Create ghost gauge fields in case of multi GPU builds.
-  gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ?
-    QUDA_SU3_LINKS :
-    QUDA_ASQTAD_FAT_LINKS;
-  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
-  gauge_param.location = QUDA_CPU_FIELD_LOCATION;
-
-  GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink);
-  cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuFat = GaugeField::Create(cpuFatParam);
+  loadFatLongGaugeQuda(cpuFatMILC.data(), cpuLongMILC.data(), gauge_param);
 
-  gauge_param.type = QUDA_ASQTAD_LONG_LINKS;
-  GaugeFieldParam cpuLongParam(gauge_param, milc_longlink);
-  cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-  cpuLong = GaugeField::Create(cpuLongParam);
-
-  loadFatLongGaugeQuda(milc_fatlink, milc_longlink, gauge_param);
+  // now copy back to QDP aliases, since these are used for the reference dslash
+  cpuFatQDP = cpuFatMILC;
+  cpuLongQDP = cpuLongMILC;
+  // ensure QDP alias has exchanged ghosts
+  cpuFatQDP.exchangeGhost();
+  cpuLongQDP.exchangeGhost();
 
   // Staggered Gauge construct END
   //-----------------------------------------------------------------------------------
@@ -279,37 +218,33 @@ int main(int argc, char **argv)
     if (use_split_grid) { errorQuda("Split grid does not work with MG yet."); }
     mg_preconditioner = newMultigridQuda(&mg_param);
     inv_param.preconditioner = mg_preconditioner;
+
+    printfQuda("MG Setup Done: %g secs, %g Gflops\n", mg_param.secs, mg_param.gflops / mg_param.secs);
   }
 
   // Staggered vector construct START
   //-----------------------------------------------------------------------------------
-  std::vector<quda::ColorSpinorField *> in;
-  std::vector<quda::ColorSpinorField *> out;
-  quda::ColorSpinorField *ref;
-  quda::ColorSpinorField *tmp;
+  std::vector<quda::ColorSpinorField> in(Nsrc);
+  std::vector<quda::ColorSpinorField> out(Nsrc);
   quda::ColorSpinorParam cs_param;
   constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param);
   for (int k = 0; k < Nsrc; k++) {
-    in.emplace_back(quda::ColorSpinorField::Create(cs_param));
-    out.emplace_back(quda::ColorSpinorField::Create(cs_param));
+    in[k] = quda::ColorSpinorField(cs_param);
+    out[k] = quda::ColorSpinorField(cs_param);
   }
-  ref = quda::ColorSpinorField::Create(cs_param);
-  tmp = quda::ColorSpinorField::Create(cs_param);
+  ColorSpinorField ref(cs_param);
+  ColorSpinorField tmp(cs_param);
   // Staggered vector construct END
   //-----------------------------------------------------------------------------------
 
   // Prepare rng
-  auto *rng = new quda::RNG(*ref, 1234);
+  quda::RNG rng(ref, 1234);
 
   // Performance measuring
   std::vector<double> time(Nsrc);
   std::vector<double> gflops(Nsrc);
   std::vector<int> iter(Nsrc);
 
-  // Pointers for split grid tests
-  std::vector<quda::ColorSpinorField *> _h_b(Nsrc, nullptr);
-  std::vector<quda::ColorSpinorField *> _h_x(Nsrc, nullptr);
-
   // QUDA invert test
   //----------------------------------------------------------------------------
 
@@ -320,17 +255,14 @@ int main(int argc, char **argv)
     // case 3: // even parity solution, solving EVEN system
     // case 4: // odd parity solution, solving ODD system
 
-    if (multishift != 1) {
-      printfQuda("Multishift not supported for test %d\n", test_type);
-      exit(0);
-    }
+    if (multishift != 1) errorQuda("Multishift not supported for test %d\n", test_type);
 
-    for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); }
+    for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM); }
 
     if (!use_split_grid) {
       for (int k = 0; k < Nsrc; k++) {
         if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE;
-        invertQuda(out[k]->V(), in[k]->V(), &inv_param);
+        invertQuda(out[k].data(), in[k].data(), &inv_param);
         time[k] = inv_param.secs;
         gflops[k] = inv_param.gflops / inv_param.secs;
         iter[k] = inv_param.iter;
@@ -341,12 +273,12 @@ int main(int argc, char **argv)
       std::vector<void *> _hp_x(Nsrc);
       std::vector<void *> _hp_b(Nsrc);
       for (int k = 0; k < Nsrc; k++) {
-        _hp_x[k] = out[k]->V();
-        _hp_b[k] = in[k]->V();
+        _hp_x[k] = out[k].data();
+        _hp_b[k] = in[k].data();
       }
       inv_param.num_src = Nsrc;
       inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition;
-      invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, (void *)milc_fatlink, (void *)milc_longlink,
+      invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(), cpuLongMILC.data(),
                                   &gauge_param);
       quda::comm_allreduce_int(inv_param.iter);
       inv_param.iter /= comm_size() / num_sub_partition;
@@ -359,8 +291,7 @@ int main(int argc, char **argv)
 
     for (int k = 0; k < Nsrc; k++) {
       if (verify_results)
-        verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, qdp_fatlink, qdp_longlink, (void **)cpuFat->Ghost(),
-                                 (void **)cpuLong->Ghost(), gauge_param, inv_param, 0);
+        verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, gauge_param, inv_param, 0);
     }
   } else if (test_type == 5 || test_type == 6) {
     // case 5: // multi mass CG, even parity solution, solving EVEN system
@@ -398,14 +329,14 @@ int main(int argc, char **argv)
       inv_param.tol_offset[i] = (multishift_tols.size() == 0 ? inv_param.tol : multishift_tols[i]);
       inv_param.tol_hq_offset[i] = (multishift_tols_hq.size() == 0 ? inv_param.tol_hq : multishift_tols_hq[i]);
 
-      outArray[i] = qudaOutArray[i].V();
+      outArray[i] = qudaOutArray[i].data();
 
       logQuda(QUDA_VERBOSE, "Multishift mass %d = %e ; tolerance %e ; hq tolerance %e\n", i, masses[i], inv_param.tol_offset[i], inv_param.tol_hq_offset[i]);
     }
 
     for (int k = 0; k < Nsrc; k++) {
-      quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM);
-      invertMultiShiftQuda((void **)outArray.data(), in[k]->V(), &inv_param);
+      quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM);
+      invertMultiShiftQuda((void **)outArray.data(), in[k].data(), &inv_param);
 
       time[k] = inv_param.secs;
       gflops[k] = inv_param.gflops / inv_param.secs;
@@ -415,8 +346,8 @@ int main(int argc, char **argv)
 
       for (int i = 0; i < multishift; i++) {
         printfQuda("%dth solution: mass=%f, ", i, masses[i]);
-        verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], qdp_fatlink, qdp_longlink,
-                                 (void **)cpuFat->Ghost(), (void **)cpuLong->Ghost(), gauge_param, inv_param, i);
+        verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, gauge_param,
+                                 inv_param, i);
       }
     }
   } else {
@@ -426,40 +357,66 @@ int main(int argc, char **argv)
   // Compute timings
   if (Nsrc > 1 && !use_split_grid) performanceStats(time, gflops, iter);
 
-  // Free RNG
-  delete rng;
-
   // Free the multigrid solver
   if (inv_multigrid) destroyMultigridQuda(mg_preconditioner);
+}
 
-  // Clean up gauge fields
-  for (int dir = 0; dir < 4; dir++) {
-    host_free(qdp_inlink[dir]);
-    host_free(qdp_fatlink[dir]);
-    host_free(qdp_longlink[dir]);
+int main(int argc, char **argv)
+{
+  setQudaDefaultMgTestParams();
+  // Parse command line options
+  auto app = make_app();
+  add_eigen_option_group(app);
+  add_deflation_option_group(app);
+  add_multigrid_option_group(app);
+  add_comms_option_group(app);
+  CLI::TransformPairs<int> test_type_map {{"full", 0}, {"full_ee_prec", 1}, {"full_oo_prec", 2}, {"even", 3},
+                                          {"odd", 4},  {"mcg_even", 5},     {"mcg_odd", 6}};
+  app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map));
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
   }
-  host_free(milc_fatlink);
-  host_free(milc_longlink);
+  setVerbosity(verbosity);
+  if (!inv_multigrid) solve_type = QUDA_INVALID_SOLVE;
 
-  if (cpuFat != nullptr) {
-    delete cpuFat;
-    cpuFat = nullptr;
-  }
-  if (cpuLong != nullptr) {
-    delete cpuLong;
-    cpuLong = nullptr;
+  if (inv_deflate && inv_multigrid) {
+    errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve");
   }
 
-  for (auto in_vec : in) { delete in_vec; }
-  for (auto out_vec : out) { delete out_vec; }
-  delete ref;
-  delete tmp;
+  // Set values for precisions via the command line.
+  setQudaPrecisions();
+
+  // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp)
+  initComms(argc, argv, gridsize_from_cmdline);
 
-  if (use_split_grid) {
-    for (auto p : _h_b) { delete p; }
-    for (auto p : _h_x) { delete p; }
+  initRand();
+
+  // Only these fermions are supported in this file. Ensure a reasonable default,
+  // ensure that the default is improved staggered
+  if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) {
+    printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type),
+               get_dslash_str(QUDA_ASQTAD_DSLASH));
+    dslash_type = QUDA_ASQTAD_DSLASH;
   }
 
+  // Need to add support for LAPLACE MG?
+  if (inv_multigrid) {
+    if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) {
+      errorQuda("dslash_type %s not supported for multigrid preconditioner", get_dslash_str(dslash_type));
+    }
+  }
+
+  // Deduce operator, solution, and operator preconditioning types
+  if (!inv_multigrid) setQudaStaggeredInvTestParams();
+
+  display_test_info();
+
+  initQuda(device_ordinal);
+
+  test(argc, argv);
+
   // Finalize the QUDA library
   endQuda();
 
diff --git a/tests/unitarize_link_test.cpp b/tests/unitarize_link_test.cpp
index a0322e397b..4cd8553fdd 100644
--- a/tests/unitarize_link_test.cpp
+++ b/tests/unitarize_link_test.cpp
@@ -32,18 +32,18 @@ static double max_allowed_error = 1e-11;
 
 static QudaGaugeFieldOrder gauge_order = QUDA_MILC_GAUGE_ORDER;
 
-quda::cpuGaugeField *cpuFatLink, *cpuULink, *cudaResult;
-quda::cudaGaugeField *cudaFatLink, *cudaULink;
+quda::GaugeField *cpuFatLink, *cpuULink, *cudaResult;
+quda::GaugeField *cudaFatLink, *cudaULink;
 
 const double unittol = (prec == QUDA_DOUBLE_PRECISION) ? 1e-10 : 1e-6;
 
 TEST(unitarization, verify)
 {
   unitarizeLinksCPU(*cpuULink, *cpuFatLink);
-  cudaULink->saveCPUField(*cudaResult);
+  cudaResult->copy(*cudaULink);
 
-  int res = compare_floats(cudaResult->Gauge_p(), cpuULink->Gauge_p(), 4 * cudaResult->Volume() * gauge_site_size,
-                           unittol, cpu_prec);
+  int res = compare_floats(cudaResult->data(), cpuULink->data(), 4 * cudaResult->Volume() * gauge_site_size, unittol,
+                           cpu_prec);
 
 #ifdef MULTI_GPU
   quda::comm_allreduce_int(res);
@@ -124,21 +124,21 @@ static int unitarize_link_test(int &test_rc)
   gParam.create = QUDA_REFERENCE_FIELD_CREATE;
   gParam.gauge = fatlink;
   gParam.location = QUDA_CPU_FIELD_LOCATION;
-  cpuFatLink = new quda::cpuGaugeField(gParam);
+  cpuFatLink = new quda::GaugeField(gParam);
 
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  cpuULink = new quda::cpuGaugeField(gParam);
+  cpuULink = new quda::GaugeField(gParam);
 
   gParam.create = QUDA_ZERO_FIELD_CREATE;
-  cudaResult = new quda::cpuGaugeField(gParam);
+  cudaResult = new quda::GaugeField(gParam);
 
   gParam.pad = 0;
   gParam.create = QUDA_NULL_FIELD_CREATE;
   gParam.reconstruct = QUDA_RECONSTRUCT_NO;
   gParam.setPrecision(prec, true);
   gParam.location = QUDA_CUDA_FIELD_LOCATION;
-  cudaFatLink = new quda::cudaGaugeField(gParam);
-  cudaULink = new quda::cudaGaugeField(gParam);
+  cudaFatLink = new quda::GaugeField(gParam);
+  cudaULink = new quda::GaugeField(gParam);
 
   { // create fat links
     double act_path_coeff[6];
@@ -151,7 +151,7 @@ static int unitarize_link_test(int &test_rc)
 
     computeKSLinkQuda(fatlink, NULL, NULL, inlink, act_path_coeff, &qudaGaugeParam);
 
-    cudaFatLink->loadCPUField(*cpuFatLink);
+    cudaFatLink->copy(*cpuFatLink);
   }
 
   quda::setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error,
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index c56ec2bd14..d842349f1d 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -121,6 +121,7 @@ QudaMatPCType matpc_type = QUDA_MATPC_EVEN_EVEN;
 QudaSolveType solve_type = QUDA_NORMOP_PC_SOLVE;
 QudaSolutionType solution_type = QUDA_MAT_SOLUTION;
 QudaTboundary fermion_t_boundary = QUDA_ANTI_PERIODIC_T;
+std::array<int, 4> dilution_block_size = {8, 8, 8, 8};
 
 int mg_levels = 2;
 
@@ -608,6 +609,11 @@ std::shared_ptr<QUDAApp> make_app(std::string app_description, std::string app_n
                  "The fermoinic temporal boundary conditions (anti-periodic (default), periodic")
     ->transform(CLI::QUDACheckedTransformer(fermion_t_boundary_map));
 
+  quda_app
+    ->add_option("--dilution-block-size", dilution_block_size,
+                 "Set the dilution block size in all four dimension (default 1 1 1 1)")
+    ->expected(4);
+
   quda_app
     ->add_option("--solve-type", solve_type,
                  "The type of solve to do (direct, direct-pc, normop, normop-pc, normerr, normerr-pc)")
diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h
index 0bc4a7ea55..a5105ff5b6 100644
--- a/tests/utils/command_line_params.h
+++ b/tests/utils/command_line_params.h
@@ -256,6 +256,7 @@ extern QudaMatPCType matpc_type;
 extern QudaSolveType solve_type;
 extern QudaSolutionType solution_type;
 extern QudaTboundary fermion_t_boundary;
+extern std::array<int, 4> dilution_block_size;
 
 extern int mg_levels;
 
diff --git a/tests/utils/face_gauge.cpp b/tests/utils/face_gauge.cpp
index 4b4af5ca62..bebaae5622 100644
--- a/tests/utils/face_gauge.cpp
+++ b/tests/utils/face_gauge.cpp
@@ -906,15 +906,10 @@ void do_exchange_cpu_staple(Float *staple, Float **ghost_staple, Float **staple_
     Float *ghost_staple_back = ghost_staple[dir];
     Float *ghost_staple_fwd = ghost_staple[dir] + 2 * Vsh[dir] * gauge_site_size;
 
-    MsgHandle *mh_recv_back;
-    MsgHandle *mh_recv_fwd;
-    MsgHandle *mh_send_fwd;
-    MsgHandle *mh_send_back;
-
-    mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2 * len[dir]);
-    mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2 * len[dir]);
-    mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2 * len[dir]);
-    mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2 * len[dir]);
+    MsgHandle *mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2 * len[dir]);
+    MsgHandle *mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2 * len[dir]);
+    MsgHandle *mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2 * len[dir]);
+    MsgHandle *mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2 * len[dir]);
 
     comm_start(mh_recv_back);
     comm_start(mh_recv_fwd);
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 3d5ac65be9..31a405bc68 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -1054,6 +1054,7 @@ template <typename Float> void constructUnitGaugeField(Float **res, QudaGaugePar
   }
 
   for (int dir = 0; dir < 4; dir++) {
+#pragma omp parallel for
     for (int i = 0; i < Vh; i++) {
       for (int m = 0; m < 3; m++) {
         for (int n = 0; n < 3; n++) {
@@ -1269,14 +1270,21 @@ template <typename Float> static void checkGauge(Float **oldG, Float **newG, dou
 
   for (int d = 0; d < 4; d++) {
     for (int eo = 0; eo < 2; eo++) {
+#pragma omp parallel for
       for (int i = 0; i < Vh; i++) {
         int ga_idx = (eo * Vh + i);
         for (int j = 0; j < 18; j++) {
           double diff = fabs(newG[d][ga_idx * 18 + j] - oldG[d][ga_idx * 18 + j]); /// fabs(oldG[d][ga_idx*18+j]);
 
           for (int f = 0; f < fail_check; f++)
-            if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[d][f]++;
-          if (diff > epsilon || std::isnan(diff)) iter[d][j]++;
+            if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) {
+#pragma omp atomic
+              fail[d][f]++;
+            }
+          if (diff > epsilon || std::isnan(diff)) {
+#pragma omp atomic
+            iter[d][j]++;
+          }
         }
       }
     }
@@ -1302,7 +1310,7 @@ void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precisi
     checkGauge((float **)oldG, (float **)newG, epsilon);
 }
 
-void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
+void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase)
 {
   if (precision == QUDA_DOUBLE_PRECISION) {
     constructUnitaryGaugeField((double **)link);
@@ -1311,6 +1319,7 @@ void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
   }
 
   if (phase == SITELINK_PHASE_MILC) {
+#pragma omp parallel for
     for (int i = 0; i < V; i++) {
       for (int dir = XUP; dir <= TUP; dir++) {
         int idx = i;
@@ -1443,6 +1452,12 @@ void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
   return;
 }
 
+void createSiteLinkCPU(quda::GaugeField &u, QudaPrecision precision, int phase)
+{
+  void *link[] = {u.data(0), u.data(1), u.data(2), u.data(3)};
+  createSiteLinkCPU(link, precision, phase);
+}
+
 template <typename Float> int compareLink(Float **linkA, Float **linkB, int len)
 {
   const int fail_check = 16;
@@ -1453,14 +1468,21 @@ template <typename Float> int compareLink(Float **linkA, Float **linkB, int len)
   for (int i = 0; i < 18; i++) iter[i] = 0;
 
   for (int dir = 0; dir < 4; dir++) {
+#pragma omp parallel for
     for (int i = 0; i < len; i++) {
       for (int j = 0; j < 18; j++) {
         int is = i * 18 + j;
         double diff = fabs(linkA[dir][is] - linkB[dir][is]);
         for (int f = 0; f < fail_check; f++)
-          if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++;
+          if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) {
+#pragma omp atomic
+            fail[f]++;
+          }
         // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
-        if (diff > 1e-3 || std::isnan(diff)) iter[j]++;
+        if (diff > 1e-3 || std::isnan(diff)) {
+#pragma omp atomic
+          iter[j]++;
+        }
       }
     }
   }
@@ -1493,6 +1515,21 @@ static int compare_link(void **linkA, void **linkB, int len, QudaPrecision preci
   return ret;
 }
 
+static int compare_link(const GaugeField &linkA, const GaugeField &linkB)
+{
+  int ret;
+
+  void *a[] = {linkA.data(0), linkA.data(1), linkA.data(2), linkA.data(3)};
+  void *b[] = {linkB.data(0), linkB.data(1), linkB.data(2), linkB.data(3)};
+  if (checkPrecision(linkA, linkB) == QUDA_DOUBLE_PRECISION) {
+    ret = compareLink((double **)a, (double **)b, linkA.Volume());
+  } else {
+    ret = compareLink((float **)a, (float **)b, linkA.Volume());
+  }
+
+  return ret;
+}
+
 // X indexes the lattice site
 static void printLinkElement(void *link, int X, QudaPrecision precision)
 {
@@ -1524,8 +1561,30 @@ int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *
     printfQuda("\n");
   }
 
-  int ret = compare_link(linkA, linkB, len, prec);
-  return ret;
+  return compare_link(linkA, linkB, len, prec);
+}
+
+int strong_check_link(const GaugeField &linkA, const std::string &msgA, const GaugeField &linkB, const std::string &msgB)
+{
+  if (verbosity >= QUDA_VERBOSE) {
+    printfQuda("%s\n", msgA.c_str());
+    printLinkElement(linkA.data(0), 0, prec);
+    printfQuda("\n");
+    printLinkElement(linkA.data(0), 1, prec);
+    printfQuda("...\n");
+    printLinkElement(linkA.data(3), linkA.Volume() - 1, prec);
+    printfQuda("\n");
+
+    printfQuda("\n%s\n", msgB.c_str());
+    printLinkElement(linkB.data(0), 0, prec);
+    printfQuda("\n");
+    printLinkElement(linkB.data(0), 1, prec);
+    printfQuda("...\n");
+    printLinkElement(linkB.data(3), linkB.Volume() - 1, prec);
+    printfQuda("\n");
+  }
+
+  return compare_link(linkA, linkB);
 }
 
 void createMomCPU(void *mom, QudaPrecision precision, double max_val)
@@ -1581,14 +1640,21 @@ template <typename Float> int compare_mom(Float *momA, Float *momB, int len)
   int iter[mom_site_size];
   for (auto i = 0lu; i < mom_site_size; i++) iter[i] = 0;
 
+#pragma omp parallel for
   for (int i = 0; i < len; i++) {
     for (auto j = 0lu; j < mom_site_size - 1; j++) {
       int is = i * mom_site_size + j;
       double diff = fabs(momA[is] - momB[is]);
       for (int f = 0; f < fail_check; f++)
-        if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++;
+        if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) {
+#pragma omp atomic
+          fail[f]++;
+        }
       // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff);
-      if (diff > 1e-3 || std::isnan(diff)) iter[j]++;
+      if (diff > 1e-3 || std::isnan(diff)) {
+#pragma omp atomic
+        iter[j]++;
+      }
     }
   }
 
diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h
index 85302be2df..873b23b226 100644
--- a/tests/utils/host_utils.h
+++ b/tests/utils/host_utils.h
@@ -49,17 +49,19 @@ void setQudaStaggeredInvTestParams();
 //------------------------------------------------------
 void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu,
                                             void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param,
-                                            int argc, char **argv, bool &gauge_loaded);
+                                            int argc, char **argv);
 void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
-                                      QudaGaugeParam &gauge_param, int argc, char **argv);
+                                      QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu);
 void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *,
                                 QudaDslashType dslash_type);
 void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param);
 void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, void *act_path_coeff);
 void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink,
-                         void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik);
+                         void *qudaGaugeParamPtr, std::array<std::array<double, 6>, 3> &act_path_coeffs, double eps_naik);
 void computeTwoLinkCPU(void **twolink, void **sitelink, QudaGaugeParam *gauge_param);
-void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], void** ghost_twolnk,  quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param, const int oddBit, const double width, const int t0, QudaPrecision prec);
+void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk,
+                                   quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param,
+                                   const int oddBit, const double width, const int t0, QudaPrecision prec);
 template <typename Float>
 void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type);
 void applyGaugeFieldScaling_long(void **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type,
@@ -168,7 +170,9 @@ enum {
    @param[in] precision Precision of field
    @param[in] phase Type of phase; 0 == no additional phase, 1 == MILC phases, 2 == U(1) phase
  */
-void createSiteLinkCPU(void **link, QudaPrecision precision, int phase);
+void createSiteLinkCPU(void *const *const link, QudaPrecision precision, int phase);
+void createSiteLinkCPU(quda::GaugeField &u, QudaPrecision precision, int phase);
+
 void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision);
 void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision,
                      QudaGaugeParam *param);
@@ -181,6 +185,8 @@ double compare_floats_v2(void *a, void *b, int len, double epsilon, QudaPrecisio
 void check_gauge(void **, void **, double epsilon, QudaPrecision precision);
 
 int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec);
+int strong_check_link(const quda::GaugeField &linkA, const std::string &msgA, const quda::GaugeField &linkB,
+                      const std::string &msgB);
 int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec);
 
 /**
diff --git a/tests/utils/llfat_utils.cpp b/tests/utils/llfat_utils.cpp
index 43d2acaffa..548c6976b1 100644
--- a/tests/utils/llfat_utils.cpp
+++ b/tests/utils/llfat_utils.cpp
@@ -29,10 +29,6 @@ template <typename su3_matrix, typename Real>
 void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matrix *mulink, su3_matrix **sitelink,
                                     void **fatlink, Real coef, int use_staple)
 {
-  su3_matrix tmat1, tmat2;
-  int i;
-  su3_matrix *fat1;
-
   /* Upper staple */
   /* Computes the staple :
    *                mu (B)
@@ -46,16 +42,15 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr
    * It also adds the computed staple to the fatlink[mu] with weight coef.
    */
 
-  int dx[4];
-
   /* upper staple */
 
-  for (i = 0; i < V; i++) {
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
 
-    fat1 = ((su3_matrix *)fatlink[mu]) + i;
+    auto fat1 = ((su3_matrix *)fatlink[mu]) + i;
     su3_matrix *A = sitelink[nu] + i;
 
-    memset(dx, 0, sizeof(dx));
+    int dx[4] = {};
     dx[nu] = 1;
     int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
     su3_matrix *B;
@@ -70,6 +65,7 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr
     nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
     su3_matrix *C = sitelink[nu] + nbr_idx;
 
+    su3_matrix tmat1, tmat2;
     llfat_mult_su3_nn(A, B, &tmat1);
 
     if (staple != NULL) { /* Save the staple */
@@ -89,10 +85,11 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr
    *
    *********************************************/
 
-  for (i = 0; i < V; i++) {
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
 
-    fat1 = ((su3_matrix *)fatlink[mu]) + i;
-    memset(dx, 0, sizeof(dx));
+    auto fat1 = ((su3_matrix *)fatlink[mu]) + i;
+    int dx[4] = {};
     dx[nu] = -1;
     int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]);
     if (nbr_idx >= V || nbr_idx < 0) {
@@ -113,6 +110,7 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr
     nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2], dx[1], dx[0]);
     su3_matrix *C = sitelink[nu] + nbr_idx;
 
+    su3_matrix tmat1, tmat2;
     llfat_mult_su3_an(A, B, &tmat1);
     llfat_mult_su3_nn(&tmat1, C, &tmat2);
 
@@ -148,6 +146,7 @@ void llfat_cpu(void **fatlink, su3_matrix **sitelink, Float *act_path_coeff)
   for (int dir = XUP; dir <= TUP; dir++) {
 
     // Intialize fat links with c_1*U_\mu(x)
+#pragma omp parallel for
     for (int i = 0; i < V; i++) {
       su3_matrix *fat1 = ((su3_matrix *)fatlink[dir]) + i;
       llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1);
@@ -210,10 +209,6 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
                                        su3_matrix **ghost_mulink, su3_matrix **sitelink, su3_matrix **ghost_sitelink,
                                        su3_matrix **ghost_sitelink_diag, void **fatlink, Real coef, int use_staple)
 {
-  su3_matrix tmat1, tmat2;
-  int i;
-  su3_matrix *fat1;
-
   int X1 = Z[0];
   int X2 = Z[1];
   int X3 = Z[2];
@@ -237,11 +232,10 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
    * It also adds the computed staple to the fatlink[mu] with weight coef.
    */
 
-  int dx[4];
-
   // upper staple
 
-  for (i = 0; i < V; i++) {
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
 
     int half_index = i;
     int oddBit = 0;
@@ -264,10 +258,10 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
     int space_con[4] = {(x4 * X3X2 + x3 * X2 + x2) / 2, (x4 * X3X1 + x3 * X1 + x1) / 2, (x4 * X2X1 + x2 * X1 + x1) / 2,
                         (x3 * X2X1 + x2 * X1 + x1) / 2};
 
-    fat1 = ((su3_matrix *)fatlink[mu]) + i;
+    auto fat1 = ((su3_matrix *)fatlink[mu]) + i;
     su3_matrix *A = sitelink[nu] + i;
 
-    memset(dx, 0, sizeof(dx));
+    int dx[4] = {};
     dx[nu] = 1;
     int nbr_idx;
 
@@ -299,6 +293,7 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
       C = sitelink[nu] + nbr_idx;
     }
 
+    su3_matrix tmat1, tmat2;
     llfat_mult_su3_nn(A, B, &tmat1);
 
     if (staple != NULL) { /* Save the staple */
@@ -318,7 +313,8 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
    *
    *********************************************/
 
-  for (i = 0; i < V; i++) {
+#pragma omp parallel for
+  for (int i = 0; i < V; i++) {
 
     int half_index = i;
     int oddBit = 0;
@@ -342,11 +338,11 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
 
     // int x4 = x4_from_full_index(i);
 
-    fat1 = ((su3_matrix *)fatlink[mu]) + i;
+    auto fat1 = ((su3_matrix *)fatlink[mu]) + i;
 
     // we could be in the ghost link area if nu is T and we are at low T boundary
     su3_matrix *A;
-    memset(dx, 0, sizeof(dx));
+    int dx[4] = {};
     dx[nu] = -1;
 
     int nbr_idx;
@@ -412,6 +408,7 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m
     } else {
       C = sitelink[nu] + nbr_idx;
     }
+    su3_matrix tmat1, tmat2;
     llfat_mult_su3_an(A, B, &tmat1);
     llfat_mult_su3_nn(&tmat1, C, &tmat2);
 
@@ -430,12 +427,7 @@ template <typename su3_matrix, typename Float>
 void llfat_cpu_mg(void **fatlink, su3_matrix **sitelink, su3_matrix **ghost_sitelink, su3_matrix **ghost_sitelink_diag,
                   Float *act_path_coeff)
 {
-  QudaPrecision prec;
-  if (sizeof(Float) == 4) {
-    prec = QUDA_SINGLE_PRECISION;
-  } else {
-    prec = QUDA_DOUBLE_PRECISION;
-  }
+  QudaPrecision prec = sizeof(Float) == 4 ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION;
 
   su3_matrix *staple = (su3_matrix *)safe_malloc(V * sizeof(su3_matrix));
 
@@ -455,6 +447,7 @@ void llfat_cpu_mg(void **fatlink, su3_matrix **sitelink, su3_matrix **ghost_site
   for (int dir = XUP; dir <= TUP; dir++) {
 
     // Intialize fat links with c_1*U_\mu(x)
+#pragma omp parallel for
     for (int i = 0; i < V; i++) {
       su3_matrix *fat1 = ((su3_matrix *)fatlink[dir]) + i;
       llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1);
diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp
index c07fdb9e5d..fd920e5e16 100644
--- a/tests/utils/misc.cpp
+++ b/tests/utils/misc.cpp
@@ -352,7 +352,9 @@ const char *get_memory_type_str(QudaMemoryType type)
 
   switch (type) {
   case QUDA_MEMORY_DEVICE: s = "device"; break;
-  case QUDA_MEMORY_PINNED: s = "pinned"; break;
+  case QUDA_MEMORY_DEVICE_PINNED: s = "device_pinned"; break;
+  case QUDA_MEMORY_HOST: s = "host"; break;
+  case QUDA_MEMORY_HOST_PINNED: s = "host_pinned"; break;
   case QUDA_MEMORY_MAPPED: s = "mapped"; break;
   default: fprintf(stderr, "Error: invalid memory type\n"); exit(1);
   }
@@ -369,6 +371,7 @@ std::string get_dilution_type_str(QudaDilutionType type)
   case QUDA_DILUTION_COLOR: s = std::string("color"); break;
   case QUDA_DILUTION_SPIN_COLOR: s = std::string("spin_color"); break;
   case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: s = std::string("spin_color_even_odd"); break;
+  case QUDA_DILUTION_BLOCK: s = std::string("block"); break;
   default: fprintf(stderr, "Error: invalid dilution type\n"); exit(1);
   }
   return s;
diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp
index 2759e3489b..b020939c5f 100644
--- a/tests/utils/staggered_gauge_utils.cpp
+++ b/tests/utils/staggered_gauge_utils.cpp
@@ -23,8 +23,9 @@ static double max_allowed_error = 1e-11;
 
 // Wrap everything for the GPU construction of fat/long links here
 void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fatlink_eps, void **qdp_longlink_eps,
-                         void **qdp_inlink, QudaGaugeParam &gauge_param_in, double **act_path_coeffs, double eps_naik,
-                         size_t gSize, int n_naiks)
+                         void **qdp_inlink, QudaGaugeParam &gauge_param_in,
+                         std::array<std::array<double, 6>, 3> &act_path_coeffs, double eps_naik, size_t gSize,
+                         int n_naiks)
 {
   // since a lot of intermediaries can be general matrices, override the recon in `gauge_param_in`
   auto gauge_param = gauge_param_in;
@@ -52,11 +53,11 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
   }
 
   // Create V links (fat7 links) and W links (unitarized V links), 1st path table set
-  computeKSLinkQuda(milc_vlink, nullptr, milc_wlink, milc_inlink, act_path_coeffs[0], &gauge_param);
+  computeKSLinkQuda(milc_vlink, nullptr, milc_wlink, milc_inlink, act_path_coeffs[0].data(), &gauge_param);
 
   if (n_naiks > 1) {
     // Create Naiks, 3rd path table set
-    computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[2], &gauge_param);
+    computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[2].data(), &gauge_param);
 
     // Rescale+copy Naiks into Naik field
     cpu_axy(gauge_param.cpu_prec, eps_naik, milc_fatlink, milc_fatlink_eps, V * 4 * gauge_site_size);
@@ -67,7 +68,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
   }
 
   // Create X and long links, 2nd path table set
-  computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[1], &gauge_param);
+  computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[1].data(), &gauge_param);
 
   if (n_naiks > 1) {
     // Add into Naik field
@@ -98,7 +99,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
   }
 }
 
-void setActionPaths(double **act_paths)
+template <class T> void setActionPaths(T &act_paths)
 {
   ///////////////////////////
   // Set path coefficients //
@@ -160,8 +161,7 @@ void setActionPaths(double **act_paths)
 void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
                        size_t gSize, int n_naiks, double eps_naik)
 {
-  double **act_paths = new double *[3];
-  for (int i = 0; i < 3; i++) act_paths[i] = new double[6];
+  std::array<std::array<double, 6>, 3> act_paths;
   setActionPaths(act_paths);
 
   ///////////////////////////////////////////////////////////////////////
@@ -196,17 +196,12 @@ void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlin
       host_free(qdp_longlink_naik_temp[dir]);
     }
   }
-
-  for (int i = 0; i < 3; i++) delete[] act_paths[i];
-  delete[] act_paths;
 }
 
-void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu,
-                             void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize,
-                             int n_naiks, double eps_naik)
+void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
+                       size_t gSize, int n_naiks, double eps_naik)
 {
-  double **act_paths = new double *[3];
-  for (int i = 0; i < 3; i++) act_paths[i] = new double[6];
+  std::array<std::array<double, 6>, 3> act_paths;
   setActionPaths(act_paths);
 
   ///////////////////////////////////////////////////////////////////////
@@ -229,41 +224,26 @@ void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, vo
   //////////////////////////
 
   // defined in "llfat_reference.cpp"
-  computeHISQLinksCPU(qdp_fatlink_cpu, qdp_longlink_cpu, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr,
+  computeHISQLinksCPU(qdp_fatlink, qdp_longlink, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr,
                       (n_naiks == 2) ? qdp_longlink_naik_temp : nullptr, qdp_inlink, &gauge_param, act_paths, eps_naik);
 
   if (n_naiks == 2) {
     // Override the naik fields into the fat/long link fields
     for (int dir = 0; dir < 4; dir++) {
-      memcpy(qdp_fatlink_cpu[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize);
-      memcpy(qdp_longlink_cpu[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize);
-      memset(qdp_fatlink_naik_temp[dir], 0, V * gauge_site_size * gSize);
-      memset(qdp_longlink_naik_temp[dir], 0, V * gauge_site_size * gSize);
-    }
-  }
-
-  //////////////////////////
-  // Create the GPU links //
-  //////////////////////////
-
-  // Skip eps field for now
-  // Note: GPU link creation only works for single and double precision
-  computeHISQLinksGPU(qdp_fatlink_gpu, qdp_longlink_gpu, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr,
-                      (n_naiks == 2) ? qdp_longlink_naik_temp : nullptr, qdp_inlink, gauge_param, act_paths, eps_naik,
-                      gSize, n_naiks);
-
-  if (n_naiks == 2) {
-    // Override the naik fields into the fat/long link fields
-    for (int dir = 0; dir < 4; dir++) {
-      memcpy(qdp_fatlink_gpu[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize);
-      memcpy(qdp_longlink_gpu[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize);
+      memcpy(qdp_fatlink[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize);
+      memcpy(qdp_longlink[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize);
       host_free(qdp_fatlink_naik_temp[dir]);
       host_free(qdp_longlink_naik_temp[dir]);
     }
   }
+}
 
-  for (int i = 0; i < 3; i++) delete[] act_paths[i];
-  delete[] act_paths;
+void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu,
+                             void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize,
+                             int n_naiks, double eps_naik)
+{
+  computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik);
+  computeFatLongCPU(qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik);
 }
 
 // Routine that takes in a QDP-ordered field and outputs the plaquette.
diff --git a/tests/utils/staggered_gauge_utils.h b/tests/utils/staggered_gauge_utils.h
index f2cc4e4749..d969d437e1 100644
--- a/tests/utils/staggered_gauge_utils.h
+++ b/tests/utils/staggered_gauge_utils.h
@@ -18,6 +18,9 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat
 void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
                        size_t gSize, int n_naiks, double eps_naik);
 
+void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param,
+                       size_t gSize, int n_naiks, double eps_naik);
+
 void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu,
                              void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize,
                              int n_naiks, double eps_naik);
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index 021bbd6877..95efb0d85a 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -26,25 +26,24 @@ template <typename T> using complex = std::complex<T>;
 
 // Staggered gauge field utils
 //------------------------------------------------------
-void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu,
-                                            void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param,
-                                            int argc, char **argv, bool &gauge_loaded)
+void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
+                                      QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu)
 {
+  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
+
   // load a field WITHOUT PHASES
   if (latfile.size() > 0) {
-    if (!gauge_loaded) {
-      read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
-      if (dslash_type != QUDA_LAPLACE_DSLASH) {
-        applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
-      }
-      gauge_loaded = true;
-    } // else it's already been loaded
+    // load in the command line supplied gauge field using QIO and LIME
+    read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
+    if (dslash_type != QUDA_LAPLACE_DSLASH) {
+      applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
+    }
   } else {
     int construct_type = (unit_gauge) ? 0 : 1;
     if (dslash_type == QUDA_LAPLACE_DSLASH) {
       constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param);
     } else {
-      constructFatLongGaugeField(qdp_inlink, qdp_longlink_cpu, construct_type, gauge_param.cpu_prec, &gauge_param,
+      constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param,
                                  compute_fatlong ? QUDA_STAGGERED_DSLASH : dslash_type);
     }
   }
@@ -53,62 +52,49 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli
   // "compute" the fat/long links or not.
   if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) {
     for (int dir = 0; dir < 4; dir++) {
-      memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-      memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-      memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
-      memset(qdp_longlink_cpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
+      memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+      memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
     }
   } else {
     // QUDA_ASQTAD_DSLASH
     if (compute_fatlong) {
-      computeFatLongGPUandCPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink,
-                              gauge_param, host_gauge_data_type_size, n_naiks, eps_naik);
+      if (compute_on_gpu)
+        computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks,
+                          eps_naik);
+      else
+        computeFatLongCPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks,
+                          eps_naik);
     } else {
-      // Not computing FatLong
       for (int dir = 0; dir < 4; dir++) {
-        memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-        memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-        memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size);
+        memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
       }
     }
   }
 }
 
-void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink,
-                                      QudaGaugeParam &gauge_param, int argc, char **argv)
+void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu,
+                                            void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param,
+                                            int argc, char **argv)
 {
-  gauge_param.reconstruct = QUDA_RECONSTRUCT_NO;
-
-  if (latfile.size() > 0) {
-    // load in the command line supplied gauge field using QIO and LIME
-    read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv);
-    if (dslash_type != QUDA_LAPLACE_DSLASH) {
-      applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec);
-    }
-  } else {
-    int construct_type = (unit_gauge) ? 0 : 1;
-    if (dslash_type == QUDA_LAPLACE_DSLASH) {
-      constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param);
-    } else {
-      constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param,
-                                 compute_fatlong ? QUDA_STAGGERED_DSLASH : dslash_type);
-    }
-  }
+  constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, false);
 
   // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you
   // "compute" the fat/long links or not.
   if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) {
     for (int dir = 0; dir < 4; dir++) {
-      memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
-      memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
+      memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+      memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size);
     }
   } else {
     // QUDA_ASQTAD_DSLASH
     if (compute_fatlong) {
-      computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik);
+      computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks,
+                        eps_naik);
     } else {
+      // Not computing FatLong
       for (int dir = 0; dir < 4; dir++) {
-        memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+        memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size);
+        memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size);
       }
     }
   }
@@ -188,6 +174,7 @@ void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaP
   // FIXME: may break host comparison
   if (dslash_type == QUDA_STAGGERED_DSLASH) {
     for (int dir = 0; dir < 4; ++dir) {
+#pragma omp parallel for
       for (int i = 0; i < V; ++i) {
         for (auto j = 0lu; j < gauge_site_size; j += 2) {
           if (precision == QUDA_DOUBLE_PRECISION) {
@@ -252,11 +239,12 @@ void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugePara
 template <typename su3_matrix, typename Float>
 void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff)
 {
-  su3_matrix temp;
   for (int dir = XUP; dir <= TUP; ++dir) {
-    int dx[4] = {0, 0, 0, 0};
+#pragma omp parallel for
     for (int i = 0; i < V; ++i) {
+      int dx[4] = {0, 0, 0, 0};
       // Initialize the longlinks
+      su3_matrix temp;
       su3_matrix *llink = ((su3_matrix *)longlink[dir]) + i;
       llfat_scalar_mult_su3_matrix(sitelink[dir] + i, act_path_coeff[1], llink);
       dx[dir] = 1;
@@ -267,7 +255,6 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_
       llfat_mult_su3_nn(&temp, sitelink[dir] + nbr_idx, llink);
     }
   }
-  return;
 }
 #else
 
@@ -278,7 +265,7 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat
   for (int dir = 0; dir < 4; ++dir) E[dir] = Z[dir] + 4;
   const int extended_volume = E[3] * E[2] * E[1] * E[0];
 
-  su3_matrix temp;
+#pragma omp parallel for
   for (int t = 0; t < Z[3]; ++t) {
     for (int z = 0; z < Z[2]; ++z) {
       for (int y = 0; y < Z[1]; ++y) {
@@ -294,6 +281,7 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat
             llfat_scalar_mult_su3_matrix(sitelinkEx[dir] + large_index, act_path_coeff[1], llink);
             dx[dir] = 1;
             int nbr_index = neighborIndexFullLattice(E, large_index, dx);
+            su3_matrix temp;
             llfat_mult_su3_nn(llink, sitelinkEx[dir] + nbr_index, &temp);
             dx[dir] = 2;
             nbr_index = neighborIndexFullLattice(E, large_index, dx);
@@ -303,7 +291,6 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat
       }   // y
     }     // z
   }       // t
-  return;
 }
 #endif
 
@@ -399,6 +386,7 @@ void staggeredTwoLinkGaussianSmear(sFloat *res, gFloat **twolink, gFloat **ghost
   }
 
   {
+#pragma omp parallel for
     for (int i = 0; i < Vh; i++) {
       // Get local time-slice index:
       const int local_t = i / Vsh_t;
@@ -437,11 +425,14 @@ void staggeredTwoLinkGaussianSmear(sFloat *res, gFloat **twolink, gFloat **ghost
   return;
 }
 
-void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], void **ghost_twolnk,
+void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk,
                                    quda::ColorSpinorField &in, QudaGaugeParam * /*qudaGaugeParam*/,
                                    QudaInvertParam * /*inv_param*/, const int oddBit, const double /*width*/,
                                    const int t0, QudaPrecision prec)
 {
+  void *ghost[4];
+  for (int i = 0; i < 4; i++) ghost[i] = twolnk.Ghost()[i].data();
+
   QudaParity otherparity = QUDA_INVALID_PARITY;
   if (oddBit == QUDA_EVEN_PARITY) {
     otherparity = QUDA_ODD_PARITY;
@@ -459,19 +450,20 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk
 
   if (prec == QUDA_DOUBLE_PRECISION) {
     {
-      staggeredTwoLinkGaussianSmear((double *)out.V(), (double **)qdp_twolnk, (double **)ghost_twolnk, (double *)in.V(),
+      staggeredTwoLinkGaussianSmear((double *)out.data(), (double **)qdp_twolnk, (double **)ghost, (double *)in.data(),
                                     (double **)fwd_nbr_spinor, (double **)back_nbr_spinor, t0, oddBit);
     } 
   } else {
     {
-      staggeredTwoLinkGaussianSmear((float *)out.V(), (float **)qdp_twolnk, (float **)ghost_twolnk, (float *)in.V(),
+      staggeredTwoLinkGaussianSmear((float *)out.data(), (float **)qdp_twolnk, (float **)ghost, (float *)in.data(),
                                     (float **)fwd_nbr_spinor, (float **)back_nbr_spinor, t0, oddBit);
     }
   }
   return;
 }
 #else
-void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, void** ,  quda::ColorSpinorField&, QudaGaugeParam* , QudaInvertParam* , const int , const double , const int , QudaPrecision )
+void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, const quda::GaugeField &, quda::ColorSpinorField &,
+                                   QudaGaugeParam *, QudaInvertParam *, const int, const double, const int, QudaPrecision)
 {}
 #endif
 
@@ -480,7 +472,7 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, void** ,
 // If "eps_naik" is 0, there's no naik correction,
 // and this routine skips building the paths in "act_path_coeffs[2]"
 void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink,
-                         void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik)
+                         void *qudaGaugeParamPtr, std::array<std::array<double, 6>, 3> &act_path_coeffs, double eps_naik)
 {
   // Prepare various things
   QudaGaugeParam &qudaGaugeParam = *((QudaGaugeParam *)qudaGaugeParamPtr);
@@ -513,6 +505,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo
   int X3 = Z[2];
   int X4 = Z[3];
 
+#pragma omp parallel for
   for (int i = 0; i < V_ex; i++) {
     int sid = i;
     int oddBit = 0;
@@ -642,7 +635,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo
   unitarizeLinksCPU(*cpuWLink, *cpuVLink);
 
   // Copy back into "w_reflink"
-  reorderMILCtoQDP(w_reflink, cpuWLink->Gauge_p(), V, gauge_site_size, prec, prec);
+  reorderMILCtoQDP(w_reflink, cpuWLink->data(), V, gauge_site_size, prec, prec);
 
   // Clean up cpuGaugeFields, we don't need them anymore.
   delete cpuVLink;
@@ -652,6 +645,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo
   // Prepare for extended W fields //
   ///////////////////////////////////
 
+#pragma omp parallel for
   for (int i = 0; i < V_ex; i++) {
     int sid = i;
     int oddBit = 0;
@@ -868,6 +862,7 @@ void reorderQDPtoMILC(void *milc_out, void **qdp_in, int V, int siteSize, QudaPr
 
 template <typename Out, typename In> void reorderMILCtoQDP(Out **qdp_out, In *milc_in, int V, int siteSize)
 {
+#pragma omp parallel for
   for (int i = 0; i < V; i++) {
     for (int dir = 0; dir < 4; dir++) {
       for (int j = 0; j < siteSize; j++) {
@@ -922,6 +917,7 @@ void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, Q
   for (int d = 0; d < 3; d++) {
 
     // even
+#pragma omp parallel for
     for (int i = 0; i < Vh; i++) {
 
       int index = fullLatticeIndex(i, 0);
@@ -970,6 +966,7 @@ void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, Q
 
   // Apply boundary conditions to temporal links
   if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t()) {
+#pragma omp parallel for
     for (int j = 0; j < Vh; j++) {
       int sign = 1;
       if (dslash_type == QUDA_ASQTAD_DSLASH) {