diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
index 153c2c9695..91aaec4d63 100644
--- a/include/kernels/block_transpose.cuh
+++ b/include/kernels/block_transpose.cuh
@@ -47,6 +47,15 @@ namespace quda
     constexpr BlockTransposeKernel(const Arg &arg) : arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
+    struct CacheDims {
+      static constexpr dim3 dims(dim3 block)
+      {
+        block.x += 1;
+        block.z = 1;
+        return block;
+      }
+    };
+
     /**
       @brief Transpose between the two different orders of batched colorspinor fields:
         - B: nVec -> spatial/N -> spin/color -> N, where N is for that in floatN
@@ -60,7 +69,7 @@ namespace quda
       int parity = parity_color / Arg::nColor;
       using color_spinor_t = ColorSpinor<typename Arg::real, 1, Arg::nSpin>;
 
-      SharedMemoryCache<color_spinor_t> cache({target::block_dim().x + 1, target::block_dim().y, 1});
+      SharedMemoryCache<color_spinor_t, CacheDims> cache;
 
       int x_offset = target::block_dim().x * target::block_idx().x;
       int v_offset = target::block_dim().y * target::block_idx().y;
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index b63bf7f435..03f9d4b75e 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -10,6 +10,7 @@
 #include <matrix_tile.cuh>
 #include <target_device.h>
 #include <kernel.h>
+#include <shared_memory_cache_helper.h>
 
 namespace quda {
 
@@ -1387,14 +1388,20 @@ namespace quda {
   };
 
   template <> struct storeCoarseSharedAtomic_impl<true> {
+    template <typename Arg>
+    using CacheT = complex<storeType>[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4]
+                                     [Arg::coarseSpin][Arg::coarseSpin];
+    template <typename Arg> using Cache = SharedMemoryCache<CacheT<Arg>, DimsStatic<2, 1, 1>>;
+
     template <typename VUV, typename Pack, typename Arg>
     inline __device__ void operator()(VUV &vuv, bool isDiagonal, int coarse_x_cb, int coarse_parity, int i0, int j0, int parity, const Pack &pack, const Arg &arg)
     {
       using real = typename Arg::Float;
       using TileType = typename Arg::vuvTileType;
       const int dim_index = arg.dim_index % arg.Y_atomic.geometry;
-      __shared__ complex<storeType> X[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
-      __shared__ complex<storeType> Y[Arg::max_color_height_per_block][Arg::max_color_width_per_block][4][Arg::coarseSpin][Arg::coarseSpin];
+      Cache<Arg> cache;
+      auto &X = cache.data()[0];
+      auto &Y = cache.data()[1];
 
       int x_ = coarse_x_cb % arg.aggregates_per_block;
       int tx = virtualThreadIdx(arg);
@@ -1416,7 +1423,7 @@ namespace quda {
         }
       }
 
-      __syncthreads();
+      cache.sync();
 
 #pragma unroll
       for (int i = 0; i < TileType::M; i++) {
@@ -1445,7 +1452,7 @@ namespace quda {
         }
       }
 
-      __syncthreads();
+      cache.sync();
 
       if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) {
 
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index ff1a7969c4..38100fff50 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -172,17 +172,24 @@ namespace quda {
   };
 
   template <> struct site_max<true> {
+    template <typename Arg> struct CacheDims {
+      static constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
+      static constexpr int Mc = colors_per_thread<true>(Arg::nColor);
+      static constexpr int color_spin_threads = (Arg::nSpin / Ms) * (Arg::nColor / Mc);
+      static constexpr dim3 dims(dim3 block)
+      {
+        // pad the shared block size to avoid bank conflicts for native ordering
+        if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
+        block.y = color_spin_threads; // state the y block since we know it at compile time
+        return block;
+      }
+    };
+
     template <typename Arg> __device__ inline auto operator()(typename Arg::real thread_max, Arg &)
     {
       using real = typename Arg::real;
-      constexpr int Ms = spins_per_thread<true>(Arg::nSpin);
-      constexpr int Mc = colors_per_thread<true>(Arg::nColor);
-      constexpr int color_spin_threads = (Arg::nSpin/Ms) * (Arg::nColor/Mc);
-      auto block = target::block_dim();
-      // pad the shared block size to avoid bank conflicts for native ordering
-      if (Arg::is_native) block.x = ((block.x + device::warp_size() - 1) / device::warp_size()) * device::warp_size();
-      block.y = color_spin_threads; // state the y block since we know it at compile time
-      SharedMemoryCache<real> cache(block);
+      constexpr int color_spin_threads = CacheDims<Arg>::color_spin_threads;
+      SharedMemoryCache<real, CacheDims<Arg>> cache;
       cache.save(thread_max);
       cache.sync();
       real this_site_max = static_cast<real>(0);
diff --git a/include/kernels/dslash_clover_helper.cuh b/include/kernels/dslash_clover_helper.cuh
index 00b61a9b3e..4cf6f1a311 100644
--- a/include/kernels/dslash_clover_helper.cuh
+++ b/include/kernels/dslash_clover_helper.cuh
@@ -203,7 +203,7 @@ namespace quda {
 
       Mat A = arg.clover(x_cb, clover_parity, chirality);
 
-      SharedMemoryCache<half_fermion> cache(target::block_dim());
+      SharedMemoryCache<half_fermion> cache;
 
       half_fermion in_chi[n_flavor]; // flavor array of chirally projected fermion
 #pragma unroll
diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh
index 6d74bcda41..744a74f874 100644
--- a/include/kernels/dslash_coarse.cuh
+++ b/include/kernels/dslash_coarse.cuh
@@ -301,7 +301,7 @@ namespace quda {
   template <> struct dim_collapse<true> {
     template <typename T, typename Arg> __device__ __host__ inline void operator()(T &out, int dir, int dim, const Arg &arg)
     {
-      SharedMemoryCache<T> cache(target::block_dim());
+      SharedMemoryCache<T> cache;
       // only need to write to shared memory if not master thread
       if (dim > 0 || dir) cache.save(out);
 
diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh
index bab21d4c11..dfce18bede 100644
--- a/include/kernels/dslash_domain_wall_m5.cuh
+++ b/include/kernels/dslash_domain_wall_m5.cuh
@@ -220,7 +220,7 @@ namespace quda
     if (mobius_m5::use_half_vector()) {
       // if using shared-memory caching then load spinor field for my site into cache
       typedef ColorSpinor<real, Arg::nColor, 4 / 2> HalfVector;
-      SharedMemoryCache<HalfVector> cache(target::block_dim());
+      SharedMemoryCache<HalfVector> cache;
 
       { // forwards direction
         constexpr int proj_dir = dagger ? +1 : -1;
@@ -271,7 +271,7 @@ namespace quda
     } else { // use_half_vector
 
       // if using shared-memory caching then load spinor field for my site into cache
-      SharedMemoryCache<Vector> cache(target::block_dim());
+      SharedMemoryCache<Vector> cache;
       if (shared) {
         if (sync) { cache.sync(); }
         cache.save(in);
@@ -377,7 +377,7 @@ namespace quda
     const auto inv = arg.inv;
 
     // if using shared-memory caching then load spinor field for my site into cache
-    SharedMemoryCache<Vector> cache(target::block_dim());
+    SharedMemoryCache<Vector> cache;
     if (shared) {
       // cache.save(arg.in(s_ * arg.volume_4d_cb + x_cb, parity));
       if (sync) { cache.sync(); }
@@ -436,7 +436,7 @@ namespace quda
     Vector out;
 
     if (mobius_m5::use_half_vector()) {
-      SharedMemoryCache<HalfVector> cache(target::block_dim());
+      SharedMemoryCache<HalfVector> cache;
 
       { // first do R
         constexpr int proj_dir = dagger ? -1 : +1;
@@ -495,7 +495,7 @@ namespace quda
         out += l.reconstruct(4, proj_dir);
       }
     } else { // use_half_vector
-      SharedMemoryCache<Vector> cache(target::block_dim());
+      SharedMemoryCache<Vector> cache;
       if (shared) {
         if (sync) { cache.sync(); }
         cache.save(in);
diff --git a/include/kernels/dslash_mobius_eofa.cuh b/include/kernels/dslash_mobius_eofa.cuh
index f5e0a5c8ac..3d62ec3923 100644
--- a/include/kernels/dslash_mobius_eofa.cuh
+++ b/include/kernels/dslash_mobius_eofa.cuh
@@ -107,7 +107,7 @@ namespace quda
         using real = typename Arg::real;
         typedef ColorSpinor<real, Arg::nColor, 4> Vector;
 
-        SharedMemoryCache<Vector> cache(target::block_dim());
+        SharedMemoryCache<Vector> cache;
 
         Vector out;
         cache.save(arg.in(s * arg.volume_4d_cb + x_cb, parity));
@@ -185,7 +185,7 @@ namespace quda
         typedef ColorSpinor<real, Arg::nColor, 4> Vector;
 
         const auto sherman_morrison = arg.sherman_morrison;
-        SharedMemoryCache<Vector> cache(target::block_dim());
+        SharedMemoryCache<Vector> cache;
         cache.save(arg.in(s * arg.volume_4d_cb + x_cb, parity));
         cache.sync();
 
diff --git a/include/kernels/dslash_ndeg_twisted_clover.cuh b/include/kernels/dslash_ndeg_twisted_clover.cuh
index 108f8c5e84..cb8bc61ad7 100644
--- a/include/kernels/dslash_ndeg_twisted_clover.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover.cuh
@@ -72,7 +72,7 @@ namespace quda
         // apply the chiral and flavor twists
         // use consistent load order across s to ensure better cache locality
         Vector x = arg.x(my_flavor_idx, my_spinor_parity);
-        SharedMemoryCache<Vector> cache(target::block_dim());
+        SharedMemoryCache<Vector> cache;
         cache.save(x);
 
         x.toRel(); // switch to chiral basis
diff --git a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
index bdbff30817..ebd8f71da6 100644
--- a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
@@ -91,7 +91,7 @@ namespace quda
 
         int chirality = flavor; // relabel flavor as chirality
 
-        SharedMemoryCache<HalfVector> cache(target::block_dim());
+        SharedMemoryCache<HalfVector> cache;
 
         enum swizzle_direction {
           FORWARDS = 0,
diff --git a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
index 98e72eb61a..8bab3d5623 100644
--- a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
@@ -95,7 +95,7 @@ namespace quda
       }
       
       if (!dagger || Arg::asymmetric) { // apply A^{-1} to D*in
-        SharedMemoryCache<Vector> cache(target::block_dim());
+        SharedMemoryCache<Vector> cache;
         if (isComplete<mykernel_type>(arg, coord) && active) {
           // to apply the preconditioner we need to put "out" in shared memory so the other flavor can access it
           cache.save(out);
diff --git a/include/kernels/gauge_stout.cuh b/include/kernels/gauge_stout.cuh
index 45cc176d88..2a512e77e3 100644
--- a/include/kernels/gauge_stout.cuh
+++ b/include/kernels/gauge_stout.cuh
@@ -6,6 +6,7 @@
 #include <su3_project.cuh>
 #include <kernels/gauge_utils.cuh>
 #include <kernel.h>
+#include <thread_local_cache.h>
 
 namespace quda
 {
@@ -134,8 +135,8 @@ namespace quda
       }
 
       Link U, Q;
-      SharedMemoryCache<Link> Stap(target::block_dim());
-      SharedMemoryCache<Link> Rect(target::block_dim(), sizeof(Link));
+      ThreadLocalCache<Link, 0, computeStapleRectangleOps> Stap;
+      ThreadLocalCache<Link, 0, decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
 
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
diff --git a/include/kernels/gauge_utils.cuh b/include/kernels/gauge_utils.cuh
index 48c7e6c1cc..ded8c9377a 100644
--- a/include/kernels/gauge_utils.cuh
+++ b/include/kernels/gauge_utils.cuh
@@ -19,6 +19,7 @@ namespace quda
   // matrix+matrix = 18 floating-point ops
   // => Total number of floating point ops per function call
   // dims * (2*18 + 4*198) = dims*828
+  using computeStapleOps = thread_array<int, 4>;
   template <typename Arg, typename Staple, typename Int>
   __host__ __device__ inline void computeStaple(const Arg &arg, const int *x, const Int *X, const int parity, const int nu, Staple &staple, const int dir_ignore)
   {
@@ -94,6 +95,7 @@ namespace quda
   // matrix+matrix = 18 floating-point ops
   // => Total number of floating point ops per function call
   // dims * (8*18 + 28*198) = dims*5688
+  using computeStapleRectangleOps = thread_array<int, 4>;
   template <typename Arg, typename Staple, typename Rectangle, typename Int>
   __host__ __device__ inline void computeStapleRectangle(const Arg &arg, const int *x, const Int *X, const int parity, const int nu,
                                                          Staple &staple, Rectangle &rectangle, const int dir_ignore)
diff --git a/include/kernels/gauge_wilson_flow.cuh b/include/kernels/gauge_wilson_flow.cuh
index 327f7c7eb0..457d93beab 100644
--- a/include/kernels/gauge_wilson_flow.cuh
+++ b/include/kernels/gauge_wilson_flow.cuh
@@ -4,6 +4,7 @@
 #include <kernels/gauge_utils.cuh>
 #include <su3_project.cuh>
 #include <kernel.h>
+#include <thread_local_cache.h>
 
 namespace quda
 {
@@ -71,8 +72,8 @@ namespace quda
       // This function gets stap = S_{mu,nu} i.e., the staple of length 3,
       // and the 1x2 and 2x1 rectangles of length 5. From the following paper:
       // https://arxiv.org/abs/0801.1165
-      SharedMemoryCache<Link> Stap(target::block_dim());
-      SharedMemoryCache<Link> Rect(target::block_dim(), sizeof(Link)); // offset to ensure non-overlapping allocations
+      ThreadLocalCache<Link, 0, computeStapleRectangleOps> Stap;
+      ThreadLocalCache<Link, 0, decltype(Stap)> Rect; // offset by Stap type to ensure non-overlapping allocations
       computeStapleRectangle(arg, x, arg.E, parity, dir, Stap, Rect, Arg::wflow_dim);
       Z = arg.coeff1x1 * static_cast<const Link &>(Stap) + arg.coeff2x1 * static_cast<const Link &>(Rect);
       break;
diff --git a/include/kernels/hisq_paths_force.cuh b/include/kernels/hisq_paths_force.cuh
index cac909bf8a..a16f7783eb 100644
--- a/include/kernels/hisq_paths_force.cuh
+++ b/include/kernels/hisq_paths_force.cuh
@@ -4,7 +4,7 @@
 #include <index_helper.cuh>
 #include <gauge_field_order.h>
 #include <kernel.h>
-#include <shared_memory_cache_helper.h>
+#include <thread_local_cache.h>
 
 namespace quda {
 
@@ -272,7 +272,7 @@ namespace quda {
      *            A _______ B
      *    mu_next  |       |
      *            H|       |G
-     *   
+     *
      *   Variables have been named to reflection dimensionality for
      *   mu_positive == true, sig_positive == true, mu_next_positive == true
      **************************************************************************/
@@ -372,7 +372,7 @@ namespace quda {
         @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction
         @param[in] parity_a Parity of the coordinate x
         @param[in/out] force_mu Accumulated force in the mu direction
-        @param[in] Uab_cache Shared memory cache that stores the gauge link going from a to b (read)
+        @param[in] Uab_cache Thread local cache that stores the gauge link going from a to b (read)
         @details This subset of the code computes the Lepage contribution to the fermion force.
           Data traffic:
             READ: cb_link, id_link, pMu_at_c
@@ -386,7 +386,10 @@ namespace quda {
           Flops:
             2 multiplies, 1 add, 1 rescale
       */
-      __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu, SharedMemoryCache<Link> &Uab_cache) {
+      template <typename LinkCache>
+      __device__ __host__ inline void lepage_force(int x[4], int point_a, int parity_a, Link &force_mu,
+                                                   LinkCache &Uab_cache)
+      {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
 
@@ -414,7 +417,7 @@ namespace quda {
 
         Link Ow = mu_positive ? (conj(Ucb) * Oc) : (Ucb * Oc);
         {
-          Link Uab = Uab_cache.load();
+          Link Uab = Uab_cache;
           Link Oy = sig_positive ? Uab * Ow : conj(Uab) * Ow;
           Link Ox = mu_positive ? (Oy * Uid) : (Uid * conj(Oy));
           auto mycoeff_lepage = -coeff_sign<sig_positive, typename Arg::real>(parity_a)*coeff_sign<mu_positive, typename Arg::real>(parity_a)*arg.coeff_lepage;
@@ -440,7 +443,7 @@ namespace quda {
         @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice
         @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction
         @param[in] parity_a Parity of the coordinate x
-        @param[in] Uab_cache Shared memory cache that stores the gauge link going from a to b (read)
+        @param[in] Uab_cache Thread local cache that stores the gauge link going from a to b (read)
           Data traffic:
             READ: gb_link, oProd_at_h
             WRITE: pMu_next_at_b, p3_at_a
@@ -454,7 +457,8 @@ namespace quda {
           Flops:
             2 multiplies, 1 add, 1 rescale
       */
-      __device__ __host__ inline void middle_three(int x[4], int point_a, int parity_a, SharedMemoryCache<Link> &Uab_cache)
+      template <typename LinkCache>
+      __device__ __host__ inline void middle_three(int x[4], int point_a, int parity_a, LinkCache &Uab_cache)
       {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
@@ -487,7 +491,7 @@ namespace quda {
         arg.pMu_next(0, point_b, parity_b) = Oz;
         {
           // scoped Uab load
-          Link Uab = Uab_cache.load();
+          Link Uab = Uab_cache;
           if constexpr (!sig_positive) Uab = conj(Uab);
           arg.p3(0, point_a, parity_a) = Uab * Oz;
         }
@@ -535,8 +539,7 @@ namespace quda {
         /*
          * The "extra" low point corresponds to the Lepage contribution to the
          * force_mu term.
-         * 
-         *  
+         *
          *            sig
          *         F        E
          *          |      |
@@ -557,7 +560,7 @@ namespace quda {
         int point_a = e_cb;
         int parity_a = parity;
 
-        SharedMemoryCache<Link> Uab_cache(target::block_dim());
+        ThreadLocalCache<Link> Uab_cache;
         // Scoped load of Uab
         {
           int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
@@ -636,7 +639,7 @@ namespace quda {
       Link force;
       Link shortP;
       Link p5;
-      
+
       const Link pMu;
 
       // double-buffer: read pNuMu, qNuMu for side 5, middle 7
@@ -688,7 +691,7 @@ namespace quda {
         @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice
         @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction
         @param[in] parity_a Parity of the coordinate x
-        @param[in/out] Matrix_cache Shared memory cache that maintains the accumulated P5 contribution (write)
+        @param[in/out] Matrix_cache Thread local cache that maintains the accumulated P5 contribution (write)
                        the gauge link going from a to b (read), as well as force_sig when sig is positive (read/write)
         @details This subset of the code computes the full seven link contribution to the HISQ force.
           Data traffic:
@@ -705,8 +708,9 @@ namespace quda {
           Flops:
             4 multiplies, 2 adds, 2 rescales
       */
-      __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a,
-          SharedMemoryCache<Link> &Matrix_cache) {
+      template <typename LinkCache>
+      __device__ __host__ inline void all_link(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         auto mycoeff_seven = parity_sign<typename Arg::real>(parity_a) * coeff_sign<sig_positive, typename Arg::real>(parity_a) * arg.coeff_seven;
 
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
@@ -735,19 +739,19 @@ namespace quda {
           UbeOeOf = Ube * OeOf;
 
           // Cache Ube to below
-          Matrix_cache.save_z(Ube, 1);
+          Matrix_cache.save(Ube, 1);
         }
 
         // Take care of force_sig --- contribution from the negative rho direction
         Link Uaf = arg.link(arg.rho, point_a, parity_a);
         if constexpr (sig_positive) {
-          Link force_sig = Matrix_cache.load_z(2);
+          Link force_sig = Matrix_cache[2];
           force_sig = mm_add(mycoeff_seven * UbeOeOf, conj(Uaf), force_sig);
-          Matrix_cache.save_z(force_sig, 2);
+          Matrix_cache.save(force_sig, 2);
         }
 
         // Compute the force_rho --- contribution from the negative rho direction
-        Link Uab = Matrix_cache.load_z(0);
+        Link Uab = Matrix_cache[0];
         if constexpr (!sig_positive) Uab = conj(Uab);
         Link force_rho = arg.force(arg.rho, point_a, parity_a);
         force_rho = mm_add(mycoeff_seven * conj(UbeOeOf), conj(Uab), force_rho);
@@ -756,7 +760,7 @@ namespace quda {
         Link Ufe = arg.link(arg.sig, fe_link_nbr_idx, fe_link_nbr_parity);
 
         // Load Ube from the cache
-        Link Ube = Matrix_cache.load_z(1);
+        Link Ube = Matrix_cache[1];
 
         // Form the product UfeUebOb
         Link UfeUeb = (sig_positive ? Ufe : conj(Ufe)) * conj(Ube);
@@ -788,7 +792,7 @@ namespace quda {
         Link Oz = Ucb * Ob;
         Link Oy = (sig_positive ? Udc : conj(Udc)) * Oz;
         p5_sig = mm_add(arg.accumu_coeff_seven * conj(Uda), Oy, p5_sig);
-        Matrix_cache.save_z(p5_sig, 1);
+        Matrix_cache.save(p5_sig, 1);
 
         // When sig is positive, compute the force_sig contribution from the
         // positive rho direction
@@ -796,11 +800,10 @@ namespace quda {
           Link Od = arg.qNuMu(0, point_d, parity_d);
           Link Oc = arg.pNuMu(0, point_c, parity_c);
           Link Oz = conj(Ucb) * Oc;
-          Link force_sig = Matrix_cache.load_z(2);
+          Link force_sig = Matrix_cache[2];
           force_sig = mm_add(mycoeff_seven * Oz, Od * Uda, force_sig);
-          Matrix_cache.save_z(force_sig, 2);
+          Matrix_cache.save(force_sig, 2);
         }
-
       }
 
       /**
@@ -808,7 +811,7 @@ namespace quda {
         @param[in] x Local coordinate
         @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice
         @param[in] parity_a Parity of the coordinate x
-        @param[in/out] Matrix_cache Shared memory cache that maintains the full P5 contribution
+        @param[in/out] Matrix_cache Thread local cache that maintains the full P5 contribution
                        summed from the previous middle five and all seven (read), as well as force_sig
                        when sig is positive (read/write)
         @details This subset of the code computes the side link five link contribution to the HISQ force.
@@ -818,7 +821,9 @@ namespace quda {
           Flops:
             2 multiplies, 2 adds, 2 rescales
       */
-      __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, SharedMemoryCache<Link> &Matrix_cache) {
+      template <typename LinkCache>
+      __device__ __host__ inline void side_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         int y[4] = {x[0], x[1], x[2], x[3]};
         int point_h = updateCoordExtendedIndexShiftMILC<flip_dir(nu_positive)>(y, arg.nu, arg);
         int parity_h = 1 - parity_a;
@@ -832,7 +837,7 @@ namespace quda {
         int qh_link_nbr_idx = mu_positive ? point_q : point_h;
         int qh_link_nbr_parity = mu_positive ? parity_q : parity_h;
 
-        Link P5 = Matrix_cache.load_z(1);
+        Link P5 = Matrix_cache[1];
         Link Uah = arg.link(arg.nu, ha_link_nbr_idx, ha_link_nbr_parity);
         Link Ow = nu_positive ? Uah * P5 : conj(Uah) * P5;
 
@@ -857,7 +862,7 @@ namespace quda {
         @param[in] point_a 1-d checkerboard index for the unit site in the full extended lattice
         @param[in] point_b 1-d checkerboard index for the unit site shifted in the sig direction
         @param[in] parity_a Parity of the coordinate x
-        @param[in/out] Matrix_cache Helper shared memory cache that maintains  the gauge link going
+        @param[in/out] Matrix_cache Thread local cache that maintains the gauge link going
                        from a to b (read) and, when sig is positive, force_sig (read/write)
         @details This subset of the code computes the middle link five link contribution to the HISQ force.
           Data traffic:
@@ -870,8 +875,9 @@ namespace quda {
           Flops:
             1 multiply, 1 add, 1 rescale
       */
-      __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a,
-          SharedMemoryCache<Link> &Matrix_cache) {
+      template <typename LinkCache>
+      __device__ __host__ inline void middle_five(int x[4], int point_a, int parity_a, LinkCache &Matrix_cache)
+      {
         int point_b = linkExtendedIndexShiftMILC<sig_positive>(x, arg.sig, arg);
         int parity_b = 1 - parity_a;
 
@@ -902,7 +908,7 @@ namespace quda {
         arg.pNuMu_next(0, point_b, parity_b) = Ow;
         {
           // scoped Uab load
-          Link Uab = Matrix_cache.load_z(0);
+          Link Uab = Matrix_cache[0];
           if constexpr (!sig_positive) Uab = conj(Uab);
           arg.p5(0, point_a, parity_a) = Uab * Ow;
         }
@@ -917,9 +923,9 @@ namespace quda {
 
         // compute the force in the sigma direction if sig is positive
         if constexpr (sig_positive) {
-          Link force_sig = Matrix_cache.load_z(2);
+          Link force_sig = Matrix_cache[2];
           force_sig = mm_add(arg.coeff_five * Ow, Ox, force_sig);
-          Matrix_cache.save_z(force_sig, 2);
+          Matrix_cache.save(force_sig, 2);
         }
       }
 
@@ -955,14 +961,14 @@ namespace quda {
 
         int point_a = e_cb;
         int parity_a = parity;
-        
+
         // calculate p5_sig
-        auto block_dim = target::block_dim();
-        block_dim.z = (sig_positive ? 3 : 2);
-        SharedMemoryCache<Link> Matrix_cache(block_dim);
+        constexpr int cacheLen = sig_positive ? 3 : 2;
+        ThreadLocalCache<Link, cacheLen> Matrix_cache;
+
         if constexpr (sig_positive) {
           Link force_sig = arg.force(arg.sig, point_a, parity_a);
-          Matrix_cache.save_z(force_sig, 2);
+          Matrix_cache.save(force_sig, 2);
         }
 
         // Scoped load of Uab
@@ -972,7 +978,7 @@ namespace quda {
           int ab_link_nbr_idx = (sig_positive) ? point_a : point_b;
           int ab_link_nbr_parity = (sig_positive) ? parity_a : parity_b;
           Link Uab = arg.link(arg.sig, ab_link_nbr_idx, ab_link_nbr_parity);
-          Matrix_cache.save_z(Uab, 0);
+          Matrix_cache.save(Uab, 0);
         }
 
         // accumulate into P5, force_sig
@@ -987,7 +993,7 @@ namespace quda {
 
         // update the force in the sigma direction
         if constexpr (sig_positive) {
-          Link force_sig = Matrix_cache.load_z(2);
+          Link force_sig = Matrix_cache[2];
           arg.force(arg.sig, point_a, parity_a) = force_sig;
         }
 
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 4a5420b166..0550ad62dd 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -6,6 +6,12 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
diff --git a/include/targets/cuda/shared_memory_cache_helper.h b/include/targets/cuda/shared_memory_cache_helper.h
index 7c7c0a1b28..73be0cd01b 100644
--- a/include/targets/cuda/shared_memory_cache_helper.h
+++ b/include/targets/cuda/shared_memory_cache_helper.h
@@ -1,295 +1 @@
-#pragma once
-
-#include <target_device.h>
-#include <array.h>
-
-/**
-   @file shared_memory_cache_helper.h
-
-   Helper functionality for aiding the use of the shared memory for
-   sharing data between threads in a thread block.
- */
-
-namespace quda
-{
-
-  /**
-     @brief Class which wraps around a shared memory cache for type T,
-     where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
-
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
-
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
-
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
-   */
-  template <typename T, int block_size_y_ = 1, int block_size_z_ = 1, bool dynamic_ = true> class SharedMemoryCache
-  {
-  public:
-    using value_type = T;
-    static constexpr int block_size_y = block_size_y_;
-    static constexpr int block_size_z = block_size_z_;
-    static constexpr bool dynamic = dynamic_;
-
-  private:
-    /** maximum number of threads in x given the y and z block sizes */
-    static constexpr int block_size_x = device::max_block_size<block_size_y, block_size_z>();
-
-    using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
-
-    // The number of elements of type atom_t that we break T into for optimal shared-memory access
-    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
-
-    const dim3 block;
-    const int stride;
-    const unsigned int offset = 0; // dynamic offset in bytes
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool, typename dummy = void> struct cache_dynamic {
-      atom_t *operator()(unsigned)
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, dynamic specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_dynamic<true, dummy> {
-      __device__ inline atom_t *operator()(unsigned int offset)
-      {
-        extern __shared__ int cache_[];
-        return reinterpret_cast<atom_t *>(reinterpret_cast<char *>(cache_) + offset);
-      }
-    };
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct cache_static {
-      atom_t *operator()()
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, static specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_static<true, dummy> {
-      __device__ inline atom_t *operator()()
-      {
-        static __shared__ atom_t cache_[n_element * block_size_x * block_size_y * block_size_z];
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_dynamic>(offset);
-    }
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<!dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_static>();
-    }
-
-    __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      memcpy(tmp, (void *)&a, sizeof(T));
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) cache<dynamic>()[i * stride + j] = tmp[i];
-    }
-
-    __device__ __host__ inline T load_detail(int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = cache<dynamic>()[i * stride + j];
-      T a;
-      memcpy((void *)&a, tmp, sizeof(T));
-      return a;
-    }
-
-    /**
-       @brief Dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct sync_impl {
-      void operator()() { }
-    };
-
-    /**
-       @brief Synchronize the cache when on the device
-    */
-    template <typename dummy> struct sync_impl<true, dummy> {
-      __device__ inline void operator()() { __syncthreads(); }
-    };
-
-  public:
-    /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
-    */
-    constexpr SharedMemoryCache(dim3 block = dim3(block_size_x, block_size_y, block_size_z),
-                                unsigned int thread_offset = 0) :
-      block(block), stride(block.x * block.y * block.z), offset(stride * thread_offset)
-    {
-    }
-
-    /**
-       @brief Grab the raw base address to shared memory.
-    */
-    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache<dynamic>()); }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, x, y, z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-     */
-    __device__ __host__ inline void save_x(const T &a, int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      save_detail(a, x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] y The y index to use
-     */
-    __device__ __host__ inline void save_y(const T &a, int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      save_detail(a, tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save_z(const T &a, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Load a value from the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-     */
-    __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      return load_detail(x, y, z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] x The x index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_x(int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      return load_detail(x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] y The y index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_y(int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      return load_detail(tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_z(int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      return load_detail(tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Synchronize the cache
-    */
-    __device__ __host__ void sync() const { target::dispatch<sync_impl>(); }
-
-    /**
-       @brief Cast operator to allow cache objects to be used where T
-       is expected
-     */
-    __device__ __host__ operator T() const { return load(); }
-
-    /**
-       @brief Assignment operator to allow cache objects to be used on
-       the lhs where T is otherwise expected.
-     */
-    __device__ __host__ void operator=(const T &src) const { save(src); }
-  };
-
-} // namespace quda
-
-// include overloads
 #include "../generic/shared_memory_cache_helper.h"
diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
new file mode 100644
index 0000000000..bc9bd7c66b
--- /dev/null
+++ b/include/targets/cuda/shared_memory_helper.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <target_device.h>
+
+/**
+   @file shared_memory_helper.h
+
+   Target specific helper for allocating and accessing shared memory.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief Class which is used to allocate and access shared memory.
+     The shared memory is treated as an array of type T, with the
+     number of elements given by a call to the static member
+     S::size(target::block_dim()).  The byte offset from the beginning
+     of the total shared memory block is given by the static member
+     O::shared_mem_size(target::block_dim()), or 0 if O is void.
+   */
+  template <typename T, typename S, typename O = void> class SharedMemory
+  {
+  public:
+    using value_type = T;
+
+  private:
+    T *data;
+
+    /**
+       @brief This is a dummy instantiation for the host compiler
+    */
+    template <bool, typename dummy = void> struct cache_dynamic {
+      T *operator()(unsigned int)
+      {
+        static T *cache_;
+        return cache_;
+      }
+    };
+
+    /**
+       @brief This is the handle to the dynamic shared memory
+       @return Shared memory pointer
+     */
+    template <typename dummy> struct cache_dynamic<true, dummy> {
+      __device__ inline T *operator()(unsigned int offset)
+      {
+        extern __shared__ int cache_[];
+        return reinterpret_cast<T *>(reinterpret_cast<char *>(cache_) + offset);
+      }
+    };
+
+    __device__ __host__ inline T *cache(unsigned int offset) const { return target::dispatch<cache_dynamic>(offset); }
+
+  public:
+    /**
+       @brief Byte offset for this shared memory object.
+    */
+    static constexpr unsigned int get_offset(dim3 block)
+    {
+      unsigned int o = 0;
+      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
+      return o;
+    }
+
+    /**
+       @brief Shared memory size in bytes.
+    */
+    static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block) * sizeof(T); }
+
+    /**
+       @brief Constructor for SharedMemory object.
+    */
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) { }
+
+    /**
+       @brief Return this SharedMemory object.
+    */
+    constexpr auto sharedMem() const { return *this; }
+
+    /**
+       @brief Subscripting operator returning a reference to element.
+       @param[in] i The index to use.
+       @return Reference to value stored at that index.
+     */
+    __device__ __host__ T &operator[](const int i) const { return data[i]; }
+  };
+
+} // namespace quda
diff --git a/include/targets/cuda/thread_array.h b/include/targets/cuda/thread_array.h
index 4fe1bb33f6..1c4d7f3244 100644
--- a/include/targets/cuda/thread_array.h
+++ b/include/targets/cuda/thread_array.h
@@ -1,49 +1,18 @@
 #pragma once
 
-#include "shared_memory_cache_helper.h"
-
-namespace quda
-{
-
 #ifndef _NVHPC_CUDA
 
-  /**
-     @brief Class that provides indexable per-thread storage.  On CUDA
-     this maps to using assigning each thread a unique window of
-     shared memory using the SharedMemoryCache object.
-   */
-  template <typename T, int n> struct thread_array {
-    SharedMemoryCache<array<T, n>, 1, 1, false> device_array;
-    int offset;
-    array<T, n> host_array;
-    array<T, n> &array_;
-
-    __device__ __host__ constexpr thread_array() :
-      offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x
-             + target::thread_idx().x),
-      array_(target::is_device() ? *(device_array.data() + offset) : host_array)
-    {
-      array_ = array<T, n>(); // call default constructor
-    }
-
-    template <typename... Ts>
-    __device__ __host__ constexpr thread_array(T first, const Ts... other) :
-      offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x
-             + target::thread_idx().x),
-      array_(target::is_device() ? *(device_array.data() + offset) : host_array)
-    {
-      array_ = array<T, n> {first, other...};
-    }
-
-    __device__ __host__ T &operator[](int i) { return array_[i]; }
-    __device__ __host__ const T &operator[](int i) const { return array_[i]; }
-  };
+#include "../generic/thread_array.h"
 
 #else
 
+#include <array.h>
+
+namespace quda
+{
   template <typename T, int n> struct thread_array : array<T, n> {
+    static constexpr unsigned int shared_mem_size(dim3 block) { return 0; }
   };
+} // namespace quda
 
 #endif
-
-} // namespace quda
diff --git a/include/targets/cuda/thread_local_cache.h b/include/targets/cuda/thread_local_cache.h
new file mode 100644
index 0000000000..dd4cd863fc
--- /dev/null
+++ b/include/targets/cuda/thread_local_cache.h
@@ -0,0 +1 @@
+#include "../generic/thread_local_cache.h"
diff --git a/include/targets/cuda/tunable_kernel.h b/include/targets/cuda/tunable_kernel.h
index d7936eb497..7306ab355c 100644
--- a/include/targets/cuda/tunable_kernel.h
+++ b/include/targets/cuda/tunable_kernel.h
@@ -45,6 +45,7 @@ namespace quda
     std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
 #ifdef JITIFY
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
 #else
@@ -62,6 +63,7 @@ namespace quda
     std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
 #ifdef JITIFY
       // note we do the copy to constant memory after the kernel has been compiled in launch_jitify
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
@@ -83,6 +85,7 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
+      checkSharedBytes(tp);
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/targets/generic/kernel_ops.h b/include/targets/generic/kernel_ops.h
new file mode 100644
index 0000000000..d8e9cba9bd
--- /dev/null
+++ b/include/targets/generic/kernel_ops.h
@@ -0,0 +1,53 @@
+#pragma once
+
+namespace quda
+{
+
+  /**
+     @brief Used to declare an object of fixed size.
+   */
+  template <int N> struct SizeStatic {
+    static constexpr unsigned int size(dim3) { return N; }
+  };
+
+  /**
+     @brief Used to declare an object of fixed size per thread, N.
+   */
+  template <int N> struct SizePerThread {
+    static constexpr unsigned int size(dim3 block) { return N * block.x * block.y * block.z; }
+  };
+
+  /**
+     @brief Used to declare an object of fixed size per thread, N, with thread dimensions derermined by D.
+   */
+  template <typename D, int N = 1> struct SizeDims {
+    static constexpr unsigned int size(dim3 block)
+    {
+      dim3 dims = D::dims(block);
+      return dims.x * dims.y * dims.z * N;
+    }
+  };
+
+  /**
+     @brief Used to declare an object with dimensions given by the block size.
+   */
+  struct DimsBlock {
+    static constexpr dim3 dims(dim3 block) { return block; }
+  };
+
+  /**
+     @brief Used to declare an object with fixed dimensions.
+   */
+  template <int x, int y, int z> struct DimsStatic {
+    static constexpr dim3 dims(dim3) { return dim3(x, y, z); }
+  };
+
+  /**
+     @brief Uniform helper for exposing type T, whether we are dealing
+     with an instance of T or some wrapper of T
+   */
+  template <class T, class enable = void> struct get_type {
+    using type = T;
+  };
+
+} // namespace quda
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index 060e3478f8..c4b466fb57 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -1,68 +1,282 @@
+#pragma once
+
+#include <kernel_ops.h>
+#include <target_device.h>
+#include <shared_memory_helper.h>
+
 /**
    @file shared_memory_cache_helper.h
-   @brief Convenience overloads to allow SharedMemoryCache objects to
-   appear in simple expressions.  The actual implementation of
-   SharedMemoryCache is target specific, and located in e.g.,
-   include/targets/cuda/shared_memory_cache_helper.h, etc.
+
+   Helper functionality for aiding the use of the shared memory for
+   sharing data between threads in a thread block.
  */
 
 namespace quda
 {
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline T operator+(const SharedMemoryCache<T, by, bz, dynamic> &a, const T &b)
+  /**
+     @brief Class which wraps around a shared memory cache for type T,
+     where each thread in the thread block stores a unique value in
+     the cache which any other thread can access.  The data is stored
+     in a coalesced order with element size atom_t<T>.
+
+     The dimensions of the cache is determined by a call to
+     D::dims(target::block_dim()), and D defaults to having dimensions
+     equal to the block dimensions.
+
+     A byte offset into the shared memory region can be specified with
+     the type O, and is given by
+     O::shared_mem_size(target::block_dim()) if O is not void.
+   */
+  template <typename T, typename D = DimsBlock, typename O = void>
+  class SharedMemoryCache : SharedMemory<atom_t<T>, SizeDims<D, sizeof(T) / sizeof(atom_t<T>)>, O>
+  {
+    using Smem = SharedMemory<atom_t<T>, SizeDims<D, sizeof(T) / sizeof(atom_t<T>)>, O>;
+
+  public:
+    using value_type = T;
+    using dims_type = D;
+    using offset_type = O;
+    using Smem::shared_mem_size;
+
+  private:
+    const dim3 block;
+    const int stride;
+    using Smem::sharedMem;
+    using atom_t = atom_t<T>;
+    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
+
+    // The number of elements of type atom_t that we break T into for optimal shared-memory access
+    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
+
+    // used to avoid instantiation of load functions if unused, in case T is not a valid return type (e.g. C array)
+    template <typename dummy = void> using maybeT = std::conditional_t<std::is_same_v<dummy, void>, T, void>;
+
+    __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
+    {
+      atom_t tmp[n_element];
+      memcpy(tmp, (void *)&a, sizeof(T));
+      int j = (z * block.y + y) * block.x + x;
+#pragma unroll
+      for (int i = 0; i < n_element; i++) sharedMem()[i * stride + j] = tmp[i];
+    }
+
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_detail(int x, int y, int z) const
+    {
+      atom_t tmp[n_element];
+      int j = (z * block.y + y) * block.x + x;
+#pragma unroll
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[i * stride + j];
+      T a;
+      memcpy((void *)&a, tmp, sizeof(T));
+      return a;
+    }
+
+    /**
+       @brief Dummy instantiation for the host compiler
+    */
+    template <bool is_device, typename dummy = void> struct sync_impl {
+      void operator()() { }
+    };
+
+    /**
+       @brief Synchronize the cache when on the device
+    */
+    template <typename dummy> struct sync_impl<true, dummy> {
+      __device__ inline void operator()() { __syncthreads(); }
+    };
+
+  public:
+    /**
+       @brief Constructor for SharedMemoryCache.
+    */
+    constexpr SharedMemoryCache() : block(D::dims(target::block_dim())), stride(block.x * block.y * block.z)
+    {
+      // sanity check
+      static_assert(shared_mem_size(dim3 {32, 16, 8})
+                    == Smem::get_offset(dim3 {32, 16, 8}) + SizeDims<D>::size(dim3 {32, 16, 8}) * sizeof(T));
+    }
+
+    /**
+       @brief Grab the raw base address to shared memory.
+    */
+    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(&sharedMem()[0]); }
+
+    /**
+       @brief Save the value into the 3-d shared memory cache.
+       @param[in] a The value to store in the shared memory cache
+       @param[in] x The x index to use
+       @param[in] y The y index to use
+       @param[in] z The z index to use
+     */
+    __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const
+    {
+      auto tid = target::thread_idx();
+      x = (x == -1) ? tid.x : x;
+      y = (y == -1) ? tid.y : y;
+      z = (z == -1) ? tid.z : z;
+      save_detail(a, x, y, z);
+    }
+
+    /**
+       @brief Save the value into the 3-d shared memory cache.
+       @param[in] a The value to store in the shared memory cache
+       @param[in] x The x index to use
+     */
+    __device__ __host__ inline void save_x(const T &a, int x = -1) const
+    {
+      auto tid = target::thread_idx();
+      x = (x == -1) ? tid.x : x;
+      save_detail(a, x, tid.y, tid.z);
+    }
+
+    /**
+       @brief Save the value into the 3-d shared memory cache.
+       @param[in] a The value to store in the shared memory cache
+       @param[in] y The y index to use
+     */
+    __device__ __host__ inline void save_y(const T &a, int y = -1) const
+    {
+      auto tid = target::thread_idx();
+      y = (y == -1) ? tid.y : y;
+      save_detail(a, tid.x, y, tid.z);
+    }
+
+    /**
+       @brief Save the value into the 3-d shared memory cache.
+       @param[in] a The value to store in the shared memory cache
+       @param[in] z The z index to use
+     */
+    __device__ __host__ inline void save_z(const T &a, int z = -1) const
+    {
+      auto tid = target::thread_idx();
+      z = (z == -1) ? tid.z : z;
+      save_detail(a, tid.x, tid.y, z);
+    }
+
+    /**
+       @brief Load a value from the shared memory cache
+       @param[in] x The x index to use
+       @param[in] y The y index to use
+       @param[in] z The z index to use
+       @return The value at coordinates (x,y,z)
+     */
+    template <typename dummy = void>
+    __device__ __host__ inline maybeT<dummy> load(int x = -1, int y = -1, int z = -1) const
+    {
+      auto tid = target::thread_idx();
+      x = (x == -1) ? tid.x : x;
+      y = (y == -1) ? tid.y : y;
+      z = (z == -1) ? tid.z : z;
+      return load_detail(x, y, z);
+    }
+
+    /**
+       @brief Load a vector from the shared memory cache
+       @param[in] x The x index to use
+       @return The value at coordinates (x,y,z)
+    */
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_x(int x = -1) const
+    {
+      auto tid = target::thread_idx();
+      x = (x == -1) ? tid.x : x;
+      return load_detail(x, tid.y, tid.z);
+    }
+
+    /**
+       @brief Load a vector from the shared memory cache
+       @param[in] y The y index to use
+       @return The value at coordinates (x,y,z)
+    */
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_y(int y = -1) const
+    {
+      auto tid = target::thread_idx();
+      y = (y == -1) ? tid.y : y;
+      return load_detail(tid.x, y, tid.z);
+    }
+
+    /**
+       @brief Load a vector from the shared memory cache
+       @param[in] z The z index to use
+       @return The value at coordinates (x,y,z)
+    */
+    template <typename dummy = void> __device__ __host__ inline maybeT<dummy> load_z(int z = -1) const
+    {
+      auto tid = target::thread_idx();
+      z = (z == -1) ? tid.z : z;
+      return load_detail(tid.x, tid.y, z);
+    }
+
+    /**
+       @brief Synchronize the cache
+    */
+    __device__ __host__ void sync() const { target::dispatch<sync_impl>(); }
+
+    /**
+       @brief Cast operator to allow cache objects to be used where T
+       is expected
+     */
+    template <typename dummy = void> __device__ __host__ operator maybeT<dummy>() const { return load(); }
+
+    /**
+       @brief Assignment operator to allow cache objects to be used on
+       the lhs where T is otherwise expected.
+     */
+    __device__ __host__ void operator=(const T &src) const { save(src); }
+  };
+
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline T operator+(const SharedMemoryCache<T, D, O> &a, const T &b)
   {
     return static_cast<const T &>(a) + b;
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline T operator+(const T &a, const SharedMemoryCache<T, by, bz, dynamic> &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline T operator+(const T &a, const SharedMemoryCache<T, D, O> &b)
   {
     return a + static_cast<const T &>(b);
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline T operator-(const SharedMemoryCache<T, by, bz, dynamic> &a, const T &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline T operator-(const SharedMemoryCache<T, D, O> &a, const T &b)
   {
     return static_cast<const T &>(a) - b;
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline T operator-(const T &a, const SharedMemoryCache<T, by, bz, dynamic> &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline T operator-(const T &a, const SharedMemoryCache<T, D, O> &b)
   {
     return a - static_cast<const T &>(b);
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline auto operator+=(SharedMemoryCache<T, by, bz, dynamic> &a, const T &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline auto operator+=(SharedMemoryCache<T, D, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) + b);
     return a;
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline auto operator-=(SharedMemoryCache<T, by, bz, dynamic> &a, const T &b)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline auto operator-=(SharedMemoryCache<T, D, O> &a, const T &b)
   {
     a.save(static_cast<const T &>(a) - b);
     return a;
   }
 
-  template <typename T, int by, int bz, bool dynamic>
-  __device__ __host__ inline auto conj(const SharedMemoryCache<T, by, bz, dynamic> &a)
+  template <typename T, typename D, typename O>
+  __device__ __host__ inline auto conj(const SharedMemoryCache<T, D, O> &a)
   {
     return conj(static_cast<const T &>(a));
   }
 
   /**
      @brief Uniform helper for exposing type T, whether we are dealing
-     with an instance of T or SharedMemoryCache<T>
+     with an instance of T or SharedMemoryCache<T,D,O>
    */
-  template <class T, class enable = void> struct get_type {
-    using type = T;
-  };
   template <class T>
-  struct get_type<
-    T, std::enable_if_t<std::is_same_v<T, SharedMemoryCache<typename T::value_type, T::block_size_y, T::block_size_z, T::dynamic>>>> {
+  struct get_type<T,
+                  std::enable_if_t<std::is_same_v<
+                    T, SharedMemoryCache<typename T::value_type, typename T::dims_type, typename T::offset_type>>>> {
     using type = typename T::value_type;
   };
 
diff --git a/include/targets/generic/thread_array.h b/include/targets/generic/thread_array.h
new file mode 100644
index 0000000000..dbfc6a4dc5
--- /dev/null
+++ b/include/targets/generic/thread_array.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <kernel_ops.h>
+#include <shared_memory_helper.h>
+#include <array.h>
+
+namespace quda
+{
+
+  /**
+     @brief Class that provides indexable per-thread storage for n
+     elements of type T.  This version uses shared memory for storage.
+     The offset into the shared memory region is determined from the
+     type O.
+   */
+  template <typename T, int n, typename O = void> class thread_array : SharedMemory<array<T, n>, SizePerThread<1>, O>
+  {
+    using Smem = SharedMemory<array<T, n>, SizePerThread<1>, O>;
+    using Smem::sharedMem;
+    array<T, n> &array_;
+
+  public:
+    using Smem::shared_mem_size;
+
+    __device__ __host__ constexpr thread_array() : array_(sharedMem()[target::thread_idx_linear<3>()])
+    {
+      array_ = array<T, n>(); // call default constructor
+    }
+
+    template <typename... Ts>
+    __device__ __host__ constexpr thread_array(T first, const Ts... other) :
+      array_(sharedMem()[target::thread_idx_linear<3>()])
+    {
+      array_ = array<T, n> {first, other...};
+    }
+
+    __device__ __host__ T &operator[](int i) { return array_[i]; }
+    __device__ __host__ const T &operator[](int i) const { return array_[i]; }
+  };
+
+} // namespace quda
diff --git a/include/targets/generic/thread_local_cache.h b/include/targets/generic/thread_local_cache.h
new file mode 100644
index 0000000000..3400acca05
--- /dev/null
+++ b/include/targets/generic/thread_local_cache.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <kernel_ops.h>
+#include <target_device.h>
+#include <shared_memory_helper.h>
+
+/**
+   @file thread_local_cache.h
+
+   Thread local cache object which may use shared memory for optimization.
+   The storage can be a single object or an array of objects.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief Class for threads to store a unique value (for N_ == 0),
+     or array of values (for N_ > 0), which can use shared memory for
+     optimization purposes.
+   */
+  template <typename T, int N_ = 0, typename O = void>
+  class ThreadLocalCache : SharedMemory<atom_t<T>, SizePerThread<std::max(1, N_) * sizeof(T) / sizeof(atom_t<T>)>, O>
+  {
+    using Smem = SharedMemory<atom_t<T>, SizePerThread<std::max(1, N_) * sizeof(T) / sizeof(atom_t<T>)>, O>;
+
+  public:
+    using value_type = T;
+    static constexpr int N = N_; // size of array, 0 means to behave like T instead of array<T, 1>
+    using offset_type = O;       // type of object using shared memory at the same time that is located before this one
+    static constexpr int len = std::max(1, N); // actual number of elements to store
+    using Smem::shared_mem_size;
+
+  private:
+    const int stride;
+    using Smem::sharedMem;
+    using atom_t = atom_t<T>;
+    static_assert(sizeof(T) % 4 == 0, "Thread local cache does not support sub-word size types");
+
+    // The number of elements of type atom_t that we break T into for optimal shared-memory access
+    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
+
+    __device__ __host__ inline void save_detail(const T &a, const int k) const
+    {
+      atom_t tmp[n_element];
+      memcpy(tmp, (void *)&a, sizeof(T));
+      int j = target::thread_idx_linear<3>();
+#pragma unroll
+      for (int i = 0; i < n_element; i++) sharedMem()[(k * n_element + i) * stride + j] = tmp[i];
+    }
+
+    __device__ __host__ inline T load_detail(const int k) const
+    {
+      atom_t tmp[n_element];
+      int j = target::thread_idx_linear<3>();
+#pragma unroll
+      for (int i = 0; i < n_element; i++) tmp[i] = sharedMem()[(k * n_element + i) * stride + j];
+      T a;
+      memcpy((void *)&a, tmp, sizeof(T));
+      return a;
+    }
+
+  public:
+    /**
+       @brief Constructor for ThreadLocalCache.
+    */
+    constexpr ThreadLocalCache() : stride(target::block_size<3>())
+    {
+      // sanity check
+      static_assert(shared_mem_size(dim3 {32, 16, 8})
+                    == Smem::get_offset(dim3 {32, 16, 8}) + SizePerThread<len>::size(dim3 {32, 16, 8}) * sizeof(T));
+    }
+
+    /**
+       @brief Save the value into the thread local cache.  Used when N==0 so cache acts like single object.
+       @param[in] a The value to store in the thread local cache
+     */
+    __device__ __host__ inline void save(const T &a) const
+    {
+      static_assert(N == 0);
+      save_detail(a, 0);
+    }
+
+    /**
+       @brief Save the value into an element of the thread local cache.
+       @param[in] a The value to store in the thread local cache
+       @param[in] k The index to use
+     */
+    __device__ __host__ inline void save(const T &a, const int k) const
+    {
+      static_assert(N > 0);
+      save_detail(a, k);
+    }
+
+    /**
+       @brief Load a value from the thread local cache.  Used when N==0 so cache acts like single object.
+       @return The value stored in the thread local cache
+     */
+    __device__ __host__ inline T load() const
+    {
+      static_assert(N == 0);
+      return load_detail(0);
+    }
+
+    /**
+       @brief Load a value from an element of the thread local cache
+       @param[in] k The index to use
+       @return The value stored in the thread local cache at that index
+     */
+    __device__ __host__ inline T load(const int k) const
+    {
+      static_assert(N > 0);
+      return load_detail(k);
+    }
+
+    /**
+       @brief Cast operator to allow cache objects to be used where T is expected (when N==0).
+     */
+    __device__ __host__ operator T() const
+    {
+      static_assert(N == 0);
+      return load();
+    }
+
+    /**
+       @brief Assignment operator to allow cache objects to be used on
+       the lhs where T is otherwise expected (when N==0).
+     */
+    __device__ __host__ void operator=(const T &src) const
+    {
+      static_assert(N == 0);
+      save(src);
+    }
+
+    /**
+       @brief Subscripting operator returning value at index for convenience.
+       @param[in] i The index to use
+       @return The value stored in the thread local cache at that index
+     */
+    __device__ __host__ T operator[](int i)
+    {
+      static_assert(N > 0);
+      return load(i);
+    }
+  };
+
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator+(const ThreadLocalCache<T, N, O> &a, const T &b)
+  {
+    return static_cast<const T &>(a) + b;
+  }
+
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator+(const T &a, const ThreadLocalCache<T, N, O> &b)
+  {
+    return a + static_cast<const T &>(b);
+  }
+
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator-(const ThreadLocalCache<T, N, O> &a, const T &b)
+  {
+    return static_cast<const T &>(a) - b;
+  }
+
+  template <typename T, int N, typename O>
+  __device__ __host__ inline T operator-(const T &a, const ThreadLocalCache<T, N, O> &b)
+  {
+    return a - static_cast<const T &>(b);
+  }
+
+  template <typename T, int N, typename O>
+  __device__ __host__ inline auto operator+=(ThreadLocalCache<T, N, O> &a, const T &b)
+  {
+    a.save(static_cast<const T &>(a) + b);
+    return a;
+  }
+
+  template <typename T, int N, typename O>
+  __device__ __host__ inline auto operator-=(ThreadLocalCache<T, N, O> &a, const T &b)
+  {
+    a.save(static_cast<const T &>(a) - b);
+    return a;
+  }
+
+  template <typename T, int N, typename O> __device__ __host__ inline auto conj(const ThreadLocalCache<T, N, O> &a)
+  {
+    return conj(static_cast<const T &>(a));
+  }
+
+  /**
+     @brief Uniform helper for exposing type T, whether we are dealing
+     with an instance of T or ThreadLocalCache<T,N,O>
+   */
+  template <class T>
+  struct get_type<
+    T, std::enable_if_t<std::is_same_v<T, ThreadLocalCache<typename T::value_type, T::N, typename T::offset_type>>>> {
+    using type = typename T::value_type;
+  };
+
+} // namespace quda
diff --git a/include/targets/hip/load_store.h b/include/targets/hip/load_store.h
index 7b387bf2df..d1bfe4a955 100644
--- a/include/targets/hip/load_store.h
+++ b/include/targets/hip/load_store.h
@@ -5,6 +5,12 @@
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
diff --git a/include/targets/hip/shared_memory_cache_helper.h b/include/targets/hip/shared_memory_cache_helper.h
index 7c7c0a1b28..73be0cd01b 100644
--- a/include/targets/hip/shared_memory_cache_helper.h
+++ b/include/targets/hip/shared_memory_cache_helper.h
@@ -1,295 +1 @@
-#pragma once
-
-#include <target_device.h>
-#include <array.h>
-
-/**
-   @file shared_memory_cache_helper.h
-
-   Helper functionality for aiding the use of the shared memory for
-   sharing data between threads in a thread block.
- */
-
-namespace quda
-{
-
-  /**
-     @brief Class which wraps around a shared memory cache for type T,
-     where each thread in the thread block stores a unique value in
-     the cache which any other thread can access.
-
-     This accessor supports both explicit run-time block size and
-     compile-time sizing.
-
-     * For run-time block size, the constructor should be initialied
-       with the desired block size.
-
-     * For compile-time block size, no arguments should be passed to
-       the constructor, and then the second and third template
-       parameters correspond to the y and z dimensions of the block,
-       respectively.  The x dimension of the block will be set
-       according the maximum number of threads possible, given these
-       dimensions.
-   */
-  template <typename T, int block_size_y_ = 1, int block_size_z_ = 1, bool dynamic_ = true> class SharedMemoryCache
-  {
-  public:
-    using value_type = T;
-    static constexpr int block_size_y = block_size_y_;
-    static constexpr int block_size_z = block_size_z_;
-    static constexpr bool dynamic = dynamic_;
-
-  private:
-    /** maximum number of threads in x given the y and z block sizes */
-    static constexpr int block_size_x = device::max_block_size<block_size_y, block_size_z>();
-
-    using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
-    static_assert(sizeof(T) % 4 == 0, "Shared memory cache does not support sub-word size types");
-
-    // The number of elements of type atom_t that we break T into for optimal shared-memory access
-    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
-
-    const dim3 block;
-    const int stride;
-    const unsigned int offset = 0; // dynamic offset in bytes
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool, typename dummy = void> struct cache_dynamic {
-      atom_t *operator()(unsigned)
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, dynamic specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_dynamic<true, dummy> {
-      __device__ inline atom_t *operator()(unsigned int offset)
-      {
-        extern __shared__ int cache_[];
-        return reinterpret_cast<atom_t *>(reinterpret_cast<char *>(cache_) + offset);
-      }
-    };
-
-    /**
-       @brief This is a dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct cache_static {
-      atom_t *operator()()
-      {
-        static atom_t *cache_;
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    /**
-       @brief This is the handle to the shared memory, static specialization
-       @return Shared memory pointer
-     */
-    template <typename dummy> struct cache_static<true, dummy> {
-      __device__ inline atom_t *operator()()
-      {
-        static __shared__ atom_t cache_[n_element * block_size_x * block_size_y * block_size_z];
-        return reinterpret_cast<atom_t *>(cache_);
-      }
-    };
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_dynamic>(offset);
-    }
-
-    template <bool dynamic_shared> __device__ __host__ inline std::enable_if_t<!dynamic_shared, atom_t *> cache() const
-    {
-      return target::dispatch<cache_static>();
-    }
-
-    __device__ __host__ inline void save_detail(const T &a, int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      memcpy(tmp, (void *)&a, sizeof(T));
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) cache<dynamic>()[i * stride + j] = tmp[i];
-    }
-
-    __device__ __host__ inline T load_detail(int x, int y, int z) const
-    {
-      atom_t tmp[n_element];
-      int j = (z * block.y + y) * block.x + x;
-#pragma unroll
-      for (int i = 0; i < n_element; i++) tmp[i] = cache<dynamic>()[i * stride + j];
-      T a;
-      memcpy((void *)&a, tmp, sizeof(T));
-      return a;
-    }
-
-    /**
-       @brief Dummy instantiation for the host compiler
-    */
-    template <bool is_device, typename dummy = void> struct sync_impl {
-      void operator()() { }
-    };
-
-    /**
-       @brief Synchronize the cache when on the device
-    */
-    template <typename dummy> struct sync_impl<true, dummy> {
-      __device__ inline void operator()() { __syncthreads(); }
-    };
-
-  public:
-    /**
-       @brief constructor for SharedMemory cache.  If no arguments are
-       pass, then the dimensions are set according to the templates
-       block_size_y and block_size_z, together with the derived
-       block_size_x.  Otherwise use the block sizes passed into the
-       constructor.
-
-       @param[in] block Block dimensions for the 3-d shared memory object
-       @param[in] thread_offset "Perceived" offset from dynamic shared
-       memory base pointer (used when we have multiple caches in
-       scope).  Need to include block size to actual offset.
-    */
-    constexpr SharedMemoryCache(dim3 block = dim3(block_size_x, block_size_y, block_size_z),
-                                unsigned int thread_offset = 0) :
-      block(block), stride(block.x * block.y * block.z), offset(stride * thread_offset)
-    {
-    }
-
-    /**
-       @brief Grab the raw base address to shared memory.
-    */
-    __device__ __host__ inline auto data() const { return reinterpret_cast<T *>(cache<dynamic>()); }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save(const T &a, int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, x, y, z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] x The x index to use
-     */
-    __device__ __host__ inline void save_x(const T &a, int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      save_detail(a, x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] y The y index to use
-     */
-    __device__ __host__ inline void save_y(const T &a, int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      save_detail(a, tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Save the value into the 3-d shared memory cache.
-       @param[in] a The value to store in the shared memory cache
-       @param[in] z The z index to use
-     */
-    __device__ __host__ inline void save_z(const T &a, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      save_detail(a, tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Load a value from the shared memory cache
-       @param[in] x The x index to use
-       @param[in] y The y index to use
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-     */
-    __device__ __host__ inline T load(int x = -1, int y = -1, int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      y = (y == -1) ? tid.y : y;
-      z = (z == -1) ? tid.z : z;
-      return load_detail(x, y, z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] x The x index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_x(int x = -1) const
-    {
-      auto tid = target::thread_idx();
-      x = (x == -1) ? tid.x : x;
-      return load_detail(x, tid.y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] y The y index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_y(int y = -1) const
-    {
-      auto tid = target::thread_idx();
-      y = (y == -1) ? tid.y : y;
-      return load_detail(tid.x, y, tid.z);
-    }
-
-    /**
-       @brief Load a vector from the shared memory cache
-       @param[in] z The z index to use
-       @return The value at coordinates (x,y,z)
-    */
-    __device__ __host__ inline T load_z(int z = -1) const
-    {
-      auto tid = target::thread_idx();
-      z = (z == -1) ? tid.z : z;
-      return load_detail(tid.x, tid.y, z);
-    }
-
-    /**
-       @brief Synchronize the cache
-    */
-    __device__ __host__ void sync() const { target::dispatch<sync_impl>(); }
-
-    /**
-       @brief Cast operator to allow cache objects to be used where T
-       is expected
-     */
-    __device__ __host__ operator T() const { return load(); }
-
-    /**
-       @brief Assignment operator to allow cache objects to be used on
-       the lhs where T is otherwise expected.
-     */
-    __device__ __host__ void operator=(const T &src) const { save(src); }
-  };
-
-} // namespace quda
-
-// include overloads
 #include "../generic/shared_memory_cache_helper.h"
diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
new file mode 100644
index 0000000000..bc9bd7c66b
--- /dev/null
+++ b/include/targets/hip/shared_memory_helper.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <target_device.h>
+
+/**
+   @file shared_memory_helper.h
+
+   Target specific helper for allocating and accessing shared memory.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief Class which is used to allocate and access shared memory.
+     The shared memory is treated as an array of type T, with the
+     number of elements given by a call to the static member
+     S::size(target::block_dim()).  The byte offset from the beginning
+     of the total shared memory block is given by the static member
+     O::shared_mem_size(target::block_dim()), or 0 if O is void.
+   */
+  template <typename T, typename S, typename O = void> class SharedMemory
+  {
+  public:
+    using value_type = T;
+
+  private:
+    T *data;
+
+    /**
+       @brief This is a dummy instantiation for the host compiler
+    */
+    template <bool, typename dummy = void> struct cache_dynamic {
+      T *operator()(unsigned int)
+      {
+        static T *cache_;
+        return cache_;
+      }
+    };
+
+    /**
+       @brief This is the handle to the dynamic shared memory
+       @return Shared memory pointer
+     */
+    template <typename dummy> struct cache_dynamic<true, dummy> {
+      __device__ inline T *operator()(unsigned int offset)
+      {
+        extern __shared__ int cache_[];
+        return reinterpret_cast<T *>(reinterpret_cast<char *>(cache_) + offset);
+      }
+    };
+
+    __device__ __host__ inline T *cache(unsigned int offset) const { return target::dispatch<cache_dynamic>(offset); }
+
+  public:
+    /**
+       @brief Byte offset for this shared memory object.
+    */
+    static constexpr unsigned int get_offset(dim3 block)
+    {
+      unsigned int o = 0;
+      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block); }
+      return o;
+    }
+
+    /**
+       @brief Shared memory size in bytes.
+    */
+    static constexpr unsigned int shared_mem_size(dim3 block) { return get_offset(block) + S::size(block) * sizeof(T); }
+
+    /**
+       @brief Constructor for SharedMemory object.
+    */
+    constexpr SharedMemory() : data(cache(get_offset(target::block_dim()))) { }
+
+    /**
+       @brief Return this SharedMemory object.
+    */
+    constexpr auto sharedMem() const { return *this; }
+
+    /**
+       @brief Subscripting operator returning a reference to element.
+       @param[in] i The index to use.
+       @return Reference to value stored at that index.
+     */
+    __device__ __host__ T &operator[](const int i) const { return data[i]; }
+  };
+
+} // namespace quda
diff --git a/include/targets/hip/thread_array.h b/include/targets/hip/thread_array.h
index 77751adf1b..24f986cc26 100644
--- a/include/targets/hip/thread_array.h
+++ b/include/targets/hip/thread_array.h
@@ -1,40 +1 @@
-#pragma once
-
-#include "shared_memory_cache_helper.h"
-
-namespace quda
-{
-
-  /**
-     @brief Class that provides indexable per-thread storage.  On HIP
-     this maps to using assigning each thread a unique window of
-     shared memory using the SharedMemoryCache object.
-   */
-  template <typename T, int n> struct thread_array {
-    SharedMemoryCache<array<T, n>, 1, 1, false> device_array;
-    int offset;
-    array<T, n> host_array;
-    array<T, n> &array_;
-
-    __device__ __host__ constexpr thread_array() :
-      offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x
-             + target::thread_idx().x),
-      array_(target::is_device() ? *(device_array.data() + offset) : host_array)
-    {
-      array_ = array<T, n>(); // call default constructor
-    }
-
-    template <typename... Ts>
-    __device__ __host__ constexpr thread_array(T first, const Ts... other) :
-      offset((target::thread_idx().z * target::block_dim().y + target::thread_idx().y) * target::block_dim().x
-             + target::thread_idx().x),
-      array_(target::is_device() ? *(device_array.data() + offset) : host_array)
-    {
-      array_ = array<T, n> {first, other...};
-    }
-
-    __device__ __host__ T &operator[](int i) { return array_[i]; }
-    __device__ __host__ const T &operator[](int i) const { return array_[i]; }
-  };
-
-} // namespace quda
+#include "../generic/thread_array.h"
diff --git a/include/targets/hip/tunable_kernel.h b/include/targets/hip/tunable_kernel.h
index 4084058066..dca9dcfa24 100644
--- a/include/targets/hip/tunable_kernel.h
+++ b/include/targets/hip/tunable_kernel.h
@@ -42,6 +42,7 @@ namespace quda
     std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
       launch_error = qudaLaunchKernel(kernel.func, tp, stream, static_cast<const void *>(&arg));
       return launch_error;
     }
@@ -50,6 +51,7 @@ namespace quda
     std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
+      checkSharedBytes(tp);
       static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
       qudaMemcpyAsync(device::get_constant_buffer<Arg>(), &arg, sizeof(Arg), qudaMemcpyHostToDevice, stream);
       launch_error = qudaLaunchKernel(kernel.func, tp, stream, static_cast<const void *>(&arg));
@@ -66,6 +68,7 @@ namespace quda
     template <template <typename> class Functor, typename Arg>
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
+      checkSharedBytes(tp);
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/tunable_block_reduction.h b/include/tunable_block_reduction.h
index f4552605e5..fcd73e5a92 100644
--- a/include/tunable_block_reduction.h
+++ b/include/tunable_block_reduction.h
@@ -162,12 +162,17 @@ namespace quda
         return true;
       } else { // block.x (spacetime) was reset
 
+        auto next = param;
+        next.block.z += step_z;
+        auto shared_bytes = setSharedBytes(next);
+
         // we can advance spin/block-color since this is valid
         if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2)
             && param.block.x * param.block.y * (param.block.z + step_z) <= device::max_threads_per_block()
-            && ((param.block.z + step_z) <= max_block_z)) {
+            && ((param.block.z + step_z) <= max_block_z) && shared_bytes <= this->maxSharedBytesPerBlock()) {
           param.block.z += step_z;
           param.grid.z = (vector_length_z + param.block.z - 1) / param.block.z;
+          param.shared_bytes = shared_bytes;
           return true;
         } else { // we have run off the end so let's reset
           param.block.z = step_z;
diff --git a/include/tunable_nd.h b/include/tunable_nd.h
index 43af42f932..b942bc041d 100644
--- a/include/tunable_nd.h
+++ b/include/tunable_nd.h
@@ -265,8 +265,7 @@ namespace quda
 
         auto next = param;
         next.block.y += step_y;
-        auto shared_bytes = std::max(this->sharedBytesPerThread() * next.block.x * next.block.y * next.block.z,
-                                     this->sharedBytesPerBlock(next));
+        auto shared_bytes = this->setSharedBytes(next);
 
         // we can advance spin/block-color since this is valid
         if (param.block.y < vector_length_y && param.block.y < device::max_threads_per_block_dim(1)
@@ -279,7 +278,6 @@ namespace quda
         } else { // we have run off the end so let's reset
           param.block.y = step_y;
           param.grid.y = (vector_length_y + param.block.y - 1) / param.block.y;
-
           return false;
         }
       }
@@ -294,8 +292,7 @@ namespace quda
       Tunable::initTuneParam(param);
       param.block.y = step_y;
       param.grid.y = (vector_length_y + step_y - 1) / step_y;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -307,8 +304,7 @@ namespace quda
       Tunable::defaultTuneParam(param);
       param.block.y = step_y;
       param.grid.y = (vector_length_y + step_y - 1) / step_y;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -524,8 +520,7 @@ namespace quda
 
         auto next = param;
         next.block.z += step_z;
-        auto shared_bytes = std::max(this->sharedBytesPerThread() * next.block.x * next.block.y * next.block.z,
-                                     this->sharedBytesPerBlock(next));
+        auto shared_bytes = this->setSharedBytes(next);
 
         // we can advance spin/block-color since this is valid
         if (param.block.z < vector_length_z && param.block.z < device::max_threads_per_block_dim(2)
@@ -552,8 +547,7 @@ namespace quda
       TunableKernel2D_base<grid_stride>::initTuneParam(param);
       param.block.z = step_z;
       param.grid.z = (vector_length_z + step_z - 1) / step_z;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
@@ -565,8 +559,7 @@ namespace quda
       TunableKernel2D_base<grid_stride>::defaultTuneParam(param);
       param.block.z = step_z;
       param.grid.z = (vector_length_z + step_z - 1) / step_z;
-      param.shared_bytes = std::max(this->sharedBytesPerThread() * param.block.x * param.block.y * param.block.z,
-                                    this->sharedBytesPerBlock(param));
+      this->setSharedBytes(param);
     }
 
     /**
diff --git a/include/tunable_reduction.h b/include/tunable_reduction.h
index d24904b2fd..17241fcccc 100644
--- a/include/tunable_reduction.h
+++ b/include/tunable_reduction.h
@@ -166,6 +166,7 @@ namespace quda
     {
       TunableKernel::initTuneParam(param);
       param.block.y = block_size_y;
+      setSharedBytes(param);
     }
 
     /**
@@ -176,6 +177,7 @@ namespace quda
     {
       TunableKernel::defaultTuneParam(param);
       param.block.y = block_size_y;
+      setSharedBytes(param);
     }
   };
 
@@ -323,11 +325,17 @@ namespace quda
       if (rtn) {
         return true;
       } else {
+
+        auto next = param;
+        next.block.z++;
+        auto shared_bytes = setSharedBytes(next);
+
         if (param.block.z < n_batch && param.block.z < device::max_threads_per_block_dim(2)
             && param.block.x * param.block.y * (param.block.z + 1) <= device::max_threads_per_block()
-            && param.block.z < n_batch_block_max) {
+            && param.block.z < n_batch_block_max && shared_bytes <= this->maxSharedBytesPerBlock()) {
           param.block.z++;
           param.grid.z = (n_batch + param.block.z - 1) / param.block.z;
+          param.shared_bytes = shared_bytes;
           return true;
         } else { // we have run off the end so let's reset
           param.block.z = 1;
@@ -346,6 +354,7 @@ namespace quda
       TunableReduction2D::initTuneParam(param);
       param.block = {param.block.x, param.block.y, 1};
       param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z};
+      setSharedBytes(param);
     }
 
     /**
@@ -357,6 +366,7 @@ namespace quda
       TunableReduction2D::defaultTuneParam(param);
       param.block = {param.block.x, param.block.y, 1};
       param.grid = {param.grid.x, param.grid.y, (n_batch + param.block.z - 1) / param.block.z};
+      setSharedBytes(param);
     }
   };
 
diff --git a/include/tune_quda.h b/include/tune_quda.h
index 9da6a82411..c2cc221d4e 100644
--- a/include/tune_quda.h
+++ b/include/tune_quda.h
@@ -133,6 +133,13 @@ namespace quda {
       }
     }
 
+    auto setSharedBytes(TuneParam &param) const
+    {
+      int nthreads = param.block.x * param.block.y * param.block.z;
+      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      return param.shared_bytes;
+    }
+
     virtual bool advanceBlockDim(TuneParam &param) const
     {
       const unsigned int max_threads = maxBlockSize(param);
@@ -140,14 +147,12 @@ namespace quda {
       bool ret;
 
       param.block.x += blockStep();
-      int nthreads = param.block.x * param.block.y * param.block.z;
-      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      setSharedBytes(param);
 
       if (param.block.x > max_threads || param.shared_bytes > max_shared
           || param.block.x * param.block.y * param.block.z > device::max_threads_per_block()) {
         resetBlockDim(param);
-        int nthreads = param.block.x * param.block.y * param.block.z;
-        param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+        setSharedBytes(param);
         ret = false;
       } else {
         ret = true;
@@ -197,8 +202,7 @@ namespace quda {
 	if (param.shared_bytes > max_shared) {
 	  TuneParam next(param);
 	  advanceBlockDim(next); // to get next blockDim
-	  int nthreads = next.block.x * next.block.y * next.block.z;
-          param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(next));
+          param.shared_bytes = setSharedBytes(next);
           return false;
 	} else {
 	  return true;
@@ -284,8 +288,7 @@ namespace quda {
 
 	param.grid = dim3((minThreads()+param.block.x-1)/param.block.x, 1, 1);
       }
-      int nthreads = param.block.x*param.block.y*param.block.z;
-      param.shared_bytes = std::max(sharedBytesPerThread() * nthreads, sharedBytesPerBlock(param));
+      setSharedBytes(param);
     }
 
     /** sets default values for when tuning is disabled */
@@ -335,6 +338,23 @@ namespace quda {
         errorQuda("aux tuning enabled but param.aux is not initialized");
     }
 
+    /**
+     * @brief self-consistency check that the shared memory is set
+     * correctly (e.g., check that block size has been correctly
+     * factored in when set setting shared_bytes)
+     */
+    void checkSharedBytes(const TuneParam &tp) const
+    {
+      auto tp2 = TuneParam(tp);
+      auto expected = setSharedBytes(tp2);
+      if (tp.shared_bytes < expected)
+        errorQuda("Shared bytes %u insufficient (expected %u)", tp.shared_bytes, expected);
+
+      if (sharedBytesPerThread() && sharedBytesPerBlock(tp))
+        errorQuda("Not supported: non-zero shared bytes per thread (%u) and per block (%u)", sharedBytesPerThread(),
+                  sharedBytesPerBlock(tp));
+    }
+
     /**
      * @brief Return the rank on which kernel tuning is performed.
      * This will default to 0, but can be globally overriden with the
diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu
index 34ef0b993b..24615f84ce 100644
--- a/lib/clover_deriv_quda.cu
+++ b/lib/clover_deriv_quda.cu
@@ -12,6 +12,7 @@ namespace quda {
     double coeff;
     int parity;
     unsigned int minThreads() const { return gauge.LocalVolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
     DerivativeClover(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, int parity) :
diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh
index 564d1ed55c..78d5cd716e 100644
--- a/lib/coarse_op.cuh
+++ b/lib/coarse_op.cuh
@@ -245,7 +245,7 @@ namespace quda {
 
     unsigned int sharedBytesPerBlock(const TuneParam &param) const override
     {
-      if (type == COMPUTE_VUV || type == COMPUTE_VLV)
+      if (arg.shared_atomic && (type == COMPUTE_VUV || type == COMPUTE_VLV))
         return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin;
       return TunableKernel3D::sharedBytesPerBlock(param);
     }
@@ -577,9 +577,7 @@ namespace quda {
       if (type == COMPUTE_VUV || type == COMPUTE_VLV || type == COMPUTE_CONVERT || type == COMPUTE_RESCALE) arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim;
       arg.kd_dagger = kd_dagger;
 
-      if (type == COMPUTE_VUV || type == COMPUTE_VLV) tp.shared_bytes -= sharedBytesPerBlock(tp); // shared memory is static so don't include it in launch
       Launch<location_template>(arg, tp, type, stream);
-      if (type == COMPUTE_VUV || type == COMPUTE_VLV) tp.shared_bytes += sharedBytesPerBlock(tp); // restore shared memory
     };
 
     /**
diff --git a/lib/dslash5_domain_wall.cu b/lib/dslash5_domain_wall.cu
index 10e6ae8a48..6df3595613 100644
--- a/lib/dslash5_domain_wall.cu
+++ b/lib/dslash5_domain_wall.cu
@@ -72,7 +72,10 @@ namespace quda
           && (type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS
               || type == Dslash5Type::M5_INV_ZMOBIUS)) {
         // spin components in shared depend on inversion algorithm
-        int nSpin = mobius_m5::var_inverse() ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() : in.Nspin();
+        bool isInv = type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS
+          || type == Dslash5Type::M5_INV_ZMOBIUS;
+        int nSpin = (!isInv || mobius_m5::var_inverse()) ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() :
+                                                           in.Nspin();
         return 2 * nSpin * nColor * sizeof(typename mapper<Float>::type);
       } else {
         return 0;
@@ -129,7 +132,7 @@ namespace quda
     }
 
     template <bool dagger, bool xpay, Dslash5Type type> using Arg = Dslash5Arg<Float, nColor, dagger, xpay, type>;
-    
+
     template <Dslash5Type type, template <typename> class F>
     void Launch(TuneParam &tp, const qudaStream_t &stream)
     {
diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu
index fa2ba4d365..d2acdf369f 100644
--- a/lib/dslash_clover_helper.cu
+++ b/lib/dslash_clover_helper.cu
@@ -76,7 +76,11 @@ namespace quda {
 
     unsigned int sharedBytesPerThread() const
     {
-      return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type);
+      if (in.TwistFlavor() == QUDA_TWIST_SINGLET) {
+        return 0;
+      } else {
+        return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type);
+      }
     }
 
   public:
diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu
index 248b7d1d6c..a2c8a92dd8 100644
--- a/lib/gauge_ape.cu
+++ b/lib/gauge_ape.cu
@@ -13,6 +13,7 @@ namespace quda {
     const GaugeField &in;
     const Float alpha;
     unsigned int minThreads() const { return in.LocalVolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
     // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim
diff --git a/lib/gauge_field_strength_tensor.cu b/lib/gauge_field_strength_tensor.cu
index dc6b763b54..7d56cf4740 100644
--- a/lib/gauge_field_strength_tensor.cu
+++ b/lib/gauge_field_strength_tensor.cu
@@ -11,6 +11,7 @@ namespace quda
     GaugeField &f;
     const GaugeField &u;
     unsigned int minThreads() const { return f.VolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
     Fmunu(const GaugeField &u, GaugeField &f) :
diff --git a/lib/gauge_force.cu b/lib/gauge_force.cu
index 5e43fa64e6..a15b894996 100644
--- a/lib/gauge_force.cu
+++ b/lib/gauge_force.cu
@@ -12,6 +12,7 @@ namespace quda {
     double epsilon;
     const paths<4> &p;
     unsigned int minThreads() const { return mom.VolumeCB(); }
+    unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array
 
   public:
     ForceGauge(const GaugeField &u, GaugeField &mom, double epsilon, const paths<4> &p) :
diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu
index 0b1af50ba4..6425e39cf8 100644
--- a/lib/gauge_loop_trace.cu
+++ b/lib/gauge_loop_trace.cu
@@ -13,6 +13,7 @@ namespace quda {
     std::vector<reduce_t>& loop_traces;
     double factor;
     const paths<1> p;
+    unsigned int sharedBytesPerThread() const override { return 4 * sizeof(int); } // for thread_array
 
   public:
     // max block size of 8 is arbitrary for now
@@ -31,14 +32,14 @@ namespace quda {
       apply(device::get_default_stream());
     }
 
-    void apply(const qudaStream_t &stream)
+    void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       GaugeLoopTraceArg<Float, nColor, recon> arg(u, factor, p);
       launch<GaugeLoop>(loop_traces, tp, stream, arg);
     }
 
-    long long flops() const
+    long long flops() const override
     {
       auto Nc = u.Ncolor();
       auto mat_mul_flops = 8ll * Nc * Nc * Nc - 2 * Nc * Nc;
@@ -46,7 +47,8 @@ namespace quda {
       return (p.count * mat_mul_flops + p.num_paths * (2 * Nc + 2)) * u.Volume();
     }
 
-    long long bytes() const {
+    long long bytes() const override
+    {
       // links * one LatticeColorMatrix worth of data
       return p.count * u.Bytes() / 4;
     }
diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu
index c7f256f2ee..48d7e638e8 100644
--- a/lib/gauge_stout.cu
+++ b/lib/gauge_stout.cu
@@ -18,8 +18,9 @@ namespace quda {
     unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); }
     unsigned int sharedBytesPerThread() const
     {
-      // use SharedMemoryCache if using over improvement for two link fields
-      return improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0;
+      // use ThreadLocalCache if using over improvement for two link fields
+      return (improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0)
+        + 4 * sizeof(int); // for thread_array
     }
 
   public:
diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu
index d92fb0a68c..307d52472b 100644
--- a/lib/gauge_wilson_flow.cu
+++ b/lib/gauge_wilson_flow.cu
@@ -24,8 +24,11 @@ namespace quda {
 
     unsigned int sharedBytesPerThread() const
     {
-      // use SharedMemoryCache if using Symanzik improvement for two Link fields
-      return wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) : 0;
+      // use ThreadLocalCache if using Symanzik improvement for two Link fields
+      return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ?
+                2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper<Float>::type) :
+                0)
+        + 4 * sizeof(int); // for thread_array
     }
 
   public: