lattice · maddyscientist · Sep 22, 2023 · Sep 20, 2023 · Sep 20, 2023
@@ -10,6 +10,7 @@ namespace quda
 
     /**
       @brief A constexpr function to returns the maximum dyanmic shared memory per block.
+        See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#feature-availability
      */
     constexpr int maximum_dynamic_shared_memory()
     {
@@ -37,5 +38,27 @@ namespace quda
 #endif
       }
     }
+
+    /**
+      @brief A constexpr function to return the maximum number of resident threads per SM.
+        See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#feature-availability
+     */
+    constexpr unsigned int maximum_resident_threads()
+    {
+#if (__COMPUTE_CAPABILITY__ < 750)
+      return 2048;
+#elif (__COMPUTE_CAPABILITY__ == 750)
+      return 1024;
+#elif (__COMPUTE_CAPABILITY__ == 800)
+      return 2048;
+#elif ((__COMPUTE_CAPABILITY__ > 800) && (__COMPUTE_CAPABILITY__ < 900))
+      return 1536;
+#elif (__COMPUTE_CAPABILITY__ == 900)
+      return 2048;
+#else
+      return 0;
+#endif
+    }
+
   } // namespace device
 } // namespace quda
@@ -10,6 +10,10 @@
 #include <kernels/dslash_mdw_fused.cuh>
 #include <dslash_mdw_fused.hpp>
 
+#ifdef QUDA_MMA_AVAILABLE
+#include <device.hpp>
+#endif
+
 namespace quda
 {
 
@@ -87,7 +91,11 @@ namespace quda
 
       int blockStep() const { return 16; }
       int blockMin() const { return 16; }
-      unsigned int maxBlockSize(const TuneParam &) const { return 32; }
+      unsigned int maxBlockSize(const TuneParam &param) const
+      {
+        unsigned int m = std::min(device::max_threads_per_block() / (param.block.y * param.block.z), 32u);
+        return std::min(m, device::maximum_resident_threads() / (param.block.y * param.block.z * param.aux.y));
+      }
 
       int gridStep() const { return device::processor_count(); }
       unsigned int maxGridSize() const { return (volume_4d_cb_active + blockMin() - 1) / blockMin(); }
@@ -116,7 +124,9 @@ namespace quda
           param.aux.x++;
           aux_advanced = true;
         } else {
-          if (param.aux.y < 3) { // second see if aux.y
+          if (param.aux.y < 3
+              && (param.aux.y + 1) * param.block.x * param.block.y
+                <= device::maximum_resident_threads()) { // second see if aux.y
             param.aux.y++;
             aux_advanced = true;
             param.aux.x = 0;
@@ -182,8 +192,14 @@ namespace quda
       template <int block_dim_x, int min_blocks, bool reload, MdwfFusedDslashType type>
       void apply(const TuneParam &tp, const qudaStream_t &stream)
       {
-        launch_cuda<FusedMobiusDslash>(tp, stream, Arg<type, Ls, block_dim_x, min_blocks, reload>
-                                               (out, in, U, y, x, m_f, m_5, b_5, c_5, parity, shift, halo_shift));
+        if constexpr (block_dim_x * Ls * min_blocks <= device::maximum_resident_threads()) {
+          launch_cuda<FusedMobiusDslash>(tp, stream,
+                                         Arg<type, Ls, block_dim_x, min_blocks, reload>(out, in, U, y, x, m_f, m_5, b_5,
+                                                                                        c_5, parity, shift, halo_shift));
+        } else {
+          errorQuda("Maximum number of resident_threads reached: %d * %d * %d > %u\n", block_dim_x, Ls, min_blocks,
+                    device::maximum_resident_threads());
+        }
       }
 
       template <int block_dim_x, bool reload, MdwfFusedDslashType type>