lattice · maddyscientist · Jul 9, 2024 · Jul 8, 2024 · Jun 21, 2024 · Jul 4, 2024
@@ -280,7 +280,6 @@ mark_as_advanced(QUDA_RECONSTRUCT)
 mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE)
 mark_as_advanced(QUDA_MULTIGRID_DSLASH_PROMOTE)
 mark_as_advanced(QUDA_CTEST_SEP_DSLASH_POLICIES)
-mark_as_advanced(QUDA_OPENMP)
 
 mark_as_advanced(QUDA_BACKWARDS)
 

@@ -1,5 +1,6 @@
 #pragma once
 
+#include <chrono>
 #include <quda_api.h>
 
 namespace quda
@@ -11,6 +12,7 @@ namespace quda
     /**
        @brief Create the device context.  Called by initQuda when
        initializing the library.
+       @param[in] dev Device ordinal for which to initialize
      */
     void init(int dev);
 
@@ -21,6 +23,24 @@ namespace quda
      */
     void init_thread();
 
+    /**
+       @brief Struct that is used to record the state of the device
+       (or host in the future).  At present this is used for storing
+       the power, clock rate and temperature at a given point in time,
+       but can be expanded as necessary in the future.
+     */
+    struct state_t {
+      std::chrono::time_point<std::chrono::high_resolution_clock> time;
+      float power;
+      unsigned int clock;
+      unsigned int temp;
+    };
+
+    /**
+       @brief Record the present state of the GPU (power, temperature, clock)
+     */
+    state_t get_state();
+
     /**
        @brief Get number of devices present on node
     */

@@ -19,16 +19,14 @@ namespace quda
     F out;      /** output vector field */
     const F in; /** input vector field */
 
-    SpinTasteArg(ColorSpinorField &out_, const ColorSpinorField &in_) :
-      kernel_param(dim3(in_.VolumeCB(), in_.SiteSubset(), 1)), out(out_), in(in_)
+    SpinTasteArg(ColorSpinorField &out, const ColorSpinorField &in) :
+      kernel_param(dim3(in.VolumeCB(), in.SiteSubset(), 1)), out(out), in(in)
     {
-      checkOrder(out_, in_);     // check all orders match
-      checkPrecision(out_, in_); // check all precisions match
-      checkLocation(out_, in_);  // check all locations match
-      if (!in_.isNative()) errorQuda("Unsupported field order colorspinor= %d \n", in_.FieldOrder());
-      if (!out_.isNative()) errorQuda("Unsupported field order colorspinor= %d \n", out_.FieldOrder());
-#pragma unroll
-      for (int i = 0; i < 4; i++) { X[i] = in_.X()[i]; }
+      checkOrder(out, in);     // check all orders match
+      checkPrecision(out, in); // check all precisions match
+      checkLocation(out, in);  // check all locations match
+      checkNative(out, in);
+      for (int i = 0; i < 4; i++) { X[i] = in.X()[i]; }
     }
   };
 

@@ -0,0 +1,26 @@
+namespace quda
+{
+
+  namespace monitor
+  {
+
+    /**
+       @brief Initialize device monitoring if supported.  On CUDA this
+       uses NVML-based monitoring.
+    */
+    void init();
+
+    /**
+       @brief Tear down any state associated with device monitoring
+    */
+    void destroy();
+
+    /**
+       @brief Serlialize the monitor state history to disk.  If
+       QUDA_RESOURCE_PATH is not defined then no action is taken
+    */
+    void serialize();
+
+  } // namespace monitor
+
+} // namespace quda
@@ -81,7 +81,7 @@ namespace quda
   template <bool is_device> struct atomic_fetch_abs_max_impl {
     template <typename T> inline void operator()(T *addr, T val)
     {
-#pragma omp atomic update
+#pragma omp critical
       *addr = std::max(*addr, val);
     }
   };

@@ -33,6 +33,8 @@ namespace quda
         return 100 * 1024;
 #elif (__COMPUTE_CAPABILITY__ == 900)
         return 228 * 1024;
+#elif (__COMPUTE_CAPABILITY__ == 1000)
+        return 228 * 1024;
 #else
         return 0;
 #endif
@@ -55,6 +57,8 @@ namespace quda
       return 1536;
 #elif (__COMPUTE_CAPABILITY__ == 900)
       return 2048;
+#elif (__COMPUTE_CAPABILITY__ == 1000)
+      return 2048;
 #else
       return 0;
 #endif

@@ -5,6 +5,7 @@ namespace quda
   {
     Functor<Arg> t(arg);
     dim3 block(0, 0, 0);
+#pragma omp parallel for
     for (block.y = 0; block.y < arg.grid_dim.y; block.y++) {
       for (block.x = 0; block.x < arg.grid_dim.x; block.x++) { t(block, dim3(0, 0, 0)); }
     }

@@ -6,12 +6,14 @@ namespace quda
   template <template <typename> class Functor, typename Arg> void Kernel1D_host(const Arg &arg)
   {
     Functor<Arg> f(const_cast<Arg &>(arg));
+#pragma omp parallel for
     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { f(i); }
   }
 
   template <template <typename> class Functor, typename Arg> void Kernel2D_host(const Arg &arg)
   {
     Functor<Arg> f(const_cast<Arg &>(arg));
+#pragma omp parallel for
     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) { f(i, j); }
     }
@@ -20,6 +22,7 @@ namespace quda
   template <template <typename> class Functor, typename Arg> void Kernel3D_host(const Arg &arg)
   {
     Functor<Arg> f(const_cast<Arg &>(arg));
+#pragma omp parallel for
     for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
         for (int k = 0; k < static_cast<int>(arg.threads.z); k++) { f(i, j, k); }

@@ -11,7 +11,7 @@ namespace quda
     Functor<Arg> t(arg);
 
     reduce_t value = t.init();
-
+#pragma omp parallel for collapse(2) reduction(Functor <Arg>::apply : value)
     for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
       for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value = t(value, i, j); }
     }
@@ -21,16 +21,24 @@ namespace quda
 
   template <template <typename> class Functor, typename Arg> auto MultiReduction_host(const Arg &arg)
   {
+#pragma omp declare reduction(multi_reduce                                                                             \
+                              : typename Functor <Arg>::reduce_t                                                       \
+                              : omp_out = Functor <Arg>::apply(omp_out, omp_in))                                       \
+  initializer(omp_priv = Functor <Arg>::init())
+
     using reduce_t = typename Functor<Arg>::reduce_t;
     Functor<Arg> t(arg);
 
-    std::vector<reduce_t> value(arg.threads.z);
+    std::vector<reduce_t> value(arg.threads.z, t.init());
     for (int k = 0; k < static_cast<int>(arg.threads.z); k++) {
-      value[k] = t.init();
+      auto val = t.init();
 
+#pragma omp parallel for collapse(2) reduction(multi_reduce : val)
       for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
-        for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value[k] = t(value[k], i, j, k); }
+        for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { val = t(val, i, j, k); }
       }
+
+      value[k] = val;
     }
 
     return value;

@@ -48,7 +48,7 @@ namespace quda
   template <bool is_device> struct atomic_fetch_abs_max_impl {
     template <typename T> inline void operator()(T *addr, T val)
     {
-#pragma omp atomic update
+#pragma omp critical
       *addr = std::max(*addr, val);
     }
   };

@@ -43,6 +43,22 @@ namespace quda {
    */
   const std::map<TuneKey, TuneParam> &getTuneCache();
 
+  /**
+     @brief Return a string encoding the QUDA version
+   */
+  const std::string get_quda_version();
+
+  /**
+     @brief Return a string encoding the git hash
+   */
+  const std::string get_quda_hash();
+
+  /**
+     @brief Return the resource path (directory where QUDA read/write
+     tunecache and other internal info
+  */
+  const std::string get_resource_path();
+
   class Tunable {
 
     friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity);

@@ -17,7 +17,7 @@ endif()
 
 set (QUDA_OBJS
   # cmake-format: sortable
-  dirac_coarse.cpp dslash_coarse.cpp
+  monitor.cpp dirac_coarse.cpp dslash_coarse.cpp
   coarse_op.cpp coarsecoarse_op.cpp
   coarse_op_preconditioned.cpp staggered_coarse_op.cpp
   eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp vector_io.cpp

@@ -32,7 +32,6 @@ namespace quda {
 
     void preTune() { out.backup(); }
     void postTune() { out.restore(); }
-    long long flops() const { return 0; }
     long long bytes() const { return out.Bytes() + in.Bytes(); }
   };
 
@@ -86,12 +85,11 @@ namespace quda {
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      launch<TwistGamma>(tp, stream, GammaArg<Float, nColor>(out, in, d, kappa, mu, epsilon, dagger, type));
+      launch<TwistGamma>(tp, stream, GammaArg<Float, nColor>(out, in, d, 0, kappa, mu, epsilon, dagger, type));
     }
 
     void preTune() { out.backup(); }
     void postTune() { out.restore(); }
-    long long flops() const { return 0; }
     long long bytes() const { return out.Bytes() + in.Bytes(); }
   };
 

@@ -425,12 +425,6 @@ static void init_default_comms()
 }
 
 
-#define STR_(x) #x
-#define STR(x) STR_(x)
-  static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
-#undef STR
-#undef STR_
-
 extern char* gitversion;
 
 /*
@@ -447,9 +441,9 @@ void initQudaDevice(int dev)
   profileInit.TPSTART(QUDA_PROFILE_INIT);
 
 #ifdef GITVERSION
-  logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n", quda_version.c_str(), gitversion);
+  logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n", get_quda_version().c_str(), gitversion);
 #else
-  logQuda(QUDA_SUMMARIZE, "QUDA %s\n", quda_version.c_str());
+  logQuda(QUDA_SUMMARIZE, "QUDA %s\n", get_quda_version().c_str());
 #endif
 
 #ifdef MULTI_GPU
@@ -1377,6 +1371,9 @@ void endQuda(void)
 
     initialized = false;
 
+    assertAllMemFree();
+    device::destroy();
+
     comm_finalize();
     comms_initialized = false;
   }
@@ -1426,10 +1423,6 @@ void endQuda(void)
     printPeakMemUsage();
     printfQuda("\n");
   }
-
-  assertAllMemFree();
-
-  device::destroy();
 }