Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/power #1479

Merged
merged 11 commits into from
Jul 9, 2024
Merged
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,6 @@ mark_as_advanced(QUDA_RECONSTRUCT)
mark_as_advanced(QUDA_CLOVER_CHOLESKY_PROMOTE)
mark_as_advanced(QUDA_MULTIGRID_DSLASH_PROMOTE)
mark_as_advanced(QUDA_CTEST_SEP_DSLASH_POLICIES)
mark_as_advanced(QUDA_OPENMP)

mark_as_advanced(QUDA_BACKWARDS)

Expand Down
20 changes: 20 additions & 0 deletions include/device.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <chrono>
#include <quda_api.h>

namespace quda
Expand All @@ -11,6 +12,7 @@ namespace quda
/**
@brief Create the device context. Called by initQuda when
initializing the library.
@param[in] dev Device ordinal for which to initialize
*/
void init(int dev);

Expand All @@ -21,6 +23,24 @@ namespace quda
*/
void init_thread();

/**
@brief Struct that is used to record the state of the device
(or host in the future). At present this is used for storing
the power, clock rate and temperature at a given point in time,
but can be expanded as necessary in the future.
*/
struct state_t {
std::chrono::time_point<std::chrono::high_resolution_clock> time;
float power;
unsigned int clock;
unsigned int temp;
};

/**
@brief Record the present state of the GPU (power, temperature, clock)
*/
state_t get_state();

/**
@brief Get number of devices present on node
*/
Expand Down
16 changes: 7 additions & 9 deletions include/kernels/spin_taste.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,14 @@ namespace quda
F out; /** output vector field */
const F in; /** input vector field */

SpinTasteArg(ColorSpinorField &out_, const ColorSpinorField &in_) :
kernel_param(dim3(in_.VolumeCB(), in_.SiteSubset(), 1)), out(out_), in(in_)
SpinTasteArg(ColorSpinorField &out, const ColorSpinorField &in) :
kernel_param(dim3(in.VolumeCB(), in.SiteSubset(), 1)), out(out), in(in)
{
checkOrder(out_, in_); // check all orders match
checkPrecision(out_, in_); // check all precisions match
checkLocation(out_, in_); // check all locations match
if (!in_.isNative()) errorQuda("Unsupported field order colorspinor= %d \n", in_.FieldOrder());
if (!out_.isNative()) errorQuda("Unsupported field order colorspinor= %d \n", out_.FieldOrder());
#pragma unroll
for (int i = 0; i < 4; i++) { X[i] = in_.X()[i]; }
checkOrder(out, in); // check all orders match
checkPrecision(out, in); // check all precisions match
checkLocation(out, in); // check all locations match
checkNative(out, in);
for (int i = 0; i < 4; i++) { X[i] = in.X()[i]; }
}
};

Expand Down
26 changes: 26 additions & 0 deletions include/monitor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
namespace quda
{

namespace monitor
{

/**
@brief Initialize device monitoring if supported. On CUDA this
uses NVML-based monitoring.
*/
void init();

/**
@brief Tear down any state associated with device monitoring
*/
void destroy();

/**
@brief Serlialize the monitor state history to disk. If
QUDA_RESOURCE_PATH is not defined then no action is taken
*/
void serialize();

} // namespace monitor

} // namespace quda
2 changes: 1 addition & 1 deletion include/targets/cuda/atomic_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ namespace quda
template <bool is_device> struct atomic_fetch_abs_max_impl {
template <typename T> inline void operator()(T *addr, T val)
{
#pragma omp atomic update
#pragma omp critical
*addr = std::max(*addr, val);
}
};
Expand Down
4 changes: 4 additions & 0 deletions include/targets/cuda/device.in.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ namespace quda
return 100 * 1024;
#elif (__COMPUTE_CAPABILITY__ == 900)
return 228 * 1024;
#elif (__COMPUTE_CAPABILITY__ == 1000)
return 228 * 1024;
#else
return 0;
#endif
Expand All @@ -55,6 +57,8 @@ namespace quda
return 1536;
#elif (__COMPUTE_CAPABILITY__ == 900)
return 2048;
#elif (__COMPUTE_CAPABILITY__ == 1000)
return 2048;
#else
return 0;
#endif
Expand Down
1 change: 1 addition & 0 deletions include/targets/generic/block_reduction_kernel_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ namespace quda
{
Functor<Arg> t(arg);
dim3 block(0, 0, 0);
#pragma omp parallel for
maddyscientist marked this conversation as resolved.
Show resolved Hide resolved
for (block.y = 0; block.y < arg.grid_dim.y; block.y++) {
for (block.x = 0; block.x < arg.grid_dim.x; block.x++) { t(block, dim3(0, 0, 0)); }
}
Expand Down
3 changes: 3 additions & 0 deletions include/targets/generic/kernel_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ namespace quda
template <template <typename> class Functor, typename Arg> void Kernel1D_host(const Arg &arg)
{
Functor<Arg> f(const_cast<Arg &>(arg));
#pragma omp parallel for
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { f(i); }
}

template <template <typename> class Functor, typename Arg> void Kernel2D_host(const Arg &arg)
{
Functor<Arg> f(const_cast<Arg &>(arg));
#pragma omp parallel for
maddyscientist marked this conversation as resolved.
Show resolved Hide resolved
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
for (int j = 0; j < static_cast<int>(arg.threads.y); j++) { f(i, j); }
}
Expand All @@ -20,6 +22,7 @@ namespace quda
template <template <typename> class Functor, typename Arg> void Kernel3D_host(const Arg &arg)
{
Functor<Arg> f(const_cast<Arg &>(arg));
#pragma omp parallel for
maddyscientist marked this conversation as resolved.
Show resolved Hide resolved
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
for (int k = 0; k < static_cast<int>(arg.threads.z); k++) { f(i, j, k); }
Expand Down
16 changes: 12 additions & 4 deletions include/targets/generic/reduction_kernel_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace quda
Functor<Arg> t(arg);

reduce_t value = t.init();

#pragma omp parallel for collapse(2) reduction(Functor <Arg>::apply : value)
maddyscientist marked this conversation as resolved.
Show resolved Hide resolved
for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value = t(value, i, j); }
}
Expand All @@ -21,16 +21,24 @@ namespace quda

template <template <typename> class Functor, typename Arg> auto MultiReduction_host(const Arg &arg)
{
#pragma omp declare reduction(multi_reduce \
: typename Functor <Arg>::reduce_t \
: omp_out = Functor <Arg>::apply(omp_out, omp_in)) \
initializer(omp_priv = Functor <Arg>::init())

using reduce_t = typename Functor<Arg>::reduce_t;
Functor<Arg> t(arg);

std::vector<reduce_t> value(arg.threads.z);
std::vector<reduce_t> value(arg.threads.z, t.init());
for (int k = 0; k < static_cast<int>(arg.threads.z); k++) {
value[k] = t.init();
auto val = t.init();

#pragma omp parallel for collapse(2) reduction(multi_reduce : val)
for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { value[k] = t(value[k], i, j, k); }
for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { val = t(val, i, j, k); }
}

value[k] = val;
}

return value;
Expand Down
2 changes: 1 addition & 1 deletion include/targets/hip/atomic_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace quda
template <bool is_device> struct atomic_fetch_abs_max_impl {
template <typename T> inline void operator()(T *addr, T val)
{
#pragma omp atomic update
#pragma omp critical
*addr = std::max(*addr, val);
}
};
Expand Down
16 changes: 16 additions & 0 deletions include/tune_quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,22 @@ namespace quda {
*/
const std::map<TuneKey, TuneParam> &getTuneCache();

/**
@brief Return a string encoding the QUDA version
*/
const std::string get_quda_version();

/**
@brief Return a string encoding the git hash
*/
const std::string get_quda_hash();

/**
@brief Return the resource path (directory where QUDA read/write
tunecache and other internal info
*/
const std::string get_resource_path();

class Tunable {

friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity);
Expand Down
2 changes: 1 addition & 1 deletion lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ endif()

set (QUDA_OBJS
# cmake-format: sortable
dirac_coarse.cpp dslash_coarse.cpp
monitor.cpp dirac_coarse.cpp dslash_coarse.cpp
coarse_op.cpp coarsecoarse_op.cpp
coarse_op_preconditioned.cpp staggered_coarse_op.cpp
eig_iram.cpp eig_trlm.cpp eig_block_trlm.cpp vector_io.cpp
Expand Down
4 changes: 1 addition & 3 deletions lib/dslash_gamma_helper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ namespace quda {

void preTune() { out.backup(); }
void postTune() { out.restore(); }
long long flops() const { return 0; }
long long bytes() const { return out.Bytes() + in.Bytes(); }
};

Expand Down Expand Up @@ -86,12 +85,11 @@ namespace quda {
void apply(const qudaStream_t &stream)
{
TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
launch<TwistGamma>(tp, stream, GammaArg<Float, nColor>(out, in, d, kappa, mu, epsilon, dagger, type));
launch<TwistGamma>(tp, stream, GammaArg<Float, nColor>(out, in, d, 0, kappa, mu, epsilon, dagger, type));
}

void preTune() { out.backup(); }
void postTune() { out.restore(); }
long long flops() const { return 0; }
long long bytes() const { return out.Bytes() + in.Bytes(); }
};

Expand Down
17 changes: 5 additions & 12 deletions lib/interface_quda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -425,12 +425,6 @@ static void init_default_comms()
}


#define STR_(x) #x
#define STR(x) STR_(x)
static const std::string quda_version = STR(QUDA_VERSION_MAJOR) "." STR(QUDA_VERSION_MINOR) "." STR(QUDA_VERSION_SUBMINOR);
#undef STR
#undef STR_

extern char* gitversion;

/*
Expand All @@ -447,9 +441,9 @@ void initQudaDevice(int dev)
profileInit.TPSTART(QUDA_PROFILE_INIT);

#ifdef GITVERSION
logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n", quda_version.c_str(), gitversion);
logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n", get_quda_version().c_str(), gitversion);
#else
logQuda(QUDA_SUMMARIZE, "QUDA %s\n", quda_version.c_str());
logQuda(QUDA_SUMMARIZE, "QUDA %s\n", get_quda_version().c_str());
#endif

#ifdef MULTI_GPU
Expand Down Expand Up @@ -1377,6 +1371,9 @@ void endQuda(void)

initialized = false;

assertAllMemFree();
device::destroy();

comm_finalize();
comms_initialized = false;
}
Expand Down Expand Up @@ -1426,10 +1423,6 @@ void endQuda(void)
printPeakMemUsage();
printfQuda("\n");
}

assertAllMemFree();

device::destroy();
}


Expand Down
Loading
Loading