-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Summary: add two components for integrating RDC (https://rocm.docs.amd.com/projects/rdc) into dynolog add RdcWrapper class to open source dynolog. this contains necessary functions for enabling rdc metrics collection add DynoRdcSingleton class for adding meta specific requirement, as well as handling instance lifetime in a dynolog process Reviewed By: jj10306 Differential Revision: D59832676
- Loading branch information
1 parent
9c1d250
commit f9d346f
Showing
3 changed files
with
284 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. | ||
|
||
#include "dynolog/src/gpumon/amd/RdcWrapper.h" | ||
|
||
#include <glog/logging.h> | ||
#include <mutex> | ||
|
||
namespace dynolog { | ||
namespace gpumon { | ||
|
||
// an id to identy gpu group, since we only have one gpu group, this can be any | ||
// random string | ||
const char* kGpuGroupName = "RdcGroupInfo"; | ||
// and id to identify fields group, since we only have one field group, this can | ||
// ben any random string | ||
const char* kFieldGroupName = "RdcFieldGroup"; | ||
// how often RDC should query from underlying sources (amd-smi, rocprofiler...) | ||
constexpr std::chrono::milliseconds kUpdateInterval{500}; | ||
// how long the buffer should be to keep historical data, since kMaxKeepSamples | ||
// == 1, this should of no use when sufficiently large | ||
constexpr std::chrono::seconds kMaxKeepAge{2}; | ||
// how many sample to keep in the RDC cache, we only use the latest value | ||
constexpr int kMaxKeepSamples = 1; | ||
|
||
// check the RDC API reference | ||
// https://rocm.docs.amd.com/projects/rdc/en/docs-6.2.0/reference/api_ref.html | ||
// for more information | ||
|
||
RdcWrapper::RdcWrapper(std::vector<rdc_field_t> enabledMetrics) { | ||
auto contextWlocked = context_.wlock(); | ||
init_(std::move(enabledMetrics), contextWlocked); | ||
} | ||
|
||
RdcWrapper::~RdcWrapper() { | ||
clean(); | ||
} | ||
|
||
void RdcWrapper::init(std::vector<rdc_field_t> enabledMetrics) { | ||
LOG(INFO) << "initializing RDC in embedded mode"; | ||
auto contextWlocked = context_.wlock(); | ||
init_(std::move(enabledMetrics), contextWlocked); | ||
} | ||
|
||
void RdcWrapper::init_( | ||
std::vector<rdc_field_t> enabledMetrics, | ||
RdcRuntimeContextWithWLock& context) { | ||
context.data.enabledMetrics_ = std::move(enabledMetrics); | ||
if (context.data.enabledMetrics_.empty()) { | ||
throw std::runtime_error("enabledMetrics is empty"); | ||
} | ||
rdc_status_t result = rdc_init(0); | ||
if (result != RDC_ST_OK) { | ||
throw std::runtime_error( | ||
"rdc_init() failed with error code: " + std::to_string(result)); | ||
} | ||
result = | ||
rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &context.data.rdcHandle_); | ||
if (result != RDC_ST_OK) { | ||
throw std::runtime_error( | ||
"rdc_start_embedded() failed with error code: " + | ||
std::to_string(result)); | ||
} | ||
LOG(INFO) << "adding all GPUs to the group " << kGpuGroupName; | ||
result = rdc_group_gpu_create( | ||
context.data.rdcHandle_, | ||
RDC_GROUP_DEFAULT, | ||
kGpuGroupName, | ||
&context.data.gpuGroupId_); | ||
if (result != RDC_ST_OK) { | ||
throw std::runtime_error( | ||
"rdc_group_gpu_create() failed with error code: " + | ||
std::to_string(result)); | ||
} | ||
|
||
for (rdc_field_t metric : context.data.enabledMetrics_) { | ||
LOG(INFO) << "adding metric: " << metric; | ||
} | ||
result = rdc_group_field_create( | ||
context.data.rdcHandle_, | ||
context.data.enabledMetrics_.size(), | ||
context.data.enabledMetrics_.data(), | ||
kFieldGroupName, | ||
&context.data.fieldGroupId_); | ||
if (result != RDC_ST_OK) { | ||
throw std::runtime_error( | ||
"rdc_group_field_create() failed with error code: " + | ||
std::to_string(result)); | ||
} | ||
result = rdc_field_watch( | ||
context.data.rdcHandle_, | ||
context.data.gpuGroupId_, | ||
context.data.fieldGroupId_, | ||
std::chrono::microseconds(kUpdateInterval).count(), | ||
std::chrono::seconds(kMaxKeepAge).count(), | ||
kMaxKeepSamples); | ||
if (result != RDC_ST_OK) { | ||
throw std::runtime_error( | ||
"rdc_field_watch() failed with error code: " + std::to_string(result)); | ||
} | ||
|
||
LOG(INFO) << "RDC is initialized"; | ||
} | ||
|
||
void RdcWrapper::clean() { | ||
LOG(INFO) << "shutting down RDC"; | ||
auto contextWlocked = context_.wlock(); | ||
clean_(contextWlocked); | ||
} | ||
|
||
void RdcWrapper::clean_(RdcRuntimeContextWithWLock& context) { | ||
context.data.enabledMetrics_.clear(); | ||
rdc_status_t result = rdc_shutdown(); | ||
if (result != RDC_ST_OK) { | ||
LOG(WARNING) << "rdc_shutdown() failed with error code: " << result; | ||
} | ||
} | ||
|
||
RdcMetricsMap RdcWrapper::getRdcMetricsForDevice(size_t device) { | ||
RdcMetricsMap res; | ||
auto context = context_.rlock(); | ||
if (context.data.enabledMetrics_.empty()) { | ||
return {}; | ||
} | ||
rdc_status_t result; | ||
for (auto metric : context.data.enabledMetrics_) { | ||
rdc_field_value value = {}; | ||
result = rdc_field_get_latest_value( | ||
context.data.rdcHandle_, device, metric, &value); | ||
if (result != RDC_ST_OK) { | ||
LOG(ERROR) << "failed to get metric " << metric << " for device " | ||
<< device << " with error code: " << result; | ||
continue; | ||
} | ||
if (value.status != RDC_ST_OK) { | ||
LOG(ERROR) << "metric " << metric << " returned by device " << device | ||
<< " has a non RDC_ST_OK status: " << value.status; | ||
continue; | ||
} | ||
switch (value.type) { | ||
case rdc_field_type_t::DOUBLE: | ||
res[metric] = value.value.dbl; | ||
break; | ||
case rdc_field_type_t::INTEGER: | ||
res[metric] = value.value.l_int; | ||
break; | ||
default: | ||
LOG(ERROR) << "unsupported type returned by rdc: " << value.type; | ||
break; | ||
} | ||
} | ||
return res; | ||
} | ||
|
||
} // namespace gpumon | ||
} // namespace dynolog |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. | ||
|
||
#include <cstdint> | ||
#include <memory> | ||
#include <mutex> | ||
#include <shared_mutex> | ||
#include <unordered_map> | ||
#include <variant> | ||
#include <vector> | ||
|
||
#include "rdc/rdc.h" | ||
|
||
#pragma once | ||
|
||
namespace dynolog { | ||
namespace gpumon { | ||
|
||
using RdcMetricsValue = std::variant<double, int64_t>; | ||
using RdcMetricsMap = std::unordered_map<rdc_field_t, RdcMetricsValue>; | ||
|
||
struct RdcRuntimeContext { | ||
std::vector<rdc_field_t> enabledMetrics_; | ||
rdc_handle_t rdcHandle_; | ||
rdc_gpu_group_t gpuGroupId_; | ||
rdc_field_grp_t fieldGroupId_; | ||
}; | ||
|
||
struct RdcRuntimeContextWithWLock { | ||
RdcRuntimeContextWithWLock( | ||
RdcRuntimeContext& data, | ||
std::shared_mutex& sharedDataLock) | ||
: data{data} { | ||
lockGuard_ = std::unique_lock<std::shared_mutex>(sharedDataLock); | ||
} | ||
RdcRuntimeContext& data; | ||
|
||
private: | ||
std::unique_lock<std::shared_mutex> lockGuard_; | ||
}; | ||
|
||
struct RdcRuntimeContextWithRLock { | ||
RdcRuntimeContextWithRLock( | ||
RdcRuntimeContext& data, | ||
std::shared_mutex& sharedDataLock) | ||
: data{data} { | ||
lockGuard_ = std::shared_lock<std::shared_mutex>(sharedDataLock); | ||
} | ||
RdcRuntimeContext& data; | ||
|
||
private: | ||
std::shared_lock<std::shared_mutex> lockGuard_; | ||
}; | ||
|
||
class RdcRuntimeContextSynchronized { | ||
public: | ||
RdcRuntimeContextWithRLock rlock() { | ||
return RdcRuntimeContextWithRLock(data_, sharedDataLock_); | ||
} | ||
RdcRuntimeContextWithWLock wlock() { | ||
return RdcRuntimeContextWithWLock(data_, sharedDataLock_); | ||
} | ||
|
||
protected: | ||
RdcRuntimeContext data_; | ||
std::shared_mutex sharedDataLock_; | ||
}; | ||
|
||
class RdcWrapper { | ||
public: | ||
RdcWrapper(std::vector<rdc_field_t> enabledMetrics); | ||
~RdcWrapper(); | ||
void init(std::vector<rdc_field_t> enabledMetrics); | ||
void clean(); | ||
RdcMetricsMap getRdcMetricsForDevice(size_t device); | ||
|
||
protected: | ||
void init_( | ||
std::vector<rdc_field_t> enabledMetrics, | ||
RdcRuntimeContextWithWLock& context); | ||
void clean_(RdcRuntimeContextWithWLock& context); | ||
|
||
RdcRuntimeContextSynchronized context_; | ||
}; | ||
|
||
} // namespace gpumon | ||
} // namespace dynolog |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. | ||
|
||
#include "dynolog/src/gpumon/amd/RdcWrapper.h" | ||
#include <gtest/gtest.h> | ||
|
||
using namespace ::testing; | ||
using namespace ::dynolog::gpumon; | ||
|
||
class RdcRuntimeContextSynchronizedTest : public RdcRuntimeContextSynchronized { | ||
public: | ||
std::shared_mutex& getMutex() { | ||
return sharedDataLock_; | ||
} | ||
}; | ||
|
||
TEST(RdcWrapperTest, testRdcRuntimeContextSynchronized) { | ||
RdcRuntimeContextSynchronizedTest t; | ||
{ | ||
// two readers | ||
auto a = t.rlock(); | ||
std::shared_lock<std::shared_mutex> b(t.getMutex(), std::defer_lock); | ||
EXPECT_TRUE(b.try_lock()); | ||
} | ||
|
||
{ | ||
// two writers | ||
auto a = t.wlock(); | ||
std::unique_lock<std::shared_mutex> b(t.getMutex(), std::defer_lock); | ||
EXPECT_FALSE(b.try_lock()); | ||
} | ||
|
||
{ | ||
// reader then writer | ||
auto a = t.rlock(); | ||
std::unique_lock<std::shared_mutex> b(t.getMutex(), std::defer_lock); | ||
EXPECT_FALSE(b.try_lock()); | ||
} | ||
|
||
{ auto a = t.rlock(); } | ||
// lock released outside of scope | ||
std::unique_lock<std::shared_mutex> b(t.getMutex(), std::defer_lock); | ||
EXPECT_TRUE(b.try_lock()); | ||
} |