Skip to content

Commit

Permalink
rdc wrapper (#301)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #301

add two components for integrating RDC (https://rocm.docs.amd.com/projects/rdc) into dynolog

add RdcWrapper class to open source dynolog. this contains necessary functions for enabling rdc metrics collection

add DynoRdcSingleton class for adding meta specific requirement, as well as handling instance lifetime in a dynolog process

Reviewed By: jj10306

Differential Revision: D59832676

fbshipit-source-id: 7dcfbc8b0d19a38aa0b979a9a377f5f291372352
  • Loading branch information
Alston Tang authored and facebook-github-bot committed Sep 26, 2024
1 parent 9c1d250 commit 14145a5
Show file tree
Hide file tree
Showing 3 changed files with 284 additions and 0 deletions.
155 changes: 155 additions & 0 deletions dynolog/src/gpumon/amd/RdcWrapper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#include "dynolog/src/gpumon/amd/RdcWrapper.h"

#include <glog/logging.h>
#include <mutex>

namespace dynolog {
namespace gpumon {

// an id to identy gpu group, since we only have one gpu group, this can be any
// random string
const char* kGpuGroupName = "RdcGroupInfo";
// and id to identify fields group, since we only have one field group, this can
// ben any random string
const char* kFieldGroupName = "RdcFieldGroup";
// how often RDC should query from underlying sources (amd-smi, rocprofiler...)
constexpr std::chrono::milliseconds kUpdateInterval{500};
// how long the buffer should be to keep historical data, since kMaxKeepSamples
// == 1, this should of no use when sufficiently large
constexpr std::chrono::seconds kMaxKeepAge{2};
// how many sample to keep in the RDC cache, we only use the latest value
constexpr int kMaxKeepSamples = 1;

// check the RDC API reference
// https://rocm.docs.amd.com/projects/rdc/en/docs-6.2.0/reference/api_ref.html
// for more information

RdcWrapper::RdcWrapper(std::vector<rdc_field_t> enabledMetrics) {
auto contextWlocked = context_.wlock();
init_(std::move(enabledMetrics), contextWlocked);
}

RdcWrapper::~RdcWrapper() {
clean();
}

void RdcWrapper::init(std::vector<rdc_field_t> enabledMetrics) {
LOG(INFO) << "initializing RDC in embedded mode";
auto contextWlocked = context_.wlock();
init_(std::move(enabledMetrics), contextWlocked);
}

void RdcWrapper::init_(
std::vector<rdc_field_t> enabledMetrics,
RdcRuntimeContextWithWLock& context) {
context.data.enabledMetrics_ = std::move(enabledMetrics);
if (context.data.enabledMetrics_.empty()) {
throw std::runtime_error("enabledMetrics is empty");
}
rdc_status_t result = rdc_init(0);
if (result != RDC_ST_OK) {
throw std::runtime_error(
"rdc_init() failed with error code: " + std::to_string(result));
}
result =
rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &context.data.rdcHandle_);
if (result != RDC_ST_OK) {
throw std::runtime_error(
"rdc_start_embedded() failed with error code: " +
std::to_string(result));
}
LOG(INFO) << "adding all GPUs to the group " << kGpuGroupName;
result = rdc_group_gpu_create(
context.data.rdcHandle_,
RDC_GROUP_DEFAULT,
kGpuGroupName,
&context.data.gpuGroupId_);
if (result != RDC_ST_OK) {
throw std::runtime_error(
"rdc_group_gpu_create() failed with error code: " +
std::to_string(result));
}

for (rdc_field_t metric : context.data.enabledMetrics_) {
LOG(INFO) << "adding metric: " << metric;
}
result = rdc_group_field_create(
context.data.rdcHandle_,
context.data.enabledMetrics_.size(),
context.data.enabledMetrics_.data(),
kFieldGroupName,
&context.data.fieldGroupId_);
if (result != RDC_ST_OK) {
throw std::runtime_error(
"rdc_group_field_create() failed with error code: " +
std::to_string(result));
}
result = rdc_field_watch(
context.data.rdcHandle_,
context.data.gpuGroupId_,
context.data.fieldGroupId_,
std::chrono::microseconds(kUpdateInterval).count(),
std::chrono::seconds(kMaxKeepAge).count(),
kMaxKeepSamples);
if (result != RDC_ST_OK) {
throw std::runtime_error(
"rdc_field_watch() failed with error code: " + std::to_string(result));
}

LOG(INFO) << "RDC is initialized";
}

void RdcWrapper::clean() {
LOG(INFO) << "shutting down RDC";
auto contextWlocked = context_.wlock();
clean_(contextWlocked);
}

void RdcWrapper::clean_(RdcRuntimeContextWithWLock& context) {
context.data.enabledMetrics_.clear();
rdc_status_t result = rdc_shutdown();
if (result != RDC_ST_OK) {
LOG(WARNING) << "rdc_shutdown() failed with error code: " << result;
}
}

RdcMetricsMap RdcWrapper::getRdcMetricsForDevice(size_t device) {
RdcMetricsMap res;
auto context = context_.rlock();
if (context.data.enabledMetrics_.empty()) {
return {};
}
rdc_status_t result;
for (auto metric : context.data.enabledMetrics_) {
rdc_field_value value = {};
result = rdc_field_get_latest_value(
context.data.rdcHandle_, device, metric, &value);
if (result != RDC_ST_OK) {
LOG(ERROR) << "failed to get metric " << metric << " for device "
<< device << " with error code: " << result;
continue;
}
if (value.status != RDC_ST_OK) {
LOG(ERROR) << "metric " << metric << " returned by device " << device
<< " has a non RDC_ST_OK status: " << value.status;
continue;
}
switch (value.type) {
case rdc_field_type_t::DOUBLE:
res[metric] = value.value.dbl;
break;
case rdc_field_type_t::INTEGER:
res[metric] = value.value.l_int;
break;
default:
LOG(ERROR) << "unsupported type returned by rdc: " << value.type;
break;
}
}
return res;
}

} // namespace gpumon
} // namespace dynolog
86 changes: 86 additions & 0 deletions dynolog/src/gpumon/amd/RdcWrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#include <cstdint>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <unordered_map>
#include <variant>
#include <vector>

#include "rdc/rdc.h"

#pragma once

namespace dynolog {
namespace gpumon {

using RdcMetricsValue = std::variant<double, int64_t>;
using RdcMetricsMap = std::unordered_map<rdc_field_t, RdcMetricsValue>;

struct RdcRuntimeContext {
std::vector<rdc_field_t> enabledMetrics_;
rdc_handle_t rdcHandle_;
rdc_gpu_group_t gpuGroupId_;
rdc_field_grp_t fieldGroupId_;
};

struct RdcRuntimeContextWithWLock {
RdcRuntimeContextWithWLock(
RdcRuntimeContext& data,
std::shared_mutex& sharedDataLock)
: data{data} {
lockGuard_ = std::unique_lock<std::shared_mutex>(sharedDataLock);
}
RdcRuntimeContext& data;

private:
std::unique_lock<std::shared_mutex> lockGuard_;
};

struct RdcRuntimeContextWithRLock {
RdcRuntimeContextWithRLock(
RdcRuntimeContext& data,
std::shared_mutex& sharedDataLock)
: data{data} {
lockGuard_ = std::shared_lock<std::shared_mutex>(sharedDataLock);
}
RdcRuntimeContext& data;

private:
std::shared_lock<std::shared_mutex> lockGuard_;
};

class RdcRuntimeContextSynchronized {
public:
RdcRuntimeContextWithRLock rlock() {
return RdcRuntimeContextWithRLock(data_, sharedDataLock_);
}
RdcRuntimeContextWithWLock wlock() {
return RdcRuntimeContextWithWLock(data_, sharedDataLock_);
}

protected:
RdcRuntimeContext data_;
std::shared_mutex sharedDataLock_;
};

class RdcWrapper {
public:
RdcWrapper(std::vector<rdc_field_t> enabledMetrics);
~RdcWrapper();
void init(std::vector<rdc_field_t> enabledMetrics);
void clean();
RdcMetricsMap getRdcMetricsForDevice(size_t device);

protected:
void init_(
std::vector<rdc_field_t> enabledMetrics,
RdcRuntimeContextWithWLock& context);
void clean_(RdcRuntimeContextWithWLock& context);

RdcRuntimeContextSynchronized context_;
};

} // namespace gpumon
} // namespace dynolog
43 changes: 43 additions & 0 deletions dynolog/tests/gpumon/amd/RdcWrapperTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#include "dynolog/src/gpumon/amd/RdcWrapper.h"
#include <gtest/gtest.h>

using namespace ::testing;
using namespace ::dynolog::gpumon;

class RdcRuntimeContextSynchronizedTest : public RdcRuntimeContextSynchronized {
public:
std::shared_mutex& getMutex() {
return sharedDataLock_;
}
};

TEST(RdcWrapperTest, testRdcRuntimeContextSynchronized) {
RdcRuntimeContextSynchronizedTest t;
{
// two readers
auto a = t.rlock();
std::shared_lock<std::shared_mutex> b(t.getMutex(), std::defer_lock);
EXPECT_TRUE(b.try_lock());
}

{
// two writers
auto a = t.wlock();
std::unique_lock<std::shared_mutex> b(t.getMutex(), std::defer_lock);
EXPECT_FALSE(b.try_lock());
}

{
// reader then writer
auto a = t.rlock();
std::unique_lock<std::shared_mutex> b(t.getMutex(), std::defer_lock);
EXPECT_FALSE(b.try_lock());
}

{ auto a = t.rlock(); }
// lock released outside of scope
std::unique_lock<std::shared_mutex> b(t.getMutex(), std::defer_lock);
EXPECT_TRUE(b.try_lock());
}

0 comments on commit 14145a5

Please sign in to comment.