From f9d346f5852c6a00968e421a2b25b1e4f29b0a44 Mon Sep 17 00:00:00 2001 From: Alston Tang Date: Thu, 26 Sep 2024 14:25:14 -0700 Subject: [PATCH] rdc wrapper (#301) Summary: add two components for integrating RDC (https://rocm.docs.amd.com/projects/rdc) into dynolog add RdcWrapper class to open source dynolog. this contains necessary functions for enabling rdc metrics collection add DynoRdcSingleton class for adding meta specific requirement, as well as handling instance lifetime in a dynolog process Reviewed By: jj10306 Differential Revision: D59832676 --- dynolog/src/gpumon/amd/RdcWrapper.cpp | 155 ++++++++++++++++++++ dynolog/src/gpumon/amd/RdcWrapper.h | 86 +++++++++++ dynolog/tests/gpumon/amd/RdcWrapperTest.cpp | 43 ++++++ 3 files changed, 284 insertions(+) create mode 100644 dynolog/src/gpumon/amd/RdcWrapper.cpp create mode 100644 dynolog/src/gpumon/amd/RdcWrapper.h create mode 100644 dynolog/tests/gpumon/amd/RdcWrapperTest.cpp diff --git a/dynolog/src/gpumon/amd/RdcWrapper.cpp b/dynolog/src/gpumon/amd/RdcWrapper.cpp new file mode 100644 index 00000000..f3261efd --- /dev/null +++ b/dynolog/src/gpumon/amd/RdcWrapper.cpp @@ -0,0 +1,155 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "dynolog/src/gpumon/amd/RdcWrapper.h" + +#include +#include + +namespace dynolog { +namespace gpumon { + +// an id to identy gpu group, since we only have one gpu group, this can be any +// random string +const char* kGpuGroupName = "RdcGroupInfo"; +// and id to identify fields group, since we only have one field group, this can +// ben any random string +const char* kFieldGroupName = "RdcFieldGroup"; +// how often RDC should query from underlying sources (amd-smi, rocprofiler...) +constexpr std::chrono::milliseconds kUpdateInterval{500}; +// how long the buffer should be to keep historical data, since kMaxKeepSamples +// == 1, this should of no use when sufficiently large +constexpr std::chrono::seconds kMaxKeepAge{2}; +// how many sample to keep in the RDC cache, we only use the latest value +constexpr int kMaxKeepSamples = 1; + +// check the RDC API reference +// https://rocm.docs.amd.com/projects/rdc/en/docs-6.2.0/reference/api_ref.html +// for more information + +RdcWrapper::RdcWrapper(std::vector enabledMetrics) { + auto contextWlocked = context_.wlock(); + init_(std::move(enabledMetrics), contextWlocked); +} + +RdcWrapper::~RdcWrapper() { + clean(); +} + +void RdcWrapper::init(std::vector enabledMetrics) { + LOG(INFO) << "initializing RDC in embedded mode"; + auto contextWlocked = context_.wlock(); + init_(std::move(enabledMetrics), contextWlocked); +} + +void RdcWrapper::init_( + std::vector enabledMetrics, + RdcRuntimeContextWithWLock& context) { + context.data.enabledMetrics_ = std::move(enabledMetrics); + if (context.data.enabledMetrics_.empty()) { + throw std::runtime_error("enabledMetrics is empty"); + } + rdc_status_t result = rdc_init(0); + if (result != RDC_ST_OK) { + throw std::runtime_error( + "rdc_init() failed with error code: " + std::to_string(result)); + } + result = + rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &context.data.rdcHandle_); + if (result != RDC_ST_OK) { + throw std::runtime_error( + "rdc_start_embedded() failed with error code: " + + std::to_string(result)); + } + LOG(INFO) << "adding all GPUs to the group " << kGpuGroupName; + result = rdc_group_gpu_create( + context.data.rdcHandle_, + RDC_GROUP_DEFAULT, + kGpuGroupName, + &context.data.gpuGroupId_); + if (result != RDC_ST_OK) { + throw std::runtime_error( + "rdc_group_gpu_create() failed with error code: " + + std::to_string(result)); + } + + for (rdc_field_t metric : context.data.enabledMetrics_) { + LOG(INFO) << "adding metric: " << metric; + } + result = rdc_group_field_create( + context.data.rdcHandle_, + context.data.enabledMetrics_.size(), + context.data.enabledMetrics_.data(), + kFieldGroupName, + &context.data.fieldGroupId_); + if (result != RDC_ST_OK) { + throw std::runtime_error( + "rdc_group_field_create() failed with error code: " + + std::to_string(result)); + } + result = rdc_field_watch( + context.data.rdcHandle_, + context.data.gpuGroupId_, + context.data.fieldGroupId_, + std::chrono::microseconds(kUpdateInterval).count(), + std::chrono::seconds(kMaxKeepAge).count(), + kMaxKeepSamples); + if (result != RDC_ST_OK) { + throw std::runtime_error( + "rdc_field_watch() failed with error code: " + std::to_string(result)); + } + + LOG(INFO) << "RDC is initialized"; +} + +void RdcWrapper::clean() { + LOG(INFO) << "shutting down RDC"; + auto contextWlocked = context_.wlock(); + clean_(contextWlocked); +} + +void RdcWrapper::clean_(RdcRuntimeContextWithWLock& context) { + context.data.enabledMetrics_.clear(); + rdc_status_t result = rdc_shutdown(); + if (result != RDC_ST_OK) { + LOG(WARNING) << "rdc_shutdown() failed with error code: " << result; + } +} + +RdcMetricsMap RdcWrapper::getRdcMetricsForDevice(size_t device) { + RdcMetricsMap res; + auto context = context_.rlock(); + if (context.data.enabledMetrics_.empty()) { + return {}; + } + rdc_status_t result; + for (auto metric : context.data.enabledMetrics_) { + rdc_field_value value = {}; + result = rdc_field_get_latest_value( + context.data.rdcHandle_, device, metric, &value); + if (result != RDC_ST_OK) { + LOG(ERROR) << "failed to get metric " << metric << " for device " + << device << " with error code: " << result; + continue; + } + if (value.status != RDC_ST_OK) { + LOG(ERROR) << "metric " << metric << " returned by device " << device + << " has a non RDC_ST_OK status: " << value.status; + continue; + } + switch (value.type) { + case rdc_field_type_t::DOUBLE: + res[metric] = value.value.dbl; + break; + case rdc_field_type_t::INTEGER: + res[metric] = value.value.l_int; + break; + default: + LOG(ERROR) << "unsupported type returned by rdc: " << value.type; + break; + } + } + return res; +} + +} // namespace gpumon +} // namespace dynolog diff --git a/dynolog/src/gpumon/amd/RdcWrapper.h b/dynolog/src/gpumon/amd/RdcWrapper.h new file mode 100644 index 00000000..209dd128 --- /dev/null +++ b/dynolog/src/gpumon/amd/RdcWrapper.h @@ -0,0 +1,86 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include +#include +#include +#include +#include + +#include "rdc/rdc.h" + +#pragma once + +namespace dynolog { +namespace gpumon { + +using RdcMetricsValue = std::variant; +using RdcMetricsMap = std::unordered_map; + +struct RdcRuntimeContext { + std::vector enabledMetrics_; + rdc_handle_t rdcHandle_; + rdc_gpu_group_t gpuGroupId_; + rdc_field_grp_t fieldGroupId_; +}; + +struct RdcRuntimeContextWithWLock { + RdcRuntimeContextWithWLock( + RdcRuntimeContext& data, + std::shared_mutex& sharedDataLock) + : data{data} { + lockGuard_ = std::unique_lock(sharedDataLock); + } + RdcRuntimeContext& data; + + private: + std::unique_lock lockGuard_; +}; + +struct RdcRuntimeContextWithRLock { + RdcRuntimeContextWithRLock( + RdcRuntimeContext& data, + std::shared_mutex& sharedDataLock) + : data{data} { + lockGuard_ = std::shared_lock(sharedDataLock); + } + RdcRuntimeContext& data; + + private: + std::shared_lock lockGuard_; +}; + +class RdcRuntimeContextSynchronized { + public: + RdcRuntimeContextWithRLock rlock() { + return RdcRuntimeContextWithRLock(data_, sharedDataLock_); + } + RdcRuntimeContextWithWLock wlock() { + return RdcRuntimeContextWithWLock(data_, sharedDataLock_); + } + + protected: + RdcRuntimeContext data_; + std::shared_mutex sharedDataLock_; +}; + +class RdcWrapper { + public: + RdcWrapper(std::vector enabledMetrics); + ~RdcWrapper(); + void init(std::vector enabledMetrics); + void clean(); + RdcMetricsMap getRdcMetricsForDevice(size_t device); + + protected: + void init_( + std::vector enabledMetrics, + RdcRuntimeContextWithWLock& context); + void clean_(RdcRuntimeContextWithWLock& context); + + RdcRuntimeContextSynchronized context_; +}; + +} // namespace gpumon +} // namespace dynolog diff --git a/dynolog/tests/gpumon/amd/RdcWrapperTest.cpp b/dynolog/tests/gpumon/amd/RdcWrapperTest.cpp new file mode 100644 index 00000000..98b46513 --- /dev/null +++ b/dynolog/tests/gpumon/amd/RdcWrapperTest.cpp @@ -0,0 +1,43 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include "dynolog/src/gpumon/amd/RdcWrapper.h" +#include + +using namespace ::testing; +using namespace ::dynolog::gpumon; + +class RdcRuntimeContextSynchronizedTest : public RdcRuntimeContextSynchronized { + public: + std::shared_mutex& getMutex() { + return sharedDataLock_; + } +}; + +TEST(RdcWrapperTest, testRdcRuntimeContextSynchronized) { + RdcRuntimeContextSynchronizedTest t; + { + // two readers + auto a = t.rlock(); + std::shared_lock b(t.getMutex(), std::defer_lock); + EXPECT_TRUE(b.try_lock()); + } + + { + // two writers + auto a = t.wlock(); + std::unique_lock b(t.getMutex(), std::defer_lock); + EXPECT_FALSE(b.try_lock()); + } + + { + // reader then writer + auto a = t.rlock(); + std::unique_lock b(t.getMutex(), std::defer_lock); + EXPECT_FALSE(b.try_lock()); + } + + { auto a = t.rlock(); } + // lock released outside of scope + std::unique_lock b(t.getMutex(), std::defer_lock); + EXPECT_TRUE(b.try_lock()); +}