diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5246786..f54fc8d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,6 +72,12 @@ jobs: rocm-smi-lib \ rocm-validation-suite \ rocprofiler-dev \ + rocprofiler-plugins \ + rocprofiler-register \ + rocprofiler-sdk \ + hip-dev \ + hip-runtime-amd \ + hipcc \ build-essential \ ccache \ cmake \ diff --git a/include/rdc_modules/rdc_rocp/RdcRocpBase.h b/include/rdc_modules/rdc_rocp/RdcRocpBase.h index 49b9c91..65be41c 100644 --- a/include/rdc_modules/rdc_rocp/RdcRocpBase.h +++ b/include/rdc_modules/rdc_rocp/RdcRocpBase.h @@ -22,7 +22,7 @@ THE SOFTWARE. #ifndef RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_ #define RDC_MODULES_RDC_ROCP_RDCROCPBASE_H_ -#include +#include #include #include @@ -32,16 +32,11 @@ THE SOFTWARE. #include "rdc/rdc.h" #include "rdc_lib/RdcTelemetryLibInterface.h" +#include "rdc_modules/rdc_rocp/RdcRocpCounterSampler.h" namespace amd { namespace rdc { -typedef struct { - hsa_agent_t* agents; - unsigned count; - unsigned capacity; -} hsa_agent_arr_t; - /// Common interface for RocP tests and samples class RdcRocpBase { public: @@ -68,18 +63,16 @@ class RdcRocpBase { protected: private: typedef std::pair rdc_field_pair_t; - static const size_t buffer_length_k = 5; /** * @brief Tweak this to change for how long each metric is collected */ static const uint32_t collection_duration_us_k = 10000; - double read_feature(rocprofiler_t* context, uint32_t gpu_index); + double read_feature(rocprofiler_record_counter_t* record, uint32_t gpu_index); double run_profiler(uint32_t gpu_index, rdc_field_t field); - hsa_agent_arr_t agent_arr = {}; - std::vector queues; - std::map gpuid_to_feature; + std::vector agents = {}; + std::vector> samplers = {}; std::map field_to_metric = {}; // these fields must be divided by time passed @@ -89,9 +82,9 @@ class RdcRocpBase { }; /** - * @brief Convert from rocmtools status into RDC status + * @brief Convert from profiler status into RDC status */ - rdc_status_t Rocp2RdcError(hsa_status_t status); + rdc_status_t Rocp2RdcError(rocprofiler_status_t status); }; } // namespace rdc diff --git a/include/rdc_modules/rdc_rocp/RdcRocpCounterSampler.h b/include/rdc_modules/rdc_rocp/RdcRocpCounterSampler.h new file mode 100644 index 0000000..7553074 --- /dev/null +++ b/include/rdc_modules/rdc_rocp/RdcRocpCounterSampler.h @@ -0,0 +1,92 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef RDC_MODULES_RDC_ROCP_RDCROCPCOUNTERSAMPLER_H_ +#define RDC_MODULES_RDC_ROCP_RDCROCPCOUNTERSAMPLER_H_ + +#include +#include +#include + +#include +#include +#include +#include + +namespace amd { +namespace rdc { +class CounterSampler { + public: + // Setup system profiling for an agent + explicit CounterSampler(rocprofiler_agent_id_t agent); + + ~CounterSampler(); + + // Decode the counter name of a record + const std::string& decode_record_name(const rocprofiler_record_counter_t& rec) const; + + // Get the dimensions of a record (what CU/SE/etc the counter is for). High cost operation + // should be cached if possible. + std::unordered_map get_record_dimensions( + const rocprofiler_record_counter_t& rec); + + // Sample the counter values for a set of counters, returns the records in the out parameter. + void sample_counter_values(const std::vector& counters, + std::vector& out, uint64_t duration); + + rocprofiler_agent_id_t get_agent() const { return agent_; } + + // Get the supported counters for an agent + static std::unordered_map get_supported_counters( + rocprofiler_agent_id_t agent); + + // Get the available agents on the system + static std::vector get_available_agents(); + + static std::vector>& get_samplers(); + + private: + rocprofiler_agent_id_t agent_ = {}; + rocprofiler_context_id_t ctx_ = {}; + rocprofiler_buffer_id_t buf_ = {}; + rocprofiler_profile_config_id_t profile_ = {.handle = 0}; + + std::map, rocprofiler_profile_config_id_t> cached_profiles_; + std::map profile_sizes_; + + // Internal function used to set the profile for the agent when start_context is called + void set_profile(rocprofiler_context_id_t ctx, rocprofiler_agent_set_profile_callback_t cb) const; + + // Get the size of a counter in number of records + size_t get_counter_size(rocprofiler_counter_id_t counter); + + // Get the dimensions of a counter + std::vector get_counter_dimensions( + rocprofiler_counter_id_t counter); + + static std::vector> samplers_; +}; + +} // namespace rdc +} // namespace amd + +#endif // RDC_MODULES_RDC_ROCP_RDCROCPCOUNTERSAMPLER_H_ diff --git a/python_binding/RdcReader.py b/python_binding/RdcReader.py index 5fc4754..8a8f8f4 100644 --- a/python_binding/RdcReader.py +++ b/python_binding/RdcReader.py @@ -12,7 +12,8 @@ rdc_field_t.RDC_FI_POWER_USAGE, rdc_field_t.RDC_FI_GPU_CLOCK, rdc_field_t.RDC_FI_GPU_UTIL, - rdc_field_t.RDC_FI_GPU_TEMP + rdc_field_t.RDC_FI_GPU_TEMP, + rdc_field_t.RDC_FI_GPU_MEMORY_USAGE ] default_unit_coverter = { diff --git a/python_binding/rdc_collectd.py b/python_binding/rdc_collectd.py index 9ce8089..c890237 100644 --- a/python_binding/rdc_collectd.py +++ b/python_binding/rdc_collectd.py @@ -8,7 +8,8 @@ rdc_field_t.RDC_FI_POWER_USAGE, rdc_field_t.RDC_FI_GPU_CLOCK, rdc_field_t.RDC_FI_GPU_UTIL, - rdc_field_t.RDC_FI_GPU_TEMP + rdc_field_t.RDC_FI_GPU_TEMP, + rdc_field_t.RDC_FI_GPU_MEMORY_USAGE, ] diff --git a/python_binding/rdc_prometheus.py b/python_binding/rdc_prometheus.py index d75aa77..58a887a 100644 --- a/python_binding/rdc_prometheus.py +++ b/python_binding/rdc_prometheus.py @@ -13,6 +13,7 @@ rdc_field_t.RDC_FI_GPU_TEMP, rdc_field_t.RDC_FI_PROF_ACTIVE_CYCLES, rdc_field_t.RDC_FI_PROF_ACTIVE_WAVES, + rdc_field_t.RDC_FI_PROF_OCCUPANCY_PERCENT, ] class PrometheusReader(RdcReader): diff --git a/rdc_libs/rdc/src/RdcRocpLib.cc b/rdc_libs/rdc/src/RdcRocpLib.cc index c4cbff4..c26e527 100644 --- a/rdc_libs/rdc/src/RdcRocpLib.cc +++ b/rdc_libs/rdc/src/RdcRocpLib.cc @@ -168,7 +168,7 @@ std::string RdcRocpLib::get_rocm_path() { std::string line; while (getline(file, line)) { - size_t index_end = line.find("librocprofiler64.so"); + size_t index_end = line.find("librocprofiler-register.so"); size_t index_start = index_end; if (index_end == std::string::npos) { // no library on this line @@ -189,28 +189,27 @@ std::string RdcRocpLib::get_rocm_path() { } rdc_status_t RdcRocpLib::set_rocprofiler_path() { - // rocprofiler requires ROCP_METRICS to be set - std::string rocprofiler_metrics_path = - get_rocm_path() + "/libexec/rocprofiler/counters/derived_counters.xml"; + // rocprofiler requires ROCPROFILER_METRICS_PATH to be set + std::string rocprofiler_metrics_path = get_rocm_path() + "/share/rocprofiler-sdk/"; // set rocm prefix - int result = setenv("ROCP_METRICS", rocprofiler_metrics_path.c_str(), 0); + int result = setenv("ROCPROFILER_METRICS_PATH", rocprofiler_metrics_path.c_str(), 0); if (result != 0) { - RDC_LOG(RDC_ERROR, "setenv ROCP_METRICS failed! " << result); + RDC_LOG(RDC_ERROR, "setenv ROCPROFILER_METRICS_PATH failed! " << result); return RDC_ST_PERM_ERROR; } // check that env exists - const char* rocprofiler_metrics_env = getenv("ROCP_METRICS"); + const char* rocprofiler_metrics_env = getenv("ROCPROFILER_METRICS_PATH"); if (rocprofiler_metrics_env == nullptr) { - RDC_LOG(RDC_ERROR, "ROCP_METRICS is not set!"); + RDC_LOG(RDC_ERROR, "ROCPROFILER_METRICS_PATH is not set!"); return RDC_ST_NO_DATA; } // check that file can be accessed std::ifstream test_file(rocprofiler_metrics_env); if (!test_file.good()) { - RDC_LOG(RDC_ERROR, "failed to open ROCP_METRICS: " << rocprofiler_metrics_env); + RDC_LOG(RDC_ERROR, "failed to open ROCPROFILER_METRICS_PATH: " << rocprofiler_metrics_env); return RDC_ST_FILE_ERROR; } diff --git a/rdc_libs/rdc_modules/rdc_rocp/CMakeLists.txt b/rdc_libs/rdc_modules/rdc_rocp/CMakeLists.txt index 7db6d82..252749c 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/CMakeLists.txt +++ b/rdc_libs/rdc_modules/rdc_rocp/CMakeLists.txt @@ -9,30 +9,31 @@ set(RDC_ROCP_LIB_COMPONENT "lib${RDC_ROCP_LIB}") set(RDC_ROCP_LIB_SRC_LIST "${BOOTSTRAP_LIB_SRC_DIR}/RdcLogger.cc" "${SRC_DIR}/RdcTelemetryLib.cc" + "${SRC_DIR}/RdcRocpCounterSampler.cc" "${SRC_DIR}/RdcRocpBase.cc") set(RDC_ROCP_LIB_INC_LIST "${PROJECT_SOURCE_DIR}/include/rdc/rdc.h" "${RDC_LIB_INC_DIR}/RdcDiagnosticLibInterface.h" "${RDC_LIB_INC_DIR}/rdc_common.h" "${RDC_LIB_INC_DIR}/RdcLogger.h" - "${INC_DIR}/RdcRocpBase.h") + "${INC_DIR}/RdcRocpBase.h" + "${INC_DIR}/RdcRocpCounterSampler.h") if(BUILD_PROFILER) message("Build librdc_rocp.so is enabled, make sure ROCmTools is installed.") message("RDC_ROCP_LIB_INC_LIST=${RDC_ROCP_LIB_INC_LIST}") - set(ROCPROFILER_LIB rocprofiler::rocprofiler) - # below provides rocprofiler::rocprofiler package - include(Findrocprofiler) - + find_package(rocprofiler-sdk + HINTS ${ROCM_DIR}/lib/cmake + CONFIGURE REQUIRED) find_package(hsa-runtime64 NAMES hsa-runtime64 HINTS ${ROCM_DIR}/lib/cmake CONFIGURE REQUIRED) set(RDC_LIB_MODULES ${RDC_LIB_MODULES} ${RDC_ROCP_LIB} PARENT_SCOPE) add_library(${RDC_ROCP_LIB} SHARED ${RDC_ROCP_LIB_SRC_LIST} ${RDC_ROCP_LIB_INC_LIST}) - target_link_libraries(${RDC_ROCP_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} hsa-runtime64::hsa-runtime64 rocprofiler::rocprofiler pthread dl) + target_link_libraries(${RDC_ROCP_LIB} PRIVATE ${RDC_LIB} ${BOOTSTRAP_LIB} hsa-runtime64::hsa-runtime64 rocprofiler-sdk::rocprofiler-sdk pthread dl) target_include_directories(${RDC_ROCP_LIB} PRIVATE "${PROJECT_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}/include" diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index bd45af7..38a9fad 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -22,7 +22,9 @@ THE SOFTWARE. #include "rdc_modules/rdc_rocp/RdcRocpBase.h" -#include +#include +#include +#include #include #include @@ -33,199 +35,63 @@ THE SOFTWARE. #include #include #include -#include -#include +#include #include // #include "hsa.h" #include "rdc/rdc.h" #include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcTelemetryLibInterface.h" +#include "rdc_modules/rdc_rocp/RdcRocpCounterSampler.h" namespace amd { namespace rdc { -static hsa_status_t get_agent_handle_cb(hsa_agent_t agent, void* agent_arr) { - hsa_device_type_t type; - - assert(agent_arr != nullptr); - - hsa_agent_arr_t* agent_arr_ = (hsa_agent_arr_t*)agent_arr; - - hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); - if (status != HSA_STATUS_SUCCESS) { - return status; - } - - if (type == HSA_DEVICE_TYPE_GPU) { - if (agent_arr_->count >= agent_arr_->capacity) { - agent_arr_->capacity *= 2; - agent_arr_->agents = - (hsa_agent_t*)realloc(agent_arr_->agents, agent_arr_->capacity * sizeof(hsa_agent_t)); - // realloc might set agents to nullptr upon failure - assert(agent_arr_->agents != nullptr); - } - agent_arr_->agents[agent_arr_->count] = agent; - ++agent_arr_->count; - } - - return HSA_STATUS_SUCCESS; -} - -double RdcRocpBase::read_feature(rocprofiler_t* context, uint32_t gpu_index) { - hsa_status_t status = rocprofiler_read(context, 0); - assert(status == HSA_STATUS_SUCCESS); - status = rocprofiler_get_data(context, 0); - assert(status == HSA_STATUS_SUCCESS); - status = rocprofiler_get_metrics(context); - assert(status == HSA_STATUS_SUCCESS); - switch (gpuid_to_feature[gpu_index].data.kind) { - case ROCPROFILER_DATA_KIND_DOUBLE: - return gpuid_to_feature[gpu_index].data.result_double; - break; - case ROCPROFILER_DATA_KIND_INT32: - return static_cast(gpuid_to_feature[gpu_index].data.result_int32); - break; - case ROCPROFILER_DATA_KIND_INT64: - return static_cast(gpuid_to_feature[gpu_index].data.result_int64); - break; - case ROCPROFILER_DATA_KIND_FLOAT: - return static_cast(gpuid_to_feature[gpu_index].data.result_float); - break; - default: - RDC_LOG(RDC_ERROR, - "ERROR: Unexpected feature kind: " << gpuid_to_feature[gpu_index].data.kind); - } - return 0.0; -} +double RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) { + thread_local std::vector records; -static int get_agents(hsa_agent_arr_t* agent_arr) { - int errcode = 0; + // initialize hsa. hsa_init() will also load the profiler libs under the hood hsa_status_t status = HSA_STATUS_SUCCESS; - agent_arr->capacity = 1; - agent_arr->count = 0; - agent_arr->agents = (hsa_agent_t*)calloc(agent_arr->capacity, sizeof(hsa_agent_t)); - assert(agent_arr->agents); - - status = hsa_iterate_agents(get_agent_handle_cb, agent_arr); - if (status != HSA_STATUS_SUCCESS) { - errcode = -1; - - agent_arr->capacity = 0; - agent_arr->count = 0; - free(agent_arr->agents); + auto counter_sampler = CounterSampler::get_samplers()[gpu_index]; + if (!counter_sampler) { + RDC_LOG(RDC_ERROR, "Error: Counter sampler not found for GPU index " << gpu_index << std::endl); + throw std::runtime_error("Counter sampler not found"); } - return errcode; -} - -bool createHsaQueue(hsa_queue_t** queue, hsa_agent_t gpu_agent) { - // create a single-producer queue - hsa_status_t status = hsa_queue_create(gpu_agent, 64, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, - UINT32_MAX, UINT32_MAX, queue); - if (status != HSA_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Queue creation failed"); + auto field_it = field_to_metric.find(field); + if (field_it == field_to_metric.end()) { + RDC_LOG(RDC_ERROR, + "Error: Field " << field << " not found in field_to_metric map." << std::endl); + throw std::out_of_range("Field not found in field_to_metric map"); } + const std::string& metric_id = field_it->second; - status = hsa_amd_queue_set_priority(*queue, HSA_AMD_QUEUE_PRIORITY_HIGH); - if (status != HSA_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "HSA Queue Priority Set Failed"); + try { + counter_sampler->sample_counter_values({metric_id}, records, collection_duration_us_k); + } catch (const std::exception& e) { + RDC_LOG(RDC_ERROR, "Error while sampling counter values: " << e.what() << std::endl); + throw; } - return (status == HSA_STATUS_SUCCESS); -} - -double RdcRocpBase::run_profiler(uint32_t gpu_index, rdc_field_t field) { - // initialize hsa. hsa_init() will also load the profiler libs under the hood - hsa_status_t status = HSA_STATUS_SUCCESS; - - gpuid_to_feature[gpu_index].kind = (rocprofiler_feature_kind_t)ROCPROFILER_FEATURE_KIND_METRIC; - gpuid_to_feature[gpu_index].name = field_to_metric[field]; - - // rocprofiler_t* contexts[agent_arr.count] = {0}; - std::vector contexts; - contexts.reserve(agent_arr.count); - rocprofiler_properties_t properties = { - queues[gpu_index], - 64, - NULL, - NULL, - }; - int mode = (ROCPROFILER_MODE_STANDALONE | ROCPROFILER_MODE_SINGLEGROUP); - status = rocprofiler_open(agent_arr.agents[gpu_index], &gpuid_to_feature[gpu_index], 1, - &contexts[gpu_index], mode, &properties); - const char* error_string = nullptr; - rocprofiler_error_string(&error_string); - if (error_string != nullptr) { - if (error_string[0] != '\0') { - RDC_LOG(RDC_ERROR, error_string); - } + // Aggregate counter values. Rocprof v1/v2 summed values across dimensions. + double value = 0.0; + for (auto& record : records) { + value += record.counter_value; // Summing up values from all dimensions. } - assert(status == HSA_STATUS_SUCCESS); - - status = rocprofiler_start(contexts[gpu_index], 0); - assert(status == HSA_STATUS_SUCCESS); - - // this is the duration for which the counter increments from zero. - // TODO: Return error if sampling interval is lower than this value - usleep(collection_duration_us_k); - - status = rocprofiler_stop(contexts[gpu_index], 0); - assert(status == HSA_STATUS_SUCCESS); - - double value = read_feature(contexts[gpu_index], gpu_index); - - usleep(100); - - status = rocprofiler_close(contexts[gpu_index]); - assert(status == HSA_STATUS_SUCCESS); return value; } const char* RdcRocpBase::get_field_id_from_name(rdc_field_t field) { - return field_to_metric.at(field); -} - -// TODO - map RDC gpu_index to node_id -// use rocprofiler to check which metrics are supported -void check_metrics_supported(uint32_t node_id, std::vector& metrics_all, - std::vector& metrics_good) { - typedef struct { - std::vector* metrics_all_; - std::vector* metrics_good_; - uint32_t driver_node_id; - } payload_t; - // callback for rocprofiler to check which metrics are supported - auto info_callback = [](const rocprofiler_info_data_t info, void* data) { - payload_t* payload = reinterpret_cast(data); - if (info.agent_index == payload->driver_node_id) { - auto it = - std::find(payload->metrics_all_->begin(), payload->metrics_all_->end(), info.metric.name); - if (it != payload->metrics_all_->end()) { - payload->metrics_good_->push_back(info.metric.name); - RDC_LOG(RDC_DEBUG, " gpu-agent" << info.agent_index << " : " << info.metric.name << " : " - << info.metric.description); - if (info.metric.expr != NULL) // if it's a derived metric, print it's formula - RDC_LOG(RDC_DEBUG, " " << info.metric.name << " = " << info.metric.expr); - } - } - return HSA_STATUS_SUCCESS; - }; - - payload_t payload = {&metrics_all, &metrics_good, node_id}; - hsa_status_t status = - rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, &payload); - if (status != HSA_STATUS_SUCCESS) { - const char* errstr = nullptr; - hsa_status_string(status, &errstr); - RDC_LOG(RDC_ERROR, "hsa error: " << std::to_string(status) << " " << errstr); - } else { - for (auto& iter : *(payload.metrics_good_)) { - RDC_LOG(RDC_DEBUG, iter << " : exists"); - } + auto it = field_to_metric.find(field); + if (it == field_to_metric.end()) { + RDC_LOG(RDC_ERROR, + "Error: Field ID " << field << " not found in field_to_metric map." << std::endl); + throw std::out_of_range("Field ID not found in field_to_metric map"); } + + return field_to_metric.at(field); } const std::vector RdcRocpBase::get_field_ids() { @@ -237,20 +103,14 @@ const std::vector RdcRocpBase::get_field_ids() { } RdcRocpBase::RdcRocpBase() { - hsa_status_t status = hsa_init(); - if (status != HSA_STATUS_SUCCESS) { - const char* errstr = nullptr; - hsa_status_string(status, &errstr); - throw std::runtime_error("hsa error code: " + std::to_string(status) + " " + errstr); - } - // all fields static const std::map temp_field_map_k = { {RDC_FI_PROF_OCCUPANCY_PERCENT, "OccupancyPercent"}, {RDC_FI_PROF_ACTIVE_CYCLES, "GRBM_GUI_ACTIVE"}, {RDC_FI_PROF_ACTIVE_WAVES, "SQ_WAVES"}, {RDC_FI_PROF_ELAPSED_CYCLES, "GRBM_COUNT"}, - {RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "MfmaUtil"}, // same as TENSOR_ACTIVE but available for more GPUs + {RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, + "MfmaUtil"}, // same as TENSOR_ACTIVE but available for more GPUs {RDC_FI_PROF_GPU_UTIL_PERCENT, "GPU_UTIL"}, // metrics below are divided by time passed {RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"}, @@ -261,33 +121,41 @@ RdcRocpBase::RdcRocpBase() { {RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "ValuPipeIssueUtil"}, }; + hsa_status_t status = hsa_init(); + if (status != HSA_STATUS_SUCCESS) { + const char* errstr = nullptr; + hsa_status_string(status, &errstr); + throw std::runtime_error("hsa error code: " + std::to_string(status) + " " + errstr); + } + + // check rocprofiler + if (int rocp_status = 0; + rocprofiler_is_initialized(&rocp_status) == ROCPROFILER_STATUS_SUCCESS && rocp_status != 1) { + throw std::runtime_error("Rocprofiler is not initialized. status: " + + std::to_string(rocp_status)); + } + std::vector all_fields; std::vector checked_fields; + // populate list of agents + agents = CounterSampler::get_available_agents(); + RDC_LOG(RDC_DEBUG, "Agent count: " << agents.size()); + samplers = CounterSampler::get_samplers(); + + // populate fields for (auto& [k, v] : temp_field_map_k) { all_fields.push_back(v); } - // populate list of agents - int errcode = get_agents(&agent_arr); - if (errcode != 0) { - return; - } - RDC_LOG(RDC_DEBUG, "Agent count: " << agent_arr.count); - - uint32_t driver_node_id = 0; - for (uint32_t gpu_index = 0; gpu_index < agent_arr.count; gpu_index++) { - status = hsa_agent_get_info(agent_arr.agents[gpu_index], - static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), - &driver_node_id); - if (status != HSA_STATUS_SUCCESS) { - const char* errstr = nullptr; - hsa_status_string(status, &errstr); - RDC_LOG(RDC_ERROR, "hsa error: " << std::to_string(status) << " " << errstr); - } else { - RDC_LOG(RDC_DEBUG, "gpu_index[" << gpu_index << "] = node_id[" << driver_node_id << "]"); + // find intersection of supported and requested fields + for (uint32_t gpu_index = 0; gpu_index < agents.size(); gpu_index++) { + auto& cs = *samplers[gpu_index]; + RDC_LOG(RDC_DEBUG, + "gpu_index[" << gpu_index << "] = node_id[" << agents[gpu_index].node_id << "]"); + for (auto& [str, id] : cs.get_supported_counters(cs.get_agent())) { + checked_fields.emplace_back(str); } - check_metrics_supported(driver_node_id, all_fields, checked_fields); for (auto& [k, v] : temp_field_map_k) { auto found = std::find(checked_fields.begin(), checked_fields.end(), v); @@ -298,22 +166,6 @@ RdcRocpBase::RdcRocpBase() { } RDC_LOG(RDC_DEBUG, "Rocprofiler supports " << field_to_metric.size() << " fields"); - - for (uint32_t gpu_index = 0; gpu_index < agent_arr.count; gpu_index++) { - for (const auto& [k, v] : field_to_metric) { - rocprofiler_feature_t temp_feature; - temp_feature.kind = (rocprofiler_feature_kind_t)ROCPROFILER_FEATURE_KIND_METRIC; - temp_feature.name = v; - gpuid_to_feature.insert({gpu_index, temp_feature}); - } - } - - for (uint32_t gpu_index = 0; gpu_index < agent_arr.count; gpu_index++) { - queues.push_back(nullptr); - if (!createHsaQueue(&queues[gpu_index], agent_arr.agents[gpu_index])) { - RDC_LOG(RDC_ERROR, "can't create queues[" << gpu_index << "]\n"); - } - } } RdcRocpBase::~RdcRocpBase() { @@ -332,10 +184,6 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value) return RDC_ST_BAD_PARAMETER; } - hsa_status_t status = HSA_STATUS_SUCCESS; - if (status != HSA_STATUS_SUCCESS) { - return Rocp2RdcError(status); - } const auto start_time = std::chrono::high_resolution_clock::now(); *value = run_profiler(gpu_index, field); const auto stop_time = std::chrono::high_resolution_clock::now(); @@ -343,8 +191,6 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value) if (eval_fields.find(field) != eval_fields.end()) { const auto elapsed = std::chrono::duration_cast(stop_time - start_time).count(); - // RDC_LOG(RDC_DEBUG, "INDEX: " << gpu_index << " before[" << *value << "] after[" - // << (*value / elapsed) << "]"); *value = *value / elapsed; } // GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE. @@ -352,16 +198,7 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value) if (field == RDC_FI_PROF_GPU_UTIL_PERCENT) { *value = *value / 100.0F; } - return Rocp2RdcError(status); -} - -rdc_status_t RdcRocpBase::Rocp2RdcError(hsa_status_t status) { - switch (status) { - case HSA_STATUS_SUCCESS: - return RDC_ST_OK; - default: - return RDC_ST_UNKNOWN_ERROR; - } + return RDC_ST_OK; } } // namespace rdc diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpCounterSampler.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpCounterSampler.cc new file mode 100644 index 0000000..9b593a3 --- /dev/null +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpCounterSampler.cc @@ -0,0 +1,350 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "rdc_modules/rdc_rocp/RdcRocpCounterSampler.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rdc_lib/RdcLogger.h" + +template +void RocprofilerCall(Callable&& callable, const std::string& msg, const char* file, int line) { + auto result = callable(); + if (result != ROCPROFILER_STATUS_SUCCESS) { + std::string status_msg = rocprofiler_get_status_string(result); + RDC_LOG(RDC_ERROR, "[CALL][" << file << ":" << line << "] " << msg << " failed with error code " + << result << ": " << status_msg << std::endl); + std::stringstream errmsg{}; + errmsg << "[CALL][" << file << ":" << line << "] " << msg << " failure (" << status_msg << ")"; + throw std::runtime_error(errmsg.str()); + } +} + +namespace amd { +namespace rdc { + +std::vector> CounterSampler::samplers_; + +std::vector>& CounterSampler::get_samplers() { return samplers_; } + +CounterSampler::CounterSampler(rocprofiler_agent_id_t agent) : agent_(agent) { + // Setup context (should only be done once per agent) + auto client_thread = rocprofiler_callback_thread_t{}; + RocprofilerCall([&]() { return rocprofiler_create_context(&ctx_); }, "context creation failed", + __FILE__, __LINE__); + + RocprofilerCall( + [&]() { + // 4096 is total buffer size + // 2048 is size for callback buffer, not likely used as rdc reads results immediately, not + // from buffer feature will drop in future + return rocprofiler_create_buffer( + ctx_, 4096, 2048, ROCPROFILER_BUFFER_POLICY_LOSSLESS, + [](rocprofiler_context_id_t, rocprofiler_buffer_id_t, rocprofiler_record_header_t**, + size_t, void*, uint64_t) {}, + nullptr, &buf_); + }, + "buffer creation failed", __FILE__, __LINE__); + RocprofilerCall([&]() { return rocprofiler_create_callback_thread(&client_thread); }, + "failure creating callback thread", __FILE__, __LINE__); + RocprofilerCall([&]() { return rocprofiler_assign_callback_thread(buf_, client_thread); }, + "failed to assign thread for buffer", __FILE__, __LINE__); + + RocprofilerCall( + [&]() { + return rocprofiler_configure_device_counting_service( + ctx_, buf_, agent, + [](rocprofiler_context_id_t context_id, rocprofiler_agent_id_t, + rocprofiler_agent_set_profile_callback_t set_config, void* user_data) { + if (user_data) { + auto* sampler = static_cast(user_data); + sampler->set_profile(context_id, set_config); + } + }, + this); + }, + "Could not setup buffered service", __FILE__, __LINE__); +} + +CounterSampler::~CounterSampler() { rocprofiler_stop_context(ctx_); } + +const std::string& CounterSampler::decode_record_name( + const rocprofiler_record_counter_t& rec) const { + static auto roc_counters = [this]() { + auto name_to_id = CounterSampler::get_supported_counters(agent_); + std::map id_to_name; + for (const auto& [name, id] : name_to_id) { + id_to_name.emplace(id.handle, name); + } + return id_to_name; + }(); + rocprofiler_counter_id_t counter_id = {.handle = 0}; + rocprofiler_query_record_counter_id(rec.id, &counter_id); + + auto it = roc_counters.find(counter_id.handle); + if (it == roc_counters.end()) { + RDC_LOG(RDC_ERROR, "Error: Counter handle " << counter_id.handle + << " not found in roc_counters." << std::endl); + throw std::runtime_error("Counter handle not found in roc_counters"); + } + + return it->second; +} + +std::unordered_map CounterSampler::get_record_dimensions( + const rocprofiler_record_counter_t& rec) { + std::unordered_map out; + rocprofiler_counter_id_t counter_id = {.handle = 0}; + rocprofiler_query_record_counter_id(rec.id, &counter_id); + auto dims = get_counter_dimensions(counter_id); + + for (auto& dim : dims) { + size_t pos = 0; + rocprofiler_query_record_dimension_position(rec.id, dim.id, &pos); + out.emplace(dim.name, pos); + } + return out; +} + +void CounterSampler::sample_counter_values(const std::vector& counters, + std::vector& out, + uint64_t duration) { + auto profile_cached = cached_profiles_.find(counters); + if (profile_cached == cached_profiles_.end()) { + size_t expected_size = 0; + rocprofiler_profile_config_id_t profile = {}; + std::vector gpu_counters; + auto roc_counters = get_supported_counters(agent_); + for (const auto& counter : counters) { + auto it = roc_counters.find(counter); + if (it == roc_counters.end()) { + RDC_LOG(RDC_ERROR, "Counter " << counter << " not found\n"); + continue; + } + gpu_counters.push_back(it->second); + expected_size += get_counter_size(it->second); + } + RocprofilerCall( + [&]() { + return rocprofiler_create_profile_config(agent_, gpu_counters.data(), gpu_counters.size(), + &profile); + }, + "Could not create profile", __FILE__, __LINE__); + cached_profiles_.emplace(counters, profile); + profile_sizes_.emplace(profile.handle, expected_size); + profile_cached = cached_profiles_.find(counters); + } + + if (profile_sizes_.find(profile_cached->second.handle) == profile_sizes_.end()) { + RDC_LOG(RDC_ERROR, "Error: Profile handle " << profile_cached->second.handle + << " not found in profile_sizes_." << std::endl); + throw std::runtime_error("Profile handle not found in profile_sizes_"); + } + out.resize(profile_sizes_.at(profile_cached->second.handle)); + profile_ = profile_cached->second; + rocprofiler_start_context(ctx_); + size_t out_size = out.size(); + // Wait for sampling window to collect metrics + usleep(duration); + rocprofiler_sample_device_counting_service(ctx_, {}, ROCPROFILER_COUNTER_FLAG_NONE, out.data(), + &out_size); + rocprofiler_stop_context(ctx_); + out.resize(out_size); +} + +std::vector CounterSampler::get_available_agents() { + std::vector agents; + rocprofiler_query_available_agents_cb_t iterate_cb = [](rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, void* udata) { + if (agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) + throw std::runtime_error{"unexpected rocprofiler agent version"}; + auto* agents_v = static_cast*>(udata); + for (size_t i = 0; i < num_agents; ++i) { + const auto* rocp_agent = static_cast(agents_arr[i]); + if (rocp_agent->type == ROCPROFILER_AGENT_TYPE_GPU) agents_v->emplace_back(*rocp_agent); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + + RocprofilerCall( + [&]() { + return rocprofiler_query_available_agents( + ROCPROFILER_AGENT_INFO_VERSION_0, iterate_cb, sizeof(rocprofiler_agent_t), + const_cast(static_cast(&agents))); + }, + "query available agents", __FILE__, __LINE__); + return agents; +} + +void CounterSampler::set_profile(rocprofiler_context_id_t ctx, + rocprofiler_agent_set_profile_callback_t cb) const { + if (profile_.handle != 0) { + cb(ctx, profile_); + } +} + +size_t CounterSampler::get_counter_size(rocprofiler_counter_id_t counter) { + size_t size = 1; + rocprofiler_iterate_counter_dimensions( + counter, + [](rocprofiler_counter_id_t, const rocprofiler_record_dimension_info_t* dim_info, + size_t num_dims, void* user_data) { + size_t* s = static_cast(user_data); + for (size_t i = 0; i < num_dims; i++) { + *s *= dim_info[i].instance_size; + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast(&size)); + return size; +} + +std::unordered_map CounterSampler::get_supported_counters( + rocprofiler_agent_id_t agent) { + std::unordered_map out; + std::vector gpu_counters; + + RocprofilerCall( + [&]() { + return rocprofiler_iterate_agent_supported_counters( + agent, + [](rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, + void* user_data) { + std::vector* vec = + static_cast*>(user_data); + for (size_t i = 0; i < num_counters; i++) { + vec->push_back(counters[i]); + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast(&gpu_counters)); + }, + "Could not fetch supported counters", __FILE__, __LINE__); + for (auto& counter : gpu_counters) { + rocprofiler_counter_info_v0_t version; + RocprofilerCall( + [&]() { + return rocprofiler_query_counter_info(counter, ROCPROFILER_COUNTER_INFO_VERSION_0, + static_cast(&version)); + }, + "Could not query info for counter", __FILE__, __LINE__); + out.emplace(version.name, counter); + } + return out; +} + +std::vector CounterSampler::get_counter_dimensions( + rocprofiler_counter_id_t counter) { + std::vector dims; + rocprofiler_available_dimensions_cb_t cb = [](rocprofiler_counter_id_t, + const rocprofiler_record_dimension_info_t* dim_info, + size_t num_dims, void* user_data) { + std::vector* vec = + static_cast*>(user_data); + for (size_t i = 0; i < num_dims; i++) { + vec->push_back(dim_info[i]); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + RocprofilerCall([&]() { return rocprofiler_iterate_counter_dimensions(counter, cb, &dims); }, + "Could not iterate counter dimensions", __FILE__, __LINE__); + return dims; +} + +int tool_init(rocprofiler_client_finalize_t, void*) { + // Get the agents available on the device + auto agents = CounterSampler::get_available_agents(); + if (agents.empty()) { + RDC_LOG(RDC_ERROR, "No agents found\n"); + return -1; + } + + for (auto agent : agents) { + CounterSampler::get_samplers().push_back(std::make_shared(agent.id)); + } + + // no errors + return 0; +} + +void tool_fini(void* user_data) { + auto* output_stream = static_cast(user_data); + *output_stream << std::flush; + if (output_stream != &std::cout && output_stream != &std::cerr) delete output_stream; +} + +extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) { + // set the client name + id->name = "CounterClientSample"; + + // compute major/minor/patch version info + uint32_t major = version / 10000; + uint32_t minor = (version % 10000) / 100; + uint32_t patch = version % 100; + + // generate info string + auto info = std::stringstream{}; + info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "." + << minor << "." << patch << " (" << runtime_version << ")"; + + std::clog << info.str() << std::endl; + + std::ostream* output_stream = nullptr; + std::string filename = "counter_collection.log"; + if (auto* outfile = getenv("ROCPROFILER_SAMPLE_OUTPUT_FILE"); outfile) filename = outfile; + if (filename == "stdout") + output_stream = &std::cout; + else if (filename == "stderr") + output_stream = &std::cerr; + else + output_stream = new std::ofstream{filename}; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), &tool_init, + &tool_fini, static_cast(output_stream)}; + + // return pointer to configure data + return &cfg; +} + +} // namespace rdc +} // namespace amd diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc index 4db8e97..8863209 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcTelemetryLib.cc @@ -52,7 +52,7 @@ bool is_rocp_disabled() { [&value_str](const char* val) { return value_str == val; }); } -rdc_status_t rdc_module_init(uint64_t /*flags*/) { +rdc_status_t rdc_module_init(uint64_t /*flags*/) { if (is_rocp_disabled()) { // rocprofiler does NOT work in gtest. // GTest starts up multiple instances of the progam under test,