diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 6bbf746..c3cae14 100755 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -126,6 +126,12 @@ set(HEALTH_EXAMPLE_EXE "health") add_executable(${HEALTH_EXAMPLE_EXE} "${HEALTH_EXAMPLE_SRC_LIST}") target_link_libraries(${HEALTH_EXAMPLE_EXE} pthread dl rdc_bootstrap) +set(CONFIG_EXAMPLE_SRC_LIST "config_example.cc") +cmake_print_variables(CONFIG_EXAMPLE_SRC_LIST) +set(CONFIG_EXAMPLE_EXE "config") +add_executable(${CONFIG_EXAMPLE_EXE} "${CONFIG_EXAMPLE_SRC_LIST}") +target_link_libraries(${CONFIG_EXAMPLE_EXE} pthread dl rdc_bootstrap) + message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") message(" Finished Cmake Example ") message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&") diff --git a/example/config_example.cc b/example/config_example.cc new file mode 100644 index 0000000..b05726f --- /dev/null +++ b/example/config_example.cc @@ -0,0 +1,156 @@ +#include + +#include +#include +#include + +#include "rdc/rdc.h" + +int main() { + rdc_gpu_group_t group_id; + rdc_status_t result; + bool standalone = false; + rdc_handle_t rdc_handle; + uint32_t count = 0; + rdc_config_setting_list_t settings_list; + rdc_config_setting_t setting; + uint64_t watts; + + char hostIpAddress[] = {"localhost:50051"}; + char group_name[] = {"group1"}; + + // Select the embedded mode and standalone mode dynamically. + std::cout << "Start rdci in: \n"; + std::cout << "0 - Embedded mode \n"; + std::cout << "1 - Standalone mode \n"; + while (!(std::cin >> standalone)) { + std::cout << "Invalid input.\n"; + std::cin.clear(); + std::cin.ignore(); + } + std::cout << std::endl; + std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n"); + + // Init the rdc + result = rdc_init(0); + + if (result != RDC_ST_OK) { + std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } else { + std::cout << "RDC Initialized.\n"; + } + + if (standalone) { // standalone + result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr); + if (result != RDC_ST_OK) { + std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + } else { // embedded + result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle); + if (result != RDC_ST_OK) { + std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + } + + // Now we can use the same API for both standalone and embedded + // Get the list of devices in the system + uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; + result = rdc_device_get_all(rdc_handle, gpu_index_list, &count); + if (result != RDC_ST_OK) { + std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result); + goto cleanup; + } + if (count == 0) { + std::cout << "No GPUs find on the sytem "; + goto cleanup; + } else { + std::cout << count << " GPUs found in the system.\n"; + } + + // Create the group + result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id); + if (result != RDC_ST_OK) { + std::cout << "Error creating group. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Created the GPU group " << group_id << std::endl; + + // Add all GPUs to the group + for (uint32_t i = 0; i < count; i++) { + result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0 + if (result != RDC_ST_OK) { + std::cout << "Error adding group. Return: " << rdc_status_string(result); + goto cleanup; + } + rdc_device_attributes_t attribute; + result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute); + if (result != RDC_ST_OK) { + std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result); + goto cleanup; + } + std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group " + << group_id << std::endl; + } + + setting.type = RDC_CFG_POWER_LIMIT; + // Our targeted value is 195 Watts, which will be converted into Microwatts inside of + // rdc_config_set + setting.target_value = 195; + result = rdc_config_set(rdc_handle, group_id, setting); + if (result != RDC_ST_OK) { + std::cout << "Error set config RDC_CFG_POWER_LIMIT, Return: " << rdc_status_string(result) + << std::endl; + goto cleanup; + } + + result = rdc_config_get(rdc_handle, group_id, &settings_list); + if (result != RDC_ST_OK) { + std::cout << "Error get config, Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } + + // Prompt user to change amd-smi to other value, and watch rdc config change it back + + std::cout << "Config before wait:" << std::endl; + + result = rdc_config_get(rdc_handle, group_id, &settings_list); + if (result != RDC_ST_OK) { + std::cout << "Error get config, Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } + + std::cout << "The config will keep the power limit to 195 Watts" << std::endl; + std::cout << "You can change the power limit using amd-smi, the RDC config module should be able " + "to detect it and set it back" + << std::endl; + std::cout << "Waiting 3 minutes before exit ..." << std::endl; + std::this_thread::sleep_for(std::chrono::minutes(3)); + + result = rdc_config_clear(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error clear config, Return: " << rdc_status_string(result) << std::endl; + goto cleanup; + } + +//... clean up +cleanup: + std::cout << "Cleaning up.\n"; + + result = rdc_group_gpu_destroy(rdc_handle, group_id); + if (result != RDC_ST_OK) { + std::cout << "Error delete GPU group. Return: " << rdc_status_string(result); + } + std::cout << "Deleted the GPU group " << group_id << std::endl; + + if (standalone) + rdc_disconnect(rdc_handle); + else + rdc_stop_embedded(rdc_handle); + rdc_shutdown(); + return result; +} diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index b68025a..dc380ea 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -147,6 +147,11 @@ typedef enum { INTEGER = 0, DOUBLE, STRING, BLOB } rdc_field_type_t; */ #define RDC_MAX_VERSION_STR_LENGTH 60 +/** + * @brief Max configuration can be collected using the configuration get + */ +#define RDC_MAX_CONFIG_SETTINGS 32 + /** * These enums are used to specify a particular field to be retrieved. */ @@ -182,11 +187,11 @@ typedef enum { /** * @brief GPU usage related fields */ - RDC_FI_GPU_UTIL = 500, //!< GPU Utilization - RDC_FI_GPU_MEMORY_USAGE, //!< Memory usage of the GPU instance - RDC_FI_GPU_MEMORY_TOTAL, //!< Total memory of the GPU instance - RDC_FI_GPU_MM_ENC_UTIL, //!< Multimedia encoder busy percentage - RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage + RDC_FI_GPU_UTIL = 500, //!< GPU Utilization + RDC_FI_GPU_MEMORY_USAGE, //!< Memory usage of the GPU instance + RDC_FI_GPU_MEMORY_TOTAL, //!< Total memory of the GPU instance + RDC_FI_GPU_MM_ENC_UTIL, //!< Multimedia encoder busy percentage + RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage RDC_FI_GPU_MEMORY_ACTIVITY, //!< Memory busy percentage /** @@ -251,16 +256,16 @@ typedef enum { RDC_FI_XGMI_6_READ_KB, //!< XGMI_6 accumulated data read size (KB) RDC_FI_XGMI_7_READ_KB, //!< XGMI_7 accumulated data read size (KB) - RDC_FI_XGMI_0_WRITE_KB, //!< XGMI_0 accumulated data write size (KB) - RDC_FI_XGMI_1_WRITE_KB, //!< XGMI_1 accumulated data write size (KB) - RDC_FI_XGMI_2_WRITE_KB, //!< XGMI_2 accumulated data write size (KB) - RDC_FI_XGMI_3_WRITE_KB, //!< XGMI_3 accumulated data write size (KB) - RDC_FI_XGMI_4_WRITE_KB, //!< XGMI_4 accumulated data write size (KB) - RDC_FI_XGMI_5_WRITE_KB, //!< XGMI_5 accumulated data write size (KB) - RDC_FI_XGMI_6_WRITE_KB, //!< XGMI_6 accumulated data write size (KB) - RDC_FI_XGMI_7_WRITE_KB, //!< XGMI_7 accumulated data write size (KB) - RDC_FI_XGMI_TOTAL_READ_KB, //!< XGMI_SUM accumulated data read size (KB) - RDC_FI_XGMI_TOTAL_WRITE_KB, //!< XGMI_SUM accumulated data write size (KB) + RDC_FI_XGMI_0_WRITE_KB, //!< XGMI_0 accumulated data write size (KB) + RDC_FI_XGMI_1_WRITE_KB, //!< XGMI_1 accumulated data write size (KB) + RDC_FI_XGMI_2_WRITE_KB, //!< XGMI_2 accumulated data write size (KB) + RDC_FI_XGMI_3_WRITE_KB, //!< XGMI_3 accumulated data write size (KB) + RDC_FI_XGMI_4_WRITE_KB, //!< XGMI_4 accumulated data write size (KB) + RDC_FI_XGMI_5_WRITE_KB, //!< XGMI_5 accumulated data write size (KB) + RDC_FI_XGMI_6_WRITE_KB, //!< XGMI_6 accumulated data write size (KB) + RDC_FI_XGMI_7_WRITE_KB, //!< XGMI_7 accumulated data write size (KB) + RDC_FI_XGMI_TOTAL_READ_KB, //!< XGMI_SUM accumulated data read size (KB) + RDC_FI_XGMI_TOTAL_WRITE_KB, //!< XGMI_SUM accumulated data write size (KB) /** * @brief ROC-profiler related fields @@ -340,14 +345,14 @@ typedef enum { /** * @brief RDC health related fields */ - RDC_HEALTH_XGMI_ERROR = 3000, //!< XGMI one or more errors detected - RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count - RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number - RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number - RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page - RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT,//!< The threshold of uncorrectable page - RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter - RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds) + RDC_HEALTH_XGMI_ERROR = 3000, //!< XGMI one or more errors detected + RDC_HEALTH_PCIE_REPLAY_COUNT, //!< Total PCIE replay count + RDC_HEALTH_RETIRED_PAGE_NUM, //!< Retired page number + RDC_HEALTH_PENDING_PAGE_NUM, //!< Pending page number + RDC_HEALTH_RETIRED_PAGE_LIMIT, //!< The threshold of retired page + RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, //!< The threshold of uncorrectable page + RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter + RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds) } rdc_field_t; // even and odd numbers are used for correctable and uncorrectable errors @@ -517,7 +522,7 @@ typedef enum { */ typedef enum { RDC_AMDMSI_COMPONENT - //If needed later, add them one by one + // If needed later, add them one by one } rdc_component_t; /** @@ -571,8 +576,8 @@ typedef struct { typedef void (*rdc_callback_t)(void*, void*); typedef struct { - rdc_callback_t callback; //!< Callback sends logs for running diagnostics - void* cookie; //!< Cookie is used to identify different callbacks and supply them with data + rdc_callback_t callback; //!< Callback sends logs for running diagnostics + void* cookie; //!< Cookie is used to identify different callbacks and supply them with data } rdc_diag_callback_t; /** @@ -599,7 +604,7 @@ typedef enum { RDC_POLICY_ACTION_NONE, RDC_POLICY_ACTION_GPU_RESET } rdc_policy_ */ typedef struct { rdc_policy_condition_t condition; //!< condition to meet - rdc_policy_action_t action; //!< Action to take + rdc_policy_action_t action; //!< Action to take } rdc_policy_t; typedef enum { @@ -663,12 +668,12 @@ typedef struct { * @brief type of health watches */ typedef enum { - RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches - RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches - RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches - RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches - RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches - RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches + RDC_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches + RDC_HEALTH_WATCH_XGMI = 0x2, //!< XGMI system watches + RDC_HEALTH_WATCH_MEM = 0x4, //!< Memory watches + RDC_HEALTH_WATCH_INFOROM = 0x8, //!< Inforom watches + RDC_HEALTH_WATCH_THERMAL = 0x10, //!< Temperature watches + RDC_HEALTH_WATCH_POWER = 0x20, //!< Power watches } rdc_health_system_t; /** @@ -708,32 +713,56 @@ typedef enum { * @brief details of the health errors */ typedef struct { - char msg[MAX_HEALTH_MSG_LENGTH]; //!< The test result details - uint32_t code; //!< The low level error code + char msg[MAX_HEALTH_MSG_LENGTH]; //!< The test result details + uint32_t code; //!< The low level error code } rdc_health_detail_t; /** * @brief details of the per health incidents */ typedef struct { - uint32_t gpu_index; //!< which GPU in this group have the issue - rdc_health_system_t component; //!< which components have the issue - rdc_health_result_t health; //!< health diagnosis of this incident - rdc_health_detail_t error; //!< The details of the error, rdc_health_error_code_t + uint32_t gpu_index; //!< which GPU in this group have the issue + rdc_health_system_t component; //!< which components have the issue + rdc_health_result_t health; //!< health diagnosis of this incident + rdc_health_detail_t error; //!< The details of the error, rdc_health_error_code_t } rdc_health_incidents_t; - #define HEALTH_MAX_ERROR_ITEMS 64 /** * @brief The health responses for test cases */ typedef struct { - rdc_health_result_t overall_health; //!< The overall health of this entire host - unsigned int incidents_count; //!< The number of health incidents reported in this struct - rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]; //!< Report of the errors detected + rdc_health_result_t overall_health; //!< The overall health of this entire host + unsigned int incidents_count; //!< The number of health incidents reported in this struct + rdc_health_incidents_t incidents[HEALTH_MAX_ERROR_ITEMS]; //!< Report of the errors detected } rdc_health_response_t; +/** + * @brief property id's for the configuration set/get + */ +typedef enum { + RDC_CFG_GFX_CLOCK_LIMIT, + RDC_CFG_MEMORY_CLOCK_LIMIT, + RDC_CFG_POWER_LIMIT +} rdc_config_type_t; + +/** + * @brief Value mapped to rdc_config_type_t property id for the configuration set/get + */ +typedef struct { + rdc_config_type_t type; + uint64_t target_value; +} rdc_config_setting_t; + +/** + * @brief Array of properties collected using the configuration get + */ +typedef struct { + uint32_t total_settings; + rdc_config_setting_t settings[RDC_MAX_CONFIG_SETTINGS]; +} rdc_config_setting_list_t; + /** * @brief Initialize ROCm RDC. * @@ -972,7 +1001,8 @@ rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_i * * @retval ::RDC_ST_OK is returned upon successful call. */ -rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, rdc_component_version_t* p_rdc_compv); +rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, + rdc_component_version_t* p_rdc_compv); /** * @brief Create a group contains multiple GPUs @@ -1382,7 +1412,7 @@ typedef struct { unsigned int version; rdc_policy_condition_t condition; //!< the condition that is meet rdc_gpu_group_t group_id; //!< The group id trigger this callback - int64_t value; //!< The current value that meet the condition + int64_t value; //!< The current value that meet the condition } rdc_policy_callback_response_t; /** @@ -1514,6 +1544,50 @@ rdc_status_t rdc_device_topology_get(rdc_handle_t p_rdc_handle, uint32_t gpu_ind * @retval ::RDC_ST_OK is returned upon successful call. */ rdc_status_t rdc_link_status_get(rdc_handle_t p_rdc_handle, rdc_link_status_t* results); +/** + * @brief Set one configuration + * + * @details Set the given configuration to all nodes belong to the given group + * + * @param[in] p_rdc_handle Node handle + * + * @param[in] group_id Group id to which node belongs + * + * @param[in] setting Configuration to be set for the nodes + * + * @retval RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_config_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_config_setting_t setting); + +/** + * @brief Get the configrations + * + * @details Get all the configurations for all nodes belong to the given group + * + * @param[in] p_rdc_handle Node handle + * + * @param[in] group_id Group id to which nodes belong + * + * @param[out] settings List of configurations returned. + * + * @retval RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_config_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings); + +/** + * @brief Clear the setting + * + * @details Clear all the configurations for the nodes belongs to the given group + * + * @param[in] p_rdc_handle Node handle + * + * @param[in] group_id Group id to which nodes belong + * + * @retval RDC_ST_OK is returned upon successful call. + */ +rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id); #ifdef __cplusplus } diff --git a/include/rdc_lib/RdcConfigSettings.h b/include/rdc_lib/RdcConfigSettings.h new file mode 100644 index 0000000..f7f1eb6 --- /dev/null +++ b/include/rdc_lib/RdcConfigSettings.h @@ -0,0 +1,50 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_RDCCONFIGSETTINGS_H_ +#define INCLUDE_RDC_LIB_RDCCONFIGSETTINGS_H_ + +#include + +#include "rdc/rdc.h" + +namespace amd { +namespace rdc { + +class RdcConfigSettings { + public: + // Set one configure + virtual rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) = 0; + + // Get the setting + virtual rdc_status_t rdc_config_get(rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) = 0; + + // Clear the setting + virtual rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) = 0; + + virtual ~RdcConfigSettings() {} +}; +typedef std::shared_ptr RdcConfigSettingsPtr; +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_RDCCONFIGSETTINGS_H_ diff --git a/include/rdc_lib/RdcHandler.h b/include/rdc_lib/RdcHandler.h index 1edb8e7..e03a741 100644 --- a/include/rdc_lib/RdcHandler.h +++ b/include/rdc_lib/RdcHandler.h @@ -81,18 +81,21 @@ class RdcHandler { // Diagnostic API virtual rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, const char* config, size_t config_size, - rdc_diag_response_t* response, rdc_diag_callback_t* callback) = 0; + rdc_diag_response_t* response, + rdc_diag_callback_t* callback) = 0; virtual rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, const char* config, size_t config_size, - rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) = 0; + rdc_diag_test_result_t* result, + rdc_diag_callback_t* callback) = 0; // Control API virtual rdc_status_t rdc_field_update_all(uint32_t wait_for_update) = 0; // It is just a client interface under the GRPC framework and is not used as an RDC API. // The reason is that RdcEmbeddedHandler::get_mixed_component_version does not need to be called. - virtual rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) = 0; + virtual rdc_status_t get_mixed_component_version(mixed_component_t component, + mixed_component_version_t* p_mixed_compv) = 0; // Policy API virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0; @@ -110,13 +113,24 @@ class RdcHandler { // Health API virtual rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) = 0; virtual rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) = 0; - virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) = 0; + virtual rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, + rdc_health_response_t* response) = 0; virtual rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) = 0; // topology API virtual rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) = 0; virtual rdc_status_t rdc_link_status_get(rdc_link_status_t* results) = 0; + // Set one configure + virtual rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) = 0; + + // Get the setting + virtual rdc_status_t rdc_config_get(rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) = 0; + + // Clear the setting + virtual rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) = 0; + virtual ~RdcHandler() {} }; diff --git a/include/rdc_lib/impl/RdcConfigSettingsImpl.h b/include/rdc_lib/impl/RdcConfigSettingsImpl.h new file mode 100644 index 0000000..a58017e --- /dev/null +++ b/include/rdc_lib/impl/RdcConfigSettingsImpl.h @@ -0,0 +1,73 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef INCLUDE_RDC_LIB_IMPL_RDCCONFIGSETTINGSIMPL_H_ +#define INCLUDE_RDC_LIB_IMPL_RDCCONFIGSETTINGSIMPL_H_ + +#include +#include +#include // NOLINT +#include +#include + +#include "rdc_lib/RdcConfigSettings.h" +#include "rdc_lib/RdcGroupSettings.h" + +namespace amd { +namespace rdc { + +class RdcConfigSettingsImpl : public RdcConfigSettings { + public: + // Set one configure + rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) override; + + // Get the setting + rdc_status_t rdc_config_get(rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) override; + + // clear the setting + rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override; + + explicit RdcConfigSettingsImpl(const RdcGroupSettingsPtr& group_settings); + + private: + RdcGroupSettingsPtr group_settings_; + std::unordered_map> + cached_group_settings_; + std::thread monitor_thread_; + std::mutex mutex_; // Mutex for cached_group_settings_ + std::atomic is_running_; // Bool for if the thread should keep running + std::condition_variable cv_; + + // monitorSettings() is kicked off from the RdcConfigSettingsImpl constructor as it's own thread + // Every minute, it will check if gpu settings from amdsmi are the same as inside + // cached_group_settings_ If not, it sets the mismatched values to the value in + // cached_group_settings_ + void monitorSettings(); + uint64_t wattsToMicrowatts(uint64_t watts) const; + uint64_t microwattsToWatts(int microwatts) const; + rdc_status_t get_group_info(rdc_gpu_group_t group_id, rdc_group_info_t* rdc_group_info); +}; + +} // namespace rdc +} // namespace amd + +#endif // INCLUDE_RDC_LIB_IMPL_RDCCONFIGSETTINGSIMPL_H_ diff --git a/include/rdc_lib/impl/RdcEmbeddedHandler.h b/include/rdc_lib/impl/RdcEmbeddedHandler.h index c54e07f..06c2342 100644 --- a/include/rdc_lib/impl/RdcEmbeddedHandler.h +++ b/include/rdc_lib/impl/RdcEmbeddedHandler.h @@ -25,6 +25,7 @@ THE SOFTWARE. #include // NOLINT(build/c++11) #include "rdc_lib/RdcCacheManager.h" +#include "rdc_lib/RdcConfigSettings.h" #include "rdc_lib/RdcGroupSettings.h" #include "rdc_lib/RdcHandler.h" #include "rdc_lib/RdcMetricFetcher.h" @@ -86,17 +87,20 @@ class RdcEmbeddedHandler final : public RdcHandler { // Diagnostic API rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, const char* config, size_t config_size, - rdc_diag_response_t* response, rdc_diag_callback_t* callback) override; + rdc_diag_response_t* response, + rdc_diag_callback_t* callback) override; rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, const char* config, size_t config_size, - rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override; + rdc_diag_test_result_t* result, + rdc_diag_callback_t* callback) override; // Control API rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; // It is just a client interface under the GRPC framework and is not used as an RDC API. // Pure virtual functions need to be overridden. - rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override; + rdc_status_t get_mixed_component_version(mixed_component_t component, + mixed_component_version_t* p_mixed_compv) override; // Policy API rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override; @@ -106,19 +110,30 @@ class RdcEmbeddedHandler final : public RdcHandler { rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id, rdc_policy_condition_type_t condition_type) override; - rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, rdc_policy_register_callback callback) override; + rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id, + rdc_policy_register_callback callback) override; rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) override; // Health API rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override; rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override; - rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override; + rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override; rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override; rdc_status_t rdc_link_status_get(rdc_link_status_t* results) override; + // Set one configure + rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) override; + + // Get the setting + rdc_status_t rdc_config_get(rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) override; + + // Clear the setting + rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override; + explicit RdcEmbeddedHandler(rdc_operation_mode_t op_mode); ~RdcEmbeddedHandler() final; @@ -132,6 +147,7 @@ class RdcEmbeddedHandler final : public RdcHandler { RdcWatchTablePtr watch_table_; RdcMetricsUpdaterPtr metrics_updater_; RdcPolicyPtr policy_; + RdcConfigSettingsPtr config_handler_; std::future updater_; RdcTopologyLinkPtr topologylink_; }; diff --git a/include/rdc_lib/impl/RdcStandaloneHandler.h b/include/rdc_lib/impl/RdcStandaloneHandler.h index 7d08ae3..b90f0ac 100644 --- a/include/rdc_lib/impl/RdcStandaloneHandler.h +++ b/include/rdc_lib/impl/RdcStandaloneHandler.h @@ -23,8 +23,8 @@ THE SOFTWARE. #define INCLUDE_RDC_LIB_IMPL_RDCSTANDALONEHANDLER_H_ #include -#include #include +#include #include #include "rdc.grpc.pb.h" // NOLINT @@ -82,21 +82,35 @@ class RdcStandaloneHandler : public RdcHandler { // Diagnostic API rdc_status_t rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, const char* config, size_t config_size, - rdc_diag_response_t* response, rdc_diag_callback_t* callback) override; + rdc_diag_response_t* response, + rdc_diag_callback_t* callback) override; rdc_status_t rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, const char* config, size_t config_size, - rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) override; + rdc_diag_test_result_t* result, + rdc_diag_callback_t* callback) override; // Control RdcAPI rdc_status_t rdc_field_update_all(uint32_t wait_for_update) override; + // Set one configure + rdc_status_t rdc_config_set(rdc_gpu_group_t group_id, rdc_config_setting_t setting) override; + + // Get the setting + rdc_status_t rdc_config_get(rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) override; + + // Clear the setting + rdc_status_t rdc_config_clear(rdc_gpu_group_t group_id) override; + // It is just a client interface under the GRPC framework and is not used as an RDC API. // Pure virtual functions need to be overridden - rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) override; + rdc_status_t get_mixed_component_version(mixed_component_t component, + mixed_component_version_t* p_mixed_compv) override; // Policy API rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) override; - rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override; + rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count, + rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) override; rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id, rdc_policy_condition_type_t condition_type) override; @@ -109,7 +123,7 @@ class RdcStandaloneHandler : public RdcHandler { // Health API rdc_status_t rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) override; rdc_status_t rdc_health_get(rdc_gpu_group_t group_id, unsigned int* components) override; - rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t *response) override; + rdc_status_t rdc_health_check(rdc_gpu_group_t group_id, rdc_health_response_t* response) override; rdc_status_t rdc_health_clear(rdc_gpu_group_t group_id) override; rdc_status_t rdc_device_topology_get(uint32_t gpu_index, rdc_device_topology_t* results) override; @@ -129,11 +143,10 @@ class RdcStandaloneHandler : public RdcHandler { struct policy_thread_context { bool start; - std::thread *t; + std::thread* t; }; std::map policy_threads_; - }; } // namespace rdc diff --git a/protos/rdc.proto b/protos/rdc.proto index 083b843..5907143 100755 --- a/protos/rdc.proto +++ b/protos/rdc.proto @@ -207,6 +207,16 @@ service RdcAPI { // rdc_gpu_group_t group_id, // rdc_policy_condition_t condition); rpc GetTopology(GetTopologyRequest) returns (GetTopologyResponse) {} + + //Set one configure + rpc SetConfig(SetConfigRequest) returns (SetConfigResponse) {} + + //Get the setting + rpc GetConfig(GetConfigRequest) returns (GetConfigResponse) {} + + //Clear the setting + rpc ClearConfig(ClearConfigRequest) returns (ClearConfigResponse) {} + } message Empty { @@ -720,3 +730,42 @@ message GetTopologyResponse { Topology toppology = 2; } + + +enum rdc_config_type { + RDC_CFG_GFX_CLOCK_LIMIT = 0; + RDC_CFG_MEMORY_CLOCK_LIMIT = 1; + RDC_CFG_POWER_LIMIT = 2; +} + +message rdc_config_setting { + rdc_config_type type = 1; + uint64 target_value = 2; +} + +message SetConfigRequest { + uint32 group_id = 1; + rdc_config_setting setting = 2; +} + +message SetConfigResponse { + uint32 status = 1; +} + +message GetConfigRequest { + uint32 group_id = 1; + uint32 num_of_settings = 2; +} + +message GetConfigResponse { + uint32 status = 1; + repeated rdc_config_setting settings = 2; +} + +message ClearConfigRequest { + uint32 group_id = 1; +} + +message ClearConfigResponse { + uint32 status = 1; +} diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index c69fae5..dac201b 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -183,7 +183,8 @@ rdc_status_t rdc_device_get_attributes(rdc_handle_t p_rdc_handle, uint32_t gpu_i ->rdc_device_get_attributes(gpu_index, p_rdc_attr); } -rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, rdc_component_version_t* p_rdc_compv) { +rdc_status_t rdc_device_get_component_version(rdc_handle_t p_rdc_handle, rdc_component_t component, + rdc_component_version_t* p_rdc_compv) { if (!p_rdc_handle || !p_rdc_compv) { return RDC_ST_INVALID_HANDLER; } @@ -316,7 +317,8 @@ rdc_status_t rdc_diagnostic_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, const char* config, - size_t config_size, rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) { + size_t config_size, rdc_diag_test_result_t* result, + rdc_diag_callback_t* callback) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } @@ -325,7 +327,8 @@ rdc_status_t rdc_test_case_run(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_ ->rdc_test_case_run(group_id, test_case, config, config_size, result, callback); } -rdc_status_t get_mixed_component_version(rdc_handle_t p_rdc_handle, mixed_component_t component, mixed_component_version_t* p_mixed_compv) { +rdc_status_t get_mixed_component_version(rdc_handle_t p_rdc_handle, mixed_component_t component, + mixed_component_version_t* p_mixed_compv) { if (!p_rdc_handle || !p_mixed_compv) { return RDC_ST_INVALID_HANDLER; } @@ -388,6 +391,32 @@ const char* rdc_diagnostic_result_string(rdc_diag_result_t result) { } } +rdc_status_t rdc_config_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_config_setting_t setting) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)->rdc_config_set(group_id, setting); +} + +rdc_status_t rdc_config_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) { + if (!p_rdc_handle || settings == nullptr) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)->rdc_config_get(group_id, settings); +} + +rdc_status_t rdc_config_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) { + if (!p_rdc_handle) { + return RDC_ST_INVALID_HANDLER; + } + + return static_cast(p_rdc_handle)->rdc_config_clear(group_id); +} + const char* field_id_string(rdc_field_t field_id) { amd::rdc::fld_id2name_map_t& field_id_to_descript = amd::rdc::get_field_id_description_from_id(); return field_id_to_descript.find(field_id)->second.label.c_str(); @@ -407,8 +436,7 @@ rdc_status_t rdc_health_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, return RDC_ST_INVALID_HANDLER; } - return static_cast(p_rdc_handle) - ->rdc_health_set(group_id, components); + return static_cast(p_rdc_handle)->rdc_health_set(group_id, components); } rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, @@ -417,18 +445,16 @@ rdc_status_t rdc_health_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, return RDC_ST_INVALID_HANDLER; } - return static_cast(p_rdc_handle) - ->rdc_health_get(group_id, components); + return static_cast(p_rdc_handle)->rdc_health_get(group_id, components); } rdc_status_t rdc_health_check(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, - rdc_health_response_t *response) { + rdc_health_response_t* response) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } - return static_cast(p_rdc_handle) - ->rdc_health_check(group_id, response); + return static_cast(p_rdc_handle)->rdc_health_check(group_id, response); } rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) { @@ -436,8 +462,7 @@ rdc_status_t rdc_health_clear(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_i return RDC_ST_INVALID_HANDLER; } - return static_cast(p_rdc_handle) - ->rdc_health_clear(group_id); + return static_cast(p_rdc_handle)->rdc_health_clear(group_id); } char* strncpy_with_null(char* dest, const char* src, size_t n) { @@ -464,35 +489,32 @@ rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, return RDC_ST_INVALID_HANDLER; } - return static_cast(p_rdc_handle)->rdc_policy_get(group_id, count, policies); + return static_cast(p_rdc_handle) + ->rdc_policy_get(group_id, count, policies); } - rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, - rdc_policy_condition_type_t condition_type){ + rdc_policy_condition_type_t condition_type) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } - return static_cast(p_rdc_handle)->rdc_policy_delete(group_id, condition_type); + return static_cast(p_rdc_handle) + ->rdc_policy_delete(group_id, condition_type); } - - rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, rdc_policy_register_callback callback) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } - return static_cast(p_rdc_handle) - ->rdc_policy_register(group_id, callback); + return static_cast(p_rdc_handle)->rdc_policy_register(group_id, callback); } rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id) { if (!p_rdc_handle) { return RDC_ST_INVALID_HANDLER; } - return static_cast(p_rdc_handle) - ->rdc_policy_unregister(group_id); + return static_cast(p_rdc_handle)->rdc_policy_unregister(group_id); } rdc_status_t rdc_device_topology_get(rdc_handle_t p_rdc_handle, uint32_t gpu_index, rdc_device_topology_t* results) { diff --git a/rdc_libs/rdc/CMakeLists.txt b/rdc_libs/rdc/CMakeLists.txt index 9d44381..2282802 100644 --- a/rdc_libs/rdc/CMakeLists.txt +++ b/rdc_libs/rdc/CMakeLists.txt @@ -25,6 +25,7 @@ set(RDC_LIB_SRC_LIST ${RDC_LIB_SRC_LIST} "${SRC_DIR}/RdcRVSLib.cc" "${SRC_DIR}/RdcSmiDiagnosticImpl.cc" "${SRC_DIR}/RdcSmiLib.cc" + "${SRC_DIR}/RdcConfigSettingsImpl.cc" "${SRC_DIR}/RdcTelemetryModule.cc" "${SRC_DIR}/RdcWatchTableImpl.cc" "${SRC_DIR}/SmiUtils.cc") diff --git a/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc b/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc new file mode 100644 index 0000000..213c480 --- /dev/null +++ b/rdc_libs/rdc/src/RdcConfigSettingsImpl.cc @@ -0,0 +1,364 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "rdc_lib/impl/RdcConfigSettingsImpl.h" + +#include +#include +#include + +#include "amd_smi/amdsmi.h" +#include "rdc_lib/RdcLogger.h" +#include "rdc_lib/impl/SmiUtils.h" +#include "rdc_lib/rdc_common.h" + +namespace amd { +namespace rdc { + +RdcConfigSettingsImpl::RdcConfigSettingsImpl(const RdcGroupSettingsPtr& group_settings) + : group_settings_(group_settings), is_running_(false) {} + +// Monitoring thread of gpu settings +void RdcConfigSettingsImpl::monitorSettings() { + rdc_gpu_group_t group_id; + amdsmi_processor_handle processor_handle; + amdsmi_status_t status; + rdc_status_t rdc_status; + rdc_group_info_t rdc_group_info = {}; + amdsmi_power_cap_info_t cap_info = {}; + amdsmi_dev_perf_level_t perf_info = {}; + uint32_t od; + uint64_t cached_value; + + while (true) { + { // Scope block for mutex + std::unique_lock lock(mutex_); + cv_.wait_for(lock, std::chrono::minutes(1), + [this] { return !is_running_ || cached_group_settings_.empty(); }); + + if (!is_running_ || cached_group_settings_.empty()) { + break; // Stop if the thread is requested to stop or settings are empty + } + + for (const auto& group_pair : cached_group_settings_) { + group_id = group_pair.first; + const auto& cached_settings = group_pair.second; + rdc_status = get_group_info(group_id, &rdc_group_info); + if (rdc_status != RDC_ST_OK) { + // Error log handled in get_group_info + continue; + } + for (unsigned int i = 0; i < rdc_group_info.count; ++i) { + status = get_processor_handle_from_id(rdc_group_info.entity_ids[i], &processor_handle); + if (status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(): get_processor_handle_from_id faied: " + << status); + continue; + } + + // Power cap + status = amdsmi_get_power_cap_info(processor_handle, 0, &cap_info); + if (status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_power_cap_info failed: " + << status); + continue; + } + + auto power_cap_it = cached_settings.find(RDC_CFG_POWER_LIMIT); + if (power_cap_it != cached_settings.end()) { + cached_value = power_cap_it->second.target_value; + if (microwattsToWatts(cap_info.power_cap) != cached_value) { + RDC_LOG( + RDC_INFO, + "RdcConfigSettingsImpl::monitorSettings(); Mismatched Power values, resetting"); + status = amdsmi_set_power_cap(processor_handle, 0, wattsToMicrowatts(cached_value)); + if (status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG( + RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_set_power_cap_info failed: " + << status); + continue; + } + } + } + + // Mem clock + status = amdsmi_get_gpu_overdrive_level(processor_handle, &od); + if (status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG( + RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_overdrive_level failed: " + << status); + continue; + } + + auto mem_clk_it = cached_settings.find(RDC_CFG_MEMORY_CLOCK_LIMIT); + if (mem_clk_it != cached_settings.end()) { + cached_value = mem_clk_it->second.target_value; + if (od == cached_value) { + status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, + CLK_LIMIT_MAX, cached_value); + if (status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_set_gpu_clk_limit failed " + "for mem clk: " + << status); + continue; + } + } + } + + // GFX clock + status = amdsmi_get_gpu_perf_level(processor_handle, &perf_info); + if (status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_get_gpu_perf_level failed: " + << status); + continue; + } + + auto gfx_clk_it = cached_settings.find(RDC_CFG_GFX_CLOCK_LIMIT); + if (gfx_clk_it != cached_settings.end()) { + cached_value = gfx_clk_it->second.target_value; + if (od == cached_value) { + status = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, + CLK_LIMIT_MAX, cached_value); + if (status != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::monitorSettings(); amdsmi_set_gpu_clk_limit failed " + "for gfx clk: " + << status); + continue; + } + } + } + } + } + } + } + RDC_LOG(RDC_INFO, "RdcConfigSettingsImpl Monitoring Thread Stopped"); +} + +uint64_t RdcConfigSettingsImpl::wattsToMicrowatts(uint64_t watts) const { + return watts * 1'000'000; +} + +uint64_t RdcConfigSettingsImpl::microwattsToWatts(int microwatts) const { + return microwatts / 1'000'000; +} + +rdc_status_t RdcConfigSettingsImpl::get_group_info(rdc_gpu_group_t group_id, + rdc_group_info_t* rdc_group_info) { + rdc_status_t status = group_settings_->rdc_group_gpu_get_info(group_id, rdc_group_info); + if (status != RDC_ST_OK) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_set(): rdc_group_gpu_get_info failed : " << status); + } + return status; +} + +// Set configuration setting +rdc_status_t RdcConfigSettingsImpl::rdc_config_set(rdc_gpu_group_t group_id, + rdc_config_setting_t setting) { + amdsmi_processor_handle processor_handle; + amdsmi_status_t amd_ret; + + // Get the group info for gpu_index list + rdc_group_info_t rdc_group_info; + if (get_group_info(group_id, &rdc_group_info) != RDC_ST_OK) { + return RDC_ST_UNKNOWN_ERROR; + } + + for (unsigned int i = 0; i < rdc_group_info.count; ++i) { + amd_ret = get_processor_handle_from_id(rdc_group_info.entity_ids[i], &processor_handle); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG( + RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_set(): Failed to get processor handle : " << amd_ret); + break; + } + + if (setting.type == RDC_CFG_POWER_LIMIT) { + amd_ret = amdsmi_set_power_cap(processor_handle, 0, wattsToMicrowatts(setting.target_value)); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_set: amdsmi_set_power_cap failed : " << amd_ret); + break; + } + } else if (setting.type == RDC_CFG_MEMORY_CLOCK_LIMIT) { + amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, + setting.target_value); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG( + RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_set: amdsmi_set_gpu_clk_limit failed : " << amd_ret); + break; + } + } else if (setting.type == RDC_CFG_GFX_CLOCK_LIMIT) { + amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX, + setting.target_value); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG( + RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_set: amdsmi_set_gpu_clk_limit failed : " << amd_ret); + break; + } + } + } + + if (amd_ret == AMDSMI_STATUS_SUCCESS) { + std::lock_guard lock(mutex_); + cached_group_settings_[group_id][setting.type] = setting; + if (!is_running_) { + is_running_ = true; + monitor_thread_ = std::thread(&RdcConfigSettingsImpl::monitorSettings, this); + RDC_LOG(RDC_INFO, "RdcConfigSettingsImpl Monitoring Thread Started"); + } + return RDC_ST_OK; + } else { + return RDC_ST_UNKNOWN_ERROR; + } +} + +// Display user configured settings +rdc_status_t RdcConfigSettingsImpl::rdc_config_get(rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) { + // Ensure group_id exists in the cache + std::lock_guard lock(mutex_); + auto group_iter = cached_group_settings_.find(group_id); + if (group_iter == cached_group_settings_.end()) { + RDC_LOG(RDC_ERROR, "rdc_config_get: group_id not found in cache: " << RDC_ST_NOT_FOUND); + return RDC_ST_NOT_FOUND; + } + + // Iterate through cached settings for this group + int i = 0; + for (const auto& setting_pair : group_iter->second) { + if (i >= RDC_MAX_CONFIG_SETTINGS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_get: more settings than RDC_MAX_CONFIG_SETTINGS: " + << RDC_ST_MAX_LIMIT); + return RDC_ST_MAX_LIMIT; + } + + settings->settings[i].type = setting_pair.first; + settings->settings[i].target_value = setting_pair.second.target_value; + ++i; + } + + settings->total_settings = i; + return RDC_ST_OK; +} + +// Clear cache of user configured settings +rdc_status_t RdcConfigSettingsImpl::rdc_config_clear(rdc_gpu_group_t group_id) { + amdsmi_status_t amd_ret = AMDSMI_STATUS_SUCCESS; + amdsmi_processor_handle processor_handle; + + // Check if group_id has any cached settings + std::unique_lock lock(mutex_); + auto group_iter = cached_group_settings_.find(group_id); + if (group_iter == cached_group_settings_.end()) { + // No cached settings for this group, nothing to clear + return RDC_ST_OK; + } + + rdc_group_info_t rdc_group_info; + if (get_group_info(group_id, &rdc_group_info) != RDC_ST_OK) { + return RDC_ST_UNKNOWN_ERROR; + } + // Iterate over each GPU in the group and clear only the cached settings + for (unsigned int i = 0; i < rdc_group_info.count; ++i) { + amd_ret = get_processor_handle_from_id(rdc_group_info.entity_ids[i], &processor_handle); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear(): Failed to get processor handle : " + << amd_ret); + break; + } + + // Reset power cap if it was set + if (group_iter->second.find(RDC_CFG_POWER_LIMIT) != group_iter->second.end()) { + amdsmi_power_cap_info_t cap_info = {}; + amd_ret = amdsmi_get_power_cap_info(processor_handle, 0, &cap_info); + if (amd_ret == AMDSMI_STATUS_SUCCESS && cap_info.power_cap != cap_info.default_power_cap) { + amd_ret = amdsmi_set_power_cap(processor_handle, 0, cap_info.default_power_cap); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset power cap : " + << amd_ret); + break; + } + } + } + + // Reset GFX clock limit if it was set + if (group_iter->second.find(RDC_CFG_GFX_CLOCK_LIMIT) != group_iter->second.end()) { + amdsmi_dev_perf_level_t perf_info = {}; + amd_ret = amdsmi_get_gpu_perf_level(processor_handle, &perf_info); + if (amd_ret == AMDSMI_STATUS_SUCCESS && perf_info != AMDSMI_DEV_PERF_LEVEL_AUTO) { + amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_GFX, CLK_LIMIT_MAX, + AMDSMI_DEV_PERF_LEVEL_AUTO); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset GFX clock limit : " + << amd_ret); + break; + } + } + } + + // Reset memory clock limit if it was set + if (group_iter->second.find(RDC_CFG_MEMORY_CLOCK_LIMIT) != group_iter->second.end()) { + uint32_t od = 0; + amd_ret = amdsmi_get_gpu_overdrive_level(processor_handle, &od); + if (amd_ret == AMDSMI_STATUS_SUCCESS && od != 0) { + amd_ret = amdsmi_set_gpu_clk_limit(processor_handle, AMDSMI_CLK_TYPE_MEM, CLK_LIMIT_MAX, 0); + if (amd_ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, + "RdcConfigSettingsImpl::rdc_config_clear: Failed to reset memory clock limit:" + << amd_ret); + break; + } + } + } + } + + cached_group_settings_.erase(group_id); + + if (cached_group_settings_.empty()) { + is_running_ = false; + cv_.notify_all(); + lock.unlock(); + + if (monitor_thread_.joinable()) { + monitor_thread_.join(); // Wait for the thread to finish + } + + RDC_LOG(RDC_INFO, "RdcConfigSettingsImpl Monitoring Thread Stopped"); + } + + return (amd_ret == AMDSMI_STATUS_SUCCESS) ? RDC_ST_OK : RDC_ST_UNKNOWN_ERROR; +} + +} // namespace rdc +} // namespace amd diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index dc50d63..559ad16 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -30,6 +30,7 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcNotification.h" #include "rdc_lib/impl/RdcCacheManagerImpl.h" +#include "rdc_lib/impl/RdcConfigSettingsImpl.h" #include "rdc_lib/impl/RdcGroupSettingsImpl.h" #include "rdc_lib/impl/RdcMetricFetcherImpl.h" #include "rdc_lib/impl/RdcMetricsUpdaterImpl.h" @@ -80,10 +81,12 @@ RdcEmbeddedHandler::RdcEmbeddedHandler(rdc_operation_mode_t mode) metric_fetcher_(new RdcMetricFetcherImpl()), rdc_module_mgr_(new RdcModuleMgrImpl(metric_fetcher_)), rdc_notif_(new RdcNotificationImpl()), - watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, metric_fetcher_, rdc_module_mgr_, rdc_notif_)), + watch_table_(new RdcWatchTableImpl(group_settings_, cache_mgr_, metric_fetcher_, + rdc_module_mgr_, rdc_notif_)), metrics_updater_(new RdcMetricsUpdaterImpl(watch_table_, METIC_UPDATE_FREQUENCY)), - policy_(new RdcPolicyImpl(group_settings_,metric_fetcher_)), - topologylink_(new RdcTopologyLinkImpl(group_settings_, metric_fetcher_)) { + policy_(new RdcPolicyImpl(group_settings_, metric_fetcher_)), + topologylink_(new RdcTopologyLinkImpl(group_settings_, metric_fetcher_)), + config_handler_(new RdcConfigSettingsImpl(group_settings_)) { if (mode == RDC_OPERATION_MODE_AUTO) { RDC_LOG(RDC_DEBUG, "Run RDC with RDC_OPERATION_MODE_AUTO"); metrics_updater_->start(); @@ -199,7 +202,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_attributes(uint32_t gpu_index, return status; } -rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(rdc_component_t component, rdc_component_version_t* p_rdc_compv) { +rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version( + rdc_component_t component, rdc_component_version_t* p_rdc_compv) { if (!p_rdc_compv) { return RDC_ST_BAD_PARAMETER; } @@ -211,7 +215,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_device_get_component_version(rdc_component_ ret = amdsmi_get_lib_version(&ver); if (ret != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Failed to obtain the version of the server's amd-smi library. reason: " << (ret == AMDSMI_STATUS_INVAL ? "Invalid parameters" : "unknown")); + RDC_LOG(RDC_ERROR, "Failed to obtain the version of the server's amd-smi library. reason: " + << (ret == AMDSMI_STATUS_INVAL ? "Invalid parameters" : "unknown")); return RDC_ST_MSI_ERROR; } @@ -383,7 +388,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_unwatch(rdc_gpu_group_t group_id, rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_diag_level_t level, const char* config, size_t config_size, - rdc_diag_response_t* response, rdc_diag_callback_t* callback) { + rdc_diag_response_t* response, + rdc_diag_callback_t* callback) { if (!response) { return RDC_ST_BAD_PARAMETER; } @@ -400,7 +406,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_diagnostic_run(rdc_gpu_group_t group_id, rdc_status_t RdcEmbeddedHandler::rdc_test_case_run(rdc_gpu_group_t group_id, rdc_diag_test_cases_t test_case, const char* config, size_t config_size, - rdc_diag_test_result_t* result, rdc_diag_callback_t* callback) { + rdc_diag_test_result_t* result, + rdc_diag_callback_t* callback) { if (!result) { return RDC_ST_BAD_PARAMETER; } @@ -428,7 +435,8 @@ rdc_status_t RdcEmbeddedHandler::rdc_field_update_all(uint32_t wait_for_update) // It is just a client interface under the GRPC framework and is not used as an RDC API. // Just write an empty function to solve compilation errors -rdc_status_t RdcEmbeddedHandler::get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) { +rdc_status_t RdcEmbeddedHandler::get_mixed_component_version( + mixed_component_t component, mixed_component_version_t* p_mixed_compv) { (void)(component); (void)(p_mixed_compv); return RDC_ST_OK; @@ -463,8 +471,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_policy_unregister(rdc_gpu_group_t group_id) } // Health API -rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id, - unsigned int components) { +rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id, unsigned int components) { if (0 == components) { return RDC_ST_BAD_PARAMETER; } @@ -473,7 +480,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_health_set(rdc_gpu_group_t group_id, } rdc_status_t RdcEmbeddedHandler::rdc_health_get(rdc_gpu_group_t group_id, - unsigned int *components) { + unsigned int* components) { if (components == nullptr) { return RDC_ST_BAD_PARAMETER; } @@ -482,7 +489,7 @@ rdc_status_t RdcEmbeddedHandler::rdc_health_get(rdc_gpu_group_t group_id, } rdc_status_t RdcEmbeddedHandler::rdc_health_check(rdc_gpu_group_t group_id, - rdc_health_response_t *response) { + rdc_health_response_t* response) { if (response == nullptr) { return RDC_ST_BAD_PARAMETER; } @@ -491,7 +498,6 @@ rdc_status_t RdcEmbeddedHandler::rdc_health_check(rdc_gpu_group_t group_id, } rdc_status_t RdcEmbeddedHandler::rdc_health_clear(rdc_gpu_group_t group_id) { - return watch_table_->rdc_health_clear(group_id); } @@ -504,5 +510,22 @@ rdc_status_t RdcEmbeddedHandler::rdc_link_status_get(rdc_link_status_t* results) return topologylink_->rdc_link_status_get(results); } +// Set one configure +rdc_status_t RdcEmbeddedHandler::rdc_config_set(rdc_gpu_group_t group_id, + rdc_config_setting_t setting) { + return config_handler_->rdc_config_set(group_id, setting); +} + +// Get the setting +rdc_status_t RdcEmbeddedHandler::rdc_config_get(rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) { + return config_handler_->rdc_config_get(group_id, settings); +} + +// Clear the setting +rdc_status_t RdcEmbeddedHandler::rdc_config_clear(rdc_gpu_group_t group_id) { + return config_handler_->rdc_config_clear(group_id); +} + } // namespace rdc } // namespace amd diff --git a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc index a08edae..e587f71 100644 --- a/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc +++ b/rdc_libs/rdc_client/src/RdcStandaloneHandler.cc @@ -702,6 +702,69 @@ rdc_status_t RdcStandaloneHandler::rdc_field_update_all(uint32_t wait_for_update return error_handle(status, reply.status()); } +// Set one configure +rdc_status_t RdcStandaloneHandler::rdc_config_set(rdc_gpu_group_t group_id, + rdc_config_setting_t setting) { + ::rdc::SetConfigRequest request; + ::rdc::SetConfigResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + + ::rdc::rdc_config_setting* setting_ref = (::rdc::rdc_config_setting*)request.mutable_setting(); + setting_ref->set_type(static_cast<::rdc::rdc_config_type>(setting.type)); + setting_ref->set_target_value(setting.target_value); + + ::grpc::Status status = stub_->SetConfig(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + + return err_status; +} + +// Get the setting +rdc_status_t RdcStandaloneHandler::rdc_config_get(rdc_gpu_group_t group_id, + rdc_config_setting_list_t* settings) { + int i = 0; + ::rdc::GetConfigRequest request; + ::rdc::GetConfigResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + ::grpc::Status status = stub_->GetConfig(&context, request, &reply); + rdc_status_t err_status = error_handle(status, reply.status()); + if (err_status != RDC_ST_OK) return err_status; + + auto res = reply.settings(); + if (reply.settings_size() > RDC_MAX_CONFIG_SETTINGS) return RDC_ST_MAX_LIMIT; + + for (i = 0; i < reply.settings_size() && i < RDC_MAX_CONFIG_SETTINGS; ++i) { + const ::rdc::rdc_config_setting& result = reply.settings(i); + settings->settings[i].type = static_cast(result.type()); + settings->settings[i].target_value = result.target_value(); + } + + settings->total_settings = (reply.settings_size() >= RDC_MAX_CONFIG_SETTINGS) + ? RDC_MAX_CONFIG_SETTINGS + : reply.settings_size(); + err_status = error_handle(status, reply.status()); + return err_status; +} + +// Clear the setting +rdc_status_t RdcStandaloneHandler::rdc_config_clear(rdc_gpu_group_t group_id) { + ::rdc::ClearConfigRequest request; + ::rdc::ClearConfigResponse reply; + ::grpc::ClientContext context; + + request.set_group_id(group_id); + + ::grpc::Status status = stub_->ClearConfig(&context, request, &reply); + + rdc_status_t err_status = error_handle(status, reply.status()); + + return err_status; +} + // It is only an interface for the client under the GRPC framework and is not used as an RDC API. rdc_status_t RdcStandaloneHandler::get_mixed_component_version( mixed_component_t component, mixed_component_version_t* p_mixed_compv) { @@ -804,7 +867,7 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_register(rdc_gpu_group_t group_id, } // no registered callback, start the thread to read the stream from rdcd - struct policy_thread_context ctx = {true,nullptr}; + struct policy_thread_context ctx = {true, nullptr}; ctx.t = new std::thread([this, group_id, callback]() { // call rdcd @@ -905,7 +968,7 @@ rdc_status_t RdcStandaloneHandler::rdc_health_get(rdc_gpu_group_t group_id, } rdc_status_t RdcStandaloneHandler::rdc_health_check(rdc_gpu_group_t group_id, - rdc_health_response_t *response) { + rdc_health_response_t* response) { if (!response) { return RDC_ST_BAD_PARAMETER; } @@ -931,7 +994,7 @@ rdc_status_t RdcStandaloneHandler::rdc_health_check(rdc_gpu_group_t group_id, to_result.component = static_cast(result.component()); to_result.health = static_cast(result.health()); - //set error + // set error to_result.error.code = result.error().code(); strncpy_with_null(to_result.error.msg, result.error().msg().c_str(), MAX_HEALTH_MSG_LENGTH); } diff --git a/rdci/CMakeLists.txt b/rdci/CMakeLists.txt index ad56a63..0888c31 100644 --- a/rdci/CMakeLists.txt +++ b/rdci/CMakeLists.txt @@ -69,6 +69,7 @@ set(RDCI_SRC_LIST "${SRC_DIR}/RdciStatsSubSystem.cc" "${SRC_DIR}/RdciPolicySubSystem.cc" "${SRC_DIR}/RdciHealthSubSystem.cc" + "${SRC_DIR}/RdciConfigSubSystem.cc" "${SRC_DIR}/RdciSubSystem.cc" "${SRC_DIR}/RdciTopologyLinkSubSystem.cc" "${SRC_DIR}/rdci.cc") diff --git a/rdci/include/RdciConfigSubSystem.h b/rdci/include/RdciConfigSubSystem.h new file mode 100644 index 0000000..145e8cb --- /dev/null +++ b/rdci/include/RdciConfigSubSystem.h @@ -0,0 +1,59 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#ifndef RDCI_INCLUDE_RDCICONFIGSUBSYSTEM_H_ +#define RDCI_INCLUDE_RDCICONFIGSUBSYSTEM_H_ + +#include "RdciSubSystem.h" + +namespace amd { +namespace rdc { + +class RdciConfigSubSystem : public RdciSubSystem { + public: + RdciConfigSubSystem(); + ~RdciConfigSubSystem() override; + void parse_cmd_opts(int argc, char** argv) override; + void process() override; + typedef enum { + CONFIG_COMMAND_NONE = 0, + CONFIG_COMMAND_SET, + CONFIG_COMMAND_GET, + CONFIG_COMMAND_CLEAR, + CONFIG_COMMAND_HELP, + } config_command_type_t; + + private: + void show_help() const; + void display_config_settings(rdc_config_setting_list_t& rdc_configs_list); + config_command_type_t config_cmd_; + static constexpr rdc_field_grp_t JOB_FIELD_ID = 1; + uint32_t group_id_; + uint32_t power_limit_; + uint64_t gfx_max_clock_; + uint64_t memory_max_clock_; + rdc_field_grp_t fgid_; +}; + +} // namespace rdc +} // namespace amd + +#endif // RDCI_INCLUDE_RDCICONFIGSUBSYSTEM_H_ diff --git a/rdci/src/RdciConfigSubSystem.cc b/rdci/src/RdciConfigSubSystem.cc new file mode 100644 index 0000000..7a9c4d4 --- /dev/null +++ b/rdci/src/RdciConfigSubSystem.cc @@ -0,0 +1,391 @@ +/* +Copyright (c) 2024 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "RdciConfigSubSystem.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include "common/rdc_utils.h" +#include "rdc/rdc.h" +#include "rdc_lib/RdcException.h" +#include "rdc_lib/rdc_common.h" + +static constexpr uint32_t TABLE_COLUMN_WIDTH = 20; + +namespace amd { +namespace rdc { + +RdciConfigSubSystem::RdciConfigSubSystem() + : config_cmd_(CONFIG_COMMAND_NONE), + power_limit_(0), + gfx_max_clock_(0), + memory_max_clock_(0), + fgid_(0) {} + +RdciConfigSubSystem::~RdciConfigSubSystem() { + if (fgid_ != JOB_FIELD_ID) { + rdc_field_unwatch(rdc_handle_, group_id_, fgid_); + rdc_group_field_destroy(rdc_handle_, fgid_); + fgid_ = JOB_FIELD_ID; + } +} + +void RdciConfigSubSystem::parse_cmd_opts(int argc, char** argv) { + const int JSON_OPTIONS = 1001; + const struct option long_options[] = {{"set", no_argument, nullptr, 's'}, + {"get", no_argument, nullptr, 't'}, + {"clear", no_argument, nullptr, 'c'}, + {"unauth", optional_argument, nullptr, 'u'}, + {"group", required_argument, nullptr, 'g'}, + {"powerlimit", required_argument, nullptr, 'p'}, + {"gfxmaxclk", required_argument, nullptr, 'x'}, + {"memmaxclk", required_argument, nullptr, 'm'}, + {"help", optional_argument, nullptr, 'h'}, + {"json", optional_argument, nullptr, JSON_OPTIONS}, + {nullptr, 0, nullptr, 0}}; + + int option_index = 0; + int opt = 0; + config_cmd_ = CONFIG_COMMAND_NONE; + bool group_id_set = false; // ensure set, get, and clear have a group associated with them + + while ((opt = getopt_long(argc, argv, "stcuhg:p:x:m", long_options, &option_index)) != -1) { + switch (opt) { + case 's': + config_cmd_ = CONFIG_COMMAND_SET; + break; + case 't': + config_cmd_ = CONFIG_COMMAND_GET; + break; + case 'c': + config_cmd_ = CONFIG_COMMAND_CLEAR; + break; + case 'u': + use_auth_ = false; + break; + case 'h': + config_cmd_ = CONFIG_COMMAND_HELP; + break; + case 'g': + group_id_ = std::stoi(optarg); + group_id_set = true; + break; + case 'p': + power_limit_ = std::stoul(optarg); + break; + case 'x': + gfx_max_clock_ = std::stoul(optarg); + break; + case 'm': + memory_max_clock_ = std::stoul(optarg); + break; + case JSON_OPTIONS: + set_json_output(true); + break; + default: + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command line options"); + } + } + + if (config_cmd_ == CONFIG_COMMAND_NONE) { + show_help(); + throw RdcException(RDC_ST_BAD_PARAMETER, "Must specify a valid operations"); + } + + // Enforce a mandatory group id for set, get, and clear + if ((config_cmd_ == CONFIG_COMMAND_SET || config_cmd_ == CONFIG_COMMAND_GET || + config_cmd_ == CONFIG_COMMAND_CLEAR) && + !group_id_set) { + show_help(); + throw RdcException( + RDC_ST_BAD_PARAMETER, + "Must specify a group ID (-g or --group) for set, get, and clear operations"); + } +} + +void RdciConfigSubSystem::show_help() const { + if (is_json_output()) return; + std::cout << " config -- Used to configure GPU to have configuration across workloads and across " + "devices.\n\n"; + std::cout << "Usage\n"; + std::cout << " rdci config --help\n"; + std::cout << " rdci config [-g ] --set [--powerlimit ] [--gfxmaxclk " + "] [--memmaxclk ]\n"; + std::cout << " rdci config [-g ] --get\n"; + std::cout << " rdci config [-g ] --clear\n"; + show_common_usage(); +} + +void RdciConfigSubSystem::process() { + rdc_status_t result = RDC_ST_UNKNOWN_ERROR; + std::ostringstream json_ss; + + switch (config_cmd_) { + case CONFIG_COMMAND_SET: { + rdc_config_setting_t setting; + if (gfx_max_clock_ != 0) { + setting.type = RDC_CFG_GFX_CLOCK_LIMIT; + setting.target_value = gfx_max_clock_; + result = rdc_config_set(rdc_handle_, group_id_, setting); + } + + if (power_limit_ != 0) { + setting.type = RDC_CFG_POWER_LIMIT; + setting.target_value = power_limit_; + result = rdc_config_set(rdc_handle_, group_id_, setting); + } + + if (memory_max_clock_ != 0) { + setting.type = RDC_CFG_MEMORY_CLOCK_LIMIT; + setting.target_value = memory_max_clock_; + result = rdc_config_set(rdc_handle_, group_id_, setting); + } + if (result == RDC_ST_OK) { + if (is_json_output()) { + json_ss << "{" + << "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\"" + << "}"; + } else { + std::cout << "Successfully configured GPU Id belongs to group: " << group_id_ + << std::endl; + } + std::cout << json_ss.str() << std::endl; + return; + } + break; + } + case CONFIG_COMMAND_GET: { + // Add the default job stats fields + std::vector job_fields = {RDC_FI_GPU_CLOCK, RDC_FI_MEM_CLOCK, + RDC_FI_POWER_USAGE}; + static const char job_field_group[] = "RdciConfigSubSystem"; + + result = rdc_group_field_create(rdc_handle_, job_fields.size(), job_fields.data(), + job_field_group, &fgid_); + + // Start watch + const double max_keep_age = 30060; // Length of time to keep data in field in seconds + const int max_keep_samples = 10; + const int update_frequency = 1000000; // Once per minute + result = rdc_field_watch(rdc_handle_, group_id_, fgid_, update_frequency, max_keep_age, + max_keep_samples); + + rdc_config_setting_list_t settings = {0, {}}; + result = rdc_config_get(rdc_handle_, group_id_, &settings); + if (result == RDC_ST_OK) { + display_config_settings(settings); + } else if (result == RDC_ST_NOT_FOUND) { + std::cout << "Get config information failed, cache empty " << std::endl; + } else { + std::cout << "Get config information failed " << std::endl; + } + + // Stop watching the field group + result = rdc_field_unwatch(rdc_handle_, group_id_, fgid_); + if (result != RDC_ST_OK) { + std::cout << "Error stop watch fields. Return: " << rdc_status_string(result); + } + std::cout << "Stop watch group:" << group_id_ << ", field_group:" << fgid_ << std::endl; + + // Delete the field group and GPU group + result = rdc_group_field_destroy(rdc_handle_, fgid_); + if (result != RDC_ST_OK) { + std::cout << "Error delete field group. Return: " << rdc_status_string(result); + } + std::cout << "Deleted the field group " << fgid_ << std::endl; + + break; + } + case CONFIG_COMMAND_CLEAR: { + result = rdc_config_clear(rdc_handle_, group_id_); + if (result == RDC_ST_OK) { + if (is_json_output()) { + json_ss << "\"group_id\": \"" << group_id_ << "\", \"status\": \"ok\""; + } else { + std::cout << "Successfully cleared all configurationbelongs for group: " << group_id_ + << std::endl; + } + std::cout << json_ss.str() << std::endl; + return; + } + break; + } + case CONFIG_COMMAND_HELP: + show_help(); + result = RDC_ST_OK; + break; + case CONFIG_COMMAND_NONE: + default: + throw RdcException(RDC_ST_BAD_PARAMETER, "Unknown command"); + } + if (result != RDC_ST_OK) { + throw RdcException(result, rdc_status_string(result)); + } +} + +void RdciConfigSubSystem::display_config_settings(rdc_config_setting_list_t& rdc_configs_list) { + rdc_status_t result = RDC_ST_OK; + std::stringstream ss, json_ss; + rdc_group_info_t rdc_group_info = {0, "", {0}}; + uint32_t gpu_index = 0; + uint32_t index = 0; + + ss << std::setw(TABLE_COLUMN_WIDTH) << std::left << "configure" << std::setw(TABLE_COLUMN_WIDTH) + << std::left << "gpu_index" << std::setw(TABLE_COLUMN_WIDTH) << std::left << "config_limit" + << std::setw(TABLE_COLUMN_WIDTH) << std::left << "current_value" << std::endl; + json_ss << "\"group_id\": " << group_id_ << "," + << "\"config_list\" : ["; + + result = rdc_group_gpu_get_info(rdc_handle_, group_id_, &rdc_group_info); + if (result == RDC_ST_OK) { + std::vector group_index_array(rdc_group_info.entity_ids, + rdc_group_info.entity_ids + rdc_group_info.count); + sort(begin(group_index_array), end(group_index_array)); + for (uint32_t i = 0; i < rdc_configs_list.total_settings && i < RDC_GROUP_MAX_ENTITIES; ++i) { + auto type = rdc_configs_list.settings[i].type; + auto config_value = rdc_configs_list.settings[i].target_value; + + for (gpu_index = 0; gpu_index < rdc_group_info.count && gpu_index < RDC_GROUP_MAX_ENTITIES; + ++gpu_index) { + json_ss << "{\"gpu_index\": " << group_index_array[gpu_index] << ","; + + rdc_field_value value; + switch (type) { + case RDC_CFG_GFX_CLOCK_LIMIT: { + json_ss << "\"GFX Clock Limit\":" << config_value; + ss << std::setw(TABLE_COLUMN_WIDTH) << std::left << "gfx_clock_limit" + << std::setw(TABLE_COLUMN_WIDTH) << std::left << group_index_array[gpu_index] + << std::setw(TABLE_COLUMN_WIDTH) << std::left << config_value + << std::setw(TABLE_COLUMN_WIDTH) << std::left; + + result = rdc_field_get_latest_value(rdc_handle_, group_index_array[gpu_index], + RDC_FI_GPU_CLOCK, &value); + json_ss << ",\"GFX Clock Current Value\":"; + if (result != RDC_ST_OK) { + ss << "N/A"; + json_ss << "\"N/A\""; + } else { + if (value.type == INTEGER) { + ss << value.value.l_int; + json_ss << value.value.l_int; + } else if (value.type == DOUBLE) { + ss << std::fixed << std::setprecision(3) << value.value.dbl; + json_ss << value.value.dbl; + } else { + ss << value.value.str; + json_ss << value.value.str; + } + } + ss << std::endl; + break; + } + case RDC_CFG_MEMORY_CLOCK_LIMIT: + json_ss << "\"Memory Clock Limit\":" << config_value; + ss << std::setw(TABLE_COLUMN_WIDTH) << std::left << "memory_clock_limit" + << std::setw(TABLE_COLUMN_WIDTH) << std::left << group_index_array[gpu_index] + << std::setw(TABLE_COLUMN_WIDTH) << std::left << config_value + << std::setw(TABLE_COLUMN_WIDTH) << std::left; + + result = rdc_field_get_latest_value(rdc_handle_, group_index_array[gpu_index], + RDC_FI_MEM_CLOCK, &value); + json_ss << ",\"Memory Clock Current Value\":"; + if (result != RDC_ST_OK) { + ss << "N/A"; + json_ss << "\"N/A\""; + } else { + if (value.type == INTEGER) { + ss << value.value.l_int; + json_ss << value.value.l_int; + } else if (value.type == DOUBLE) { + ss << std::fixed << std::setprecision(3) << value.value.dbl; + json_ss << value.value.dbl; + } else { + ss << value.value.str; + json_ss << value.value.str; + } + } + ss << std::endl; + break; + case RDC_CFG_POWER_LIMIT: + json_ss << "\"Power Limit\":" << config_value; + ss << std::setw(TABLE_COLUMN_WIDTH) << std::left << "power_limit" + << std::setw(TABLE_COLUMN_WIDTH) << std::left << group_index_array[gpu_index] + << std::setw(TABLE_COLUMN_WIDTH) << std::left << config_value + << std::setw(TABLE_COLUMN_WIDTH) << std::left; + + result = rdc_field_get_latest_value(rdc_handle_, group_index_array[gpu_index], + RDC_FI_POWER_USAGE, &value); + json_ss << ",\"Power Current Value\":"; + if (result != RDC_ST_OK) { + ss << "N/A"; + json_ss << "\"N/A\""; + } else { + if (value.type == INTEGER) { + double watts = static_cast(value.value.l_int) / 1'000'000; + ss << std::fixed << std::setprecision(3) << watts; + json_ss << watts; + } else if (value.type == DOUBLE) { + double watts = value.value.dbl / 1'000'000; + ss << std::fixed << std::setprecision(3) << watts; + json_ss << watts; + } else { + ss << value.value.str; + json_ss << value.value.str; + } + } + ss << std::endl; + break; + default: + break; + } + // Set the json seperator + json_ss << "}"; + if ((gpu_index + 1) != rdc_group_info.count) { + json_ss << ","; + } + } + if (rdc_group_info.count != 0 && i < rdc_configs_list.total_settings - 1 && + i < RDC_GROUP_MAX_ENTITIES - 1) { + json_ss << ","; + } + } + } + if (index != 0) { + json_ss << "}"; + } + json_ss << "]"; + if (is_json_output()) { + std::cout << json_ss.str() << std::endl; + } else { + std::cout << ss.str() << std::endl; + } +} + +} // namespace rdc +} // namespace amd diff --git a/rdci/src/rdci.cc b/rdci/src/rdci.cc index aa500dd..f4c5b0c 100644 --- a/rdci/src/rdci.cc +++ b/rdci/src/rdci.cc @@ -20,19 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include -#include +#include "RdciConfigSubSystem.h" #include "RdciDiagSubSystem.h" #include "RdciDiscoverySubSystem.h" #include "RdciDmonSubSystem.h" #include "RdciFieldGroupSubSystem.h" #include "RdciGroupSubSystem.h" -#include "RdciStatsSubSystem.h" -#include "RdciPolicySubSystem.h" #include "RdciHealthSubSystem.h" #include "RdciTopologyLinkSubSystem.h" +#include "RdciPolicySubSystem.h" +#include "RdciStatsSubSystem.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" #include "rdc_lib/rdc_common.h" @@ -42,8 +44,11 @@ THE SOFTWARE. #define RDC_CLIENT_VERSION_RELEASE 0 #define RDC_CLIENT_VERSION_CREATE_STRING(MAJOR, MINOR, RELEASE) (#MAJOR "." #MINOR "." #RELEASE) -#define RDC_CLIENT_VERSION_EXPAND_PARTS(MAJOR_STR, MINOR_STR, RELEASE_STR) RDC_CLIENT_VERSION_CREATE_STRING(MAJOR_STR, MINOR_STR, RELEASE_STR) -#define RDC_CLIENT_VERSION_STRING RDC_CLIENT_VERSION_EXPAND_PARTS(RDC_CLIENT_VERSION_MAJOR, RDC_CLIENT_VERSION_MINOR, RDC_CLIENT_VERSION_RELEASE) +#define RDC_CLIENT_VERSION_EXPAND_PARTS(MAJOR_STR, MINOR_STR, RELEASE_STR) \ + RDC_CLIENT_VERSION_CREATE_STRING(MAJOR_STR, MINOR_STR, RELEASE_STR) +#define RDC_CLIENT_VERSION_STRING \ + RDC_CLIENT_VERSION_EXPAND_PARTS(RDC_CLIENT_VERSION_MAJOR, RDC_CLIENT_VERSION_MINOR, \ + RDC_CLIENT_VERSION_RELEASE) #define Q(x) #x #define QUOTE(x) Q(x) @@ -52,7 +57,7 @@ int main(int argc, char** argv) { const std::string usage_help = "Usage:\trdci |\n" "subsystem: \n" - " discovery, dmon, group, fieldgroup, stats, diag, policy, health, topo\n" + " discovery, dmon, group, fieldgroup, stats, diag, config, policy, health, topo\n" "options: \n" " -v(--version) : Print client version information only\n"; @@ -63,11 +68,12 @@ int main(int argc, char** argv) { if (strcmp(argv[1], "-v") == 0 || strcmp(argv[1], "--version") == 0) { #ifdef CURRENT_GIT_HASH - std::cout << "RDCI : " << RDC_CLIENT_VERSION_STRING << "+" << QUOTE(CURRENT_GIT_HASH) << std::endl; + std::cout << "RDCI : " << RDC_CLIENT_VERSION_STRING << "+" << QUOTE(CURRENT_GIT_HASH) + << std::endl; #else - std::cout << "RDCI : " << RDC_CLIENT_VERSION_STRING << std::endl; + std::cout << "RDCI : " << RDC_CLIENT_VERSION_STRING << std::endl; #endif - exit(0); + exit(0); } amd::rdc::RdciSubSystemPtr subsystem; @@ -91,6 +97,8 @@ int main(int argc, char** argv) { subsystem.reset(new amd::rdc::RdciStatsSubSystem()); } else if (subsystem_name == "policy") { subsystem.reset(new amd::rdc::RdciPolicySubSystem()); + } else if (subsystem_name == "config") { + subsystem.reset(new amd::rdc::RdciConfigSubSystem()); } else { std::cout << usage_help; exit(0); diff --git a/server/include/rdc/rdc_api_service.h b/server/include/rdc/rdc_api_service.h index bf423de..afafcfe 100644 --- a/server/include/rdc/rdc_api_service.h +++ b/server/include/rdc/rdc_api_service.h @@ -22,10 +22,10 @@ THE SOFTWARE. #ifndef SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_ #define SERVER_INCLUDE_RDC_RDC_API_SERVICE_H_ -#include - #include +#include + #include "rdc.grpc.pb.h" // NOLINT #include "rdc/rdc.h" @@ -157,12 +157,10 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { const ::rdc::GetTopologyRequest* request, ::rdc::GetTopologyResponse* reply) override; - ::grpc::Status SetHealth(::grpc::ServerContext* context, - const ::rdc::SetHealthRequest* request, + ::grpc::Status SetHealth(::grpc::ServerContext* context, const ::rdc::SetHealthRequest* request, ::rdc::SetHealthResponse* reply) override; - ::grpc::Status GetHealth(::grpc::ServerContext* context, - const ::rdc::GetHealthRequest* request, + ::grpc::Status GetHealth(::grpc::ServerContext* context, const ::rdc::GetHealthRequest* request, ::rdc::GetHealthResponse* reply) override; ::grpc::Status CheckHealth(::grpc::ServerContext* context, @@ -173,6 +171,16 @@ class RdcAPIServiceImpl final : public ::rdc::RdcAPI::Service { const ::rdc::ClearHealthRequest* request, ::rdc::ClearHealthResponse* reply) override; + ::grpc::Status SetConfig(::grpc::ServerContext* context, const ::rdc::SetConfigRequest* request, + ::rdc::SetConfigResponse* reply) override; + + ::grpc::Status GetConfig(::grpc::ServerContext* context, const ::rdc::GetConfigRequest* request, + ::rdc::GetConfigResponse* reply) override; + + ::grpc::Status ClearConfig(::grpc::ServerContext* context, + const ::rdc::ClearConfigRequest* request, + ::rdc::ClearConfigResponse* reply) override; + private: bool copy_gpu_usage_info(const rdc_gpu_usage_info_t& src, ::rdc::GpuUsageInfo* target); rdc_handle_t rdc_handle_; diff --git a/server/src/rdc_api_service.cc b/server/src/rdc_api_service.cc index 5f0acbd..66bad6d 100644 --- a/server/src/rdc_api_service.cc +++ b/server/src/rdc_api_service.cc @@ -1009,7 +1009,7 @@ ::grpc::Status RdcAPIServiceImpl::CheckHealth(::grpc::ServerContext* context, to_incidents->set_component(incident.component); to_incidents->set_health(incident.health); - //error + // error auto to_error = to_incidents->mutable_error(); to_error->set_code(incident.error.code); to_error->set_msg(incident.error.msg); @@ -1062,6 +1062,47 @@ ::grpc::Status RdcAPIServiceImpl::GetTopology(::grpc::ServerContext* context, static_cast<::rdc::TopologyLinkInfo_LinkType>(topology_results.link_infos[i].link_type)); linkinfos->set_p2p_accessible(topology_results.link_infos[i].is_p2p_accessible); } + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::SetConfig(::grpc::ServerContext* context, + const ::rdc::SetConfigRequest* request, + ::rdc::SetConfigResponse* reply) { + (void)(context); + rdc_config_setting_t setting; + ::rdc::rdc_config_setting setting_ref = request->setting(); + setting.type = static_cast(setting_ref.type()); + setting.target_value = setting_ref.target_value(); + + rdc_status_t status = rdc_config_set(rdc_handle_, request->group_id(), setting); + reply->set_status(static_cast<::uint32_t>(status)); + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::GetConfig(::grpc::ServerContext* context, + const ::rdc::GetConfigRequest* request, + ::rdc::GetConfigResponse* reply) { + (void)(context); + rdc_config_setting_list_t settings; + + rdc_status_t status = rdc_config_get(rdc_handle_, request->group_id(), &settings); + + reply->set_status(status); + for (uint32_t i = 0; i < settings.total_settings && i < RDC_MAX_CONFIG_SETTINGS; ++i) { + auto result = reply->add_settings(); + result->set_type(static_cast<::rdc::rdc_config_type>(settings.settings[i].type)); + result->set_target_value(settings.settings[i].target_value); + } + + return ::grpc::Status::OK; +} + +::grpc::Status RdcAPIServiceImpl::ClearConfig(::grpc::ServerContext* context, + const ::rdc::ClearConfigRequest* request, + ::rdc::ClearConfigResponse* reply) { + (void)(context); + rdc_status_t status = rdc_config_clear(rdc_handle_, request->group_id()); + reply->set_status(status); return ::grpc::Status::OK; }