Skip to content

Commit

Permalink
Enable RDC policy feature
Browse files Browse the repository at this point in the history
1. Add policy APIs
2. Add policy example for policy API usage

Change-Id: I14deb7c809d0b865b7bb083842092fc37868025e
Signed-off-by: Chao Fei <[email protected]>
  • Loading branch information
Chao Fei authored and Chao Fei committed Oct 24, 2024
1 parent 4bd31b6 commit 345ac64
Show file tree
Hide file tree
Showing 19 changed files with 1,382 additions and 18 deletions.
2 changes: 2 additions & 0 deletions common/rdc_field.data
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ FLD_DESC_ENT(RDC_FI_GPU_MM_ENC_UTIL, "Mutilmedia encoder busy percentage",
FLD_DESC_ENT(RDC_FI_GPU_MM_DEC_UTIL, "Mutilmedia decoder busy percentage", "GPU_MM_DEC_UTIL", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_ACTIVITY, "Memory busy percentage", "GPU_MEM_UTIL", true)

FLD_DESC_ENT(RDC_FI_GPU_PAGE_RETRIED, "Retried page of the GPU instance", "GPU_PAGE_RETRIED", true)

// ECC totals
FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true)
FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true)
Expand Down
7 changes: 7 additions & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ set(ROCPROFILER_EXAMPLE_EXE "rocprofiler")
add_executable(${ROCPROFILER_EXAMPLE_EXE} "${ROCPROFILER_EXAMPLE_SRC_LIST}")
target_link_libraries(${ROCPROFILER_EXAMPLE_EXE} pthread dl rdc_bootstrap)


set(POLICY_EXAMPLE_SRC_LIST "policy_example.cc")
cmake_print_variables(POLICY_EXAMPLE_SRC_LIST)
set(POLICY_EXAMPLE_EXE "policy")
add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}")
target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap)

message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Finished Cmake Example ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
195 changes: 195 additions & 0 deletions example/policy_example.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#include <unistd.h>

#include <iostream>

#include "rdc/rdc.h"

static const char* condition_type_to_str(rdc_policy_condition_type_t type) {
if (type == RDC_POLICY_COND_MAX_PAGE_RETRIED) return "Retried Page Limit";
if (type == RDC_POLICY_COND_THERMAL) return "Temperature Limit";
if (type == RDC_POLICY_COND_POWER) return "Power Limit";
return "Unknown_Type";
}

static time_t last_time = 0; // last time to print message
int rdc_policy_callback(rdc_policy_callback_response_t* userData) {
if (userData == nullptr) {
std::cerr << "The rdc_policy_callback returns null data\n";
return 1;
}

// To avoid flooding too many messages, only print message every 5 seconds
time_t now = time(NULL);
if (difftime(now, last_time) < 5) {
return 0;
}
std::cout << "The " << condition_type_to_str(userData->condition.type)
<< " exceeds the threshold " << userData->condition.value << " with the value "
<< userData->value << std::endl;
last_time = now; // update the last time
return 0;
}

int main() {
rdc_gpu_group_t group_id;
rdc_status_t result;
bool standalone = false;
rdc_handle_t rdc_handle;
uint32_t count = 0;

char hostIpAddress[] = {"localhost:50051"};
char group_name[] = {"group1"};

// Select the embedded mode and standalone mode dynamically.
std::cout << "Start rdci in: \n";
std::cout << "0 - Embedded mode \n";
std::cout << "1 - Standalone mode \n";
while (!(std::cin >> standalone)) {
std::cout << "Invalid input.\n";
std::cin.clear();
std::cin.ignore();
}
std::cout << std::endl;
std::cout << (standalone ? "Standalone mode selected.\n" : "Embedded mode selected.\n");

// Init the rdc
result = rdc_init(0);

if (result != RDC_ST_OK) {
std::cout << "Error initializing RDC. Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
} else {
std::cout << "RDC Initialized.\n";
}

if (standalone) { // standalone
result = rdc_connect(hostIpAddress, &rdc_handle, nullptr, nullptr, nullptr);
if (result != RDC_ST_OK) {
std::cout << "Error connecting to remote rdcd. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
} else { // embedded
result = rdc_start_embedded(RDC_OPERATION_MODE_AUTO, &rdc_handle);
if (result != RDC_ST_OK) {
std::cout << "Error starting embedded RDC engine. Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}
}

// Now we can use the same API for both standalone and embedded
// Get the list of devices in the system
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES];
result = rdc_device_get_all(rdc_handle, gpu_index_list, &count);
if (result != RDC_ST_OK) {
std::cout << "Error to find devices on the system. Return: " << rdc_status_string(result);
goto cleanup;
}
if (count == 0) {
std::cout << "No GPUs find on the sytem ";
goto cleanup;
} else {
std::cout << count << " GPUs found in the system.\n";
}

// Create the group
result = rdc_group_gpu_create(rdc_handle, RDC_GROUP_EMPTY, group_name, &group_id);
if (result != RDC_ST_OK) {
std::cout << "Error creating group. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Created the GPU group " << group_id << std::endl;

// Add all GPUs to the group
for (uint32_t i = 0; i < count; i++) {
result = rdc_group_gpu_add(rdc_handle, group_id, gpu_index_list[i]); // Add GPU 0
if (result != RDC_ST_OK) {
std::cout << "Error adding group. Return: " << rdc_status_string(result);
goto cleanup;
}
rdc_device_attributes_t attribute;
result = rdc_device_get_attributes(rdc_handle, gpu_index_list[i], &attribute);
if (result != RDC_ST_OK) {
std::cout << "Error get GPU attribute. Return: " << rdc_status_string(result);
goto cleanup;
}
std::cout << "Add GPU " << gpu_index_list[i] << ":" << attribute.device_name << " to group "
<< group_id << std::endl;
}

// Define a policy to print out message when temperature is above 30 degree
// or power usage is more than 150W
rdc_policy_t policy;
policy.condition = {RDC_POLICY_COND_THERMAL, 30 * 1000}; // convert to milli degree
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
result = rdc_policy_set(rdc_handle, group_id, policy);
if (result != RDC_ST_OK) {
std::cout << "Error set policy RDC_POLICY_COND_THERMAL, Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}

policy.condition = {RDC_POLICY_COND_POWER, 150000}; // convert to milli degree
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
result = rdc_policy_set(rdc_handle, group_id, policy);
if (result != RDC_ST_OK) {
std::cout << "Error set policy RDC_POLICY_COND_POWER, Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}

policy.condition = {RDC_POLICY_COND_MAX_PAGE_RETRIED, 100}; // convert to milli degree
policy.action = RDC_POLICY_ACTION_NONE; // Notify only
result = rdc_policy_set(rdc_handle, group_id, policy);
if (result != RDC_ST_OK) {
std::cout << "Error set policy RDC_POLICY_COND_MAX_PAGE_RETRIED, Return: " << rdc_status_string(result)
<< std::endl;
goto cleanup;
}

rdc_policy_t policy_get[RDC_MAX_POLICY_SETTINGS];
result = rdc_policy_get(rdc_handle, group_id, &count, policy_get);
if (result != RDC_ST_OK) {
std::cout << "Error get policy, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}

// Register a function to listen to the events
result = rdc_policy_register(rdc_handle, group_id, rdc_policy_callback);
if (result != RDC_ST_OK) {
std::cout << "Error register policy, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}

std::cout << "Wait 30 seconds for the events happening ...\n" << std::endl;

// If the events happening, the callback rdc_policy_register_callback will be called.
usleep(30 * 1000000); // sleep 30 seconds

// Un-register the events
result = rdc_policy_unregister(rdc_handle, group_id);
if (result != RDC_ST_OK) {
std::cout << "Error unregister policy, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}

// clear the events
rdc_policy_condition_type_t condition_type;
condition_type = RDC_POLICY_COND_THERMAL;
result = rdc_policy_delete(rdc_handle, group_id, condition_type);
if (result != RDC_ST_OK) {
std::cout << "Error clear policy, Return: " << rdc_status_string(result) << std::endl;
goto cleanup;
}

//... clean up
cleanup:
std::cout << "Cleaning up.\n";
if (standalone)
rdc_disconnect(rdc_handle);
else
rdc_stop_embedded(rdc_handle);
rdc_shutdown();
return result;
}
130 changes: 130 additions & 0 deletions include/rdc/rdc.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,10 @@ typedef enum {
RDC_FI_GPU_MM_DEC_UTIL, //!< Multimedia decoder busy percentage
RDC_FI_GPU_MEMORY_ACTIVITY, //!< Memory busy percentage

/**
* @brief GPU page related fields
*/
RDC_FI_GPU_PAGE_RETRIED = 550, //!< Retried page of the GPU instance
/**
* @brief ECC related fields
*/
Expand Down Expand Up @@ -552,6 +556,31 @@ typedef struct {
rdc_diag_test_result_t diag_info[MAX_TEST_CASES];
} rdc_diag_response_t;

/**
* @brief The policy type to support
*/
typedef enum {
RDC_POLICY_COND_MAX_PAGE_RETRIED, //!< Max number of page retired
RDC_POLICY_COND_THERMAL, //!< Temperature threshold, millidegree Celsius
RDC_POLICY_COND_POWER, //!< Power threshold, unit milliwatt
RDC_POLICY_COND_MAX
} rdc_policy_condition_type_t;

typedef struct {
rdc_policy_condition_type_t type;
int64_t value;
} rdc_policy_condition_t;

typedef enum { RDC_POLICY_ACTION_NONE, RDC_POLICY_ACTION_GPU_RESET } rdc_policy_action_t;

/**
* @brief The structure to define policy to enforce on GPU.
*/
typedef struct {
rdc_policy_condition_t condition; //!< condition to meet
rdc_policy_action_t action; //!< Action to take
} rdc_policy_t;

/**
* @brief Initialize ROCm RDC.
*
Expand Down Expand Up @@ -1131,6 +1160,107 @@ rdc_field_t get_field_id_from_name(const char* name);
*/
const char* rdc_diagnostic_result_string(rdc_diag_result_t result);

/**
* @brief Set the RDC policy. Each group has multiple policies, these policies can be set by this
* API one by one. Multiple calls of this API will override the existing policy.
*
* @details Set the RDC policy
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[in] policy The policy to set
*
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_set(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_t policy);

#define RDC_MAX_POLICY_SETTINGS 32

/**
* @brief Get the RDC policy
*
* @details Get the RDC policy
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[out] count The size of policies array
*
* @param[out] policies The policies to get
*
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_get(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]);

/**
* @brief delete the RDC policy for this group based on condition type
*
* @details clear the RDC policy for this group based on condition type. In a GPU group, only one
* policy can be set for a specific rdc_policy_condition_type_t
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id
*
* @param[in] condition_type The condition type to delete
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/

rdc_status_t rdc_policy_delete(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type);

/**
* Define the structure is used in RDC policy callback
*/
typedef struct {
unsigned int version;
rdc_policy_condition_t condition; //!< the condition that is meet
rdc_gpu_group_t group_id; //!< The group id trigger this callback
int64_t value; //!< The current value that meet the condition
} rdc_policy_callback_response_t;

/**
* The user data is the rdc_policy_callback_response_t
*/
typedef int (*rdc_policy_register_callback)(rdc_policy_callback_response_t* userData);

/**
* @brief Register a function to be called when policy condition is meet.
*
* @details Register the RDC policy callback
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @param[in] callback The callback function to be called when condition meet.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_register(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id,
rdc_policy_register_callback callback);

/**
* @brief un-register a policy callback function for a conditioin.
*
* @details Un-register the policy callback for a condition.
*
* @param[in] p_rdc_handle The RDC handler.
*
* @param[in] group_id The GPU group id.
*
* @retval ::RDC_ST_OK is returned upon successful call.
*/
rdc_status_t rdc_policy_unregister(rdc_handle_t p_rdc_handle, rdc_gpu_group_t group_id);

#ifdef __cplusplus
}
#endif // __cplusplus
Expand Down
13 changes: 13 additions & 0 deletions include/rdc_lib/RdcHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,19 @@ class RdcHandler {
// It is just a client interface under the GRPC framework and is not used as an RDC API.
// The reason is that RdcEmbeddedHandler::get_mixed_component_version does not need to be called.
virtual rdc_status_t get_mixed_component_version(mixed_component_t component, mixed_component_version_t* p_mixed_compv) = 0;
// Policy API
virtual rdc_status_t rdc_policy_set(rdc_gpu_group_t group_id, rdc_policy_t policy) = 0;

virtual rdc_status_t rdc_policy_get(rdc_gpu_group_t group_id, uint32_t* count,
rdc_policy_t policies[RDC_MAX_POLICY_SETTINGS]) = 0;

virtual rdc_status_t rdc_policy_delete(rdc_gpu_group_t group_id,
rdc_policy_condition_type_t condition_type) = 0;

virtual rdc_status_t rdc_policy_register(rdc_gpu_group_t group_id,
rdc_policy_register_callback callback) = 0;

virtual rdc_status_t rdc_policy_unregister(rdc_gpu_group_t group_id) = 0;

virtual ~RdcHandler() {}
};
Expand Down
Loading

0 comments on commit 345ac64

Please sign in to comment.