Skip to content

Commit

Permalink
Merge amd-staging into amd-mainline 20241219
Browse files Browse the repository at this point in the history
  • Loading branch information
Arif, Maisam authored Dec 19, 2024
2 parents 30f795b + 35eb8e7 commit e52cee3
Show file tree
Hide file tree
Showing 74 changed files with 5,493 additions and 298 deletions.
181 changes: 181 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
name: Build RDC

on:
pull_request:
branches: [ 'dgalants/ci', 'amd-staging', 'amd-mainline' ]
workflow_dispatch:

env:
DEBIAN_FRONTEND: noninteractive
DEBCONF_NONINTERACTIVE_SEEN: true
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
BUILD_TYPE: RelWithDebInfo
ROCM_DIR: /opt/rocm
# Use vars for internal URLs
JOB_NAME: ${{ vars.JOB_NAME }}
AMDGPU_REPO_DEB: ${{ vars.AMDGPU_REPO_DEB }}
AMDGPU_REPO_URL: ${{ vars.AMDGPU_REPO_URL }}
ROCM_CI_URL: ${{ vars.ROCM_CI_URL }}
# Set env vars to values of config vars
env_var: ${{ vars.ENV_CONTEXT_VAR }}

jobs:
build:
runs-on: lstt
container: rocm/rocm-build-ubuntu-22.04:6.2
outputs:
BUILD_NUM: ${{ steps.build_number.outputs.BUILD_NUM }}
TODAY: ${{ steps.build_number.outputs.TODAY }}

steps:
- uses: actions/checkout@v3

- name: Set up apt repos
run: |
test "$AMDGPU_REPO_URL" == "" && echo "Error! AMDGPU_REPO_URL is EMPTY!" && exit 1
cat /etc/os-release
apt update -y
# provides add-apt-repository and support for caching actions
apt install -y software-properties-common jq nodejs
add-apt-repository -y ppa:apt-fast/stable
apt update -y
apt install -y apt-fast
# provides amdgpu-repo
wget "$AMDGPU_REPO_URL/$AMDGPU_REPO_DEB"
apt-fast install -y "./$AMDGPU_REPO_DEB"
- name: Get latest build number
id: build_number
run: |
curl -Ls "${ROCM_CI_URL}/${JOB_NAME}/lastStableBuild/api/json?depth=1" -o /tmp/build_info.json
cat /tmp/build_info.json | jq '.actions[] | .buildsByBranchName."refs/remotes/origin/amd-master".buildNumber | select(. != null)' > /tmp/build_num.txt
BUILD_NUM="$(cat /tmp/build_num.txt)"
echo "BUILD_NUM=$BUILD_NUM" >> "$GITHUB_ENV"
echo "BUILD_NUM=$BUILD_NUM" >> "$GITHUB_OUTPUT"
amdgpu-repo --rocm-build="$JOB_NAME"/"$BUILD_NUM"
apt-fast update -y
# useful for date-based caches
TODAY="$(date +%Y_%m_%d)"
echo "TODAY=$TODAY" >> "$GITHUB_ENV"
echo "TODAY=$TODAY" >> "$GITHUB_OUTPUT"
- name: Get apt packages
run: |
apt install -y \
rocm-core \
amd-smi-lib \
rocblas \
rocblas-dev \
rocm-developer-tools \
rocm-device-libs \
rocm-smi-lib \
rocm-validation-suite \
rocprofiler-dev \
build-essential \
ccache \
cmake \
curl \
git \
gzip \
jq \
libcap-dev \
tar \
unzip \
wget \
zip \
zstd
- name: Cache .ccache
uses: actions/cache@v4
with:
path: ~/.cache/ccache
# only create one cache per day to save time during upload
key: ${{ runner.os }}-ccache-${{ github.ref_name }}-${{ env.TODAY }}
restore-keys: |
${{ runner.os }}-ccache-${{ github.ref_name }}-
${{ runner.os }}-ccache-
- name: Build RDC
run: |
pwd
cmake \
-B build \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DGRPC_ROOT=/usr/grpc \
-DBUILD_RUNTIME=ON \
-DBUILD_PROFILER=ON \
-DBUILD_RVS=OFF \
-DBUILD_TESTS=ON \
-DCPACK_GENERATOR="DEB" \
-DCMAKE_INSTALL_PREFIX=${ROCM_DIR}
make -C build -j $(nproc)
make -C build -j $(nproc) package
- name: Install RDC
run: |
echo "pre: "
ls -lah /opt
make -C build -j $(nproc) install
echo "post: "
ls -lah /opt
# important to use v3 because v4 doesn't work with act:
# https://github.com/nektos/act/issues/329
- name: Package RDC
uses: actions/upload-artifact@v3
with:
name: rdc
path: build/rdc*.deb
if-no-files-found: error
retention-days: 5

test:
needs: build
runs-on: lstt
container: rocm/rocm-build-ubuntu-22.04:6.2

steps:
- name: Set up apt repos
run: |
cat /etc/os-release
apt update -y
# provides add-apt-repository and support for caching actions
apt install -y software-properties-common jq nodejs
- name: Package RDC
uses: actions/download-artifact@v3
with:
name: rdc
path: /opt/

- name: Test RDC installation
shell: bash
run: |
COUNT=$(find /opt/ -iname 'rdc*.deb' | wc -l)
test "$COUNT" -eq '2'
dpkg --force-all -i /opt/rdc*.deb
# confirm binaries are installed
find $ROCM_DIR/bin -maxdepth 1 -iname rdcd
find $ROCM_DIR/bin -maxdepth 1 -iname rdci
find $ROCM_DIR/share/rdc -iname rdctst
# confirm that libraries are installed
MISSING_LIBS=()
for lib in librdc.so librdc_bootstrap.so librdc_client.so; do
test -e "$ROCM_DIR/lib/$lib" || MISSING_LIBS+=("$lib")
done
for lib in librdc_rocr.so librdc_rocp.so; do
test -e "$ROCM_DIR/lib/rdc/$lib" || MISSING_LIBS+=("$lib")
done
if test "${#MISSING_LIBS[@]}" != "0"; then
echo "Missing libs found!"
for lib in "${MISSING_LIBS[@]}"; do
echo "- $lib"
done
exit 1
else
echo "No missing libs found!"
fi
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ docs/_doxygen/
# misc
__pycache__/
authentication/CA/

# act
act.variables
act.secrets
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

Full documentation for RDC is available at [ROCm DataCenter Tool User Guide](https://rocm.docs.amd.com/projects/rdc/en/latest/).

## RDC for ROCm 6.3.0

### Added

- [RVS](https://github.com/ROCm/ROCmValidationSuite) integration
- Real time logging for diagnostic command
- `--version` command
- `XGMI_TOTAL_READ_KB` and `XGMI_TOTAL_WRITE_KB` monitoring metrics

## RDC for ROCm 6.2.0

- Added [rocprofiler](https://github.com/ROCm/rocprofiler) dmon metrics
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ option(BUILD_PROFILER "Build targets for librdc_rocp.so" OFF)

# When cmake -DBUILD_RVS=off, it will not build the librdc_rvs.so
# which requires the RocmValidationSuite
option(BUILD_RVS "Build targets for librdc_rvs.so" OFF)
option(BUILD_RVS "Build targets for librdc_rvs.so" ON)

# When cmake -DBUILD_TESTS=off, it will not build RDC tests.
option(BUILD_TESTS "Build test suite" OFF)
Expand Down
3 changes: 2 additions & 1 deletion cmake_modules/Findrocprofiler.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ set(NAME rocprofiler)
if(NOT DEFINED ROCM_DIR)
set(ROCM_DIR "/opt/rocm")
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_DIR})

find_library(
${NAME}_LIBRARY
NAMES ${NAME} ${NAME}64
HINTS "${ROCM_DIR}"
REQUIRED
REGISTRY_VIEW BOTH
PATH_SUFFIXES lib)

Expand Down
17 changes: 15 additions & 2 deletions cmake_modules/Findrvs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ set(NAME rvs)
if(NOT DEFINED ROCM_DIR)
set(ROCM_DIR "/opt/rocm")
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_DIR})

find_library(
${NAME}_LIBRARY
NAMES ${NAME} ${NAME}64
HINTS "${ROCM_DIR}"
NAMES ${NAME} ${NAME}64 ${NAME}lib # RVS is special and is named librvslib.so
REQUIRED
REGISTRY_VIEW BOTH
PATH_SUFFIXES lib)

Expand All @@ -35,4 +36,16 @@ if(${NAME}_FOUND AND NOT TARGET ${NAME}::${NAME})
IMPORTED_LOCATION "${${NAME}_LIBRARY}"
INTERFACE_COMPILE_OPTIONS "${PC_${NAME}_CFLAGS_OTHER}"
INTERFACE_INCLUDE_DIRECTORIES "${${NAME}_INCLUDE_DIR}")
find_library(rocm-core
NAMES rocm-core
REQUIRED)
find_package(yaml-cpp REQUIRED)
find_package(rocblas REQUIRED)
find_package(hipblaslt REQUIRED)
find_package(hsakmt REQUIRED)
find_package(hip REQUIRED)
find_package(hsa-runtime64 REQUIRED)
find_package(amd_smi REQUIRED)
target_link_libraries(${NAME}::${NAME} INTERFACE
${rocm-core} yaml-cpp roc::rocblas roc::hipblaslt hsakmt::hsakmt hip::amdhip64 hsa-runtime64::hsa-runtime64 amd_smi)
endif()
14 changes: 13 additions & 1 deletion common/rdc_field.data
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ FLD_DESC_ENT(RDC_FI_GPU_TEMP, "GPU temperature in millidegrees Celsiu
FLD_DESC_ENT(RDC_FI_POWER_USAGE, "Power usage in microwatts", "POWER_USAGE", true)
FLD_DESC_ENT(RDC_FI_PCIE_TX, "PCIe Tx utilization in bytes/second", "PCIE_TX", true)
FLD_DESC_ENT(RDC_FI_PCIE_RX, "PCIe Rx utilization in bytes/second", "PCIE_RX", true)
FLD_DESC_ENT(RDC_FI_PCIE_BANDWIDTH, "PCIe bandwidth in GB/sec", "PCIE_BANDWIDTH", true)
FLD_DESC_ENT(RDC_FI_PCIE_BANDWIDTH, "PCIe bandwidth in Mbps", "PCIE_BANDWIDTH", true)

FLD_DESC_ENT(RDC_FI_GPU_UTIL, "GPU busy percentage", "GPU_UTIL", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_USAGE, "Memory usage of the GPU instance in bytes", "GPU_MEMORY_USAGE", true)
Expand All @@ -53,6 +53,8 @@ FLD_DESC_ENT(RDC_FI_GPU_MM_ENC_UTIL, "Mutilmedia encoder busy percentage",
FLD_DESC_ENT(RDC_FI_GPU_MM_DEC_UTIL, "Mutilmedia decoder busy percentage", "GPU_MM_DEC_UTIL", true)
FLD_DESC_ENT(RDC_FI_GPU_MEMORY_ACTIVITY, "Memory busy percentage", "GPU_MEM_UTIL", true)

FLD_DESC_ENT(RDC_FI_GPU_PAGE_RETRIED, "Retried page of the GPU instance", "GPU_PAGE_RETRIED", true)

// ECC totals
FLD_DESC_ENT(RDC_FI_ECC_CORRECT_TOTAL, "Accumulated Single Error Correction", "ECC_CORRECT", true)
FLD_DESC_ENT(RDC_FI_ECC_UNCORRECT_TOTAL, "Accumulated Double Error Detection", "ECC_UNCORRECT", true)
Expand Down Expand Up @@ -135,6 +137,7 @@ FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", false)

// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
Expand All @@ -159,3 +162,12 @@ FLD_DESC_ENT(RDC_EVNT_NOTIF_THERMAL_THROTTLE, "Clk freq decrease due to temp",
FLD_DESC_ENT(RDC_EVNT_NOTIF_PRE_RESET, "GPU reset is about to occur", "GPU_PRE_RESET", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_POST_RESET, "GPU reset just occurred", "GPU_POST_RESET", false)
FLD_DESC_ENT(RDC_EVNT_NOTIF_RING_HANG, "GPU ring hang just occured", "RING_HANG", false)

// RDC health related fields
FLD_DESC_ENT(RDC_HEALTH_XGMI_ERROR, "XGMI one or more errors detected", "XGMI_ERROR", true)
FLD_DESC_ENT(RDC_HEALTH_PCIE_REPLAY_COUNT, "Total PCIE replay count", "PCIE_REPLAY_COUNT", true)
FLD_DESC_ENT(RDC_HEALTH_PENDING_PAGE_NUM, "Pending page number", "PENDING_PAGE_NUM", true)
FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", "RETIRED_PAGE_LIMIT", false)
FLD_DESC_ENT(RDC_HEALTH_UNCORRECTABLE_PAGE_LIMIT, "Uncorrectable page limit", "UNCORRECTABLE_PAGE_LIMIT", false)
FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", false)
FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", false)
13 changes: 13 additions & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,19 @@ set(ROCPROFILER_EXAMPLE_EXE "rocprofiler")
add_executable(${ROCPROFILER_EXAMPLE_EXE} "${ROCPROFILER_EXAMPLE_SRC_LIST}")
target_link_libraries(${ROCPROFILER_EXAMPLE_EXE} pthread dl rdc_bootstrap)


set(POLICY_EXAMPLE_SRC_LIST "policy_example.cc")
cmake_print_variables(POLICY_EXAMPLE_SRC_LIST)
set(POLICY_EXAMPLE_EXE "policy")
add_executable(${POLICY_EXAMPLE_EXE} "${POLICY_EXAMPLE_SRC_LIST}")
target_link_libraries(${POLICY_EXAMPLE_EXE} pthread dl rdc_bootstrap)

set(HEALTH_EXAMPLE_SRC_LIST "health_example.cc")
cmake_print_variables(HEALTH_EXAMPLE_SRC_LIST)
set(HEALTH_EXAMPLE_EXE "health")
add_executable(${HEALTH_EXAMPLE_EXE} "${HEALTH_EXAMPLE_SRC_LIST}")
target_link_libraries(${HEALTH_EXAMPLE_EXE} pthread dl rdc_bootstrap)

message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
message(" Finished Cmake Example ")
message("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
2 changes: 1 addition & 1 deletion example/diagnostic_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ int main(int, char**) {
std::cout << " ============== Run individual diagnostic test ===========\n";
rdc_diag_test_result_t test_result;
result =
rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, nullptr, 0, &test_result);
rdc_test_case_run(rdc_handle, group_id, RDC_DIAG_COMPUTE_PROCESS, nullptr, 0, &test_result, nullptr);

if (result != RDC_ST_OK) {
std::cout << "Error run RDC_DIAG_COMPUTE_PROCESS diagnostic. Return: "
Expand Down
Loading

0 comments on commit e52cee3

Please sign in to comment.