From 5c7e14193ab184ab4663af34b29236b00ca142f6 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Sun, 14 Apr 2024 19:08:35 +0300 Subject: [PATCH] AZP/TEST: ucx_perftest over MAD RTE --- buildlib/pr/mad_tests.yml | 112 +++++++++++++++++++++++++++++++++++++ buildlib/pr/main.yml | 9 +++ buildlib/tools/test_mad.sh | 81 +++++++++++++++++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 buildlib/pr/mad_tests.yml create mode 100755 buildlib/tools/test_mad.sh diff --git a/buildlib/pr/mad_tests.yml b/buildlib/pr/mad_tests.yml new file mode 100644 index 00000000000..da9af77f858 --- /dev/null +++ b/buildlib/pr/mad_tests.yml @@ -0,0 +1,112 @@ +jobs: + - job: SetupServer + displayName: Setup Server + pool: + name: MLNX + demands: mad_server + workspace: + clean: outputs + steps: + - checkout: self + clean: true + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - task: Bash@3 + name: Set_Vars + inputs: + targetType: "inline" + script: | + source ./buildlib/tools/test_mad.sh + set_vars + displayName: Set Vars + - bash: | + source ./buildlib/tools/test_mad.sh + build_ucx_in_docker + docker_run_srv + displayName: Setup Server + + - job: SetupClient + displayName: Setup Client + pool: + name: MLNX + demands: mad_client + workspace: + clean: outputs + steps: + - checkout: self + clean: true + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - bash: | + source ./buildlib/tools/test_mad.sh + build_ucx + displayName: Setup Client + + - job: TestLid + dependsOn: + - SetupServer + - SetupClient + displayName: Test Lid + timeoutInMinutes: 10 + pool: + name: MLNX + demands: mad_client + variables: + LID: $[ dependencies.SetupServer.outputs['Set_Vars.LID'] ] + HCA: $[ dependencies.SetupServer.outputs['Set_Vars.HCA'] ] + steps: + - checkout: none + - bash: | + source ./buildlib/tools/test_mad.sh + run_mad_test lid:$(LID) + env: + HCA: $(HCA) + displayName: Test LID + + - job: ServerRestart + dependsOn: TestLid + displayName: Server Restart + pool: + name: MLNX + demands: mad_server + steps: + - checkout: none + - bash: | + source ./buildlib/tools/test_mad.sh + docker_run_srv + displayName: Server Restart + + - job: TestGuid + dependsOn: + - SetupServer + - ServerRestart + displayName: Test Guid + timeoutInMinutes: 10 + pool: + name: MLNX + demands: mad_client + variables: + GUID: $[ dependencies.SetupServer.outputs['Set_Vars.GUID'] ] + HCA: $[ dependencies.SetupServer.outputs['Set_Vars.HCA'] ] + steps: + - checkout: none + - bash: | + source ./buildlib/tools/test_mad.sh + run_mad_test guid:$(GUID) + env: + HCA: $(HCA) + displayName: Test GUID + + - job: ServerStop + dependsOn: TestGuid + displayName: Server Stop + condition: always() + pool: + name: MLNX + demands: mad_server + steps: + - checkout: none + - bash: | + source ./buildlib/tools/test_mad.sh + docker_stop_srv + displayName: Server Stop diff --git a/buildlib/pr/main.yml b/buildlib/pr/main.yml index 2503e6357f7..abf7de095ea 100644 --- a/buildlib/pr/main.yml +++ b/buildlib/pr/main.yml @@ -255,6 +255,15 @@ stages: long_test: $(long_test) test_static: $(test_static) + - stage: ucx_perftest_mad_rte + dependsOn: [Static_check] + displayName: ucx_perftest over MAD RTE + lockBehavior: sequential + variables: + - group: concurrency_lock + jobs: + - template: mad_tests.yml + - stage: WireCompat dependsOn: [Static_check] jobs: diff --git a/buildlib/tools/test_mad.sh b/buildlib/tools/test_mad.sh new file mode 100755 index 00000000000..6bcee6965eb --- /dev/null +++ b/buildlib/tools/test_mad.sh @@ -0,0 +1,81 @@ +#!/bin/bash +set -exE -o pipefail + +IMAGE="rdmz-harbor.rdmz.labs.mlnx/ucx/x86_64/rhel8.2/builder:mofed-5.0-1.0.0.0" + +if [ -z "$BUILD_SOURCESDIRECTORY" ]; then + echo "Not running in Azure" + exit 1 +fi +cd "$BUILD_SOURCESDIRECTORY" + +build_ucx() { + ./autogen.sh + ./contrib/configure-release \ + --prefix="$PWD"/install \ + --with-mad \ + --without-valgrind \ + --without-go \ + --without-java + make -s -j"$(nproc)" + make install +} + +build_ucx_in_docker() { + docker run --rm \ + --name ucx_build_"$BUILD_BUILDID" \ + -e BUILD_SOURCESDIRECTORY="$BUILD_SOURCESDIRECTORY" \ + -v "$PWD":"$PWD" -w "$PWD" \ + -v /hpc/local:/hpc/local \ + $IMAGE \ + bash -c "source ./buildlib/tools/test_mad.sh && build_ucx" + + sudo chown -R swx-azure-svc:ecryptfs "$PWD" +} + +docker_run_srv() { + HCA=$(detect_hca) + docker_stop_srv + docker run --rm \ + --detach \ + --net=host \ + --name ucx_perftest_"$BUILD_BUILDID" \ + -e BUILD_SOURCESDIRECTORY="$BUILD_SOURCESDIRECTORY" \ + -v "$PWD":"$PWD" -w "$PWD" \ + -v /hpc/local:/hpc/local \ + --ulimit memlock=-1:-1 --device=/dev/infiniband/ \ + $IMAGE \ + bash -c "${PWD}/install/bin/ucx_perftest -K ${HCA}" +} + +docker_stop_srv() { + docker stop ucx_perftest_"$BUILD_BUILDID" || true +} + +set_vars() { + set +x + HCA=$(detect_hca) + # Replace ':' with space for 'ibstat' format + HCA_DEV=${HCA/:/ } + # shellcheck disable=SC2086 + LID=$(ibstat $HCA_DEV | grep Base | awk '{print $NF}') + # shellcheck disable=SC2086 + GUID=$(ibstat $HCA_DEV | grep GUID | awk '{print $NF}') + echo "##vso[task.setvariable variable=LID;isOutput=true]$LID" + echo "##vso[task.setvariable variable=GUID;isOutput=true]$GUID" + echo "##vso[task.setvariable variable=HCA;isOutput=true]$HCA" + echo "LID: $LID" + echo "GUID: $GUID" + echo "HCA: $HCA" +} + +run_mad_test() { + local ib_address="$1" + sudo chmod 777 /dev/infiniband/umad* + "$PWD"/install/bin/ucx_perftest -t tag_bw -e -K "$HCA" -e "$ib_address" +} + +detect_hca() { + # Detect first active HCA port + ibv_devinfo | awk '/hca_id:/ {hca=$2} /port:/ {port=$2} /PORT_ACTIVE/ {print hca ":" port; exit}' +}