Skip to content

Commit

Permalink
AZP/TEST: ucx_perftest over MAD RTE
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexey-Rivkin committed Apr 14, 2024
1 parent 1c25669 commit 5c7e141
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 0 deletions.
112 changes: 112 additions & 0 deletions buildlib/pr/mad_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
jobs:
- job: SetupServer
displayName: Setup Server
pool:
name: MLNX
demands: mad_server
workspace:
clean: outputs
steps:
- checkout: self
clean: true
fetchDepth: 100
retryCountOnTaskFailure: 5
- task: Bash@3
name: Set_Vars
inputs:
targetType: "inline"
script: |
source ./buildlib/tools/test_mad.sh
set_vars
displayName: Set Vars
- bash: |
source ./buildlib/tools/test_mad.sh
build_ucx_in_docker
docker_run_srv
displayName: Setup Server
- job: SetupClient
displayName: Setup Client
pool:
name: MLNX
demands: mad_client
workspace:
clean: outputs
steps:
- checkout: self
clean: true
fetchDepth: 100
retryCountOnTaskFailure: 5
- bash: |
source ./buildlib/tools/test_mad.sh
build_ucx
displayName: Setup Client
- job: TestLid
dependsOn:
- SetupServer
- SetupClient
displayName: Test Lid
timeoutInMinutes: 10
pool:
name: MLNX
demands: mad_client
variables:
LID: $[ dependencies.SetupServer.outputs['Set_Vars.LID'] ]
HCA: $[ dependencies.SetupServer.outputs['Set_Vars.HCA'] ]
steps:
- checkout: none
- bash: |
source ./buildlib/tools/test_mad.sh
run_mad_test lid:$(LID)
env:
HCA: $(HCA)
displayName: Test LID
- job: ServerRestart
dependsOn: TestLid
displayName: Server Restart
pool:
name: MLNX
demands: mad_server
steps:
- checkout: none
- bash: |
source ./buildlib/tools/test_mad.sh
docker_run_srv
displayName: Server Restart
- job: TestGuid
dependsOn:
- SetupServer
- ServerRestart
displayName: Test Guid
timeoutInMinutes: 10
pool:
name: MLNX
demands: mad_client
variables:
GUID: $[ dependencies.SetupServer.outputs['Set_Vars.GUID'] ]
HCA: $[ dependencies.SetupServer.outputs['Set_Vars.HCA'] ]
steps:
- checkout: none
- bash: |
source ./buildlib/tools/test_mad.sh
run_mad_test guid:$(GUID)
env:
HCA: $(HCA)
displayName: Test GUID
- job: ServerStop
dependsOn: TestGuid
displayName: Server Stop
condition: always()
pool:
name: MLNX
demands: mad_server
steps:
- checkout: none
- bash: |
source ./buildlib/tools/test_mad.sh
docker_stop_srv
displayName: Server Stop
9 changes: 9 additions & 0 deletions buildlib/pr/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,15 @@ stages:
long_test: $(long_test)
test_static: $(test_static)
- stage: ucx_perftest_mad_rte
dependsOn: [Static_check]
displayName: ucx_perftest over MAD RTE
lockBehavior: sequential
variables:
- group: concurrency_lock
jobs:
- template: mad_tests.yml

- stage: WireCompat
dependsOn: [Static_check]
jobs:
Expand Down
81 changes: 81 additions & 0 deletions buildlib/tools/test_mad.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash
set -exE -o pipefail

IMAGE="rdmz-harbor.rdmz.labs.mlnx/ucx/x86_64/rhel8.2/builder:mofed-5.0-1.0.0.0"

if [ -z "$BUILD_SOURCESDIRECTORY" ]; then
echo "Not running in Azure"
exit 1
fi
cd "$BUILD_SOURCESDIRECTORY"

build_ucx() {
./autogen.sh
./contrib/configure-release \
--prefix="$PWD"/install \
--with-mad \
--without-valgrind \
--without-go \
--without-java
make -s -j"$(nproc)"
make install
}

build_ucx_in_docker() {
docker run --rm \
--name ucx_build_"$BUILD_BUILDID" \
-e BUILD_SOURCESDIRECTORY="$BUILD_SOURCESDIRECTORY" \
-v "$PWD":"$PWD" -w "$PWD" \
-v /hpc/local:/hpc/local \
$IMAGE \
bash -c "source ./buildlib/tools/test_mad.sh && build_ucx"

sudo chown -R swx-azure-svc:ecryptfs "$PWD"
}

docker_run_srv() {
HCA=$(detect_hca)
docker_stop_srv
docker run --rm \
--detach \
--net=host \
--name ucx_perftest_"$BUILD_BUILDID" \
-e BUILD_SOURCESDIRECTORY="$BUILD_SOURCESDIRECTORY" \
-v "$PWD":"$PWD" -w "$PWD" \
-v /hpc/local:/hpc/local \
--ulimit memlock=-1:-1 --device=/dev/infiniband/ \
$IMAGE \
bash -c "${PWD}/install/bin/ucx_perftest -K ${HCA}"
}

docker_stop_srv() {
docker stop ucx_perftest_"$BUILD_BUILDID" || true
}

set_vars() {
set +x
HCA=$(detect_hca)
# Replace ':' with space for 'ibstat' format
HCA_DEV=${HCA/:/ }
# shellcheck disable=SC2086
LID=$(ibstat $HCA_DEV | grep Base | awk '{print $NF}')
# shellcheck disable=SC2086
GUID=$(ibstat $HCA_DEV | grep GUID | awk '{print $NF}')
echo "##vso[task.setvariable variable=LID;isOutput=true]$LID"
echo "##vso[task.setvariable variable=GUID;isOutput=true]$GUID"
echo "##vso[task.setvariable variable=HCA;isOutput=true]$HCA"
echo "LID: $LID"
echo "GUID: $GUID"
echo "HCA: $HCA"
}

run_mad_test() {
local ib_address="$1"
sudo chmod 777 /dev/infiniband/umad*
"$PWD"/install/bin/ucx_perftest -t tag_bw -e -K "$HCA" -e "$ib_address"
}

detect_hca() {
# Detect first active HCA port
ibv_devinfo | awk '/hca_id:/ {hca=$2} /port:/ {port=$2} /PORT_ACTIVE/ {print hca ":" port; exit}'
}

0 comments on commit 5c7e141

Please sign in to comment.