-
Notifications
You must be signed in to change notification settings - Fork 428
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AZP/TEST: Add MAD tests #9735
AZP/TEST: Add MAD tests #9735
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
jobs: | ||
- job: SetupServer | ||
displayName: Setup Server | ||
pool: | ||
name: MLNX | ||
demands: mad_server | ||
workspace: | ||
clean: outputs | ||
steps: | ||
- checkout: self | ||
clean: true | ||
fetchDepth: 100 | ||
retryCountOnTaskFailure: 5 | ||
- task: Bash@3 | ||
name: Set_Vars | ||
inputs: | ||
targetType: "inline" | ||
script: | | ||
source ./buildlib/tools/test_mad.sh | ||
set_vars | ||
displayName: Set Vars | ||
- bash: | | ||
source ./buildlib/tools/test_mad.sh | ||
build_ucx_in_docker | ||
docker_run_srv | ||
displayName: Setup Server | ||
|
||
- job: SetupClient | ||
displayName: Setup Client | ||
pool: | ||
name: MLNX | ||
demands: mad_client | ||
workspace: | ||
clean: outputs | ||
steps: | ||
- checkout: self | ||
clean: true | ||
fetchDepth: 100 | ||
retryCountOnTaskFailure: 5 | ||
- bash: | | ||
source ./buildlib/tools/test_mad.sh | ||
build_ucx | ||
displayName: Setup Client | ||
|
||
- job: TestLid | ||
dependsOn: | ||
- SetupServer | ||
- SetupClient | ||
displayName: Test Lid | ||
timeoutInMinutes: 10 | ||
pool: | ||
name: MLNX | ||
demands: mad_client | ||
variables: | ||
LID: $[ dependencies.SetupServer.outputs['Set_Vars.LID'] ] | ||
HCA: $[ dependencies.SetupServer.outputs['Set_Vars.HCA'] ] | ||
steps: | ||
- checkout: none | ||
- bash: | | ||
source ./buildlib/tools/test_mad.sh | ||
run_mad_test lid:$(LID) | ||
env: | ||
HCA: $(HCA) | ||
displayName: Test LID | ||
|
||
- job: ServerRestart | ||
dependsOn: TestLid | ||
displayName: Server Restart | ||
pool: | ||
name: MLNX | ||
demands: mad_server | ||
steps: | ||
- checkout: none | ||
- bash: | | ||
source ./buildlib/tools/test_mad.sh | ||
docker_run_srv | ||
displayName: Server Restart | ||
|
||
- job: TestGuid | ||
dependsOn: | ||
- SetupServer | ||
- ServerRestart | ||
displayName: Test Guid | ||
timeoutInMinutes: 10 | ||
pool: | ||
name: MLNX | ||
demands: mad_client | ||
variables: | ||
GUID: $[ dependencies.SetupServer.outputs['Set_Vars.GUID'] ] | ||
HCA: $[ dependencies.SetupServer.outputs['Set_Vars.HCA'] ] | ||
steps: | ||
- checkout: none | ||
- bash: | | ||
source ./buildlib/tools/test_mad.sh | ||
run_mad_test guid:$(GUID) | ||
env: | ||
HCA: $(HCA) | ||
displayName: Test GUID | ||
|
||
- job: ServerStop | ||
dependsOn: TestGuid | ||
displayName: Server Stop | ||
condition: always() | ||
pool: | ||
name: MLNX | ||
demands: mad_server | ||
steps: | ||
- checkout: none | ||
- bash: | | ||
source ./buildlib/tools/test_mad.sh | ||
docker_stop_srv | ||
displayName: Server Stop |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#!/bin/bash | ||
set -exE -o pipefail | ||
|
||
IMAGE="rdmz-harbor.rdmz.labs.mlnx/ucx/x86_64/rhel8.2/builder:mofed-5.0-1.0.0.0" | ||
|
||
if [ -z "$BUILD_SOURCESDIRECTORY" ]; then | ||
echo "Not running in Azure" | ||
exit 1 | ||
fi | ||
cd "$BUILD_SOURCESDIRECTORY" | ||
|
||
build_ucx() { | ||
./autogen.sh | ||
./contrib/configure-release \ | ||
--prefix="$PWD"/install \ | ||
--with-mad \ | ||
--without-valgrind \ | ||
--without-go \ | ||
--without-java | ||
make -s -j"$(nproc)" | ||
make install | ||
} | ||
|
||
build_ucx_in_docker() { | ||
docker run --rm \ | ||
--name ucx_build_"$BUILD_BUILDID" \ | ||
-e BUILD_SOURCESDIRECTORY="$BUILD_SOURCESDIRECTORY" \ | ||
-v "$PWD":"$PWD" -w "$PWD" \ | ||
-v /hpc/local:/hpc/local \ | ||
$IMAGE \ | ||
bash -c "source ./buildlib/tools/test_mad.sh && build_ucx" | ||
|
||
sudo chown -R swx-azure-svc:ecryptfs "$PWD" | ||
} | ||
|
||
docker_run_srv() { | ||
HCA=$(detect_hca) | ||
docker_stop_srv | ||
docker run --rm \ | ||
--detach \ | ||
--net=host \ | ||
--name ucx_perftest_"$BUILD_BUILDID" \ | ||
-e BUILD_SOURCESDIRECTORY="$BUILD_SOURCESDIRECTORY" \ | ||
-v "$PWD":"$PWD" -w "$PWD" \ | ||
-v /hpc/local:/hpc/local \ | ||
--ulimit memlock=-1:-1 --device=/dev/infiniband/ \ | ||
$IMAGE \ | ||
bash -c "${PWD}/install/bin/ucx_perftest -K ${HCA}" | ||
} | ||
|
||
docker_stop_srv() { | ||
docker stop ucx_perftest_"$BUILD_BUILDID" || true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. stop is not enough we should remove the container as well, we dont want to have dangling containers all the time There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a daily cleanup of Docker resources. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we will have many stopped containers every day until they are cleaned? this is bad practice, each run should do its best to clean after it self, the daily cleanup is extra cleanup not instead There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check out the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Containers get cleaned once stopped. |
||
} | ||
|
||
set_vars() { | ||
set +x | ||
yosefe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
HCA=$(detect_hca) | ||
# Replace ':' with space for 'ibstat' format | ||
HCA_DEV=${HCA/:/ } | ||
# shellcheck disable=SC2086 | ||
LID=$(ibstat $HCA_DEV | grep Base | awk '{print $NF}') | ||
# shellcheck disable=SC2086 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why disable? its only adding double quotes on $HCA There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding double quotes breaks the functionality |
||
GUID=$(ibstat $HCA_DEV | grep GUID | awk '{print $NF}') | ||
echo "##vso[task.setvariable variable=LID;isOutput=true]$LID" | ||
echo "##vso[task.setvariable variable=GUID;isOutput=true]$GUID" | ||
echo "##vso[task.setvariable variable=HCA;isOutput=true]$HCA" | ||
echo "LID: $LID" | ||
echo "GUID: $GUID" | ||
echo "HCA: $HCA" | ||
} | ||
|
||
run_mad_test() { | ||
local ib_address="$1" | ||
sudo chmod 777 /dev/infiniband/umad* | ||
"$PWD"/install/bin/ucx_perftest -t tag_bw -e -K "$HCA" -e "$ib_address" | ||
} | ||
|
||
detect_hca() { | ||
# Detect first active HCA port | ||
ibv_devinfo | awk '/hca_id:/ {hca=$2} /port:/ {port=$2} /PORT_ACTIVE/ {print hca ":" port; exit}' | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would like to see separate setup and build stages instead of hiding them in some test stage
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done