diff --git a/buildlib/pr/mad_tests.yml b/buildlib/pr/mad_tests.yml new file mode 100644 index 00000000000..84c05fda9db --- /dev/null +++ b/buildlib/pr/mad_tests.yml @@ -0,0 +1,91 @@ +parameters: + HCA: + +jobs: + - job: Server_Up + pool: + name: MLNX + demands: mad_server + workspace: + clean: outputs + steps: + - checkout: self + clean: true + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - bash: ./contrib/test_mad.sh run_mad_server + env: + HCA: ${{ parameters.HCA }} + displayName: Server Up + + - task: Bash@3 + name: Set_Vars + inputs: + targetType: "inline" + script: ./contrib/test_mad.sh set_vars + env: + HCA: ${{ parameters.HCA }} + displayName: Set Vars + + + - job: Test_LID + dependsOn: Server_Up + timeoutInMinutes: 10 + pool: + name: MLNX + demands: mad_client + variables: + LID: $[ dependencies.Server_Up.outputs['Set_Vars.LID'] ] + steps: + - checkout: self + clean: true + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - bash: ./contrib/test_mad.sh run_mad_test_lid + env: + HCA: ${{ parameters.HCA }} + LID: $(LID) + displayName: Test LID + + + - job: Server_Restart + dependsOn: Test_LID + pool: + name: MLNX + demands: mad_server + steps: + - checkout: none + - bash: ./contrib/test_mad.sh srv_restart + displayName: Server Restart + + + - job: Test_GUID + dependsOn: + - Server_Up + - Server_Restart + timeoutInMinutes: 10 + pool: + name: MLNX + demands: mad_client + variables: + GUID: $[ dependencies.Server_Up.outputs['Set_Vars.GUID'] ] + steps: + - checkout: none + - bash: | + ./contrib/test_mad.sh run_mad_test_guid + env: + HCA: ${{ parameters.HCA }} + GUID: $(GUID) + displayName: Test GUID + + + - job: Server_Stop + dependsOn: Test_GUID + condition: always() + pool: + name: MLNX + demands: mad_server + steps: + - checkout: none + - bash: ./contrib/test_mad.sh srv_stop + displayName: Server Stop diff --git a/buildlib/pr/main.yml b/buildlib/pr/main.yml index 2503e6357f7..2a0ba7d9165 100644 --- a/buildlib/pr/main.yml +++ b/buildlib/pr/main.yml @@ -179,274 +179,281 @@ resources: options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) stages: - - stage: Codestyle - jobs: - - template: codestyle.yml + # - stage: Codestyle + # jobs: + # - template: codestyle.yml - - stage: Static_check - dependsOn: [Codestyle] - jobs: - - template: static_checks.yml + # - stage: Static_check + # dependsOn: [Codestyle] + # jobs: + # - template: static_checks.yml - - stage: Build - dependsOn: [Static_check] + - stage: MAD_Tests + # dependsOn: [Static_check] jobs: - - job: build_source - pool: - name: MLNX - demands: - - ucx_docker -equals yes - strategy: - matrix: - rhel72: - CONTAINER: rhel72 - rhel74: - CONTAINER: rhel74 - rhel76: - CONTAINER: rhel76 - long_test: yes - rhel76_mofed47: - CONTAINER: rhel76_mofed47 - long_test: yes - ubuntu2004: - CONTAINER: ubuntu2004 - long_test: yes - extra_modules: "" - ubuntu1804: - CONTAINER: ubuntu1804 - extra_modules: "" - ubuntu2204: - CONTAINER: ubuntu2204 - ubuntu2210: - CONTAINER: ubuntu2210 - debian113: - CONTAINER: debian113 - debian109: - CONTAINER: debian109 - sles15sp2: - CONTAINER: sles15sp2 - rhel82: - CONTAINER: rhel82 - rhel90: - CONTAINER: rhel90 - fedora34: - CONTAINER: fedora34 - long_test: yes - centos7: - CONTAINER: centos7_ib - ubuntu2004_rocm: - CONTAINER: ubuntu2004_rocm_5_4_0 - ubuntu2204_rocm: - CONTAINER: ubuntu2204_rocm_6_0_0 - container: $[ variables['CONTAINER'] ] - timeoutInMinutes: 340 + - template: mad_tests.yml + parameters: + HCA: "mlx5_0:1" - steps: - - checkout: self - clean: true - fetchDepth: 100 - retryCountOnTaskFailure: 5 + # - stage: Build + # dependsOn: [Static_check] + # jobs: + # - job: build_source + # pool: + # name: MLNX + # demands: + # - ucx_docker -equals yes + # strategy: + # matrix: + # rhel72: + # CONTAINER: rhel72 + # rhel74: + # CONTAINER: rhel74 + # rhel76: + # CONTAINER: rhel76 + # long_test: yes + # rhel76_mofed47: + # CONTAINER: rhel76_mofed47 + # long_test: yes + # ubuntu2004: + # CONTAINER: ubuntu2004 + # long_test: yes + # extra_modules: "" + # ubuntu1804: + # CONTAINER: ubuntu1804 + # extra_modules: "" + # ubuntu2204: + # CONTAINER: ubuntu2204 + # ubuntu2210: + # CONTAINER: ubuntu2210 + # debian113: + # CONTAINER: debian113 + # debian109: + # CONTAINER: debian109 + # sles15sp2: + # CONTAINER: sles15sp2 + # rhel82: + # CONTAINER: rhel82 + # rhel90: + # CONTAINER: rhel90 + # fedora34: + # CONTAINER: fedora34 + # long_test: yes + # centos7: + # CONTAINER: centos7_ib + # ubuntu2004_rocm: + # CONTAINER: ubuntu2004_rocm_5_4_0 + # ubuntu2204_rocm: + # CONTAINER: ubuntu2204_rocm_6_0_0 + # container: $[ variables['CONTAINER'] ] + # timeoutInMinutes: 340 - - bash: | - ./buildlib/tools/builds.sh - displayName: Build - env: - BUILD_ID: "$(Build.BuildId)-$(Build.BuildNumber)" - long_test: $(long_test) - test_static: $(test_static) + # steps: + # - checkout: self + # clean: true + # fetchDepth: 100 + # retryCountOnTaskFailure: 5 - - stage: WireCompat - dependsOn: [Static_check] - jobs: - - template: wire_compat.yml - parameters: - name: althca - demands: ucx_althca -equals yes - - template: wire_compat.yml - parameters: - name: gpu - demands: ucx_gpu -equals yes - container: centos7_cuda11 - ucx_targets: - ucx_1_15: - ucx_tag: v1.15.x - ucx_1_16: - ucx_tag: v1.16.x - - template: wire_compat.yml - parameters: - name: new - demands: ucx_new -equals yes - - template: wire_compat.yml - parameters: - name: bond - demands: ucx_iodemo -equals yes + # - bash: | + # ./buildlib/tools/builds.sh + # displayName: Build + # env: + # BUILD_ID: "$(Build.BuildId)-$(Build.BuildNumber)" + # long_test: $(long_test) + # test_static: $(test_static) - - stage: Coverity - dependsOn: [Static_check] - jobs: - - template: coverity.yml - parameters: - demands: ucx_docker -equals yes - container: coverity_rh7 + # - stage: WireCompat + # dependsOn: [Static_check] + # jobs: + # - template: wire_compat.yml + # parameters: + # name: althca + # demands: ucx_althca -equals yes + # - template: wire_compat.yml + # parameters: + # name: gpu + # demands: ucx_gpu -equals yes + # container: centos7_cuda11 + # ucx_targets: + # ucx_1_15: + # ucx_tag: v1.15.x + # ucx_1_16: + # ucx_tag: v1.16.x + # - template: wire_compat.yml + # parameters: + # name: new + # demands: ucx_new -equals yes + # - template: wire_compat.yml + # parameters: + # name: bond + # demands: ucx_iodemo -equals yes - - stage: Tests - dependsOn: [Static_check] - jobs: - - template: tests.yml - parameters: - name: althca - demands: ucx_althca -equals yes - test_perf: 0 - - template: tests.yml - parameters: - name: gpu - demands: ucx_gpu -equals yes - test_perf: 1 - container: centos7_cuda11 - - template: tests.yml - parameters: - name: new - demands: ucx_new -equals yes - test_perf: 1 - - template: tests.yml - parameters: - name: roce - demands: ucx_roce -equals yes - test_perf: 0 - - template: tests.yml - parameters: - name: roce_proto_disable - demands: ucx_roce -equals yes - test_perf: 0 - proto_enable: no - - template: tests.yml - parameters: - name: BlueField - demands: ucx_bf -equals yes - test_perf: 0 + # - stage: Coverity + # dependsOn: [Static_check] + # jobs: + # - template: coverity.yml + # parameters: + # demands: ucx_docker -equals yes + # container: coverity_rh7 - - stage: Namespace_Tests - dependsOn: [Static_check] - jobs: - - template: namespace_tests.yml - parameters: - name: new_namespace - demands: ucx_new -equals yes + # - stage: Tests + # dependsOn: [Static_check] + # jobs: + # - template: tests.yml + # parameters: + # name: althca + # demands: ucx_althca -equals yes + # test_perf: 0 + # - template: tests.yml + # parameters: + # name: gpu + # demands: ucx_gpu -equals yes + # test_perf: 1 + # container: centos7_cuda11 + # - template: tests.yml + # parameters: + # name: new + # demands: ucx_new -equals yes + # test_perf: 1 + # - template: tests.yml + # parameters: + # name: roce + # demands: ucx_roce -equals yes + # test_perf: 0 + # - template: tests.yml + # parameters: + # name: roce_proto_disable + # demands: ucx_roce -equals yes + # test_perf: 0 + # proto_enable: no + # - template: tests.yml + # parameters: + # name: BlueField + # demands: ucx_bf -equals yes + # test_perf: 0 - - stage: io_demo - dependsOn: [Static_check] - jobs: - - template: io_demo/io-demo.yml + # - stage: Namespace_Tests + # dependsOn: [Static_check] + # jobs: + # - template: namespace_tests.yml + # parameters: + # name: new_namespace + # demands: ucx_new -equals yes - - stage: jucx - dependsOn: [Static_check] - jobs: - - template: ../jucx/jucx-test.yml - parameters: - name: new - demands: ucx_new -equals yes - - template: ../jucx/jucx-test.yml - parameters: - name: gpu - demands: ucx_gpu -equals yes + # - stage: io_demo + # dependsOn: [Static_check] + # jobs: + # - template: io_demo/io-demo.yml - - stage: go - dependsOn: [Static_check] - jobs: - - template: go/go-test.yml - parameters: - name: new - demands: ucx_new -equals yes - - template: go/go-test.yml - parameters: - name: gpu - demands: ucx_gpu -equals yes + # - stage: jucx + # dependsOn: [Static_check] + # jobs: + # - template: ../jucx/jucx-test.yml + # parameters: + # name: new + # demands: ucx_new -equals yes + # - template: ../jucx/jucx-test.yml + # parameters: + # name: gpu + # demands: ucx_gpu -equals yes - - stage: Build_Static - dependsOn: [Static_check] - jobs: - - job: build_source - pool: - name: MLNX - demands: - - ucx_docker -equals yes - strategy: - matrix: - centos7: - CONTAINER: centos7_ib - extra_modules: ucx-ib ucx-cma ucx-rdmacm - extra_tls: dc_mlx5 rc_mlx5 ud_mlx5 rc_verbs ud_verbs cma - run_tls: ib rc rc_v rc_x dc dc_x ud ud_v ud_x shm sm - ubuntu2004: - CONTAINER: ubuntu2004 - extra_modules: "" - extra_tls: "" - run_tls: "" - ubuntu1804: - CONTAINER: ubuntu1804 - extra_modules: "" - extra_tls: "" - run_tls: "" - container: $[ variables['CONTAINER'] ] - timeoutInMinutes: 340 + # - stage: go + # dependsOn: [Static_check] + # jobs: + # - template: go/go-test.yml + # parameters: + # name: new + # demands: ucx_new -equals yes + # - template: go/go-test.yml + # parameters: + # name: gpu + # demands: ucx_gpu -equals yes - steps: - - checkout: self - clean: true - fetchDepth: 100 - retryCountOnTaskFailure: 5 + # - stage: Build_Static + # dependsOn: [Static_check] + # jobs: + # - job: build_source + # pool: + # name: MLNX + # demands: + # - ucx_docker -equals yes + # strategy: + # matrix: + # centos7: + # CONTAINER: centos7_ib + # extra_modules: ucx-ib ucx-cma ucx-rdmacm + # extra_tls: dc_mlx5 rc_mlx5 ud_mlx5 rc_verbs ud_verbs cma + # run_tls: ib rc rc_v rc_x dc dc_x ud ud_v ud_x shm sm + # ubuntu2004: + # CONTAINER: ubuntu2004 + # extra_modules: "" + # extra_tls: "" + # run_tls: "" + # ubuntu1804: + # CONTAINER: ubuntu1804 + # extra_modules: "" + # extra_tls: "" + # run_tls: "" + # container: $[ variables['CONTAINER'] ] + # timeoutInMinutes: 340 - - bash: | - ./buildlib/tools/build_static.sh - displayName: Build - env: - EXTRA_TLS: $(extra_tls) - RUN_TLS: $(run_tls) - EXTRA_MODULES: $(extra_modules) - EXECUTOR_NUMBER: $(AZP_AGENT_ID) + # steps: + # - checkout: self + # clean: true + # fetchDepth: 100 + # retryCountOnTaskFailure: 5 + # - bash: | + # ./buildlib/tools/build_static.sh + # displayName: Build + # env: + # EXTRA_TLS: $(extra_tls) + # RUN_TLS: $(run_tls) + # EXTRA_MODULES: $(extra_modules) + # EXECUTOR_NUMBER: $(AZP_AGENT_ID) - - stage: Cuda - dependsOn: [Static_check] - jobs: - - template: cuda/cuda.yml + # - stage: Cuda + # dependsOn: [Static_check] + # jobs: + # - template: cuda/cuda.yml - - stage: AddressSanitizer - dependsOn: [Static_check] - jobs: - - template: tests.yml - parameters: - name: gpu - demands: ucx_gpu -equals yes - test_perf: 0 - container: ubuntu22_cuda12 - asan_check: yes - - template: tests.yml - parameters: - name: new - demands: ucx_new -equals yes - test_perf: 0 - container: ubuntu2204_ib - asan_check: yes + # - stage: AddressSanitizer + # dependsOn: [Static_check] + # jobs: + # - template: tests.yml + # parameters: + # name: gpu + # demands: ucx_gpu -equals yes + # test_perf: 0 + # container: ubuntu22_cuda12 + # asan_check: yes + # - template: tests.yml + # parameters: + # name: new + # demands: ucx_new -equals yes + # test_perf: 0 + # container: ubuntu2204_ib + # asan_check: yes - - stage: Valgrind - dependsOn: [Static_check] - jobs: - - template: tests.yml - parameters: - name: roce - demands: ucx_roce -equals yes - test_perf: 0 - valgrind_check: yes - - template: tests.yml - parameters: - name: roce_proto_disable - demands: ucx_roce -equals yes - test_perf: 0 - proto_enable: no - valgrind_check: yes + + # - stage: Valgrind + # dependsOn: [Static_check] + # jobs: + # - template: tests.yml + # parameters: + # name: roce + # demands: ucx_roce -equals yes + # test_perf: 0 + # valgrind_check: yes + # - template: tests.yml + # parameters: + # name: roce_proto_disable + # demands: ucx_roce -equals yes + # test_perf: 0 + # proto_enable: no + # valgrind_check: yes # - stage: Cuda_compatible # dependsOn: [Static_check] diff --git a/buildlib/tools/ucx_perftest.template b/buildlib/tools/ucx_perftest.template new file mode 100644 index 00000000000..604a9bbde5b --- /dev/null +++ b/buildlib/tools/ucx_perftest.template @@ -0,0 +1,23 @@ +[Unit] +Description=UCX Perftest running in agent mode +Requires=multi-user.target +After=multi-user.target + +[Service] +Type=simple +Environment=UCX_KEEPALIVE_INTERVAL=2 +Environment=UCX_DC_MLX5_TIMEOUT=5 +Environment=UCX_RC_VERBS_TIMEOUT=5 +Environment=UCX_RC_MLX5_TIMEOUT=5 +Environment=UCX_UD_VERBS_TIMEOUT=5 +Environment=UCX_UD_MLX5_TIMEOUT=5 +Environment=UCX_RDMA_CM_TIMEOUT=5 +Restart=always +RestartSec=500ms +User=root +Group=root +StartLimitInterval=10s +StartLimitBurst=100 +StandardOutput=${PWD}/ucx_perftest.log +StandardError=${PWD}/ucx_perftest.log +ExecStart=${PWD}/install/bin/ucx_perftest -e -K ${HCA} diff --git a/contrib/test_mad.sh b/contrib/test_mad.sh new file mode 100755 index 00000000000..7d117498951 --- /dev/null +++ b/contrib/test_mad.sh @@ -0,0 +1,84 @@ +#!/bin/bash +set -exE -o pipefail + +cd "$BUILD_SOURCESDIRECTORY" + +run_mad_server() { + build_ucx + setup + srv_stop + funcname + export HCA="$HCA" + sudo -E bash -c 'envsubst < "buildlib/tools/ucx_perftest.template" \ + > /etc/systemd/system/ucx_perftest.service' + sudo systemctl daemon-reload + sudo systemctl start ucx_perftest + sudo systemctl status ucx_perftest +} + +build_ucx() { + funcname + ./autogen.sh + ./contrib/configure-release \ + --prefix="$PWD"/install \ + --with-mad \ + --without-valgrind \ + --without-go \ + --without-java + make -s -j"$(nproc)" + make install +} + +setup() { + funcname + sudo chmod 777 /dev/infiniband/umad* +} + +set_vars() { + set +x + HCA=${HCA/:/ } # Replace ':' with space + # shellcheck disable=SC2086 + LID=$(ibstat $HCA | grep Base | awk '{print $NF}') + # shellcheck disable=SC2086 + GUID=$(ibstat $HCA | grep GUID | awk '{print $NF}') + echo "LID: $LID" + echo "GUID: $GUID" + echo "##vso[task.setvariable variable=LID;isOutput=true]$LID" + echo "##vso[task.setvariable variable=GUID;isOutput=true]$GUID" +} + +run_mad_test_lid() { + build_ucx + setup + funcname + "$PWD"/install/bin/ucx_perftest -t tag_bw -e -K "$HCA" -e lid:"$LID" +} + +run_mad_test_guid() { + # funcname + echo "GUID: $GUID" # returns empty + "$PWD"/install/bin/ucx_perftest -t tag_bw -e -K "$HCA" guid:"$GUID" +} + +srv_restart() { + funcname + sudo systemctl stop ucx_perftest + sudo systemctl start ucx_perftest + sudo systemctl status ucx_perftest +} + +srv_stop() { + funcname + set +e + sudo systemctl status ucx_perftest + sudo systemctl stop ucx_perftest + set -e +} + +funcname() { + set +x + echo "==== Running: ${FUNCNAME[1]} ====" + set -x +} + +"$@"