Skip to content

Commit

Permalink
Merge branch \'amd/thread-count\' into origin/master
Browse files Browse the repository at this point in the history
  • Loading branch information
ashleypittman committed Jan 16, 2024
2 parents 2e784f4 + 217aab9 commit 12b61a1
Show file tree
Hide file tree
Showing 221 changed files with 9,540 additions and 5,344 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,16 @@ jobs:
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install core python packages
run: python3 -m pip install --requirement requirements.txt
- name: Install extra python packages
run: python3 -m pip install --requirement utils/cq/requirements.txt
- name: Install enchant
run: sudo apt-get update && sudo apt-get -y install python3-enchant
- name: Show versions
run: ./utils/cq/daos_pylint.py --version
- name: Run pylint check.
run: ./utils/cq/daos_pylint.py --git --output-format github
146 changes: 73 additions & 73 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -1040,78 +1040,78 @@ pipeline {
}
}
} // stage('Fault inection testing on EL 8.8')
// stage('Test RPMs on EL 8.6') {
// when {
// beforeAgent true
// expression { ! skipStage() }
// }
// agent {
// label params.CI_UNIT_VM1_LABEL
// }
// steps {
// job_step_update(
// testRpm(inst_repos: daosRepos(),
// daos_pkg_version: daosPackagesVersion(next_version))
// )
// }
// post {
// always {
// rpm_test_post(env.STAGE_NAME, env.NODELIST)
// }
// }
// } // stage('Test CentOS 7 RPMs')
// stage('Test RPMs on Leap 15.4') {
// when {
// beforeAgent true
// expression { ! skipStage() }
// }
// agent {
// label params.CI_UNIT_VM1_LABEL
// }
// steps {
// /* neither of these work as FTest strips the first node
// out of the pool requiring 2 node clusters at minimum
// * additionally for this use-case, can't override
// ftest_arg with this :-(
// script {
// 'Test RPMs on Leap 15.4': getFunctionalTestStage(
// name: 'Test RPMs on Leap 15.4',
// pragma_suffix: '',
// label: params.CI_UNIT_VM1_LABEL,
// next_version: next_version,
// stage_tags: '',
// default_tags: 'test_daos_management',
// nvme: 'auto',
// run_if_pr: true,
// run_if_landing: true,
// job_status: job_status_internal
// )
// }
// job_step_update(
// functionalTest(
// test_tag: 'test_daos_management',
// ftest_arg: '--yaml_extension single_host',
// inst_repos: daosRepos(),
// inst_rpms: functionalPackages(1, next_version, 'tests-internal'),
// test_function: 'runTestFunctionalV2'))
// }
// post {
// always {
// functionalTestPostV2()
// job_status_update()
// }
// } */
// job_step_update(
// testRpm(inst_repos: daosRepos(),
// daos_pkg_version: daosPackagesVersion(next_version))
// )
// }
// post {
// always {
// rpm_test_post(env.STAGE_NAME, env.NODELIST)
// }
// }
// } // stage('Test Leap 15 RPMs')
stage('Test RPMs on EL 8.6') {
when {
beforeAgent true
expression { ! skipStage() }
}
agent {
label params.CI_UNIT_VM1_LABEL
}
steps {
job_step_update(
testRpm(inst_repos: daosRepos(),
daos_pkg_version: daosPackagesVersion(next_version))
)
}
post {
always {
rpm_test_post(env.STAGE_NAME, env.NODELIST)
}
}
} // stage('Test CentOS 7 RPMs')
stage('Test RPMs on Leap 15.4') {
when {
beforeAgent true
expression { ! skipStage() }
}
agent {
label params.CI_UNIT_VM1_LABEL
}
steps {
/* neither of these work as FTest strips the first node
out of the pool requiring 2 node clusters at minimum
* additionally for this use-case, can't override
ftest_arg with this :-(
script {
'Test RPMs on Leap 15.4': getFunctionalTestStage(
name: 'Test RPMs on Leap 15.4',
pragma_suffix: '',
label: params.CI_UNIT_VM1_LABEL,
next_version: next_version,
stage_tags: '',
default_tags: 'test_daos_management',
nvme: 'auto',
run_if_pr: true,
run_if_landing: true,
job_status: job_status_internal
)
}
job_step_update(
functionalTest(
test_tag: 'test_daos_management',
ftest_arg: '--yaml_extension single_host',
inst_repos: daosRepos(),
inst_rpms: functionalPackages(1, next_version, 'tests-internal'),
test_function: 'runTestFunctionalV2'))
}
post {
always {
functionalTestPostV2()
job_status_update()
}
} */
job_step_update(
testRpm(inst_repos: daosRepos(),
daos_pkg_version: daosPackagesVersion(next_version))
)
}
post {
always {
rpm_test_post(env.STAGE_NAME, env.NODELIST)
}
}
} // stage('Test Leap 15 RPMs')
} // parallel
} // stage('Test')
stage('Test Storage Prep on EL 8.8') {
Expand Down Expand Up @@ -1202,7 +1202,7 @@ pipeline {
stage_tags: 'hw,medium,provider',
default_tags: startedByTimer() ? 'pr daily_regression' : 'pr',
default_nvme: 'auto',
provider: 'ucx+dc_x',
provider: cachedCommitPragma('Test-provider-ucx', 'ucx+ud_x'),
run_if_pr: false,
run_if_landing: false,
job_status: job_status_internal
Expand Down
44 changes: 29 additions & 15 deletions ci/functional/test_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@ test_tag="$TEST_TAG"
tnodes=$(echo "$NODELIST" | cut -d ',' -f 1-"$NODE_COUNT")
first_node=${NODELIST%%,*}

hardware_ok=false

cluster_reboot () {
# shellcheck disable=SC2029,SC2089
clush -B -S -o '-i ci_key' -l root -w "${tnodes}" reboot || true

# shellcheck disable=SC2029,SC2089
poll_cmd=( clush -B -S -o "-i ci_key" -l root -w "${tnodes}" )
poll_cmd+=( '"cat /etc/os-release"' )
reboot_timeout=900 # 15 minutes
poll_cmd+=( cat /etc/os-release )
# 20 minutes, HPE systems may take more than 15 minutes.
reboot_timeout=1200
retry_wait=10 # seconds
timeout=$((SECONDS + reboot_timeout))
while [ "$SECONDS" -lt "$timeout" ]; do
Expand All @@ -42,6 +45,8 @@ test_cluster() {
FIRST_NODE=${first_node} \
TEST_RPMS=${TEST_RPMS} \
NODELIST=${tnodes} \
BUILD_URL=\"$BUILD_URL\" \
STAGE_NAME=\"$STAGE_NAME\" \
$(cat ci/functional/test_main_prep_node.sh)"
}

Expand All @@ -50,8 +55,13 @@ clush -B -S -o '-i ci_key' -l root -w "${first_node}" \

if ! test_cluster; then
# Sometimes a cluster reboot will fix the issue so try it once.
cluster_reboot
test_cluster
if cluster_reboot; then
if test_cluster; then
hardware_ok=true
fi
fi
else
hardware_ok=true
fi

# collect the _results.xml files from test_main_prep_nodes before they
Expand Down Expand Up @@ -79,17 +89,20 @@ export DAOS_TARGET_OVERSUBSCRIBE=1
rm -rf install/lib/daos/TESTING/ftest/avocado ./*_results.xml

mkdir -p install/lib/daos/TESTING/ftest/avocado/job-results
if $TEST_RPMS; then
# shellcheck disable=SC2029
ssh -i ci_key -l jenkins "${first_node}" \
"TEST_TAG=\"$test_tag\" \
TNODES=\"$tnodes\" \
FTEST_ARG=\"${FTEST_ARG:-}\" \
WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \
STAGE_NAME=\"$STAGE_NAME\" \
$(cat ci/functional/test_main_node.sh)"
else
./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG"

if "$hardware_ok"; then
if $TEST_RPMS; then
# shellcheck disable=SC2029
ssh -i ci_key -l jenkins "${first_node}" \
"TEST_TAG=\"$test_tag\" \
TNODES=\"$tnodes\" \
FTEST_ARG=\"${FTEST_ARG:-}\" \
WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \
STAGE_NAME=\"$STAGE_NAME\" \
$(cat ci/functional/test_main_node.sh)"
else
./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG"
fi
fi

# Now rename the previously collected hardware test data for Jenkins
Expand All @@ -104,3 +117,4 @@ for node in ${tnodes//,/ }; do
mv "$old_name" "$new_name"
fi
done
"$hardware_ok"
7 changes: 5 additions & 2 deletions ci/functional/test_main_prep_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ set -eux

: "${FIRST_NODE:=}"
: "${OPERATIONS_EMAIL:=}"
: "${STAGE_NAME:=Unknown}"
: "${BUILD_URL:=Unknown}"

result=0
mail_message=''
Expand Down Expand Up @@ -38,9 +40,10 @@ function do_mail {
return
fi
# shellcheck disable=SC2059
build_info="BUILD_URL = $BUILD_URL$nl STAGE = $STAGE_NAME$nl$nl"
mail -s "Hardware check failed after reboot!" \
-r "$HOSTNAME"@intel.com "$OPERATIONS_EMAIL" \
<<< "$mail_message"
<<< "$build_info$mail_message"
set -x
}

Expand Down Expand Up @@ -242,7 +245,7 @@ if [ -e /sys/class/net/ib1 ]; then
testcases+=" </testcase>$nl"

((testruns++)) || true
testcases+=" <testcase name=\"NVMe lsblk Count Node $mynodenum\">${nl}"
testcases+=" <testcase name=\"PMEM lsblk Count Node $mynodenum\">${nl}"
if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then
lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen."
mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl"
Expand Down
4 changes: 4 additions & 0 deletions ci/provisioning/post_provision_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ source ci/provisioning/post_provision_config_common_functions.sh
# shellcheck disable=SC1091
source ci/junit.sh


: "${MLNX_VER_NUM:=latest-5.8}"

: "${DISTRO:=EL_7}"
DSL_REPO_var="DAOS_STACK_${DISTRO}_LOCAL_REPO"
DSG_REPO_var="DAOS_STACK_${DISTRO}_GROUP_REPO"
Expand All @@ -44,6 +47,7 @@ if ! retry_cmd 2400 clush -B -S -l root -w "$NODESTRING" \
DISTRO=\"$DISTRO\"
DAOS_STACK_RETRY_DELAY_SECONDS=\"$DAOS_STACK_RETRY_DELAY_SECONDS\"
DAOS_STACK_RETRY_COUNT=\"$DAOS_STACK_RETRY_COUNT\"
MLNX_VER_NUM=\"$MLNX_VER_NUM\"
BUILD_URL=\"$BUILD_URL\"
STAGE_NAME=\"$STAGE_NAME\"
OPERATIONS_EMAIL=\"$OPERATIONS_EMAIL\"
Expand Down
3 changes: 0 additions & 3 deletions ci/provisioning/post_provision_config_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,3 @@ case "$ID_LIKE" in
EXCLUDE_UPGRADE+=,fuse,fuse-libs,fuse-devel
;;
esac

# shellcheck disable=SC2034
MLNX_VER_NUM=5.8-3.0.7.0
12 changes: 7 additions & 5 deletions ci/provisioning/post_provision_config_nodes_EL_8.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,21 @@ install_mofed() {
stream=false
gversion="$VERSION_ID"
if [ "$gversion" == "8" ]; then
gversion="8.6"
# Mellanox does not have a release for 8.9 yet.
gversion="8.8"
stream=true
elif [[ $gversion = *.*.* ]]; then
gversion="${gversion%.*}"
fi

# Add a repo to install MOFED RPMS
repo_url=https://artifactory.dc.hpdd.intel.com/artifactory/mlnx_ofed/"$MLNX_VER_NUM-rhel$gversion"-x86_64/
artifactory_base_url="https://artifactory.dc.hpdd.intel.com/artifactory/"
mellanox_proxy="${artifactory_base_url}mellanox-proxy/mlnx_ofed/"
mellanox_key_url="${artifactory_base_url}mlnx_ofed/RPM-GPG-KEY-Mellanox"
rpm --import "$mellanox_key_url"
repo_url="$mellanox_proxy$MLNX_VER_NUM/rhel$gversion/x86_64/"
dnf -y config-manager --add-repo="$repo_url"
curl -L -O "$repo_url"RPM-GPG-KEY-Mellanox
dnf -y config-manager --save --setopt="$(url_to_repo "$repo_url")".gpgcheck=1
rpm --import RPM-GPG-KEY-Mellanox
rm -f RPM-GPG-KEY-Mellanox
dnf repolist || true

time dnf -y install mlnx-ofed-basic ucx-cma ucx-ib ucx-knem ucx-rdmacm ucx-xpmem
Expand Down
1 change: 1 addition & 0 deletions src/bio/bio_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ struct bio_dev_health {
void *bdh_intel_smart_buf; /*Intel SMART attributes*/
uint64_t bdh_stat_age;
unsigned int bdh_inflights;
unsigned int bdh_stopping:1;
uint16_t bdh_vendor_id; /* PCI vendor ID */

/**
Expand Down
Loading

0 comments on commit 12b61a1

Please sign in to comment.