Skip to content

Commit

Permalink
DAOS-15615 test: Clear existing tmpfs mount points before running tes…
Browse files Browse the repository at this point in the history
…ts (#14295)

Some test occasionally fail to start servers due to insufficient
available memory in CI due to left over DAOS mount points from a
previous test.  Adding an option to launch.py to provided a filter,
which if specified, will be used to umount and remove the directory for
any mounted tmpfs filesystems matching the filter. When using --mode=ci
the filter will be set to /mnt/daos.

Signed-off-by: Phil Henderson <[email protected]>
  • Loading branch information
phender authored May 20, 2024
1 parent ed21e30 commit 6b06ee7
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 35 deletions.
33 changes: 32 additions & 1 deletion src/tests/ftest/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def _run(self, args):
logger, self.result, self.repeat, self.slurm_setup, args.sparse, args.failfast,
not args.disable_stop_daos, args.archive, args.rename, args.jenkinslog, core_files,
args.logs_threshold, args.user_create, code_coverage, self.job_results_dir,
self.logdir)
self.logdir, args.clear_mounts)

# Convert the test status to a launch.py status
status |= summarize_run(logger, self.mode, test_status)
Expand Down Expand Up @@ -438,6 +438,28 @@ def __arg_type_find_size(val):
return val


def __arg_type_mount_point(val):
"""Parse a mount point argument.
The mount point does not need to exist on this host.
Args:
val (str): the mount point to parse
Raises:
ArgumentTypeError: if the value is not a string starting with '/'
Returns:
str: the mount point
"""
try:
if not val.startswith(os.sep):
raise ValueError(f'Mount point does not start with {os.sep}')
except Exception as err: # pylint: disable=broad-except
raise ArgumentTypeError(f'Invalid mount point: {val}') from err
return val


def main():
"""Launch DAOS functional tests."""
# Parse the command line arguments
Expand Down Expand Up @@ -507,6 +529,12 @@ def main():
"-a", "--archive",
action="store_true",
help="archive host log files in the avocado job-results directory")
parser.add_argument(
"-c", "--clear_mounts",
action="append",
default=[],
type=__arg_type_mount_point,
help="mount points to remove before running each test")
parser.add_argument(
"-dsd", "--disable_stop_daos",
action="store_true",
Expand Down Expand Up @@ -705,6 +733,9 @@ def main():
args.slurm_install = True
args.slurm_setup = True
args.user_create = True
args.clear_mounts.append("/mnt/daos")
args.clear_mounts.append("/mnt/daos0")
args.clear_mounts.append("/mnt/daos1")

# Setup the Launch object
launch = Launch(args.name, args.mode, args.slurm_install, args.slurm_setup)
Expand Down
24 changes: 0 additions & 24 deletions src/tests/ftest/scripts/setup_nodes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,30 +43,6 @@ cat /etc/security/limits.d/80_daos_limits.conf
ulimit -a
echo \"/var/tmp/core.%e.%t.%p\" > /proc/sys/kernel/core_pattern"
sudo rm -f /var/tmp/core.*
if [ "${HOSTNAME%%.*}" != "$FIRST_NODE" ]; then
if grep /mnt/daos\ /proc/mounts; then
sudo umount /mnt/daos
else
if [ ! -d /mnt/daos ]; then
sudo mkdir -p /mnt/daos
fi
fi

tmpfs_size=16777216
memsize="$(sed -ne '/MemTotal:/s/.* \([0-9][0-9]*\) kB/\1/p' \
/proc/meminfo)"
if [ "$memsize" -gt "32000000" ]; then
# make it twice as big on the hardware cluster
tmpfs_size=$((tmpfs_size*2))
fi
sudo ed <<EOF /etc/fstab
\$a
tmpfs /mnt/daos tmpfs rw,relatime,size=${tmpfs_size}k 0 0 # added by ftest.sh
.
wq
EOF
sudo mount /mnt/daos
fi

# make sure to set up for daos_agent. The test harness will take care of
# creating the /var/run/daos_{agent,server} directories when needed.
Expand Down
148 changes: 138 additions & 10 deletions src/tests/ftest/util/launch_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
(C) Copyright 2022-2023 Intel Corporation.
(C) Copyright 2022-2024 Intel Corporation.
SPDX-License-Identifier: BSD-2-Clause-Patent
"""
Expand All @@ -26,6 +26,8 @@
from util.user_utils import get_group_id, get_user_groups, groupadd, useradd, userdel
from util.yaml_utils import YamlUpdater, get_yaml_data

D_TM_SHARED_MEMORY_KEY = 0x10242048


class LaunchException(Exception):
"""Exception for launch.py execution."""
Expand Down Expand Up @@ -317,14 +319,19 @@ def __init__(self, avocado, launch_result, total_tests, total_repeats, tag_filte
self.local_host = get_local_host()

def prepare(self, logger, test_log_file, test, repeat, user_create, slurm_setup, control_host,
partition_hosts):
partition_hosts, clear_mounts):
"""Prepare the test for execution.
Args:
logger (Logger): logger for the messages produced by this method
test_log_file (str): the log file for this test
test (TestInfo): the test information
repeat (str): the test repetition sequence, e.g. '1/10'
user_create (bool): whether to create extra test users defined by the test
slurm_setup (bool): whether to setup slurm before running the test
control_host (NodeSet): slurm control hosts
partition_hosts (NodeSet): slurm partition hosts
clear_mounts (list): mount points to remove before the test
Returns:
int: status code: 0 = success, 128 = failure
Expand Down Expand Up @@ -354,6 +361,11 @@ def prepare(self, logger, test_log_file, test, repeat, user_create, slurm_setup,
if status:
return status

# Remove existing mount points on each test host
status = self._clear_mount_points(logger, test, clear_mounts)
if status:
return status

# Generate certificate files for the test
return self._generate_certs(logger)

Expand Down Expand Up @@ -457,9 +469,9 @@ def _setup_host_information(self, logger, test, slurm_setup, control_host, parti
Args:
logger (Logger): logger for the messages produced by this method
test (TestInfo): the test information
slurm_setup (bool):
control_host (NodeSet):
partition_hosts (NodeSet):
slurm_setup (bool): whether to setup slurm before running the test
control_host (NodeSet): slurm control hosts
partition_hosts (NodeSet): slurm partition hosts
Returns:
int: status code: 0 = success, 128 = failure
Expand Down Expand Up @@ -663,6 +675,117 @@ def _query_create_user(logger, hosts, user, gid=None, create=False):
if not useradd(logger, hosts, user, gid, test_env.user_dir, True).passed:
raise LaunchException(f'Error creating user {user}')

def _clear_mount_points(self, logger, test, clear_mounts):
"""Remove existing mount points on each test host.
Args:
logger (Logger): logger for the messages produced by this method
test (TestInfo): the test information
clear_mounts (list): mount points to remove before the test
Returns:
int: status code: 0 = success, 128 = failure
"""
if not clear_mounts:
return 0

logger.debug("-" * 80)
hosts = test.host_info.all_hosts
logger.debug("Clearing existing mount points on %s: %s", hosts, clear_mounts)
command = f" df --type=tmpfs --output=target | grep -E '^({'|'.join(clear_mounts)})$'"
find_result = run_remote(logger, hosts, command)
mount_point_hosts = {}
for data in find_result.output:
if not data.passed:
continue
for line in data.stdout:
if line not in mount_point_hosts:
mount_point_hosts[line] = NodeSet()
mount_point_hosts[line].add(data.hosts)

for mount_point, mount_hosts in mount_point_hosts.items():
if not self._remove_super_blocks(logger, mount_hosts, mount_point):
message = "Error removing superblocks for existing mount points"
self.test_result.fail_test(logger, "Prepare", message, sys.exc_info())
return 128

if not self._remove_shared_memory_segments(logger, hosts):
message = "Error removing shared memory segments for existing mount points"
self.test_result.fail_test(logger, "Prepare", message, sys.exc_info())
return 128

for mount_point, mount_hosts in mount_point_hosts.items():
if not self._remove_mount_point(logger, mount_hosts, mount_point):
message = "Error removing existing mount points"
self.test_result.fail_test(logger, "Prepare", message, sys.exc_info())
return 128

return 0

def _remove_super_blocks(self, logger, hosts, mount_point):
"""Remove the super blocks from the specified mount point.
Args:
logger (Logger): logger for the messages produced by this method
hosts (NodeSet): hosts on which to remove the super blocks
mount_point (str): mount point from which to remove the super blocks
Returns:
bool: True if successful; False otherwise
"""
logger.debug("Clearing existing super blocks on %s", hosts)
command = f"sudo rm -fr {mount_point}/*"
return run_remote(logger, hosts, command).passed

def _remove_shared_memory_segments(self, logger, hosts):
"""Remove existing shared memory segments.
Args:
logger (Logger): logger for the messages produced by this method
hosts (NodeSet): hosts on which to remove the shared memory segments
Returns:
bool: True if successful; False otherwise
"""
logger.debug("Clearing existing shared memory segments on %s", hosts)
daos_engine_keys = [hex(D_TM_SHARED_MEMORY_KEY + index) for index in range(4)]
result = run_remote(logger, hosts, "ipcs -m")
keys_per_host = {}
for data in result.output:
if not data.passed:
continue
for line in data.stdout:
info = re.split(r"\s+", line)
if info[0] not in daos_engine_keys:
# Skip processing lines not listing a shared memory segment
continue
if info[0] not in keys_per_host:
keys_per_host[info[0]] = NodeSet()
keys_per_host[info[0]].add(data.hosts)
for key, key_hosts in keys_per_host.items():
logger.debug("Clearing shared memory segment %s on %s:", key, key_hosts)
if not run_remote(logger, key_hosts, f"sudo ipcrm -M {key}").passed:
return False
return True

def _remove_mount_point(self, logger, hosts, mount_point):
"""Remove the mount point.
Args:
logger (Logger): logger for the messages produced by this method
hosts (NodeSet): hosts on which to remove the mount point
mount_point (str): mount point from which to remove the mount point
Returns:
bool: True if successful; False otherwise
"""
logger.debug("Clearing mount point %s on %s:", mount_point, hosts)
commands = [f"sudo umount -f {mount_point}", f"sudo rm -fr {mount_point}"]
for command in commands:
if not run_remote(logger, hosts, command).passed:
return False
return True

def _generate_certs(self, logger):
"""Generate the certificates for the test.
Expand Down Expand Up @@ -1036,15 +1159,17 @@ def _setup_application_directory(self, logger, result):
run_local(logger, f"ls -al '{self._test_env.app_dir}'")
return 0

def run_tests(self, logger, result, repeat, setup, sparse, fail_fast, stop_daos, archive,
def run_tests(self, logger, result, repeat, slurm_setup, sparse, fail_fast, stop_daos, archive,
rename, jenkins_log, core_files, threshold, user_create, code_coverage,
job_results_dir, logdir):
job_results_dir, logdir, clear_mounts):
# pylint: disable=too-many-arguments
"""Run all the tests.
Args:
logger (Logger): logger for the messages produced by this method
mode (str): launch mode
result (Results): object tracking the result of the test
repeat (int): number of times to repeat the test
slurm_setup (bool): whether to setup slurm before running the test
sparse (bool): whether or not to display the shortened avocado test output
fail_fast (bool): whether or not to fail the avocado run command upon the first failure
stop_daos (bool): whether or not to stop daos servers/clients after the test
Expand All @@ -1055,6 +1180,9 @@ def run_tests(self, logger, result, repeat, setup, sparse, fail_fast, stop_daos,
threshold (str): optional upper size limit for test log files
user_create (bool): whether to create extra test users defined by the test
code_coverage (CodeCoverage): bullseye code coverage
job_results_dir (str): avocado job-results directory
logdir (str): base directory in which to place the log file
clear_mounts (list): mount points to remove before each test
Returns:
int: status code indicating any issues running tests
Expand Down Expand Up @@ -1084,8 +1212,8 @@ def run_tests(self, logger, result, repeat, setup, sparse, fail_fast, stop_daos,

# Prepare the hosts to run the tests
step_status = runner.prepare(
logger, test_log_file, test, loop, user_create, setup, self._control,
self._partition_hosts)
logger, test_log_file, test, loop, user_create, slurm_setup, self._control,
self._partition_hosts, clear_mounts)
if step_status:
# Do not run this test - update its failure status to interrupted
return_code |= step_status
Expand Down

0 comments on commit 6b06ee7

Please sign in to comment.