DAOS-15615 test: Clear existing tmpfs mount points before running tes…

…ts (#14295) Some test occasionally fail to start servers due to insufficient available memory in CI due to left over DAOS mount points from a previous test. Adding an option to launch.py to provided a filter, which if specified, will be used to umount and remove the directory for any mounted tmpfs filesystems matching the filter. When using --mode=ci the filter will be set to /mnt/daos. Signed-off-by: Phil Henderson <[email protected]>
daos-stack · May 20, 2024 · 6b06ee7 · 6b06ee7
1 parent ed21e30
commit 6b06ee7
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 35 deletions.
diff --git a/src/tests/ftest/launch.py b/src/tests/ftest/launch.py
@@ -370,7 +370,7 @@ def _run(self, args):
             logger, self.result, self.repeat, self.slurm_setup, args.sparse, args.failfast,
             not args.disable_stop_daos, args.archive, args.rename, args.jenkinslog, core_files,
             args.logs_threshold, args.user_create, code_coverage, self.job_results_dir,
-            self.logdir)
+            self.logdir, args.clear_mounts)
 
         # Convert the test status to a launch.py status
         status |= summarize_run(logger, self.mode, test_status)
@@ -438,6 +438,28 @@ def __arg_type_find_size(val):
     return val
 
 
+def __arg_type_mount_point(val):
+    """Parse a mount point argument.
+
+    The mount point does not need to exist on this host.
+
+    Args:
+        val (str): the mount point to parse
+
+    Raises:
+        ArgumentTypeError: if the value is not a string starting with '/'
+
+    Returns:
+        str: the mount point
+    """
+    try:
+        if not val.startswith(os.sep):
+            raise ValueError(f'Mount point does not start with {os.sep}')
+    except Exception as err:  # pylint: disable=broad-except
+        raise ArgumentTypeError(f'Invalid mount point: {val}') from err
+    return val
+
+
 def main():
     """Launch DAOS functional tests."""
     # Parse the command line arguments
@@ -507,6 +529,12 @@ def main():
         "-a", "--archive",
         action="store_true",
         help="archive host log files in the avocado job-results directory")
+    parser.add_argument(
+        "-c", "--clear_mounts",
+        action="append",
+        default=[],
+        type=__arg_type_mount_point,
+        help="mount points to remove before running each test")
     parser.add_argument(
         "-dsd", "--disable_stop_daos",
         action="store_true",
@@ -705,6 +733,9 @@ def main():
         args.slurm_install = True
         args.slurm_setup = True
         args.user_create = True
+        args.clear_mounts.append("/mnt/daos")
+        args.clear_mounts.append("/mnt/daos0")
+        args.clear_mounts.append("/mnt/daos1")
 
     # Setup the Launch object
     launch = Launch(args.name, args.mode, args.slurm_install, args.slurm_setup)

diff --git a/src/tests/ftest/scripts/setup_nodes.sh b/src/tests/ftest/scripts/setup_nodes.sh
@@ -43,30 +43,6 @@ cat /etc/security/limits.d/80_daos_limits.conf
 ulimit -a
 echo \"/var/tmp/core.%e.%t.%p\" > /proc/sys/kernel/core_pattern"
 sudo rm -f /var/tmp/core.*
-if [ "${HOSTNAME%%.*}" != "$FIRST_NODE" ]; then
-    if grep /mnt/daos\  /proc/mounts; then
-        sudo umount /mnt/daos
-    else
-        if [ ! -d /mnt/daos ]; then
-            sudo mkdir -p /mnt/daos
-        fi
-    fi
-
-    tmpfs_size=16777216
-    memsize="$(sed -ne '/MemTotal:/s/.* \([0-9][0-9]*\) kB/\1/p' \
-               /proc/meminfo)"
-    if [ "$memsize" -gt "32000000" ]; then
-        # make it twice as big on the hardware cluster
-        tmpfs_size=$((tmpfs_size*2))
-    fi
-    sudo ed <<EOF /etc/fstab
-\$a
-tmpfs /mnt/daos tmpfs rw,relatime,size=${tmpfs_size}k 0 0 # added by ftest.sh
-.
-wq
-EOF
-    sudo mount /mnt/daos
-fi
 
 # make sure to set up for daos_agent. The test harness will take care of
 # creating the /var/run/daos_{agent,server} directories when needed.

diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py
@@ -1,5 +1,5 @@
 """
-  (C) Copyright 2022-2023 Intel Corporation.
+  (C) Copyright 2022-2024 Intel Corporation.
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -26,6 +26,8 @@
 from util.user_utils import get_group_id, get_user_groups, groupadd, useradd, userdel
 from util.yaml_utils import YamlUpdater, get_yaml_data
 
+D_TM_SHARED_MEMORY_KEY = 0x10242048
+
 
 class LaunchException(Exception):
     """Exception for launch.py execution."""
@@ -317,14 +319,19 @@ def __init__(self, avocado, launch_result, total_tests, total_repeats, tag_filte
         self.local_host = get_local_host()
 
     def prepare(self, logger, test_log_file, test, repeat, user_create, slurm_setup, control_host,
-                partition_hosts):
+                partition_hosts, clear_mounts):
         """Prepare the test for execution.
 
         Args:
             logger (Logger): logger for the messages produced by this method
+            test_log_file (str): the log file for this test
             test (TestInfo): the test information
             repeat (str): the test repetition sequence, e.g. '1/10'
             user_create (bool): whether to create extra test users defined by the test
+            slurm_setup (bool): whether to setup slurm before running the test
+            control_host (NodeSet): slurm control hosts
+            partition_hosts (NodeSet): slurm partition hosts
+            clear_mounts (list): mount points to remove before the test
 
         Returns:
             int: status code: 0 = success, 128 = failure
@@ -354,6 +361,11 @@ def prepare(self, logger, test_log_file, test, repeat, user_create, slurm_setup,
         if status:
             return status
 
+        # Remove existing mount points on each test host
+        status = self._clear_mount_points(logger, test, clear_mounts)
+        if status:
+            return status
+
         # Generate certificate files for the test
         return self._generate_certs(logger)
 
@@ -457,9 +469,9 @@ def _setup_host_information(self, logger, test, slurm_setup, control_host, parti
         Args:
             logger (Logger): logger for the messages produced by this method
             test (TestInfo): the test information
-            slurm_setup (bool):
-            control_host (NodeSet):
-            partition_hosts (NodeSet):
+            slurm_setup (bool): whether to setup slurm before running the test
+            control_host (NodeSet): slurm control hosts
+            partition_hosts (NodeSet): slurm partition hosts
 
         Returns:
             int: status code: 0 = success, 128 = failure
@@ -663,6 +675,117 @@ def _query_create_user(logger, hosts, user, gid=None, create=False):
         if not useradd(logger, hosts, user, gid, test_env.user_dir, True).passed:
             raise LaunchException(f'Error creating user {user}')
 
+    def _clear_mount_points(self, logger, test, clear_mounts):
+        """Remove existing mount points on each test host.
+
+        Args:
+            logger (Logger): logger for the messages produced by this method
+            test (TestInfo): the test information
+            clear_mounts (list): mount points to remove before the test
+
+        Returns:
+            int: status code: 0 = success, 128 = failure
+        """
+        if not clear_mounts:
+            return 0
+
+        logger.debug("-" * 80)
+        hosts = test.host_info.all_hosts
+        logger.debug("Clearing existing mount points on %s: %s", hosts, clear_mounts)
+        command = f" df --type=tmpfs --output=target | grep -E '^({'|'.join(clear_mounts)})$'"
+        find_result = run_remote(logger, hosts, command)
+        mount_point_hosts = {}
+        for data in find_result.output:
+            if not data.passed:
+                continue
+            for line in data.stdout:
+                if line not in mount_point_hosts:
+                    mount_point_hosts[line] = NodeSet()
+                mount_point_hosts[line].add(data.hosts)
+
+        for mount_point, mount_hosts in mount_point_hosts.items():
+            if not self._remove_super_blocks(logger, mount_hosts, mount_point):
+                message = "Error removing superblocks for existing mount points"
+                self.test_result.fail_test(logger, "Prepare", message, sys.exc_info())
+                return 128
+
+        if not self._remove_shared_memory_segments(logger, hosts):
+            message = "Error removing shared memory segments for existing mount points"
+            self.test_result.fail_test(logger, "Prepare", message, sys.exc_info())
+            return 128
+
+        for mount_point, mount_hosts in mount_point_hosts.items():
+            if not self._remove_mount_point(logger, mount_hosts, mount_point):
+                message = "Error removing existing mount points"
+                self.test_result.fail_test(logger, "Prepare", message, sys.exc_info())
+                return 128
+
+        return 0
+
+    def _remove_super_blocks(self, logger, hosts, mount_point):
+        """Remove the super blocks from the specified mount point.
+
+        Args:
+            logger (Logger): logger for the messages produced by this method
+            hosts (NodeSet): hosts on which to remove the super blocks
+            mount_point (str): mount point from which to remove the super blocks
+
+        Returns:
+            bool: True if successful; False otherwise
+        """
+        logger.debug("Clearing existing super blocks on %s", hosts)
+        command = f"sudo rm -fr {mount_point}/*"
+        return run_remote(logger, hosts, command).passed
+
+    def _remove_shared_memory_segments(self, logger, hosts):
+        """Remove existing shared memory segments.
+
+        Args:
+            logger (Logger): logger for the messages produced by this method
+            hosts (NodeSet): hosts on which to remove the shared memory segments
+
+        Returns:
+            bool: True if successful; False otherwise
+        """
+        logger.debug("Clearing existing shared memory segments on %s", hosts)
+        daos_engine_keys = [hex(D_TM_SHARED_MEMORY_KEY + index) for index in range(4)]
+        result = run_remote(logger, hosts, "ipcs -m")
+        keys_per_host = {}
+        for data in result.output:
+            if not data.passed:
+                continue
+            for line in data.stdout:
+                info = re.split(r"\s+", line)
+                if info[0] not in daos_engine_keys:
+                    # Skip processing lines not listing a shared memory segment
+                    continue
+                if info[0] not in keys_per_host:
+                    keys_per_host[info[0]] = NodeSet()
+                keys_per_host[info[0]].add(data.hosts)
+        for key, key_hosts in keys_per_host.items():
+            logger.debug("Clearing shared memory segment %s on %s:", key, key_hosts)
+            if not run_remote(logger, key_hosts, f"sudo ipcrm -M {key}").passed:
+                return False
+        return True
+
+    def _remove_mount_point(self, logger, hosts, mount_point):
+        """Remove the mount point.
+
+        Args:
+            logger (Logger): logger for the messages produced by this method
+            hosts (NodeSet): hosts on which to remove the mount point
+            mount_point (str): mount point from which to remove the mount point
+
+        Returns:
+            bool: True if successful; False otherwise
+        """
+        logger.debug("Clearing mount point %s on %s:", mount_point, hosts)
+        commands = [f"sudo umount -f {mount_point}", f"sudo rm -fr {mount_point}"]
+        for command in commands:
+            if not run_remote(logger, hosts, command).passed:
+                return False
+        return True
+
     def _generate_certs(self, logger):
         """Generate the certificates for the test.
 
@@ -1036,15 +1159,17 @@ def _setup_application_directory(self, logger, result):
         run_local(logger, f"ls -al '{self._test_env.app_dir}'")
         return 0
 
-    def run_tests(self, logger, result, repeat, setup, sparse, fail_fast, stop_daos, archive,
+    def run_tests(self, logger, result, repeat, slurm_setup, sparse, fail_fast, stop_daos, archive,
                   rename, jenkins_log, core_files, threshold, user_create, code_coverage,
-                  job_results_dir, logdir):
+                  job_results_dir, logdir, clear_mounts):
         # pylint: disable=too-many-arguments
         """Run all the tests.
 
         Args:
             logger (Logger): logger for the messages produced by this method
-            mode (str): launch mode
+            result (Results): object tracking the result of the test
+            repeat (int): number of times to repeat the test
+            slurm_setup (bool): whether to setup slurm before running the test
             sparse (bool): whether or not to display the shortened avocado test output
             fail_fast (bool): whether or not to fail the avocado run command upon the first failure
             stop_daos (bool): whether or not to stop daos servers/clients after the test
@@ -1055,6 +1180,9 @@ def run_tests(self, logger, result, repeat, setup, sparse, fail_fast, stop_daos,
             threshold (str): optional upper size limit for test log files
             user_create (bool): whether to create extra test users defined by the test
             code_coverage (CodeCoverage): bullseye code coverage
+            job_results_dir (str): avocado job-results directory
+            logdir (str): base directory in which to place the log file
+            clear_mounts (list): mount points to remove before each test
 
         Returns:
             int: status code indicating any issues running tests
@@ -1084,8 +1212,8 @@ def run_tests(self, logger, result, repeat, setup, sparse, fail_fast, stop_daos,
 
                 # Prepare the hosts to run the tests
                 step_status = runner.prepare(
-                    logger, test_log_file, test, loop, user_create, setup, self._control,
-                    self._partition_hosts)
+                    logger, test_log_file, test, loop, user_create, slurm_setup, self._control,
+                    self._partition_hosts, clear_mounts)
                 if step_status:
                     # Do not run this test - update its failure status to interrupted
                     return_code |= step_status