Merge pull request #1310 from billsacks/izumi_retry

run_sys_tests: add --retry option on izumi ### Description of changes Adds a `--retry` option to `run_sys_tests`, adds a mechanism to set a machine-dependent default value of this option, and sets the default to 1 for izumi. So, if you don't specify the `--retry` option to `run_sys_tests`, the `--retry` option to `create_test` will be set to 1 for izumi, 0 for cheyenne or other machines. You can override this with the `--retry` option to `run_sys_tests`. This value of this option is printed both to stdout and to the (increasingly poorly named) `SRCROOT_GIT_STATUS` file in your testroot directory. The motivation for this change was: Recently, nearly all of my test runs on izumi have a few tests fail with system errors. It started getting annoying to need to rerun them (and wait for the reruns to complete, then check them) before making a tag. With this change, they will be retried once. We could increase the retry to 2 or 3 if 1 doesn't seem sufficient, but I wanted to start out with just 1 retry because it could start to get confusing or wasteful of machine resources if true failures are rerun multiple times, failing in the same way each time – so I figured we could see if 1 is sufficient before increasing this further. If I understand correctly how this is implemented in create_test, it will mean that the create_test job (which, with run_sys_tests, is executed on a batch node) will remain active until all tests that it "owns" have completed. There's a chance that this will have negative repercussions in terms of job scheduling (if the scheduler sees that you already have these active jobs so gives priority to someone else), but since izumi tends to be pretty lightly loaded, I'm hopeful it will be okay. I also made a commit to this PR that fixes some pylint warnings with recent versions of pylint. ### Specific notes Contributors other than yourself, if any: none CTSM Issues Fixed (include github issue #): none Are answers expected to change (and if so in what way)? no Any User Interface Changes (namelist or namelist defaults changes)? no Testing performed, if any: - python tests - Ran full test suite on izumi via the new run_sys_tests to verify that it worked; this test suite had 3 tests that experienced system failures in their first runs but (as desired) were automatically rerun and passed the second time around - Ran a 2-test run via run_sys_tests on cheyenne to verify that I haven't broken that
ESCOMP · Mar 23, 2021 · 6fad071 · 6fad071
2 parents 3ac4305 + 853dc06
commit 6fad071
Show file tree

Hide file tree

Showing 9 changed files with 89 additions and 43 deletions.
diff --git a/python/ctsm/lilac_build_ctsm.py b/python/ctsm/lilac_build_ctsm.py
@@ -473,8 +473,8 @@ def _check_and_transform_os(os_type):
                   'cnl': 'CNL'}
     try:
         os_type_transformed = transforms[os_type]
-    except KeyError:
-        raise ValueError("Unknown OS: {}".format(os_type))
+    except KeyError as exc:
+        raise ValueError("Unknown OS: {}".format(os_type)) from exc
     return os_type_transformed
 
 def _get_case_dir(build_dir):

diff --git a/python/ctsm/lilac_download_input_data.py b/python/ctsm/lilac_download_input_data.py
@@ -5,10 +5,10 @@
 import os
 import re
 
-from ctsm.ctsm_logging import setup_logging_pre_config, add_logging_args, process_logging_args
-
 from CIME.case import Case # pylint: disable=import-error
 
+from ctsm.ctsm_logging import setup_logging_pre_config, add_logging_args, process_logging_args
+
 logger = logging.getLogger(__name__)
 
 # ========================================================================

diff --git a/python/ctsm/lilac_make_runtime_inputs.py b/python/ctsm/lilac_make_runtime_inputs.py
@@ -8,12 +8,12 @@
 from configparser import ConfigParser
 from configparser import NoSectionError, NoOptionError
 
+from CIME.buildnml import create_namelist_infile # pylint: disable=import-error
+
 from ctsm.ctsm_logging import setup_logging_pre_config, add_logging_args, process_logging_args
 from ctsm.path_utils import path_to_ctsm_root
 from ctsm.utils import abort
 
-from CIME.buildnml import create_namelist_infile # pylint: disable=import-error
-
 logger = logging.getLogger(__name__)
 
 # ========================================================================

diff --git a/python/ctsm/machine.py b/python/ctsm/machine.py
@@ -3,9 +3,9 @@
 
 import logging
 from collections import namedtuple
+from CIME.utils import get_project  # pylint: disable=import-error
 from ctsm.joblauncher.job_launcher_factory import \
     create_job_launcher, JOB_LAUNCHER_NOBATCH
-from CIME.utils import get_project  # pylint: disable=import-error
 
 logger = logging.getLogger(__name__)
 
@@ -23,11 +23,12 @@
 # user of the machine object to check for that possibility if need be.
 #
 # Similar notes apply to baseline_dir.
-Machine = namedtuple('Machine', ['name',           # str
-                                 'scratch_dir',    # str
-                                 'baseline_dir',   # str
-                                 'account',        # str or None
-                                 'job_launcher'])  # subclass of JobLauncherBase
+Machine = namedtuple('Machine', ['name',              # str
+                                 'scratch_dir',       # str
+                                 'baseline_dir',      # str
+                                 'account',           # str or None
+                                 'create_test_retry', # int
+                                 'job_launcher'])     # subclass of JobLauncherBase
 
 def create_machine(machine_name, defaults, job_launcher_type=None,
                    scratch_dir=None, account=None,
@@ -78,6 +79,7 @@ def create_machine(machine_name, defaults, job_launcher_type=None,
 
     mach_defaults = defaults.get(machine_name)
     baseline_dir = None
+    create_test_retry = 0
     if mach_defaults is not None:
         if job_launcher_type is None:
             job_launcher_type = mach_defaults.job_launcher_type
@@ -93,6 +95,10 @@ def create_machine(machine_name, defaults, job_launcher_type=None,
         # generation and comparison, or making a link in some temporary location that
         # points to the standard baselines).
         baseline_dir = mach_defaults.baseline_dir
+        # We also don't provide a way to override the default create_test_retry in the
+        # machine object: this will always give the default value for this machine, and
+        # other mechanisms will be given for overriding this in a particular case.
+        create_test_retry = mach_defaults.create_test_retry
         if account is None and mach_defaults.account_required and not allow_missing_entries:
             raise RuntimeError("Could not find an account code")
     else:
@@ -142,21 +148,23 @@ def create_machine(machine_name, defaults, job_launcher_type=None,
                    scratch_dir=scratch_dir,
                    baseline_dir=baseline_dir,
                    account=account,
+                   create_test_retry=create_test_retry,
                    job_launcher=job_launcher)
 
-def get_possibly_overridden_baseline_dir(machine, baseline_dir=None):
-    """Get the baseline directory to use here, or None
+def get_possibly_overridden_mach_value(machine, varname, value=None):
+    """Get the value to use for the given machine variable
 
-    If baseline_dir is provided (not None), use that. Otherwise use the baseline directory
-    from machine (which may be None).
+    If value is provided (not None), use that. Otherwise use the value of the given
+    variable from the provided machine object.
 
     Args:
     machine (Machine)
-    baseline_dir (str or None): gives the overriding baseline directory to use
+    varname (str): name of variable to get from the machine object
+    value: if not None, use this instead of fetching from the machine object
     """
-    if baseline_dir is not None:
-        return baseline_dir
-    return machine.baseline_dir
+    if value is not None:
+        return value
+    return getattr(machine, varname)
 
 def _get_account():
     account = get_project()

diff --git a/python/ctsm/machine_defaults.py b/python/ctsm/machine_defaults.py
@@ -12,6 +12,7 @@
                                                  'scratch_dir',
                                                  'baseline_dir',
                                                  'account_required',
+                                                 'create_test_retry',
                                                  'job_launcher_defaults'])
 # job_launcher_type: one of the JOB_LAUNCHERs defined in job_launcher_factory
 # scratch_dir: str
@@ -21,6 +22,7 @@
 #     have 0, 1 or multiple job_launcher_defaults. (It can be useful to have defaults even
 #     for the non-default job launcher for this machine, in case the user chooses a
 #     non-default launcher.)
+# create_test_retry: int: Default number of times to retry a create_test job on this machine
 # account_required: bool: whether an account number is required on this machine (not
 #     really a default, but used for error-checking)
 
@@ -40,6 +42,7 @@
         scratch_dir=os.path.join(os.path.sep, 'glade', 'scratch', get_user()),
         baseline_dir=os.path.join(os.path.sep, 'glade', 'p', 'cgd', 'tss', 'ctsm_baselines'),
         account_required=True,
+        create_test_retry=0,
         job_launcher_defaults={
             JOB_LAUNCHER_QSUB: QsubDefaults(
                 queue='regular',
@@ -56,6 +59,7 @@
         scratch_dir=os.path.join(os.path.sep, 'scratch', 'cluster', get_user()),
         baseline_dir=os.path.join(os.path.sep, 'fs', 'cgd', 'csm', 'ccsm_baselines'),
         account_required=False,
+        create_test_retry=0,
         job_launcher_defaults={
             JOB_LAUNCHER_QSUB: QsubDefaults(
                 queue='medium',
@@ -68,6 +72,9 @@
         scratch_dir=os.path.join(os.path.sep, 'scratch', 'cluster', get_user()),
         baseline_dir=os.path.join(os.path.sep, 'fs', 'cgd', 'csm', 'ccsm_baselines'),
         account_required=False,
+        # jobs on izumi experience a high frequency of failures, often at the very end of
+        # the job; so we'll automatically retry a failed job once before giving up on it
+        create_test_retry=1,
         job_launcher_defaults={
             JOB_LAUNCHER_QSUB: QsubDefaults(
                 queue='medium',

diff --git a/python/ctsm/run_sys_tests.py b/python/ctsm/run_sys_tests.py
@@ -8,17 +8,17 @@
 import subprocess
 from datetime import datetime
 
+from CIME.test_utils import get_tests_from_xml  # pylint: disable=import-error
+from CIME.cs_status_creator import create_cs_status  # pylint: disable=import-error
+
 from ctsm.ctsm_logging import setup_logging_pre_config, add_logging_args, process_logging_args
 from ctsm.machine_utils import get_machine_name
-from ctsm.machine import create_machine, get_possibly_overridden_baseline_dir
+from ctsm.machine import create_machine, get_possibly_overridden_mach_value
 from ctsm.machine_defaults import MACHINE_DEFAULTS
 from ctsm.os_utils import make_link
 from ctsm.path_utils import path_to_ctsm_root
 from ctsm.joblauncher.job_launcher_factory import JOB_LAUNCHER_NOBATCH
 
-from CIME.test_utils import get_tests_from_xml  # pylint: disable=import-error
-from CIME.cs_status_creator import create_cs_status  # pylint: disable=import-error
-
 logger = logging.getLogger(__name__)
 
 # Number of initial characters from the compiler name to use in a testid
@@ -72,6 +72,7 @@ def main(cime_path):
                   compare_name=args.compare, generate_name=args.generate,
                   baseline_root=args.baseline_root,
                   walltime=args.walltime, queue=args.queue,
+                  retry=args.retry,
                   extra_create_test_args=args.extra_create_test_args)
 
 def run_sys_tests(machine, cime_path,
@@ -85,6 +86,7 @@ def run_sys_tests(machine, cime_path,
                   compare_name=None, generate_name=None,
                   baseline_root=None,
                   walltime=None, queue=None,
+                  retry=None,
                   extra_create_test_args=''):
     """Implementation of run_sys_tests command
 
@@ -119,6 +121,8 @@ def run_sys_tests(machine, cime_path,
         determine it automatically)
     queue (str): queue to use for each test (if not provided, the test suite will
         determine it automatically)
+    retry (int): retry value to pass to create_test (if not provided, will use the default
+        for this machine)
     extra_create_test_args (str): any extra arguments to create_test, as a single,
         space-delimited string
     testlist: list of strings giving test names to run
@@ -137,17 +141,22 @@ def run_sys_tests(machine, cime_path,
     if not (skip_testroot_creation or rerun_existing_failures):
         _make_testroot(testroot, testid_base, dry_run)
     print("Testroot: {}\n".format(testroot))
+    retry_final = get_possibly_overridden_mach_value(machine,
+                                                     varname='create_test_retry',
+                                                     value=retry)
     if not skip_git_status:
-        _record_git_status(testroot, dry_run)
+        _record_git_status(testroot, retry_final, dry_run)
 
-    baseline_root_final = get_possibly_overridden_baseline_dir(machine,
-                                                               baseline_dir=baseline_root)
+    baseline_root_final = get_possibly_overridden_mach_value(machine,
+                                                             varname='baseline_dir',
+                                                             value=baseline_root)
     create_test_args = _get_create_test_args(compare_name=compare_name,
                                              generate_name=generate_name,
                                              baseline_root=baseline_root_final,
                                              account=machine.account,
                                              walltime=walltime,
                                              queue=queue,
+                                             retry=retry_final,
                                              rerun_existing_failures=rerun_existing_failures,
                                              extra_create_test_args=extra_create_test_args)
     if suite_name:
@@ -298,6 +307,11 @@ def _commandline_args():
                         help='Queue to which tests are submitted.\n'
                         'If not provided, uses machine default.')
 
+    parser.add_argument('--retry', type=int,
+                        help='Argument to create_test: Number of times to retry failed tests.\n'
+                        'Default for this machine: {}'.format(
+                            default_machine.create_test_retry))
+
     parser.add_argument('--extra-create-test-args', default='',
                         help='String giving extra arguments to pass to create_test\n'
                         '(To allow the argument parsing to accept this, enclose the string\n'
@@ -396,11 +410,13 @@ def _make_testroot(testroot, testid_base, dry_run):
         os.makedirs(testroot)
         make_link(testroot, _get_testdir_name(testid_base))
 
-def _record_git_status(testroot, dry_run):
+def _record_git_status(testroot, retry, dry_run):
     """Record git status and related information to stdout and a file"""
     output = ''
     ctsm_root = path_to_ctsm_root()
 
+    output += "create_test --retry: {}\n\n".format(retry)
+
     current_hash = subprocess.check_output(['git', 'show', '--no-patch',
                                             '--format=format:%h (%an, %ad) %s\n', 'HEAD'],
                                            cwd=ctsm_root,
@@ -440,7 +456,7 @@ def _record_git_status(testroot, dry_run):
             git_status_file.write(output)
 
 def _get_create_test_args(compare_name, generate_name, baseline_root,
-                          account, walltime, queue,
+                          account, walltime, queue, retry,
                           rerun_existing_failures,
                           extra_create_test_args):
     args = []
@@ -456,6 +472,7 @@ def _get_create_test_args(compare_name, generate_name, baseline_root,
         args.extend(['--walltime', walltime])
     if queue:
         args.extend(['--queue', queue])
+    args.extend(['--retry', str(retry)])
     if rerun_existing_failures:
         # In addition to --use-existing, we also need --allow-baseline-overwrite in this
         # case; otherwise, create_test throws an error saying that the baseline

diff --git a/python/ctsm/test/test_unit_machine.py b/python/ctsm/test/test_unit_machine.py
@@ -9,7 +9,7 @@
 from ctsm import add_cime_to_path # pylint: disable=unused-import
 from ctsm import unit_testing
 
-from ctsm.machine import create_machine, get_possibly_overridden_baseline_dir
+from ctsm.machine import create_machine, get_possibly_overridden_mach_value
 from ctsm.machine_utils import get_user
 from ctsm.machine_defaults import MACHINE_DEFAULTS, MachineDefaults, QsubDefaults
 from ctsm.joblauncher.job_launcher_no_batch import JobLauncherNoBatch
@@ -23,14 +23,16 @@
 class TestCreateMachine(unittest.TestCase):
     """Tests of create_machine"""
 
-    def assertMachineInfo(self, machine, name, scratch_dir, baseline_dir, account):
+    def assertMachineInfo(self, machine, name, scratch_dir, baseline_dir, account,
+                          create_test_retry=0):
         """Asserts that the basic machine info is as expected.
 
         This does NOT dive down into the job launcher"""
         self.assertEqual(machine.name, name)
         self.assertEqual(machine.scratch_dir, scratch_dir)
         self.assertEqual(machine.baseline_dir, baseline_dir)
         self.assertEqual(machine.account, account)
+        self.assertEqual(machine.create_test_retry, create_test_retry)
 
     def assertNoBatchInfo(self, machine, nice_level=None):
         """Asserts that the machine's launcher is of type JobLauncherNoBatch"""
@@ -62,6 +64,7 @@ def create_defaults(default_job_launcher=JOB_LAUNCHER_QSUB):
                 scratch_dir=os.path.join(os.path.sep, 'glade', 'scratch', get_user()),
                 baseline_dir=os.path.join(os.path.sep, 'my', 'baselines'),
                 account_required=True,
+                create_test_retry=2,
                 job_launcher_defaults={
                     JOB_LAUNCHER_QSUB: QsubDefaults(
                         queue='regular',
@@ -130,7 +133,8 @@ def test_knownMachine_defaults(self):
                                                         'scratch',
                                                         get_user()),
                                baseline_dir=os.path.join(os.path.sep, 'my', 'baselines'),
-                               account='a123')
+                               account='a123',
+                               create_test_retry=2)
         self.assertQsubInfo(machine=machine,
                             queue='regular',
                             walltime='06:00:00',
@@ -152,7 +156,8 @@ def test_knownMachine_argsExplicit(self):
                                name='cheyenne',
                                scratch_dir='/custom/path/to/scratch',
                                baseline_dir=os.path.join(os.path.sep, 'my', 'baselines'),
-                               account='a123')
+                               account='a123',
+                               create_test_retry=2)
         self.assertQsubInfo(machine=machine,
                             queue='custom_queue',
                             walltime='9:87:65',
@@ -161,29 +166,35 @@ def test_knownMachine_argsExplicit(self):
                             extra_args='--custom args')
 
     # ------------------------------------------------------------------------
-    # Tests of get_possibly_overridden_baseline_dir
+    # Tests of get_possibly_overridden_mach_value
     # ------------------------------------------------------------------------
 
     def test_baselineDir_overridden(self):
-        """Tests get_possibly_overridden_baseline_dir when baseline_dir is provided"""
+        """Tests get_possibly_overridden_mach_value when baseline_dir is provided"""
         defaults = self.create_defaults()
         machine = create_machine('cheyenne', defaults, account='a123')
-        baseline_dir = get_possibly_overridden_baseline_dir(machine, baseline_dir='mypath')
+        baseline_dir = get_possibly_overridden_mach_value(machine,
+                                                          varname='baseline_dir',
+                                                          value='mypath')
         self.assertEqual(baseline_dir, 'mypath')
 
     def test_baselineDir_default(self):
-        """Tests get_possibly_overridden_baseline_dir when baseline_dir is not provided"""
+        """Tests get_possibly_overridden_mach_value when baseline_dir is not provided"""
         defaults = self.create_defaults()
         machine = create_machine('cheyenne', defaults, account='a123')
-        baseline_dir = get_possibly_overridden_baseline_dir(machine, baseline_dir=None)
+        baseline_dir = get_possibly_overridden_mach_value(machine,
+                                                          varname='baseline_dir',
+                                                          value=None)
         self.assertEqual(baseline_dir, os.path.join(os.path.sep, 'my', 'baselines'))
 
     def test_baselineDir_noDefault(self):
-        """Tests get_possibly_overridden_baseline_dir when baseline_dir is not provided
+        """Tests get_possibly_overridden_mach_value when baseline_dir is not provided
         and there is no default"""
         machine = create_machine('unknown_test_machine', MACHINE_DEFAULTS,
                                  account='a123')
-        baseline_dir = get_possibly_overridden_baseline_dir(machine, baseline_dir=None)
+        baseline_dir = get_possibly_overridden_mach_value(machine,
+                                                          varname='baseline_dir',
+                                                          value=None)
         self.assertIsNone(baseline_dir)
 
 if __name__ == '__main__':