Skip to content

Commit

Permalink
Merge pull request #1310 from billsacks/izumi_retry
Browse files Browse the repository at this point in the history
run_sys_tests: add --retry option on izumi

### Description of changes

Adds a `--retry` option to `run_sys_tests`, adds a mechanism to set a machine-dependent default value of this option, and sets the default to 1 for izumi. So, if you don't specify the `--retry` option to `run_sys_tests`, the `--retry` option to `create_test` will be set to 1 for izumi, 0 for cheyenne or other machines. You can override this with the `--retry` option to `run_sys_tests`. This value of this option is printed both to stdout and to the (increasingly poorly named) `SRCROOT_GIT_STATUS` file in your testroot directory.

The motivation for this change was: Recently, nearly all of my test runs on izumi have a few tests fail with system errors. It started getting annoying to need to rerun them (and wait for the reruns to complete, then check them) before making a tag. With this change, they will be retried once. We could increase the retry to 2 or 3 if 1 doesn't seem sufficient, but I wanted to start out with just 1 retry because it could start to get confusing or wasteful of machine resources if true failures are rerun multiple times, failing in the same way each time – so I figured we could see if 1 is sufficient before increasing this further.

If I understand correctly how this is implemented in create_test, it will mean that the create_test job (which, with run_sys_tests, is executed on a batch node) will remain active until all tests that it "owns" have completed. There's a chance that this will have negative repercussions in terms of job scheduling (if the scheduler sees that you already have these active jobs so gives priority to someone else), but since izumi tends to be pretty lightly loaded, I'm hopeful it will be okay.

I also made a commit to this PR that fixes some pylint warnings with recent versions of pylint.

### Specific notes

Contributors other than yourself, if any: none

CTSM Issues Fixed (include github issue #): none

Are answers expected to change (and if so in what way)? no

Any User Interface Changes (namelist or namelist defaults changes)? no

Testing performed, if any:
- python tests
- Ran full test suite on izumi via the new run_sys_tests to verify that it worked; this test suite had 3 tests that experienced system failures in their first runs but (as desired) were automatically rerun and passed the second time around
- Ran a 2-test run via run_sys_tests on cheyenne to verify that I haven't broken that
  • Loading branch information
billsacks authored Mar 23, 2021
2 parents 3ac4305 + 853dc06 commit 6fad071
Show file tree
Hide file tree
Showing 9 changed files with 89 additions and 43 deletions.
4 changes: 2 additions & 2 deletions python/ctsm/lilac_build_ctsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,8 +473,8 @@ def _check_and_transform_os(os_type):
'cnl': 'CNL'}
try:
os_type_transformed = transforms[os_type]
except KeyError:
raise ValueError("Unknown OS: {}".format(os_type))
except KeyError as exc:
raise ValueError("Unknown OS: {}".format(os_type)) from exc
return os_type_transformed

def _get_case_dir(build_dir):
Expand Down
4 changes: 2 additions & 2 deletions python/ctsm/lilac_download_input_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import os
import re

from ctsm.ctsm_logging import setup_logging_pre_config, add_logging_args, process_logging_args

from CIME.case import Case # pylint: disable=import-error

from ctsm.ctsm_logging import setup_logging_pre_config, add_logging_args, process_logging_args

logger = logging.getLogger(__name__)

# ========================================================================
Expand Down
4 changes: 2 additions & 2 deletions python/ctsm/lilac_make_runtime_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
from configparser import ConfigParser
from configparser import NoSectionError, NoOptionError

from CIME.buildnml import create_namelist_infile # pylint: disable=import-error

from ctsm.ctsm_logging import setup_logging_pre_config, add_logging_args, process_logging_args
from ctsm.path_utils import path_to_ctsm_root
from ctsm.utils import abort

from CIME.buildnml import create_namelist_infile # pylint: disable=import-error

logger = logging.getLogger(__name__)

# ========================================================================
Expand Down
36 changes: 22 additions & 14 deletions python/ctsm/machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

import logging
from collections import namedtuple
from CIME.utils import get_project # pylint: disable=import-error
from ctsm.joblauncher.job_launcher_factory import \
create_job_launcher, JOB_LAUNCHER_NOBATCH
from CIME.utils import get_project # pylint: disable=import-error

logger = logging.getLogger(__name__)

Expand All @@ -23,11 +23,12 @@
# user of the machine object to check for that possibility if need be.
#
# Similar notes apply to baseline_dir.
Machine = namedtuple('Machine', ['name', # str
'scratch_dir', # str
'baseline_dir', # str
'account', # str or None
'job_launcher']) # subclass of JobLauncherBase
Machine = namedtuple('Machine', ['name', # str
'scratch_dir', # str
'baseline_dir', # str
'account', # str or None
'create_test_retry', # int
'job_launcher']) # subclass of JobLauncherBase

def create_machine(machine_name, defaults, job_launcher_type=None,
scratch_dir=None, account=None,
Expand Down Expand Up @@ -78,6 +79,7 @@ def create_machine(machine_name, defaults, job_launcher_type=None,

mach_defaults = defaults.get(machine_name)
baseline_dir = None
create_test_retry = 0
if mach_defaults is not None:
if job_launcher_type is None:
job_launcher_type = mach_defaults.job_launcher_type
Expand All @@ -93,6 +95,10 @@ def create_machine(machine_name, defaults, job_launcher_type=None,
# generation and comparison, or making a link in some temporary location that
# points to the standard baselines).
baseline_dir = mach_defaults.baseline_dir
# We also don't provide a way to override the default create_test_retry in the
# machine object: this will always give the default value for this machine, and
# other mechanisms will be given for overriding this in a particular case.
create_test_retry = mach_defaults.create_test_retry
if account is None and mach_defaults.account_required and not allow_missing_entries:
raise RuntimeError("Could not find an account code")
else:
Expand Down Expand Up @@ -142,21 +148,23 @@ def create_machine(machine_name, defaults, job_launcher_type=None,
scratch_dir=scratch_dir,
baseline_dir=baseline_dir,
account=account,
create_test_retry=create_test_retry,
job_launcher=job_launcher)

def get_possibly_overridden_baseline_dir(machine, baseline_dir=None):
"""Get the baseline directory to use here, or None
def get_possibly_overridden_mach_value(machine, varname, value=None):
"""Get the value to use for the given machine variable
If baseline_dir is provided (not None), use that. Otherwise use the baseline directory
from machine (which may be None).
If value is provided (not None), use that. Otherwise use the value of the given
variable from the provided machine object.
Args:
machine (Machine)
baseline_dir (str or None): gives the overriding baseline directory to use
varname (str): name of variable to get from the machine object
value: if not None, use this instead of fetching from the machine object
"""
if baseline_dir is not None:
return baseline_dir
return machine.baseline_dir
if value is not None:
return value
return getattr(machine, varname)

def _get_account():
account = get_project()
Expand Down
7 changes: 7 additions & 0 deletions python/ctsm/machine_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
'scratch_dir',
'baseline_dir',
'account_required',
'create_test_retry',
'job_launcher_defaults'])
# job_launcher_type: one of the JOB_LAUNCHERs defined in job_launcher_factory
# scratch_dir: str
Expand All @@ -21,6 +22,7 @@
# have 0, 1 or multiple job_launcher_defaults. (It can be useful to have defaults even
# for the non-default job launcher for this machine, in case the user chooses a
# non-default launcher.)
# create_test_retry: int: Default number of times to retry a create_test job on this machine
# account_required: bool: whether an account number is required on this machine (not
# really a default, but used for error-checking)

Expand All @@ -40,6 +42,7 @@
scratch_dir=os.path.join(os.path.sep, 'glade', 'scratch', get_user()),
baseline_dir=os.path.join(os.path.sep, 'glade', 'p', 'cgd', 'tss', 'ctsm_baselines'),
account_required=True,
create_test_retry=0,
job_launcher_defaults={
JOB_LAUNCHER_QSUB: QsubDefaults(
queue='regular',
Expand All @@ -56,6 +59,7 @@
scratch_dir=os.path.join(os.path.sep, 'scratch', 'cluster', get_user()),
baseline_dir=os.path.join(os.path.sep, 'fs', 'cgd', 'csm', 'ccsm_baselines'),
account_required=False,
create_test_retry=0,
job_launcher_defaults={
JOB_LAUNCHER_QSUB: QsubDefaults(
queue='medium',
Expand All @@ -68,6 +72,9 @@
scratch_dir=os.path.join(os.path.sep, 'scratch', 'cluster', get_user()),
baseline_dir=os.path.join(os.path.sep, 'fs', 'cgd', 'csm', 'ccsm_baselines'),
account_required=False,
# jobs on izumi experience a high frequency of failures, often at the very end of
# the job; so we'll automatically retry a failed job once before giving up on it
create_test_retry=1,
job_launcher_defaults={
JOB_LAUNCHER_QSUB: QsubDefaults(
queue='medium',
Expand Down
35 changes: 26 additions & 9 deletions python/ctsm/run_sys_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@
import subprocess
from datetime import datetime

from CIME.test_utils import get_tests_from_xml # pylint: disable=import-error
from CIME.cs_status_creator import create_cs_status # pylint: disable=import-error

from ctsm.ctsm_logging import setup_logging_pre_config, add_logging_args, process_logging_args
from ctsm.machine_utils import get_machine_name
from ctsm.machine import create_machine, get_possibly_overridden_baseline_dir
from ctsm.machine import create_machine, get_possibly_overridden_mach_value
from ctsm.machine_defaults import MACHINE_DEFAULTS
from ctsm.os_utils import make_link
from ctsm.path_utils import path_to_ctsm_root
from ctsm.joblauncher.job_launcher_factory import JOB_LAUNCHER_NOBATCH

from CIME.test_utils import get_tests_from_xml # pylint: disable=import-error
from CIME.cs_status_creator import create_cs_status # pylint: disable=import-error

logger = logging.getLogger(__name__)

# Number of initial characters from the compiler name to use in a testid
Expand Down Expand Up @@ -72,6 +72,7 @@ def main(cime_path):
compare_name=args.compare, generate_name=args.generate,
baseline_root=args.baseline_root,
walltime=args.walltime, queue=args.queue,
retry=args.retry,
extra_create_test_args=args.extra_create_test_args)

def run_sys_tests(machine, cime_path,
Expand All @@ -85,6 +86,7 @@ def run_sys_tests(machine, cime_path,
compare_name=None, generate_name=None,
baseline_root=None,
walltime=None, queue=None,
retry=None,
extra_create_test_args=''):
"""Implementation of run_sys_tests command
Expand Down Expand Up @@ -119,6 +121,8 @@ def run_sys_tests(machine, cime_path,
determine it automatically)
queue (str): queue to use for each test (if not provided, the test suite will
determine it automatically)
retry (int): retry value to pass to create_test (if not provided, will use the default
for this machine)
extra_create_test_args (str): any extra arguments to create_test, as a single,
space-delimited string
testlist: list of strings giving test names to run
Expand All @@ -137,17 +141,22 @@ def run_sys_tests(machine, cime_path,
if not (skip_testroot_creation or rerun_existing_failures):
_make_testroot(testroot, testid_base, dry_run)
print("Testroot: {}\n".format(testroot))
retry_final = get_possibly_overridden_mach_value(machine,
varname='create_test_retry',
value=retry)
if not skip_git_status:
_record_git_status(testroot, dry_run)
_record_git_status(testroot, retry_final, dry_run)

baseline_root_final = get_possibly_overridden_baseline_dir(machine,
baseline_dir=baseline_root)
baseline_root_final = get_possibly_overridden_mach_value(machine,
varname='baseline_dir',
value=baseline_root)
create_test_args = _get_create_test_args(compare_name=compare_name,
generate_name=generate_name,
baseline_root=baseline_root_final,
account=machine.account,
walltime=walltime,
queue=queue,
retry=retry_final,
rerun_existing_failures=rerun_existing_failures,
extra_create_test_args=extra_create_test_args)
if suite_name:
Expand Down Expand Up @@ -298,6 +307,11 @@ def _commandline_args():
help='Queue to which tests are submitted.\n'
'If not provided, uses machine default.')

parser.add_argument('--retry', type=int,
help='Argument to create_test: Number of times to retry failed tests.\n'
'Default for this machine: {}'.format(
default_machine.create_test_retry))

parser.add_argument('--extra-create-test-args', default='',
help='String giving extra arguments to pass to create_test\n'
'(To allow the argument parsing to accept this, enclose the string\n'
Expand Down Expand Up @@ -396,11 +410,13 @@ def _make_testroot(testroot, testid_base, dry_run):
os.makedirs(testroot)
make_link(testroot, _get_testdir_name(testid_base))

def _record_git_status(testroot, dry_run):
def _record_git_status(testroot, retry, dry_run):
"""Record git status and related information to stdout and a file"""
output = ''
ctsm_root = path_to_ctsm_root()

output += "create_test --retry: {}\n\n".format(retry)

current_hash = subprocess.check_output(['git', 'show', '--no-patch',
'--format=format:%h (%an, %ad) %s\n', 'HEAD'],
cwd=ctsm_root,
Expand Down Expand Up @@ -440,7 +456,7 @@ def _record_git_status(testroot, dry_run):
git_status_file.write(output)

def _get_create_test_args(compare_name, generate_name, baseline_root,
account, walltime, queue,
account, walltime, queue, retry,
rerun_existing_failures,
extra_create_test_args):
args = []
Expand All @@ -456,6 +472,7 @@ def _get_create_test_args(compare_name, generate_name, baseline_root,
args.extend(['--walltime', walltime])
if queue:
args.extend(['--queue', queue])
args.extend(['--retry', str(retry)])
if rerun_existing_failures:
# In addition to --use-existing, we also need --allow-baseline-overwrite in this
# case; otherwise, create_test throws an error saying that the baseline
Expand Down
33 changes: 22 additions & 11 deletions python/ctsm/test/test_unit_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ctsm import add_cime_to_path # pylint: disable=unused-import
from ctsm import unit_testing

from ctsm.machine import create_machine, get_possibly_overridden_baseline_dir
from ctsm.machine import create_machine, get_possibly_overridden_mach_value
from ctsm.machine_utils import get_user
from ctsm.machine_defaults import MACHINE_DEFAULTS, MachineDefaults, QsubDefaults
from ctsm.joblauncher.job_launcher_no_batch import JobLauncherNoBatch
Expand All @@ -23,14 +23,16 @@
class TestCreateMachine(unittest.TestCase):
"""Tests of create_machine"""

def assertMachineInfo(self, machine, name, scratch_dir, baseline_dir, account):
def assertMachineInfo(self, machine, name, scratch_dir, baseline_dir, account,
create_test_retry=0):
"""Asserts that the basic machine info is as expected.
This does NOT dive down into the job launcher"""
self.assertEqual(machine.name, name)
self.assertEqual(machine.scratch_dir, scratch_dir)
self.assertEqual(machine.baseline_dir, baseline_dir)
self.assertEqual(machine.account, account)
self.assertEqual(machine.create_test_retry, create_test_retry)

def assertNoBatchInfo(self, machine, nice_level=None):
"""Asserts that the machine's launcher is of type JobLauncherNoBatch"""
Expand Down Expand Up @@ -62,6 +64,7 @@ def create_defaults(default_job_launcher=JOB_LAUNCHER_QSUB):
scratch_dir=os.path.join(os.path.sep, 'glade', 'scratch', get_user()),
baseline_dir=os.path.join(os.path.sep, 'my', 'baselines'),
account_required=True,
create_test_retry=2,
job_launcher_defaults={
JOB_LAUNCHER_QSUB: QsubDefaults(
queue='regular',
Expand Down Expand Up @@ -130,7 +133,8 @@ def test_knownMachine_defaults(self):
'scratch',
get_user()),
baseline_dir=os.path.join(os.path.sep, 'my', 'baselines'),
account='a123')
account='a123',
create_test_retry=2)
self.assertQsubInfo(machine=machine,
queue='regular',
walltime='06:00:00',
Expand All @@ -152,7 +156,8 @@ def test_knownMachine_argsExplicit(self):
name='cheyenne',
scratch_dir='/custom/path/to/scratch',
baseline_dir=os.path.join(os.path.sep, 'my', 'baselines'),
account='a123')
account='a123',
create_test_retry=2)
self.assertQsubInfo(machine=machine,
queue='custom_queue',
walltime='9:87:65',
Expand All @@ -161,29 +166,35 @@ def test_knownMachine_argsExplicit(self):
extra_args='--custom args')

# ------------------------------------------------------------------------
# Tests of get_possibly_overridden_baseline_dir
# Tests of get_possibly_overridden_mach_value
# ------------------------------------------------------------------------

def test_baselineDir_overridden(self):
"""Tests get_possibly_overridden_baseline_dir when baseline_dir is provided"""
"""Tests get_possibly_overridden_mach_value when baseline_dir is provided"""
defaults = self.create_defaults()
machine = create_machine('cheyenne', defaults, account='a123')
baseline_dir = get_possibly_overridden_baseline_dir(machine, baseline_dir='mypath')
baseline_dir = get_possibly_overridden_mach_value(machine,
varname='baseline_dir',
value='mypath')
self.assertEqual(baseline_dir, 'mypath')

def test_baselineDir_default(self):
"""Tests get_possibly_overridden_baseline_dir when baseline_dir is not provided"""
"""Tests get_possibly_overridden_mach_value when baseline_dir is not provided"""
defaults = self.create_defaults()
machine = create_machine('cheyenne', defaults, account='a123')
baseline_dir = get_possibly_overridden_baseline_dir(machine, baseline_dir=None)
baseline_dir = get_possibly_overridden_mach_value(machine,
varname='baseline_dir',
value=None)
self.assertEqual(baseline_dir, os.path.join(os.path.sep, 'my', 'baselines'))

def test_baselineDir_noDefault(self):
"""Tests get_possibly_overridden_baseline_dir when baseline_dir is not provided
"""Tests get_possibly_overridden_mach_value when baseline_dir is not provided
and there is no default"""
machine = create_machine('unknown_test_machine', MACHINE_DEFAULTS,
account='a123')
baseline_dir = get_possibly_overridden_baseline_dir(machine, baseline_dir=None)
baseline_dir = get_possibly_overridden_mach_value(machine,
varname='baseline_dir',
value=None)
self.assertIsNone(baseline_dir)

if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 6fad071

Please sign in to comment.