diff --git a/config/cesm/config_archive.xml b/config/cesm/config_archive.xml index 903aaf05f64..f148d3c4e14 100644 --- a/config/cesm/config_archive.xml +++ b/config/cesm/config_archive.xml @@ -78,11 +78,11 @@ \.h.*.nc$|\.d[dovt]\. unset - rpointer.ocn.restart$NINST_STRING + rpointer.ocn$NINST_STRING.restart ./$CASE.pop$NINST_STRING.r.$DATENAME.nc,RESTART_FMT=nc - rpointer.ocn.ovf$NINST_STRING + rpointer.ocn$NINST_STRING.ovf ./$CASE.pop$NINST_STRING.ro.$DATENAME diff --git a/config/config_tests.xml b/config/config_tests.xml index d19cd1394bc..a24577b1ef0 100644 --- a/config/config_tests.xml +++ b/config/config_tests.xml @@ -245,6 +245,10 @@ NODEFAIL Tests restart upon detected node failure. Generates fake failu ndays 11 FALSE + $STOP_N / 2 + 1 + $STOP_OPTION + $STOP_OPTION + $STOP_N diff --git a/scripts/lib/CIME/SystemTests/erp.py b/scripts/lib/CIME/SystemTests/erp.py index 9cde51af190..05021bbda31 100644 --- a/scripts/lib/CIME/SystemTests/erp.py +++ b/scripts/lib/CIME/SystemTests/erp.py @@ -1,5 +1,5 @@ """ -CIME ERP test. This class inherits from SystemTestsCommon +CIME ERP test. This class inherits from SystemTestsCompareTwo This is a pes counts hybrid (open-MP/MPI) restart bfb test from startup. This is just like an ERS test but the pe-counts/threading @@ -8,132 +8,58 @@ (2) Do a restart test with half the number of tasks and threads (suffix rest) """ -import shutil from CIME.XML.standard_module_setup import * from CIME.case_setup import case_setup -import CIME.utils -from CIME.SystemTests.system_tests_common import SystemTestsCommon +from CIME.SystemTests.system_tests_compare_two import SystemTestsCompareTwo from CIME.check_lockedfiles import * logger = logging.getLogger(__name__) -class ERP(SystemTestsCommon): +class ERP(SystemTestsCompareTwo): def __init__(self, case): """ initialize a test object """ - SystemTestsCommon.__init__(self, case) + SystemTestsCompareTwo.__init__(self, case, + separate_builds = True, + run_two_suffix = 'rest', + run_one_description = 'initial', + run_two_description = 'restart') - def build_phase(self, sharedlib_only=False, model_only=False): - """ - Build two cases. Case one uses defaults, case2 uses half the number of threads - and tasks. This test will fail for components (e.g. pop) that do not reproduce exactly - with different numbers of mpi tasks. - """ + def _common_setup(self): self._case.set_value("BUILD_THREADED",True) - if sharedlib_only: - return self.build_indv(sharedlib_only=sharedlib_only, model_only=model_only) - - exeroot = self._case.get_value("EXEROOT") - cime_model = CIME.utils.get_model() - - # Make backup copies of the ORIGINAL env_mach_pes.xml and - # env_build.xml in LockedFiles if they are not there. If there - # are already copies there then simply copy them back to - # have the starting env_mach_pes.xml and env_build.xml - machpes1 = "env_mach_pes.ERP1.xml" - envbuild1 = "env_build.ERP1.xml" - if is_locked(machpes1): - restore(machpes1, newname="env_mach_pes.xml") - else: - lock_file("env_mach_pes.xml", newname=machpes1) - - if is_locked(envbuild1): - restore(envbuild1, newname="env_build.xml") - - # Build two executables, one using the original tasks and threads (ERP1) and - # one using the modified tasks and threads (ERP2) - # The reason we currently need two executables that CESM-CICE has a compile time decomposition - # For cases where ERP works, changing this decomposition will not affect answers, but it will - # affect the executable that is used - for bld in range(1,3): - logging.warn("Starting bld {}".format(bld)) - - if (bld == 2): - # halve the number of tasks and threads - for comp in self._case.get_values("COMP_CLASSES"): - ntasks = self._case.get_value("NTASKS_{}".format(comp)) - nthreads = self._case.get_value("NTHRDS_{}".format(comp)) - rootpe = self._case.get_value("ROOTPE_{}".format(comp)) - if ( nthreads > 1 ): - self._case.set_value("NTHRDS_{}".format(comp), nthreads/2) - if ( ntasks > 1 ): - self._case.set_value("NTASKS_{}".format(comp), ntasks/2) - self._case.set_value("ROOTPE_{}".format(comp), rootpe/2) - - # Note, some components, like CESM-CICE, have - # decomposition information in env_build.xml - # case_setup(self._case, test_mode=True, reset=True)that - # needs to be regenerated for the above new tasks and thread counts - case_setup(self._case, test_mode=True, reset=True) - - # Now rebuild the system, given updated information in env_build.xml - self.build_indv(sharedlib_only=sharedlib_only, model_only=model_only) - shutil.move("{}/{}.exe".format(exeroot,cime_model), - "{}/{}.ERP{}.exe".format(exeroot,cime_model,bld)) - - # Make copies of the new env_mach_pes.xml and the new - # env_build.xml to be used in the run phase - lock_file("env_mach_pes.xml", newname="env_mach_pes.ERP{}.xml".format(bld)) - lock_file("env_build.xml", newname="env_build.ERP{}.xml".format(bld)) - - def run_phase(self): - # run will have values 1,2 - for run in range(1,3): - - expect(is_locked("env_mach_pes.ERP{:d}.xml".format(run)), - "ERROR: LockedFiles/env_mach_pes.ERP{:d}.xml does not exist, run case.build".format(run )) - - # Use the second env_mach_pes.xml and env_build.xml files - restore("env_mach_pes.ERP{:d}.xml".format(run), newname="env_mach_pes.xml") - restore("env_build.ERP{:d}.xml".format(run), newname="env_build.xml") - - # update the case to use the new values - self._case.read_xml() - - # Use the second executable that was created - exeroot = self._case.get_value("EXEROOT") - cime_model = CIME.utils.get_model() - exefile = os.path.join(exeroot,"{}.exe".format(cime_model)) - exefile2 = os.path.join(exeroot,"{}.ERP{:d}.exe".format(cime_model,run)) - if (os.path.isfile(exefile)): - os.remove(exefile) - shutil.copy(exefile2, exefile) - - stop_n = self._case.get_value("STOP_N") - stop_option = self._case.get_value("STOP_OPTION") - - if run == 1: - expect(stop_n > 2, "ERROR: stop_n value {:d} too short".format(stop_n)) - rest_n = stop_n/2 + 1 - self._case.set_value("REST_N", rest_n) - self._case.set_value("REST_OPTION", stop_option) - self._case.set_value("HIST_N", stop_n) - self._case.set_value("HIST_OPTION", stop_option) - self._case.set_value("CONTINUE_RUN", False) - suffix = "base" - else: - rest_n = stop_n/2 + 1 - stop_new = stop_n - rest_n - expect(stop_new > 0, "ERROR: stop_n value {:d} too short {:d} {:d}".format(stop_new,stop_n,rest_n)) - self._case.set_value("STOP_N", stop_new) - self._case.set_value("CONTINUE_RUN", True) - self._case.set_value("REST_OPTION","never") - suffix = "rest" - - case_setup(self._case, test_mode=True, reset=True) - - self.run_indv(suffix=suffix) - self._component_compare_test("base", "rest") + def _case_one_setup(self): + stop_n = self._case.get_value("STOP_N") + + expect(stop_n > 2, "ERROR: stop_n value {:d} too short".format(stop_n)) + + def _case_two_setup(self): + # halve the number of tasks and threads + for comp in self._case.get_values("COMP_CLASSES"): + ntasks = self._case1.get_value("NTASKS_{}".format(comp)) + nthreads = self._case1.get_value("NTHRDS_{}".format(comp)) + rootpe = self._case1.get_value("ROOTPE_{}".format(comp)) + if ( nthreads > 1 ): + self._case.set_value("NTHRDS_{}".format(comp), nthreads/2) + if ( ntasks > 1 ): + self._case.set_value("NTASKS_{}".format(comp), ntasks/2) + self._case.set_value("ROOTPE_{}".format(comp), rootpe/2) + + stop_n = self._case1.get_value("STOP_N") + rest_n = self._case1.get_value("REST_N") + stop_new = stop_n - rest_n + expect(stop_new > 0, "ERROR: stop_n value {:d} too short {:d} {:d}".format(stop_new,stop_n,rest_n)) + self._case.set_value("STOP_N", stop_new) + self._case.set_value("HIST_N", stop_n) + self._case.set_value("CONTINUE_RUN", True) + self._case.set_value("REST_OPTION","never") + + # Note, some components, like CESM-CICE, have + # decomposition information in env_build.xml that + # needs to be regenerated for the above new tasks and thread counts + case_setup(self._case, test_mode=True, reset=True) + + def _case_one_custom_postrun_action(self): + self.copy_case1_restarts_to_case2() diff --git a/scripts/lib/CIME/SystemTests/system_tests_compare_two.py b/scripts/lib/CIME/SystemTests/system_tests_compare_two.py index 8c312df7d35..1ec2569443e 100644 --- a/scripts/lib/CIME/SystemTests/system_tests_compare_two.py +++ b/scripts/lib/CIME/SystemTests/system_tests_compare_two.py @@ -38,6 +38,7 @@ from CIME.SystemTests.system_tests_common import SystemTestsCommon from CIME.case import Case from CIME.case_submit import check_case +from CIME.case_st_archive import archive_last_restarts import shutil, os, glob @@ -235,6 +236,20 @@ def run_phase(self, success_change=False): # pylint: disable=arguments-differ self._component_compare_test(self._run_one_suffix, self._run_two_suffix, success_change=success_change) + def copy_case1_restarts_to_case2(self): + """ + Makes a copy (or symlink) of restart files and related files + (necessary history files, rpointer files) from case1 to case2. + + This is not done automatically, but can be called by individual + tests where case2 does a continue_run using case1's restart + files. + """ + rundir2 = self._case2.get_value("RUNDIR") + archive_last_restarts(case = self._case1, + archive_restdir = rundir2, + link_to_restart_files = True) + # ======================================================================== # Private methods # ======================================================================== diff --git a/scripts/lib/CIME/XML/env_archive.py b/scripts/lib/CIME/XML/env_archive.py index 366b2a5511e..1034644c34b 100644 --- a/scripts/lib/CIME/XML/env_archive.py +++ b/scripts/lib/CIME/XML/env_archive.py @@ -38,14 +38,19 @@ def __init__(self, case_root=None, infile="env_archive.xml"): def get_entries(self): return self.get_nodes('comp_archive_spec') + def get_entry(self, compname): + return self.get_optional_node('comp_archive_spec', attributes={"compname":compname}) + def get_entry_info(self, archive_entry): - compname = archive_entry.attrib['compname'] - compclass = archive_entry.attrib['compclass'] + compname = archive_entry.get('compname') + compclass = archive_entry.get('compclass') return compname,compclass def get_entry_value(self, name, archive_entry): node = self.get_optional_node(name, root=archive_entry) - return node.text + if node is not None: + return node.text + return None def get_rest_file_extensions(self, archive_entry): file_extensions = [] diff --git a/scripts/lib/CIME/case_st_archive.py b/scripts/lib/CIME/case_st_archive.py index 3f469655bbb..30fab631282 100644 --- a/scripts/lib/CIME/case_st_archive.py +++ b/scripts/lib/CIME/case_st_archive.py @@ -6,16 +6,30 @@ from CIME.XML.standard_module_setup import * from CIME.case_submit import submit -from CIME.XML.env_archive import EnvArchive -from CIME.utils import run_and_log_case_status, ls_sorted_by_mtime +from CIME.utils import run_and_log_case_status, ls_sorted_by_mtime, symlink_force from os.path import isdir, join import datetime logger = logging.getLogger(__name__) +############################################################################### +def _get_archive_file_fn(copy_only): +############################################################################### + """ + Returns the function to use for archiving some files + """ + return shutil.copyfile if copy_only else shutil.move + + ############################################################################### def _get_datenames(case, last_date=None): ############################################################################### + """ + Returns a list of datenames giving the dates of cpl restart files + + If there are no cpl restart files, this will return [] + """ + if last_date is not None: try: last = datetime.datetime.strptime(last_date, '%Y-%m-%d') @@ -26,8 +40,7 @@ def _get_datenames(case, last_date=None): expect(isdir(rundir), 'Cannot open directory {} '.format(rundir)) casename = case.get_value("CASE") files = sorted(glob.glob(os.path.join(rundir, casename + '.cpl.r*.nc'))) - if not files: - expect(False, 'Cannot find a {}.cpl.r.*.nc file in directory {} '.format(casename, rundir)) + datenames = [] for filename in files: names = filename.split('.') @@ -62,6 +75,22 @@ def _get_ninst_info(case, compclass): logger.debug("ninst and ninst_strings are: {} and {} for {}".format(ninst, ninst_strings, compclass)) return ninst, ninst_strings +############################################################################### +def _get_component_archive_entries(case, archive): +############################################################################### + """ + Each time this is generator function is called, it yields a tuple + (archive_entry, compname, compclass) for one component in this + case's compset components. + """ + compset_comps = case.get_compset_components() + compset_comps.append('cpl') + compset_comps.append('dart') + + for compname in compset_comps: + archive_entry = archive.get_entry(compname) + if archive_entry is not None: + yield(archive_entry, compname, archive_entry.get("compclass")) ############################################################################### def _archive_rpointer_files(case, archive, archive_entry, archive_restdir, @@ -222,16 +251,53 @@ def get_histfiles_for_restarts(case, archive, archive_entry, restfile): return histfiles ############################################################################### -def _archive_restarts(case, archive, archive_entry, - compclass, compname, datename, datename_is_last, - archive_file_fn): +def _archive_restarts_date(case, archive, + datename, datename_is_last, + archive_restdir, archive_file_fn, + link_to_last_restart_files=False): ############################################################################### + """ + Archive restart files for a single date + + Returns a dictionary of histfiles that need saving in the run + directory, indexed by compname + """ + logger.info('-------------------------------------------') + logger.info('Archiving restarts for date {}'.format(datename)) + logger.info('-------------------------------------------') + + histfiles_savein_rundir_by_compname = {} + + for (archive_entry, compname, compclass) in _get_component_archive_entries(case, archive): + logger.info('Archiving restarts for {} ({})'.format(compname, compclass)) + + # archive restarts + histfiles_savein_rundir = _archive_restarts_date_comp(case, archive, archive_entry, + compclass, compname, + datename, datename_is_last, + archive_restdir, archive_file_fn, + link_to_last_restart_files) + histfiles_savein_rundir_by_compname[compname] = histfiles_savein_rundir + + return histfiles_savein_rundir_by_compname + +############################################################################### +def _archive_restarts_date_comp(case, archive, archive_entry, + compclass, compname, datename, datename_is_last, + archive_restdir, archive_file_fn, + link_to_last_restart_files=False): +############################################################################### + """ + Archive restart files for a single date and single component + + If link_to_last_restart_files is True, then make a symlink to the + last set of restart files (i.e., the set with datename_is_last + True); if False (the default), copy them. (This has no effect on the + history files that are associated with these restart files.) + """ - # determine directory for archiving restarts based on datename - dout_s_root = case.get_value("DOUT_S_ROOT") rundir = case.get_value("RUNDIR") casename = case.get_value("CASE") - archive_restdir = join(dout_s_root, 'rest', datename) if datename_is_last or case.get_value('DOUT_S_SAVE_INTERIM_RESTART_FILES'): if not os.path.exists(archive_restdir): os.makedirs(archive_restdir) @@ -247,6 +313,14 @@ def _archive_restarts(case, archive, archive_entry, # copy latest restart files to archive restart directory histfiles_savein_rundir = [] + # determine function to use for last set of restart files + if link_to_last_restart_files: + last_restart_file_fn = symlink_force + last_restart_file_fn_msg = "linking" + else: + last_restart_file_fn = shutil.copy + last_restart_file_fn_msg = "copying" + # get file_extension suffixes for suffix in archive.get_rest_file_extensions(archive_entry): for i in range(ninst): @@ -291,8 +365,9 @@ def _archive_restarts(case, archive, archive_entry, if datename_is_last: srcfile = os.path.join(rundir, restfile) destfile = os.path.join(archive_restdir, restfile) - shutil.copy(srcfile, destfile) - logger.info("copying \n{} to \n{}".format(srcfile, destfile)) + last_restart_file_fn(srcfile, destfile) + logger.info("{} \n{} to \n{}".format( + last_restart_file_fn_msg, srcfile, destfile)) for histfile in histfiles_for_restart: srcfile = os.path.join(rundir, histfile) destfile = os.path.join(archive_restdir, histfile) @@ -341,50 +416,35 @@ def _archive_process(case, archive, last_date, archive_incomplete_logs, copy_onl """ logger.debug('In archive_process...') - compset_comps = case.get_compset_components() - compset_comps.append('cpl') - compset_comps.append('dart') - if copy_only is True: - archive_file_fn = shutil.copyfile - else: - archive_file_fn = shutil.move + archive_file_fn = _get_archive_file_fn(copy_only) # archive log files _archive_log_files(case, archive_incomplete_logs, archive_file_fn) - for archive_entry in archive.get_entries(): - # determine compname and compclass - compname, compclass = archive.get_entry_info(archive_entry) - - # check for validity of compname - if compname not in compset_comps: - continue - - # archive restarts and all necessary associated fields (e.g. rpointer files) - logger.info('-------------------------------------------') - logger.info('doing short term archiving for {} ({})'.format(compname, compclass)) - logger.info('-------------------------------------------') - datenames = _get_datenames(case, last_date) - for datename in datenames: - logger.info('Archiving for date %s' % datename) - datename_is_last = False - if datename == datenames[-1]: - datename_is_last = True - - # archive restarts - histfiles_savein_rundir = _archive_restarts(case, archive, archive_entry, - compclass, compname, - datename, datename_is_last, - archive_file_fn) - - # if the last datename for restart files, then archive history files - # for this compname - if datename_is_last: - logger.info("histfiles_savein_rundir {} ".format(histfiles_savein_rundir)) - _archive_history_files(case, archive, archive_entry, - compclass, compname, histfiles_savein_rundir, - archive_file_fn) + # archive restarts and all necessary associated files (e.g. rpointer files) + histfiles_savein_rundir_by_compname = {} + dout_s_root = case.get_value("DOUT_S_ROOT") + datenames = _get_datenames(case, last_date) + for datename in datenames: + datename_is_last = False + if datename == datenames[-1]: + datename_is_last = True + + archive_restdir = join(dout_s_root, 'rest', datename) + histfiles_savein_rundir_by_compname_this_date = _archive_restarts_date( + case, archive, datename, datename_is_last, archive_restdir, archive_file_fn) + if datename_is_last: + histfiles_savein_rundir_by_compname = histfiles_savein_rundir_by_compname_this_date + + # archive history files + for (archive_entry, compname, compclass) in _get_component_archive_entries(case, archive): + logger.info('Archiving history files for {} ({})'.format(compname, compclass)) + histfiles_savein_rundir = histfiles_savein_rundir_by_compname.get(compname, []) + logger.info("histfiles_savein_rundir {} ".format(histfiles_savein_rundir)) + _archive_history_files(case, archive, archive_entry, + compclass, compname, histfiles_savein_rundir, + archive_file_fn) ############################################################################### def restore_from_archive(case, rest_dir=None): @@ -409,6 +469,39 @@ def restore_from_archive(case, rest_dir=None): shutil.copy(item, rundir) +############################################################################### +def archive_last_restarts(case, archive_restdir, link_to_restart_files=False): +############################################################################### + """ + Convenience function for archiving just the last set of restart + files to a given directory. This also saves files attached to the + restart set, such as rpointer files and necessary history + files. However, it does not save other files that are typically + archived (e.g., history files, log files). + + Files are copied to the directory given by archive_restdir. + + If link_to_restart_files is True, then symlinks rather than copies + are done for the restart files. (This has no effect on the history + files that are associated with these restart files.) + """ + archive = case.get_env('archive') + datenames = _get_datenames(case) + expect(len(datenames) >= 1, "No restart dates found") + last_datename = datenames[-1] + + # Not currently used for anything if we're only archiving the last + # set of restart files, but needed to satisfy the following interface + archive_file_fn = _get_archive_file_fn(copy_only=False) + + _ = _archive_restarts_date(case=case, + archive=archive, + datename=last_datename, + datename_is_last=True, + archive_restdir=archive_restdir, + archive_file_fn=archive_file_fn, + link_to_last_restart_files=link_to_restart_files) + ############################################################################### def case_st_archive(case, last_date=None, archive_incomplete_logs=True, copy_only=False, no_resubmit=False): ############################################################################### @@ -434,7 +527,7 @@ def case_st_archive(case, last_date=None, archive_incomplete_logs=True, copy_onl logger.info("st_archive starting") - archive = EnvArchive(infile=os.path.join(caseroot, 'env_archive.xml')) + archive = case.get_env('archive') functor = lambda: _archive_process(case, archive, last_date, archive_incomplete_logs, copy_only) run_and_log_case_status(functor, "st_archive", caseroot=caseroot) diff --git a/scripts/lib/CIME/utils.py b/scripts/lib/CIME/utils.py index 93c2c5cd915..6709badf26b 100644 --- a/scripts/lib/CIME/utils.py +++ b/scripts/lib/CIME/utils.py @@ -2,7 +2,7 @@ Common functions used by cime python scripts Warning: you cannot use CIME Classes in this module as it causes circular dependencies """ -import logging, gzip, sys, os, time, re, shutil, glob, string, random, imp +import logging, gzip, sys, os, time, re, shutil, glob, string, random, imp, errno import stat as statlib import warnings from contextlib import contextmanager @@ -602,6 +602,21 @@ def safe_copy(src_dir, tgt_dir, file_map): os.remove(full_tgt) shutil.copy2(full_src, full_tgt) +def symlink_force(target, link_name): + """ + Makes a symlink from link_name to target. Unlike the standard + os.symlink, this will work even if link_name already exists (in + which case link_name will be overwritten). + """ + try: + os.symlink(target, link_name) + except OSError as e: + if e.errno == errno.EEXIST: + os.remove(link_name) + os.symlink(target, link_name) + else: + raise e + def find_proc_id(proc_name=None, children_only=False, of_parent=None):