Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mv erp test to compare two #1845

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config/cesm/config_archive.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@
<hist_file_extension>\.h.*.nc$|\.d[dovt]\.</hist_file_extension>
<rest_history_varname>unset</rest_history_varname>
<rpointer>
<rpointer_file>rpointer.ocn.restart$NINST_STRING</rpointer_file>
<rpointer_file>rpointer.ocn$NINST_STRING.restart</rpointer_file>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unrelated to this PR, its a fix for test IRT.f09_g17.B1850

<rpointer_content>./$CASE.pop$NINST_STRING.r.$DATENAME.nc,RESTART_FMT=nc</rpointer_content>
</rpointer>
<rpointer>
<rpointer_file>rpointer.ocn.ovf$NINST_STRING</rpointer_file>
<rpointer_file>rpointer.ocn$NINST_STRING.ovf</rpointer_file>
<rpointer_content>./$CASE.pop$NINST_STRING.ro.$DATENAME</rpointer_content>
</rpointer>
</comp_archive_spec>
Expand Down
4 changes: 4 additions & 0 deletions config/config_tests.xml
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,10 @@ NODEFAIL Tests restart upon detected node failure. Generates fake failu
<STOP_OPTION>ndays</STOP_OPTION>
<STOP_N>11</STOP_N>
<DOUT_S>FALSE</DOUT_S>
<REST_N>$STOP_N / 2 + 1 </REST_N>
<REST_OPTION>$STOP_OPTION</REST_OPTION>
<HIST_OPTION>$STOP_OPTION</HIST_OPTION>
<HIST_N>$STOP_N</HIST_N>
</test>

<test NAME="ERS">
Expand Down
172 changes: 56 additions & 116 deletions scripts/lib/CIME/SystemTests/erp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
CIME ERP test. This class inherits from SystemTestsCommon
CIME ERP test. This class inherits from SystemTestsCompareTwo

This is a pes counts hybrid (open-MP/MPI) restart bfb test from
startup. This is just like an ERS test but the pe-counts/threading
Expand All @@ -8,132 +8,72 @@
(2) Do a restart test with half the number of tasks and threads (suffix rest)
"""

import shutil
from CIME.XML.standard_module_setup import *
from CIME.case_setup import case_setup
import CIME.utils
from CIME.SystemTests.system_tests_common import SystemTestsCommon
from CIME.SystemTests.system_tests_compare_two import SystemTestsCompareTwo
from CIME.check_lockedfiles import *
from CIME.case_st_archive import _get_datenames

logger = logging.getLogger(__name__)

class ERP(SystemTestsCommon):
class ERP(SystemTestsCompareTwo):

def __init__(self, case):
"""
initialize a test object
"""
SystemTestsCommon.__init__(self, case)
SystemTestsCompareTwo.__init__(self, case,
separate_builds = True,
run_two_suffix = 'rest',
run_one_description = 'initial',
run_two_description = 'restart')

def build_phase(self, sharedlib_only=False, model_only=False):
"""
Build two cases. Case one uses defaults, case2 uses half the number of threads
and tasks. This test will fail for components (e.g. pop) that do not reproduce exactly
with different numbers of mpi tasks.
"""
def _common_setup(self):
self._case.set_value("BUILD_THREADED",True)
if sharedlib_only:
return self.build_indv(sharedlib_only=sharedlib_only, model_only=model_only)

exeroot = self._case.get_value("EXEROOT")
cime_model = CIME.utils.get_model()

# Make backup copies of the ORIGINAL env_mach_pes.xml and
# env_build.xml in LockedFiles if they are not there. If there
# are already copies there then simply copy them back to
# have the starting env_mach_pes.xml and env_build.xml
machpes1 = "env_mach_pes.ERP1.xml"
envbuild1 = "env_build.ERP1.xml"
if is_locked(machpes1):
restore(machpes1, newname="env_mach_pes.xml")
else:
lock_file("env_mach_pes.xml", newname=machpes1)

if is_locked(envbuild1):
restore(envbuild1, newname="env_build.xml")

# Build two executables, one using the original tasks and threads (ERP1) and
# one using the modified tasks and threads (ERP2)
# The reason we currently need two executables that CESM-CICE has a compile time decomposition
# For cases where ERP works, changing this decomposition will not affect answers, but it will
# affect the executable that is used
for bld in range(1,3):
logging.warn("Starting bld {}".format(bld))

if (bld == 2):
# halve the number of tasks and threads
for comp in self._case.get_values("COMP_CLASSES"):
ntasks = self._case.get_value("NTASKS_{}".format(comp))
nthreads = self._case.get_value("NTHRDS_{}".format(comp))
rootpe = self._case.get_value("ROOTPE_{}".format(comp))
if ( nthreads > 1 ):
self._case.set_value("NTHRDS_{}".format(comp), nthreads/2)
if ( ntasks > 1 ):
self._case.set_value("NTASKS_{}".format(comp), ntasks/2)
self._case.set_value("ROOTPE_{}".format(comp), rootpe/2)

# Note, some components, like CESM-CICE, have
# decomposition information in env_build.xml
# case_setup(self._case, test_mode=True, reset=True)that
# needs to be regenerated for the above new tasks and thread counts
case_setup(self._case, test_mode=True, reset=True)

# Now rebuild the system, given updated information in env_build.xml
self.build_indv(sharedlib_only=sharedlib_only, model_only=model_only)
shutil.move("{}/{}.exe".format(exeroot,cime_model),
"{}/{}.ERP{}.exe".format(exeroot,cime_model,bld))

# Make copies of the new env_mach_pes.xml and the new
# env_build.xml to be used in the run phase
lock_file("env_mach_pes.xml", newname="env_mach_pes.ERP{}.xml".format(bld))
lock_file("env_build.xml", newname="env_build.ERP{}.xml".format(bld))

def run_phase(self):
# run will have values 1,2
for run in range(1,3):

expect(is_locked("env_mach_pes.ERP{:d}.xml".format(run)),
"ERROR: LockedFiles/env_mach_pes.ERP{:d}.xml does not exist, run case.build".format(run ))

# Use the second env_mach_pes.xml and env_build.xml files
restore("env_mach_pes.ERP{:d}.xml".format(run), newname="env_mach_pes.xml")
restore("env_build.ERP{:d}.xml".format(run), newname="env_build.xml")

# update the case to use the new values
self._case.read_xml()

# Use the second executable that was created
exeroot = self._case.get_value("EXEROOT")
cime_model = CIME.utils.get_model()
exefile = os.path.join(exeroot,"{}.exe".format(cime_model))
exefile2 = os.path.join(exeroot,"{}.ERP{:d}.exe".format(cime_model,run))
if (os.path.isfile(exefile)):
os.remove(exefile)
shutil.copy(exefile2, exefile)

stop_n = self._case.get_value("STOP_N")
stop_option = self._case.get_value("STOP_OPTION")

if run == 1:
expect(stop_n > 2, "ERROR: stop_n value {:d} too short".format(stop_n))
rest_n = stop_n/2 + 1
self._case.set_value("REST_N", rest_n)
self._case.set_value("REST_OPTION", stop_option)
self._case.set_value("HIST_N", stop_n)
self._case.set_value("HIST_OPTION", stop_option)
self._case.set_value("CONTINUE_RUN", False)
suffix = "base"
else:
rest_n = stop_n/2 + 1
stop_new = stop_n - rest_n
expect(stop_new > 0, "ERROR: stop_n value {:d} too short {:d} {:d}".format(stop_new,stop_n,rest_n))
self._case.set_value("STOP_N", stop_new)
self._case.set_value("CONTINUE_RUN", True)
self._case.set_value("REST_OPTION","never")
suffix = "rest"

case_setup(self._case, test_mode=True, reset=True)

self.run_indv(suffix=suffix)

self._component_compare_test("base", "rest")
def _case_one_setup(self):
stop_n = self._case.get_value("STOP_N")

expect(stop_n > 2, "ERROR: stop_n value {:d} too short".format(stop_n))

def _case_two_setup(self):
# halve the number of tasks and threads
for comp in self._case.get_values("COMP_CLASSES"):
ntasks = self._case1.get_value("NTASKS_{}".format(comp))
nthreads = self._case1.get_value("NTHRDS_{}".format(comp))
rootpe = self._case1.get_value("ROOTPE_{}".format(comp))
if ( nthreads > 1 ):
self._case.set_value("NTHRDS_{}".format(comp), nthreads/2)
if ( ntasks > 1 ):
self._case.set_value("NTASKS_{}".format(comp), ntasks/2)
self._case.set_value("ROOTPE_{}".format(comp), rootpe/2)

stop_n = self._case1.get_value("STOP_N")
rest_n = self._case1.get_value("REST_N")
stop_new = stop_n - rest_n
expect(stop_new > 0, "ERROR: stop_n value {:d} too short {:d} {:d}".format(stop_new,stop_n,rest_n))
self._case.set_value("STOP_N", stop_new)
self._case.set_value("HIST_N", stop_n)
self._case.set_value("CONTINUE_RUN", True)
self._case.set_value("REST_OPTION","never")

# Note, some components, like CESM-CICE, have
# decomposition information in env_build.xml that
# needs to be regenerated for the above new tasks and thread counts
case_setup(self._case, test_mode=True, reset=True)

def _case_one_custom_postrun_action(self):
rundir1 = self._case1.get_value("RUNDIR")
rundir2 = self._case2.get_value("RUNDIR")
case = self._case1.get_value("CASE")
datenames = _get_datenames(self._case1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_get_datenames should have the leading underscore removed since it is no longer private.

for file_ in glob.iglob(os.path.join(rundir1,"*")):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This copies the rpointer files and links the restart and hist restart files from the restart time to the case2 run directory.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for coming up with this solution that avoids running the full short-term archiver.

Could the bulk of this function be put in some shared location so that it can be reused by other tests if needed? This could go in scripts/lib/CIME/SystemTests/test_utils/.

logger.info("File is {}".format(file_))
if os.path.basename(file_).startswith("rpointer"):
logger.info("Copy {} to {}".format(file_, rundir2))
shutil.copy(file_, rundir2)
elif os.path.basename(file_).startswith(case) and datenames[0] in file_:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this guaranteed to pick up everything that's needed? There's a lot of complexity in _archive_restarts that isn't captured here. For the most part, I don't see what this might be missing, except that it seems not to capture any unfinished history files that the short-term archiver would capture with get_histfiles_for_restarts.

(This is why I was hoping we could still reuse pieces of the case_st_archive code....)

file_case2 = os.path.join(rundir2, os.path.basename(file_))
if not os.path.isfile(file_case2):
logger.info("Link {} to {}".format(file_, rundir2))
os.symlink(file_, file_case2)