Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite ERP with SystemTestsCompareTwo #1855

Merged
merged 14 commits into from
Aug 31, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions config/cesm/config_archive.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@
<hist_file_extension>\.h.*.nc$|\.d[dovt]\.</hist_file_extension>
<rest_history_varname>unset</rest_history_varname>
<rpointer>
<rpointer_file>rpointer.ocn.restart$NINST_STRING</rpointer_file>
<rpointer_file>rpointer.ocn$NINST_STRING.restart</rpointer_file>
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changes in this file are unrelated fixes taken from @jedwards4b 's #1845

<rpointer_content>./$CASE.pop$NINST_STRING.r.$DATENAME.nc,RESTART_FMT=nc</rpointer_content>
</rpointer>
<rpointer>
<rpointer_file>rpointer.ocn.ovf$NINST_STRING</rpointer_file>
<rpointer_file>rpointer.ocn$NINST_STRING.ovf</rpointer_file>
<rpointer_content>./$CASE.pop$NINST_STRING.ro.$DATENAME</rpointer_content>
</rpointer>
</comp_archive_spec>
Expand Down
4 changes: 4 additions & 0 deletions config/config_tests.xml
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,10 @@ NODEFAIL Tests restart upon detected node failure. Generates fake failu
<STOP_OPTION>ndays</STOP_OPTION>
<STOP_N>11</STOP_N>
<DOUT_S>FALSE</DOUT_S>
<REST_N>$STOP_N / 2 + 1 </REST_N>
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changes in this file match @jedwards4b 's #1845

<REST_OPTION>$STOP_OPTION</REST_OPTION>
<HIST_OPTION>$STOP_OPTION</HIST_OPTION>
<HIST_N>$STOP_N</HIST_N>
</test>

<test NAME="ERS">
Expand Down
158 changes: 42 additions & 116 deletions scripts/lib/CIME/SystemTests/erp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
CIME ERP test. This class inherits from SystemTestsCommon
CIME ERP test. This class inherits from SystemTestsCompareTwo

This is a pes counts hybrid (open-MP/MPI) restart bfb test from
startup. This is just like an ERS test but the pe-counts/threading
Expand All @@ -8,132 +8,58 @@
(2) Do a restart test with half the number of tasks and threads (suffix rest)
"""

import shutil
from CIME.XML.standard_module_setup import *
from CIME.case_setup import case_setup
import CIME.utils
from CIME.SystemTests.system_tests_common import SystemTestsCommon
from CIME.SystemTests.system_tests_compare_two import SystemTestsCompareTwo
from CIME.check_lockedfiles import *

logger = logging.getLogger(__name__)

class ERP(SystemTestsCommon):
class ERP(SystemTestsCompareTwo):

def __init__(self, case):
"""
initialize a test object
"""
SystemTestsCommon.__init__(self, case)
SystemTestsCompareTwo.__init__(self, case,
separate_builds = True,
run_two_suffix = 'rest',
run_one_description = 'initial',
run_two_description = 'restart')

def build_phase(self, sharedlib_only=False, model_only=False):
"""
Build two cases. Case one uses defaults, case2 uses half the number of threads
and tasks. This test will fail for components (e.g. pop) that do not reproduce exactly
with different numbers of mpi tasks.
"""
def _common_setup(self):
self._case.set_value("BUILD_THREADED",True)
if sharedlib_only:
return self.build_indv(sharedlib_only=sharedlib_only, model_only=model_only)

exeroot = self._case.get_value("EXEROOT")
cime_model = CIME.utils.get_model()

# Make backup copies of the ORIGINAL env_mach_pes.xml and
# env_build.xml in LockedFiles if they are not there. If there
# are already copies there then simply copy them back to
# have the starting env_mach_pes.xml and env_build.xml
machpes1 = "env_mach_pes.ERP1.xml"
envbuild1 = "env_build.ERP1.xml"
if is_locked(machpes1):
restore(machpes1, newname="env_mach_pes.xml")
else:
lock_file("env_mach_pes.xml", newname=machpes1)

if is_locked(envbuild1):
restore(envbuild1, newname="env_build.xml")

# Build two executables, one using the original tasks and threads (ERP1) and
# one using the modified tasks and threads (ERP2)
# The reason we currently need two executables that CESM-CICE has a compile time decomposition
# For cases where ERP works, changing this decomposition will not affect answers, but it will
# affect the executable that is used
for bld in range(1,3):
logging.warn("Starting bld {}".format(bld))

if (bld == 2):
# halve the number of tasks and threads
for comp in self._case.get_values("COMP_CLASSES"):
ntasks = self._case.get_value("NTASKS_{}".format(comp))
nthreads = self._case.get_value("NTHRDS_{}".format(comp))
rootpe = self._case.get_value("ROOTPE_{}".format(comp))
if ( nthreads > 1 ):
self._case.set_value("NTHRDS_{}".format(comp), nthreads/2)
if ( ntasks > 1 ):
self._case.set_value("NTASKS_{}".format(comp), ntasks/2)
self._case.set_value("ROOTPE_{}".format(comp), rootpe/2)

# Note, some components, like CESM-CICE, have
# decomposition information in env_build.xml
# case_setup(self._case, test_mode=True, reset=True)that
# needs to be regenerated for the above new tasks and thread counts
case_setup(self._case, test_mode=True, reset=True)

# Now rebuild the system, given updated information in env_build.xml
self.build_indv(sharedlib_only=sharedlib_only, model_only=model_only)
shutil.move("{}/{}.exe".format(exeroot,cime_model),
"{}/{}.ERP{}.exe".format(exeroot,cime_model,bld))

# Make copies of the new env_mach_pes.xml and the new
# env_build.xml to be used in the run phase
lock_file("env_mach_pes.xml", newname="env_mach_pes.ERP{}.xml".format(bld))
lock_file("env_build.xml", newname="env_build.ERP{}.xml".format(bld))

def run_phase(self):
# run will have values 1,2
for run in range(1,3):

expect(is_locked("env_mach_pes.ERP{:d}.xml".format(run)),
"ERROR: LockedFiles/env_mach_pes.ERP{:d}.xml does not exist, run case.build".format(run ))

# Use the second env_mach_pes.xml and env_build.xml files
restore("env_mach_pes.ERP{:d}.xml".format(run), newname="env_mach_pes.xml")
restore("env_build.ERP{:d}.xml".format(run), newname="env_build.xml")

# update the case to use the new values
self._case.read_xml()

# Use the second executable that was created
exeroot = self._case.get_value("EXEROOT")
cime_model = CIME.utils.get_model()
exefile = os.path.join(exeroot,"{}.exe".format(cime_model))
exefile2 = os.path.join(exeroot,"{}.ERP{:d}.exe".format(cime_model,run))
if (os.path.isfile(exefile)):
os.remove(exefile)
shutil.copy(exefile2, exefile)

stop_n = self._case.get_value("STOP_N")
stop_option = self._case.get_value("STOP_OPTION")

if run == 1:
expect(stop_n > 2, "ERROR: stop_n value {:d} too short".format(stop_n))
rest_n = stop_n/2 + 1
self._case.set_value("REST_N", rest_n)
self._case.set_value("REST_OPTION", stop_option)
self._case.set_value("HIST_N", stop_n)
self._case.set_value("HIST_OPTION", stop_option)
self._case.set_value("CONTINUE_RUN", False)
suffix = "base"
else:
rest_n = stop_n/2 + 1
stop_new = stop_n - rest_n
expect(stop_new > 0, "ERROR: stop_n value {:d} too short {:d} {:d}".format(stop_new,stop_n,rest_n))
self._case.set_value("STOP_N", stop_new)
self._case.set_value("CONTINUE_RUN", True)
self._case.set_value("REST_OPTION","never")
suffix = "rest"

case_setup(self._case, test_mode=True, reset=True)

self.run_indv(suffix=suffix)

self._component_compare_test("base", "rest")
def _case_one_setup(self):
stop_n = self._case.get_value("STOP_N")

expect(stop_n > 2, "ERROR: stop_n value {:d} too short".format(stop_n))

def _case_two_setup(self):
# halve the number of tasks and threads
for comp in self._case.get_values("COMP_CLASSES"):
ntasks = self._case1.get_value("NTASKS_{}".format(comp))
nthreads = self._case1.get_value("NTHRDS_{}".format(comp))
rootpe = self._case1.get_value("ROOTPE_{}".format(comp))
if ( nthreads > 1 ):
self._case.set_value("NTHRDS_{}".format(comp), nthreads/2)
if ( ntasks > 1 ):
self._case.set_value("NTASKS_{}".format(comp), ntasks/2)
self._case.set_value("ROOTPE_{}".format(comp), rootpe/2)

stop_n = self._case1.get_value("STOP_N")
rest_n = self._case1.get_value("REST_N")
stop_new = stop_n - rest_n
expect(stop_new > 0, "ERROR: stop_n value {:d} too short {:d} {:d}".format(stop_new,stop_n,rest_n))
self._case.set_value("STOP_N", stop_new)
self._case.set_value("HIST_N", stop_n)
self._case.set_value("CONTINUE_RUN", True)
self._case.set_value("REST_OPTION","never")

# Note, some components, like CESM-CICE, have
# decomposition information in env_build.xml that
# needs to be regenerated for the above new tasks and thread counts
case_setup(self._case, test_mode=True, reset=True)

def _case_one_custom_postrun_action(self):
self.copy_case1_restarts_to_case2()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changes in this file basically match @jedwards4b 's #1845 , except for this line

15 changes: 15 additions & 0 deletions scripts/lib/CIME/SystemTests/system_tests_compare_two.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from CIME.SystemTests.system_tests_common import SystemTestsCommon
from CIME.case import Case
from CIME.case_submit import check_case
from CIME.case_st_archive import archive_last_restarts

import shutil, os, glob

Expand Down Expand Up @@ -235,6 +236,20 @@ def run_phase(self, success_change=False): # pylint: disable=arguments-differ

self._component_compare_test(self._run_one_suffix, self._run_two_suffix, success_change=success_change)

def copy_case1_restarts_to_case2(self):
"""
Makes a copy (or symlink) of restart files and related files
(necessary history files, rpointer files) from case1 to case2.

This is not done automatically, but can be called by individual
tests where case2 does a continue_run using case1's restart
files.
"""
rundir2 = self._case2.get_value("RUNDIR")
archive_last_restarts(case = self._case1,
archive_restdir = rundir2,
link_to_restart_files = True)

# ========================================================================
# Private methods
# ========================================================================
Expand Down
11 changes: 8 additions & 3 deletions scripts/lib/CIME/XML/env_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,19 @@ def __init__(self, case_root=None, infile="env_archive.xml"):
def get_entries(self):
return self.get_nodes('comp_archive_spec')

def get_entry(self, compname):
return self.get_optional_node('comp_archive_spec', attributes={"compname":compname})

def get_entry_info(self, archive_entry):
compname = archive_entry.attrib['compname']
compclass = archive_entry.attrib['compclass']
compname = archive_entry.get('compname')
compclass = archive_entry.get('compclass')
return compname,compclass

def get_entry_value(self, name, archive_entry):
node = self.get_optional_node(name, root=archive_entry)
return node.text
if node is not None:
return node.text
return None

def get_rest_file_extensions(self, archive_entry):
file_extensions = []
Expand Down
Loading