diff --git a/config/cesm/machines/config_batch.xml b/config/cesm/machines/config_batch.xml
index 687ef843740..adfe9da9095 100644
--- a/config/cesm/machines/config_batch.xml
+++ b/config/cesm/machines/config_batch.xml
@@ -10,7 +10,7 @@
batch_redirect: Whether a redirect character is needed to submit jobs.
batch_directive: The string that prepends a batch directive for the batch system.
jobid_pattern: A perl regular expression used to filter out the returned job id from a
- queue submission.
+ queue submission.
===============================================================
batch_system
@@ -243,7 +243,12 @@
-S {{ shell }}
- -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}:mem=700GB:ngpus={{ ngpus_per_node }}
+ -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}
+
+
+
+ -S {{ shell }}
+ -l select={{ num_nodes }}:ncpus={{ max_tasks_per_node }}:mpiprocs={{ tasks_per_node }}:ompthreads={{ thread_count }}
@@ -423,7 +428,7 @@
-
ssh login1 cd $CASEROOT ; sbatch
@@ -468,8 +473,8 @@
+ We achieve this setting both queues as the default queue but limiting it
+ to jobs that use 8 tasks or fewer; other jobs will land in sib2.9 -->
sib2.9
sib2.9,sky2.4
@@ -482,8 +487,8 @@
+ We achieve this setting both queues as the default queue but limiting it
+ to jobs that use 8 tasks or fewer; other jobs will land in sky2.4 -->
sky2.4
sib2.9,sky2.4
@@ -721,7 +726,7 @@
regular
-
+
-env
diff --git a/config/cesm/machines/config_compilers.xml b/config/cesm/machines/config_compilers.xml
index 659b14c6e36..429c8042191 100644
--- a/config/cesm/machines/config_compilers.xml
+++ b/config/cesm/machines/config_compilers.xml
@@ -826,6 +826,10 @@ using a fortran linker.
$ENV{NETCDF}
gpfs
$ENV{PNETCDF}
+
+
+ -DHAVE_NANOTIME -DBIT64 -DHAVE_VPRINTF -DHAVE_BACKTRACE -DHAVE_SLASHPROC -DHAVE_COMM_F2C -DHAVE_TIMES -DHAVE_GETTIMEOFDAY
+
@@ -902,11 +906,15 @@ using a fortran linker.
- -qopt-report -xCORE_AVX2 -no-fma
+ -qopt-report -xCORE_AVX2 -no-fma
-qopt-report -xCORE_AVX2 -no-fma
+
+
+ -Wl,-rpath /glade/u/apps/dav/opt/gnu/9.1.0/lib64
+
-DPIO_ENABLE_LOGGING=ON
@@ -933,7 +941,7 @@ using a fortran linker.
- -fallow-argument-mismatch -fallow-invalid-boz
+
-ldl
@@ -954,8 +962,14 @@ using a fortran linker.
$ENV{CESMDATAROOT}/tools/pFUnit/pFUnit3.2.8_cheyenne_Intel17.0.1_MPI_openMP
TRUE
+
+
+ -std=c11 -gxx-name=/glade/u/apps/ch/opt/gnu/9.1.0/bin/g++ -Wl,-rpath,/glade/u/apps/ch/opt/gnu/9.1.0/lib64 -L/glade/u/apps/ch/opt/gnu/9.1.0/lib64 -lgcc
+
+
+
-llapack -lblas
diff --git a/config/cesm/machines/config_machines.xml b/config/cesm/machines/config_machines.xml
index 7d9b619fa70..21649dfec1c 100644
--- a/config/cesm/machines/config_machines.xml
+++ b/config/cesm/machines/config_machines.xml
@@ -358,7 +358,7 @@ This allows using a different mpirun command to launch unit tests
NCAR GPU platform, os is Linux, 36 pes/node, batch system is pbs
casper*
LINUX
- pgi,intel,nvhpc,pgi-gpu,nvhpc-gpu
+ pgi,intel,nvhpc,pgi-gpu,nvhpc-gpu,gnu
openmpi
/glade/scratch/$USER
$ENV{CESMDATAROOT}/inputdata
@@ -397,6 +397,8 @@ This allows using a different mpirun command to launch unit tests
ncarenv/1.3
cmake/3.18.2
+ python
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/CrayLabs
pgi/20.4
@@ -410,9 +412,13 @@ This allows using a different mpirun command to launch unit tests
nvhpc/20.9
+
+ gnu/9.1.0
+
intel/19.1.1
mkl/2020.0.1
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/CrayLabs
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/
@@ -422,23 +428,34 @@ This allows using a different mpirun command to launch unit tests
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/
esmf-8.2.0b11_casper-ncdfio-openmpi-O
-
+
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/11.1.0/
+ esmf-8.2.0b12_casper-ncdfio-openmpi-g
+
+
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/11.1.0/
+ esmf-8.2.0b12_casper-ncdfio-openmpi-O
+
+
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/
+ esmf-8.2.0b13_casper-ncdfio-openmpi-g
+
+
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/
+ esmf-8.2.0b13_casper-ncdfio-openmpi-O
+
+
openmpi/4.1.0
netcdf-mpi/4.8.0
pnetcdf/1.12.2
-
+
netcdf/4.8.0
- openmpi/4.1.0
- netcdf-mpi/4.7.4
- pnetcdf/1.12.2
+ netcdf-mpi/4.7.4
cuda/11.0.3
-
- netcdf/4.7.4
-
openmpi/4.1.0
netcdf-mpi/4.7.4
@@ -460,6 +477,10 @@ This allows using a different mpirun command to launch unit tests
openmpi/4.1.0
netcdf-mpi/4.7.4
pnetcdf/1.12.2
+ SmartRedis/0.1.1
+
+
+ SmartRedis/0.1.1
netcdf/4.7.4
@@ -475,6 +496,9 @@ This allows using a different mpirun command to launch unit tests
/glade/p/cesmdata/cseg
$ENV{NETCDF}
+
+ /glade/u/apps/dav/opt/gnu/9.1.0/bin/:$ENV{PATH}
+
-1
@@ -561,8 +585,9 @@ This allows using a different mpirun command to launch unit tests
ncarenv/1.3
- python/3.7.9
+ conda/latest
cmake
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/CrayLabs
intel/19.1.1
@@ -570,76 +595,84 @@ This allows using a different mpirun command to launch unit tests
mkl
- gnu/10.1.0
- openblas/0.3.9
+ gnu/9.1.0
+ openblas/0.3.6
pgi/20.4
-
+
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/
- esmf-8.2.0b13-ncdfio-mpt-g
+ esmf-8.2.0b23-ncdfio-mpt-g
-
+
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/
- esmf-8.2.0b13-ncdfio-mpt-O
+ esmf-8.2.0b23-ncdfio-mpt-O
+
+
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/
+ esmf-8.2.0b23-ncdfio-openmpi-g
+
+
+ /glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/
+ esmf-8.2.0b14-ncdfio-openmpi-O
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/
- esmf-8.2.0b13-ncdfio-mpiuni-g
+ esmf-8.2.0b23-ncdfio-mpiuni-g
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/intel/19.1.1/
- esmf-8.2.0b13-ncdfio-mpiuni-O
+ esmf-8.2.0b23-ncdfio-mpiuni-O
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/
- esmf-8.2.0b13-ncdfio-mpt-g
+ esmf-8.2.0b23-ncdfio-mpt-g
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/
- esmf-8.2.0b13-ncdfio-mpt-O
+ esmf-8.2.0b23-ncdfio-mpt-O
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/
- esmf-8.2.0b13-ncdfio-openmpi-g
+ esmf-8.2.0b23-ncdfio-openmpi-g
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/
- esmf-8.2.0b13-ncdfio-openmpi-O
+ esmf-8.2.0b23-ncdfio-openmpi-O
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/
- esmf-8.2.0b13-ncdfio-mpiuni-g
+ esmf-8.2.0b23-ncdfio-mpiuni-g
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/gnu/10.1.0/
- esmf-8.2.0b13-ncdfio-mpiuni-O
+ esmf-8.2.0b23-ncdfio-mpiuni-O
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/
- esmf-8.2.0b13-ncdfio-mpt-g
+ esmf-8.2.0b23-ncdfio-mpt-g
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/
- esmf-8.2.0b13-ncdfio-mpt-O
+ esmf-8.2.0b23-ncdfio-mpt-O
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/
- esmf-8.2.0b13-ncdfio-openmpi-g
+ esmf-8.2.0b23-ncdfio-openmpi-g
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/
- esmf-8.2.0b13-ncdfio-openmpi-O
+ esmf-8.2.0b23-ncdfio-openmpi-O
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/
- esmf-8.2.0b13-ncdfio-mpiuni-g
+ esmf-8.2.0b23-ncdfio-mpiuni-g
/glade/p/cesmdata/cseg/PROGS/modulefiles/esmfpkgs/pgi/20.4/
- esmf-8.2.0b13-ncdfio-mpiuni-O
+ esmf-8.2.0b23-ncdfio-mpiuni-O
mpt/2.21
@@ -661,8 +694,15 @@ This allows using a different mpirun command to launch unit tests
netcdf-mpi/4.7.4
- openmpi/3.1.4
- netcdf/4.7.1
+ openmpi/4.0.5
+ netcdf-mpi/4.7.4
+ SmartRedis/0.2.0
+
+
+ openmpi/4.1.0
+ netcdf-mpi/4.8.0
+ pnetcdf/1.12.2
+ SmartRedis/0.2.0
ncarcompilers/0.5.0
@@ -693,6 +733,9 @@ This allows using a different mpirun command to launch unit tests
PASSIVE
true
+
+ /glade/u/apps/ch/opt/gnu/9.1.0/lib64:$ENV{LD_LIBRARY_PATH}
+
false
@@ -1999,52 +2042,52 @@ This allows using a different mpirun command to launch unit tests
/fs/cgd/data0/modules/modulefiles/esmfpkgs/gfortran/9.3.0
- esmf-8.2.0b13-ncdfio-mvapich2-O
+ esmf-8.2.0b23-ncdfio-mvapich2-O
/fs/cgd/data0/modules/modulefiles/esmfpkgs/gfortran/9.3.0
- esmf-8.2.0b13-ncdfio-mvapich2-g
+ esmf-8.2.0b23-ncdfio-mvapich2-g
/fs/cgd/data0/modules/modulefiles/esmfpkgs/gfortran/9.3.0
- esmf-8.2.0b13-ncdfio-mpiuni-O
+ esmf-8.2.0b23-ncdfio-mpiuni-O
/fs/cgd/data0/modules/modulefiles/esmfpkgs/gfortran/9.3.0
- esmf-8.2.0b13-ncdfio-mpiuni-g
+ esmf-8.2.0b23-ncdfio-mpiuni-g
/fs/cgd/data0/modules/modulefiles/esmfpkgs/nag/6.2
- esmf-8.2.0b13-ncdfio-mvapich2-O
+ esmf-8.2.0b23-ncdfio-mvapich2-O
/fs/cgd/data0/modules/modulefiles/esmfpkgs/nag/6.2
- esmf-8.2.0b13-ncdfio-mvapich2-g
+ esmf-8.2.0b23-ncdfio-mvapich2-g
/fs/cgd/data0/modules/modulefiles/esmfpkgs/nag/6.2
- esmf-8.2.0b13-ncdfio-mpiuni-g
+ esmf-8.2.0b23-ncdfio-mpiuni-g
/fs/cgd/data0/modules/modulefiles/esmfpkgs/nag/6.2
- esmf-8.2.0b13-ncdfio-mpiuni-O
+ esmf-8.2.0b23-ncdfio-mpiuni-O
/fs/cgd/data0/modules/modulefiles/esmfpkgs/intel/20.0.1
- esmf-8.2.0b13-ncdfio-mpiuni-g
+ esmf-8.2.0b23-ncdfio-mpiuni-g
/fs/cgd/data0/modules/modulefiles/esmfpkgs/intel/20.0.1
- esmf-8.2.0b13-ncdfio-mpiuni-O
+ esmf-8.2.0b23-ncdfio-mpiuni-O
/fs/cgd/data0/modules/modulefiles/esmfpkgs/intel/20.0.1
- esmf-8.2.0b13-ncdfio-mvapich2-g
+ esmf-8.2.0b23-ncdfio-mvapich2-g
/fs/cgd/data0/modules/modulefiles/esmfpkgs/intel/20.0.1
- esmf-8.2.0b13-ncdfio-mvapich2-O
+ esmf-8.2.0b23-ncdfio-mvapich2-O
/fs/cgd/data0/modules/modulefiles/esmfpkgs/pgi/20.1
diff --git a/scripts/Tools/Makefile b/scripts/Tools/Makefile
index 2e699eec736..0cce88ae6e5 100644
--- a/scripts/Tools/Makefile
+++ b/scripts/Tools/Makefile
@@ -77,6 +77,10 @@ CPPDEFS := $(USER_CPPDEFS) -D$(OS)
include $(CASEROOT)/Macros.make
+ifeq ($(strip $(USE_SMARTSIM)), TRUE)
+ SLIBS += -L$(SMARTREDIS_LIB) -lsmartredis -Wl,-rpath $(SMARTREDIS_LIB)
+endif
+
# Unless DEBUG mode is enabled, use NDEBUG to turn off assert statements.
ifeq ($(strip $(DEBUG)),TRUE)
# e3sm still has components that cannot build with -DDEBUG
@@ -360,11 +364,11 @@ endif
#===============================================================================
ifeq ($(strip $(MPILIB)), mpi-serial)
- CC := $(SCC)
- FC := $(SFC)
- CXX := $(SCXX)
- MPIFC := $(SFC)
- MPICC := $(SCC)
+ CC := $(SCC)
+ FC := $(SFC)
+ CXX := $(SCXX)
+ MPIFC := $(SFC)
+ MPICC := $(SCC)
MPICXX := $(SCXX)
CONFIG_ARGS += MCT_PATH=$(SHAREDLIBROOT)/$(SHAREDPATH)/mct/mpi-serial
else
diff --git a/scripts/Tools/case.submit b/scripts/Tools/case.submit
index 49979d86237..68424ff14ec 100755
--- a/scripts/Tools/case.submit
+++ b/scripts/Tools/case.submit
@@ -118,7 +118,6 @@ def _main_func(description, test_args=False):
config.write(fd)
elif os.path.exists(config_file):
os.remove(config_file)
-
if not test_args:
with Case(caseroot, read_only=False, record=True) as case:
case.submit(job=job, no_batch=no_batch, prereq=prereq, allow_fail=allow_fail,
diff --git a/scripts/lib/CIME/buildlib.py b/scripts/lib/CIME/buildlib.py
index 9b75726e94c..2fdeb97517c 100644
--- a/scripts/lib/CIME/buildlib.py
+++ b/scripts/lib/CIME/buildlib.py
@@ -99,6 +99,11 @@ def run_gmake(case, compclass, compname, libroot, bldroot, libname="", user_cppd
if user_cppdefs:
cmd = cmd + "USER_CPPDEFS='{}'".format(user_cppdefs )
+ use_smartsim = case.get_value("USE_SMARTSIM")
+ if use_smartsim:
+ cmd = cmd + " USE_SMARTSIM=TRUE "
+ print("Building with smartredis library")
+
stat, out, err = run_cmd(cmd, combine_output=True)
print(out)
if stat:
diff --git a/scripts/lib/CIME/case/case_run.py b/scripts/lib/CIME/case/case_run.py
index 1ecd1715c92..3a140b770d7 100644
--- a/scripts/lib/CIME/case/case_run.py
+++ b/scripts/lib/CIME/case/case_run.py
@@ -199,6 +199,11 @@ def _run_model(case, lid, skip_pnl=False, da_cycle=0):
jobid = batch_jobid()
msg_func = lambda *args: jobid if jobid is not None else ""
+ if case.get_value("USE_SMARTSIM"):
+ logger.info("Give the SmartSim DB time to launch")
+ time.sleep(10)
+ logger.info("launching job")
+
return run_and_log_case_status(functor, "case.run",
custom_starting_msg_functor=msg_func,
custom_success_msg_functor=msg_func,
diff --git a/scripts/lib/CIME/test_scheduler.py b/scripts/lib/CIME/test_scheduler.py
index 665745f2c45..b2b209a658a 100644
--- a/scripts/lib/CIME/test_scheduler.py
+++ b/scripts/lib/CIME/test_scheduler.py
@@ -793,25 +793,30 @@ def _run_phase(self, test):
###########################################################################
test_dir = self._get_test_dir(test)
- case_opts = parse_test_name(test)[1]
+ _,case_opts,_,_,_,_,testmods = parse_test_name(test)
if case_opts is not None and "B" in case_opts: # pylint: disable=unsupported-membership-test
self._log_output(test, "{} SKIPPED for test '{}'".format(RUN_PHASE, test))
self._update_test_status_file(test, SUBMIT_PHASE, TEST_PASS_STATUS)
self._update_test_status_file(test, RUN_PHASE, TEST_PASS_STATUS)
- return True, "SKIPPED"
+ return True, "SKIPPED"
else:
- cmd = "./case.submit"
- if not self._allow_pnl:
- cmd += " --skip-preview-namelist"
- if self._no_batch:
- cmd += " --no-batch"
- if self._mail_user:
- cmd += " --mail-user={}".format(self._mail_user)
- if self._mail_type:
- cmd += " -M={}".format(",".join(self._mail_type))
- if self._chksum:
- cmd += " --chksum"
+ # SmartSim test needs a wrapper to submit
+ if testmods and "drv/smartsim" in testmods:
+ self._log_output(test, "Runnning smartsim launch script")
+ cmd = os.path.join(self._cime_root, "tools", "smartsim", "launch.py")
+ else:
+ cmd = "./case.submit"
+ if not self._allow_pnl:
+ cmd += " --skip-preview-namelist"
+ if self._no_batch:
+ cmd += " --no-batch"
+ if self._mail_user:
+ cmd += " --mail-user={}".format(self._mail_user)
+ if self._mail_type:
+ cmd += " -M={}".format(",".join(self._mail_type))
+ if self._chksum:
+ cmd += " --chksum"
return self._shell_cmd_for_phase(test, cmd, RUN_PHASE, from_dir=test_dir)
diff --git a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90 b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90
index f8480b58683..65fc4282fb0 100644
--- a/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90
+++ b/src/components/xcpl_comps_nuopc/xatm/src/atm_comp_nuopc.F90
@@ -356,7 +356,11 @@ end subroutine InitializeRealize
!===============================================================================
subroutine ModelAdvance(gcomp, rc)
-
+ !
+ ! Test for interface to the CrayLabs smartredis client
+ ! see cime/tools/smartsim/README.md for details
+ !
+ use nuopc_shr_methods, only : use_smartredis
! input/output variables
type(ESMF_GridComp) :: gcomp
integer, intent(out) :: rc
@@ -396,6 +400,9 @@ subroutine ModelAdvance(gcomp, rc)
!--------------------------------
! diagnostics
!--------------------------------
+ if(use_smartredis) then
+ call srtest()
+ endif
if (dbug > 1) then
call state_diagnose(exportState,subname//':ES',rc=rc)
@@ -527,4 +534,50 @@ subroutine ModelFinalize(gcomp, rc)
end if
end subroutine ModelFinalize
+ !
+ ! Test for interface to the CrayLabs smartredis client
+ ! see cime/tools/smartsim/README.md for details
+ ! this simply writes a tensor then reads it back and compares the two
+ ! if everything is working they will be the same
+ !
+ subroutine srtest()
+ use nuopc_shr_methods, only : sr_client
+ use iso_c_binding, only : c_double
+ use ESMF, only : ESMF_VMGetCurrent, ESMF_VMGet
+
+
+ type(ESMF_VM) :: vm
+ integer :: mytask
+ integer, parameter :: dim1 = 10
+ integer, parameter :: dim2 = 20
+ integer, parameter :: dim3 = 30
+ character(len=9) :: key_prefix
+ real(r8), dimension(dim1, dim2, dim3) :: recv_array_real_64
+ real(kind=c_double), dimension(dim1, dim2, dim3) :: true_array_real_64
+ integer :: rc
+
+
+ call ESMF_VMGetCurrent(vm, rc=rc)
+ if (chkerr(rc,__LINE__,u_FILE_u)) return
+ call ESMF_VMGet(vm, localPet=mytask, rc=rc)
+ if (chkerr(rc,__LINE__,u_FILE_u)) return
+
+ if (sr_client%isinitialized()) then
+ call random_number(true_array_real_64)
+ call random_number(recv_array_real_64)
+ write(key_prefix, "(A,I6.6)") "pe_",mytask
+ if(mastertask) write(logunit, *) 'putting tensor to smartsim database '
+ call sr_client%put_tensor(key_prefix//"true_array_real_64", true_array_real_64, &
+ shape(true_array_real_64))
+ if(mastertask) write(logunit, *) 'retreveing tensor from database '
+ call sr_client%unpack_tensor(key_prefix//"true_array_real_64", recv_array_real_64,&
+ shape(recv_array_real_64))
+ if (.not. all(true_array_real_64 == recv_array_real_64)) then
+ call shr_sys_abort('true_array_real_64: FAILED')
+ endif
+ else
+ call shr_sys_abort('Could not detect smartsim database')
+ endif
+
+ end subroutine srtest
end module atm_comp_nuopc
diff --git a/tools/MakeRedsky.sh b/tools/MakeRedsky.sh
deleted file mode 100755
index 663959de26a..00000000000
--- a/tools/MakeRedsky.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/csh
-
-source /usr/share/Modules/init/csh
-module purge
-module load intel/13.0
-module load openmpi-intel/1.6
-
-setenv NETCDFROOT /projects/ccsm/yellowstone/netcdf-4.3.2-intel-13.0-openmpi-1.6
-setenv PATH $NETCDFROOT/bin:$PATH
-setenv LD_LIBRARY_PATH $NETCDFROOT/lib:$LD_LIBRARY_PATH
-setenv NETCDF_INCLUDES $NETCDFROOT/include
-setenv NETCDF_LIBS $NETCDFROOT/lib
-
-setenv PNETCDFROOT $NETCDFROOT
-
-setenv USER_FC ifort
-
-gmake LIB_NETCDF=$NETCDF_LIBS INC_NETCDF=$NETCDF_INCLUDES NETCDF=$NETCDFROOT LDFLAGS="-L$NETCDF_LIBS -lnetcdff -lnetcdf"#! /bin/csh -f
diff --git a/tools/smartsim/README.md b/tools/smartsim/README.md
new file mode 100644
index 00000000000..7bde1f6ae8c
--- /dev/null
+++ b/tools/smartsim/README.md
@@ -0,0 +1,19 @@
+Running CESM with the CrayLabs SmartSim tool. https://github.com/CrayLabs/SmartSim
+
+The tools provided here can be used with PBS version 2021 or newer.
+PBS must support the create_resv_from_job option and the user must be
+enabled to use that option.
+
+It allows you to submit multiple jobs to the queue and
+assure that all of the jobs will start and run concurrently.
+
+If --ngpus-per-node > 0 then the SmartSim database will be submitted
+to gpu nodes while the CESM case(s) will run on cpu nodes.
+
+See also:
+https://github.com/ESCOMP/CESM/wiki/Using-HPE-SmartSim-with-CESM
+
+The smartredis https://github.com/CrayLabs/SmartRedis interface is built in share
+and available to all components. If USE_SMARTSIM is false then a stub library is built
+so that no cpp ifdefs are required.
+
diff --git a/tools/smartsim/launch.py b/tools/smartsim/launch.py
new file mode 100755
index 00000000000..566fd539fdd
--- /dev/null
+++ b/tools/smartsim/launch.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+import os, sys
+
+CESM_ROOT = os.getenv("CESM_ROOT")
+
+_LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","Tools")
+if not os.path.isdir(_LIBDIR):
+ raise SystemExit("ERROR: CESM_ROOT must be defined in environment {}".format(CESM_ROOT))
+sys.path.append(_LIBDIR)
+_LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","lib")
+if not os.path.isdir(_LIBDIR):
+ raise SystemExit("ERROR: CESM_ROOT must be defined in environment")
+sys.path.append(_LIBDIR)
+
+
+import argparse, subprocess
+from string import Template
+from CIME.utils import run_cmd, expect
+from CIME.case import Case
+
+def parse_command_line(args, description):
+ parser = argparse.ArgumentParser(description=description,
+ formatter_class=argparse.RawTextHelpFormatter)
+ parser.add_argument("--db-nodes", default=1,
+ help="Number of nodes for the SmartSim database, default=1")
+ parser.add_argument("--ngpus-per-node", default=0,
+ help="Number of gpus per SmartSim database node, default=0")
+ parser.add_argument("--walltime", default="00:30:00",
+ help="Total walltime for submitted job, default=00:30:00")
+ parser.add_argument("--member-nodes", default=1,
+ help="Number of nodes per ensemble member, default=1")
+ parser.add_argument("--account", default="P93300606",
+ help="Account ID")
+ parser.add_argument("--db-port", default=6780,
+ help="db port, default=6780")
+ parser.add_argument("--caseroots" , default=[os.getcwd()],nargs="*",
+ help="Case directory to reference.\n"
+ "Default is current directory.")
+ parser.add_argument("--logroot", default="/glade/scratch/{}".format(os.environ["USER"]),
+ help="Root directory under which SmartSimdb log files will be written")
+ parser.add_argument("--dryrun", action="store_true",
+ help="Create job scripts, but do not submit")
+
+
+ args = parser.parse_args(args[1:])
+ ngpus = ""
+ if int(args.ngpus_per_node) > 0:
+ ngpus = ":ngpus="+args.ngpus_per_node
+
+ expect(int(args.db_nodes) != 2, "db-nodes size of 2 not allowed, decrease to 1 or increase to 3 or more")
+
+
+ return {"db_nodes":args.db_nodes,
+ "caseroots" : ' '.join('"%s"' % x for x in args.caseroots),
+ "ngpus": ngpus,
+ "walltime": args.walltime,
+ "account" : args.account,
+ "db_port": args.db_port,
+ "cesmroot": CESM_ROOT,
+ "logroot" : args.logroot,
+ "python_sys_path": sys.path}, args.caseroots, args.dryrun
+
+def create_submit_files(templatevars):
+ template_files = ["resv_job.template", "launch_database_cluster.template"]
+ rootdir = os.path.dirname(os.path.realpath(__file__))
+ for template in template_files:
+ with open(os.path.join(rootdir,template)) as f:
+ src = Template(f.read())
+ result = src.safe_substitute(templatevars)
+ result_file = template.replace("template","sh")
+ with open(result_file, "w") as f:
+ f.write(result)
+
+def check_cases(caseroots, db_nodes):
+ """
+ Assume all caseroots use the same number of nodes
+ """
+ prevcasepes = -1
+ for caseroot in caseroots:
+ with Case(caseroot, read_only=False) as case:
+ expect(case.get_value("BUILD_COMPLETE"),"ERROR: case build not complete for {}".format(caseroot))
+ casepes = case.get_value("TOTALPES")
+ if prevcasepes > 0:
+ expect(prevcasepes == casepes, "Case {} pe layout mismatch".format(caseroot))
+ else:
+ prevcasepes = casepes
+ member_nodes = case.num_nodes
+ case.set_value("CREATE_SMARTSIM_CLUSTER", db_nodes > 1)
+
+ return member_nodes
+
+def _main_func(desc):
+ templatevars, caseroots, dryrun = parse_command_line(sys.argv, desc)
+ templatevars["member_nodes"] = check_cases(caseroots, int(templatevars["db_nodes"]))
+ templatevars["ensemble_size"] = len(caseroots)
+ templatevars["client_nodes"] = int(templatevars["member_nodes"])*len(caseroots)
+ print("Creating submit files")
+ create_submit_files(templatevars)
+ host = os.environ.get("NCAR_HOST")
+ if host == "cheyenne":
+ queue_name = "regular"
+ gpu_flag = ""
+ else:
+ queue_name = "casper"
+ gpu_flag = "-l gpu_type=vt100"
+
+ if not dryrun:
+ print("Submitting job")
+ _, o, e = run_cmd("qsub -q {} {} resv_job.sh ".format(queue_name, gpu_flag), verbose=True)
+ if e:
+ print("ERROR: {}".format(e))
+ if o:
+ print("{}".format(o))
+
+if __name__ == "__main__":
+ _main_func(__doc__)
diff --git a/tools/smartsim/launch_database_cluster.template b/tools/smartsim/launch_database_cluster.template
new file mode 100644
index 00000000000..df003b97316
--- /dev/null
+++ b/tools/smartsim/launch_database_cluster.template
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+#PBS -N smartsimtest
+#PBS -r n
+#PBS -j oe
+#PBS -V
+#PBS -l walltime=$walltime
+#PBS -A $account
+##PBS -q regular
+#PBS -V
+#PBS -l select=$db_nodes:ncpus=1:ompthreads=1:mpiprocs=1$ngpus
+##PBS -l gpu_type=v100
+
+import os, sys, time
+
+# The python environment is not passed properly to submitted jobs on casper
+_LIBDIR = $python_sys_path
+sys.path.extend(_LIBDIR)
+CESM_ROOT = "$cesmroot"
+if not os.path.isdir(CESM_ROOT):
+ raise SystemExit("ERROR: CESM_ROOT must be defined in environment")
+
+_LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","Tools")
+sys.path.append(_LIBDIR)
+_LIBDIR = os.path.join(CESM_ROOT,"cime","scripts","lib")
+sys.path.append(_LIBDIR)
+from smartsim.utils.log import log_to_file
+
+import socket, subprocess
+import numpy as np
+from CIME.utils import run_cmd, expect
+from smartsim import Experiment, constants
+from smartsim.database import PBSOrchestrator
+log_to_file("$logroot/db_debug_log")
+
+"""
+Launch a distributed, in memory database cluster and use the
+SmartRedis python client to send and recieve some numpy arrays.
+
+This example runs in an interactive allocation with at least three
+nodes and 1 processor per node.
+
+i.e. qsub -l select=3:ncpus=1 -l walltime=00:10:00 -A -q premium -I
+"""
+
+def collect_db_hosts(num_hosts):
+ """A simple method to collect hostnames because we are using
+ openmpi. (not needed for aprun(ALPS), Slurm, etc.
+ """
+
+ hosts = []
+ if "PBS_NODEFILE" in os.environ:
+ node_file = os.environ["PBS_NODEFILE"]
+ with open(node_file, "r") as f:
+ for line in f.readlines():
+ host = line.split(".")[0]
+ hosts.append(host)
+ else:
+ raise Exception("could not parse interactive allocation nodes from PBS_NODEFILE")
+
+ # account for mpiprocs causing repeats in PBS_NODEFILE
+ hosts = list(set(hosts))
+ if len(hosts) >= num_hosts:
+ return hosts[:num_hosts]
+ else:
+ raise Exception("PBS_NODEFILE {} had {} hosts, not {}".format(node_file, len(hosts),num_hosts))
+
+
+def launch_cluster_orc(exp, db_hosts, port):
+ """Just spin up a database cluster, check the status
+ and tear it down"""
+ ld_library_path = os.getenv("LD_LIBRARY_PATH")
+ os.environ["LD_LIBRARY_PATH"]="/glade/u/apps/ch/opt/gnu/9.1.0/lib64"
+ if ld_library_path:
+ os.environ["LD_LIBRARY_PATH"]="/glade/u/apps/ch/opt/gnu/9.1.0/lib64:{}".format(ld_library_path)
+
+ print(f"Starting Orchestrator on hosts: {db_hosts}")
+ # batch = False to launch on existing allocation
+ db = PBSOrchestrator(port=port, db_nodes=len(db_hosts), batch=False, interface="ib0",
+ run_command="mpirun", hosts=db_hosts)
+
+ # generate directories for output files
+ # pass in objects to make dirs for
+ exp.generate(db, overwrite=True)
+
+ # start the database
+ exp.start(db, block=True)
+
+ # get the status of the database
+ statuses = exp.get_status(db)
+ print(f"Status of all database nodes: {statuses}")
+
+ return db
+
+def monitor_client_jobs(rsvname):
+ jobs_done=False
+ while not jobs_done:
+ s, o, e = run_cmd("qstat -q {}".format(rsvname), verbose=True)
+ jobs_left = o.split()[-2:]
+ print("Jobs left: Running {} Queued {}".format(int(jobs_left[0]),int(jobs_left[1])))
+ if int(jobs_left[0]) + int(jobs_left[1]) == 1:
+ jobs_done = True
+# jobs_done = False
+ else:
+ time.sleep(60)
+
+
+
+
+
+# create the experiment and specify PBS because cheyenne is a PBS system
+logdir = os.path.join("$logroot","SmartSimdb.{}".format(os.environ['PBS_JOBID']))
+exp = Experiment(logdir, launcher="pbs")
+
+db_port = $db_port
+db_hosts = collect_db_hosts($db_nodes)
+# start the database
+db = launch_cluster_orc(exp, db_hosts, db_port)
+
+rsvname = os.environ["RSVNAME"]
+# stay alive until client jobs have completed
+monitor_client_jobs(rsvname)
+
+# shutdown the database because we don't need it anymore
+exp.stop(db)
+# delete the job reservation
+run_cmd("pbs_rdel {}".format(rsvname))
diff --git a/tools/smartsim/resv_job.template b/tools/smartsim/resv_job.template
new file mode 100644
index 00000000000..db0306f9194
--- /dev/null
+++ b/tools/smartsim/resv_job.template
@@ -0,0 +1,72 @@
+#!/bin/bash
+#PBS -N resv_job
+#PBS -l select=$db_nodes:ncpus=1:mpiprocs=1$ngpus+$client_nodes:ncpus=36:mpiprocs=36
+##PBS -l gpu_type=v100
+#PBS -l walltime=$walltime
+#PBS -W create_resv_from_job=true
+#PBS -j oe
+#PBS -k oed
+#PBS -A $account
+#PBS -V
+
+for rsv in $(qstat -Q|awk '$1 ~ /^R/{print $1}')
+do
+ parent_job=$(pbs_rstat -F $rsv|awk '$1 ~ /^reserve_job/{print $3}')
+ if [[ "${PBS_JOBID}" == "${parent_job}" ]] ; then
+ rsvname=$rsv
+ head_host=$(pbs_rstat -F $rsv|awk '$1 ~ /^resv_nodes/{print $3}' | cut -d: -f1-1)
+ head_host="${head_host:1}"
+ break
+ fi
+done
+if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is set to '$rsvname' head_host is $head_host"; fi
+
+me=$(whoami)
+pbs_ralter -U $me $rsvname
+
+if [[ "$NCAR_HOST" == "dav" ]]; then
+ gpu_type="-l gpu_type=vt100"
+else
+ gpu_type=""
+fi
+db_jobid=$(qsub -q $rsvname $gpu_type -vRSVNAME=$rsvname launch_database_cluster.sh)
+if [ -z $db_jobid ]; then echo "db_jobid is unset"; exit -1; fi
+
+#qstat -n $PBS_JOBID
+#head_host=$(qstat -n $PBS_JOBID | awk '$1 ~ /chadmi/{print $12}'|cut -d\/ -f1-1)
+
+if [ -z $head_host ]; then echo "head_host is unset, PBS_JOBID=$PBS_JOBID"; qdel $PBS_JOBID; exit -1; fi
+
+# This gets the ib network
+if [[ "$NCAR_HOST" == "dav" ]]; then
+ host_name=${head_host}-ib
+else
+ host_name=${head_host}-ib0
+fi
+echo "HOST NAME is $host_name, head_host is $head_host"
+# This gets the ip over ib network and should be prefered
+SSDB="$(getent hosts ${host_name}|awk '{print $1}'):$db_port"
+# This gets the external network
+#SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):$db_port"
+#db_jobid=$(qsub -q $rsvname $gpu_type -vSMARTSIM_LOG_LEVEL=debug -vRSVNAME=$rsvname launch_database_cluster.sh)
+
+export SSDB
+echo "gpu_type is $gpu_type, head_host is $head_host, db_jobid is $db_jobid, SSDB is $SSDB"
+for caseroot in $caseroots;
+do
+ cd $caseroot
+ testrun=`./xmlquery --value TEST`
+ if [[ "$testrun" == "TRUE" ]]; then
+ echo "Starting Test $caseroot"
+ ./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force
+ ./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test
+ else
+ echo "Starting run $caseroot"
+ ./xmlchange JOB_QUEUE=$rsvname --subgroup case.run --force
+ ./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.run
+ fi
+ ./case.submit
+done
+
+
+