Skip to content

Commit

Permalink
Enable debugging features in CCPP physics caps, bug fixes and cleanup…
Browse files Browse the repository at this point in the history
… for CCPP variables and metadata, port model to XSEDE Expanse, switch WW3 from develop to dev/ufs-weather-model (#850)

This PR updates the submodule pointers for fv3atm (GFDL_atmos_cubed_sphere, ccpp-framework, ccpp-physics) and stochastic_physics for the changes described in detail in the associated PRs below:

It also ports the model to the XSede Expanse system (contributed by @MinsukJi-NOAA).

Switch WW3 from develop to dev/ufs-weather-model (i.e. pull #850 into this PR). This changes the results (metadata) for two regression tests, see #850 for more information. No change to input data.

Also: bug fix in run_compile.sh, replace UNIT_TEST with OPNREQ_TEST.

The bug fixes for issue #883 change the results of all regression tests using RUC LSM.

Co-authored-by: MinsukJi-NOAA <[email protected]>
Co-authored-by: aliabdolali <[email protected]>
Co-authored-by: [email protected] <[email protected]>
  • Loading branch information
4 people authored Oct 25, 2021
1 parent b87cdaa commit 74c57f1
Show file tree
Hide file tree
Showing 24 changed files with 2,614 additions and 2,298 deletions.
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
[submodule "WW3"]
path = WW3
url = https://github.com/NOAA-EMC/WW3
branch = develop
branch = dev/ufs-weather-model
[submodule "stochastic_physics"]
path = stochastic_physics
url = https://github.com/noaa-psd/stochastic_physics
Expand Down
2 changes: 1 addition & 1 deletion FV3
2 changes: 1 addition & 1 deletion NEMS
2 changes: 1 addition & 1 deletion WW3
Submodule WW3 updated 251 files
2 changes: 2 additions & 0 deletions cmake/configure_expanse.intel.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
set(INLINE_POST ON CACHE BOOL "Enable inline post" FORCE)
set(PARALLEL_NETCDF ON CACHE BOOL "Enable parallel NetCDF" FORCE)
32 changes: 32 additions & 0 deletions modulefiles/ufs_expanse.intel
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#%Module

proc ModulesHelp {} {
puts stderr "\tcit - loads modules required for building and running UFS Model on Expanse/Intel"
}

module-whatis "loads UFS Model prerequisites for Expanse/Intel"

module load slurm/expanse/20.02.3
module load cpu/0.15.4
module load intel/19.1.1.217
module load intel-mpi/2019.8.254
module load python/3.8.5
module load cmake/3.18.2

module use module use /expanse/lustre/scratch/domh/temp_project/hpc-stack-20210929/modulefiles/stack

module load hpc/1.2.0

module load hpc-intel/19.1.1.217
module load hpc-intel-mpi/2019.8.254

module load jasper/2.0.25
module load zlib/1.2.11
module load png/1.6.35

module load ufs_common

setenv CC mpiicc
setenv CXX mpiicpc
setenv FC mpiifort
setenv CMAKE_Platform expanse.intel
32 changes: 32 additions & 0 deletions modulefiles/ufs_expanse.intel_debug
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#%Module

proc ModulesHelp {} {
puts stderr "\tcit - loads modules required for building and running UFS Model on Expanse/Intel"
}

module-whatis "loads UFS Model prerequisites for Expanse/Intel"

module load slurm/expanse/20.02.3
module load cpu/0.15.4
module load intel/19.1.1.217
module load intel-mpi/2019.8.254
module load python/3.8.5
module load cmake/3.18.2

module use module use /expanse/lustre/scratch/domh/temp_project/hpc-stack-20210929/modulefiles/stack

module load hpc/1.2.0

module load hpc-intel/19.1.1.217
module load hpc-intel-mpi/2019.8.254

module load jasper/2.0.25
module load zlib/1.2.11
module load png/1.6.35

module load ufs_common_debug

setenv CC mpiicc
setenv CXX mpiicpc
setenv FC mpiifort
setenv CMAKE_Platform expanse.intel
2 changes: 1 addition & 1 deletion stochastic_physics
212 changes: 106 additions & 106 deletions tests/RegressionTests_cheyenne.gnu.log

Large diffs are not rendered by default.

675 changes: 354 additions & 321 deletions tests/RegressionTests_cheyenne.intel.log

Large diffs are not rendered by default.

580 changes: 290 additions & 290 deletions tests/RegressionTests_gaea.intel.log

Large diffs are not rendered by default.

216 changes: 108 additions & 108 deletions tests/RegressionTests_hera.gnu.log

Large diffs are not rendered by default.

588 changes: 294 additions & 294 deletions tests/RegressionTests_hera.intel.log

Large diffs are not rendered by default.

673 changes: 353 additions & 320 deletions tests/RegressionTests_jet.intel.log

Large diffs are not rendered by default.

691 changes: 362 additions & 329 deletions tests/RegressionTests_orion.intel.log

Large diffs are not rendered by default.

409 changes: 221 additions & 188 deletions tests/RegressionTests_wcoss_cray.log

Large diffs are not rendered by default.

695 changes: 364 additions & 331 deletions tests/RegressionTests_wcoss_dell_p3.log

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/ci/repo_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ cice[branch]='emc/develop'
cice[dir]='CICE-interface/CICE'

ww3[repo]='https://github.com/NOAA-EMC/WW3'
ww3[branch]='develop'
ww3[branch]='dev/ufs-weather-model'
ww3[dir]='WW3'

stoch[repo]='https://github.com/noaa-psd/stochastic_physics'
Expand Down
34 changes: 34 additions & 0 deletions tests/default_vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,40 @@ elif [[ $MACHINE_ID = stampede.* ]]; then
MPB_cdeps_025="0 39"; APB_cdeps_025="0 39"
OPB_cdeps_025="40 159"; IPB_cdeps_025="160 207"

elif [[ $MACHINE_ID = expanse.* ]]; then

TASKS_dflt=150 ; TPN_dflt=64 ; INPES_dflt=3 ; JNPES_dflt=8
TASKS_thrd=78 ; TPN_thrd=64 ; INPES_thrd=3 ; JNPES_thrd=4
TASKS_stretch=48 ; TPN_stretch=12 ; INPES_stretch=2 ; JNPES_stretch=4

TASKS_cpl_dflt=192; TPN_cpl_dflt=64; INPES_cpl_dflt=3; JNPES_cpl_dflt=8
THRD_cpl_dflt=1; WPG_cpl_dflt=6; MPB_cpl_dflt="0 143"; APB_cpl_dflt="0 149"
OPB_cpl_dflt="150 179"; IPB_cpl_dflt="180 191"

TASKS_cpl_dflt_wwav=384; TPN_cpl_dflt_wwav=64; INPES_cpl_dflt_wwav=3; JNPES_cpl_dflt_wwav=8
THRD_cpl_dflt_wwav=1; WPG_cpl_dflt_wwav=6; MPB_cpl_dflt_wwav="0 143"; APB_cpl_dflt_wwav="0 149"
OPB_cpl_dflt_wwav="150 179"; IPB_cpl_dflt_wwav="180 191"; WPB_cpl_dflt_wwav="192 383"

TASKS_cpl_thrd=120; TPN_cpl_thrd=64; INPES_cpl_thrd=3; JNPES_cpl_thrd=4
THRD_cpl_thrd=2; WPG_cpl_thrd=6; MPB_cpl_thrd="0 77"; APB_cpl_thrd="0 77"
OPB_cpl_thrd="78 107"; IPB_cpl_thrd="108 119"

TASKS_cpl_bmrk=480; TPN_cpl_bmrk=64; INPES_cpl_bmrk=6; JNPES_cpl_bmrk=8
THRD_cpl_bmrk=1; WPG_cpl_bmrk=24; MPB_cpl_bmrk="0 287"; APB_cpl_bmrk="0 311"
OPB_cpl_bmrk="312 431"; IPB_cpl_bmrk="432 479"

TASKS_cpl_wwav=640; TPN_cpl_wwav=64; INPES_cpl_wwav=6; JNPES_cpl_wwav=8
THRD_cpl_wwav=2; WPG_cpl_wwav=24; MPB_cpl_wwav="0 287"; APB_cpl_wwav="0 311"
OPB_cpl_wwav="312 431"; IPB_cpl_wwav="432 479"; WPB_cpl_wwav="480 639"

TASKS_cpl_c192=288; TPN_cpl_c192=64; INPES_cpl_c192=4; JNPES_cpl_c192=8
THRD_cpl_c192=1; WPG_cpl_c192=12; MPB_cpl_c192="0 191"; APB_cpl_c192="0 203"
OPB_cpl_c192="204 263"; IPB_cpl_c192="264 287"

TASKS_cpl_c384=318; TPN_cpl_c384=64; INPES_cpl_c384=3; JNPES_cpl_c384=8
THRD_cpl_c384=1; WPG_cpl_c384=6; MPB_cpl_c384="0 143"; APB_cpl_c384="0 149"
OPB_cpl_c384="150 269"; IPB_cpl_c384="270 317"

else

echo "Unknown MACHINE_ID ${MACHINE_ID}"
Expand Down
6 changes: 5 additions & 1 deletion tests/detect_machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,17 @@ case $(hostname -f) in
login2.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede2
login3.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede3
login4.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede4

login01.expanse.sdsc.edu) MACHINE_ID=expanse ;; ### expanse1
login02.expanse.sdsc.edu) MACHINE_ID=expanse ;; ### expanse2
esac

# Overwrite auto-detect with RT_MACHINE if set
MACHINE_ID=${RT_MACHINE:-${MACHINE_ID}}

# Append compiler
if [ $MACHINE_ID = orion ] || [ $MACHINE_ID = hera ] || [ $MACHINE_ID = cheyenne ] || [ $MACHINE_ID = jet ] || [ $MACHINE_ID = gaea ] || [ $MACHINE_ID = stampede ] || [ $MACHINE_ID = s4 ] ; then
if [ $MACHINE_ID = orion ] || [ $MACHINE_ID = hera ] || [ $MACHINE_ID = cheyenne ] || [ $MACHINE_ID = jet ] || \
[ $MACHINE_ID = gaea ] || [ $MACHINE_ID = stampede ] || [ $MACHINE_ID = s4] || [ $MACHINE_ID = expanse ] ; then
MACHINE_ID=${MACHINE_ID}.${RT_COMPILER}
fi

Expand Down
32 changes: 32 additions & 0 deletions tests/fv3_conf/fv3_slurm.IN_expanse
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH --job-name="@[JBNME]"
#SBATCH -o out
#SBATCH -e err
#SBATCH -A @[ACCNR]
#SBATCH -p @[QUEUE]
#SBATCH --nodes=@[NODES]
#SBATCH --ntasks-per-node=@[TPN]
#SBATCH -t @[WLCLK]

set -eux
echo -n " $( date +%s )," > job_timestamp.txt

set +x
source ./module-setup.sh
module use $( pwd -P )
module load modules.fv3
module list

set -x

ulimit -s unlimited

echo "Model started: "`date`

export OMP_STACK_SIZE=512M
export OMP_NUM_THREADS=@[THRD]
export I_MPI_PMI_LIBRARY=/cm/shared/apps/slurm/current/lib64/libpmi.so
srun -n @[TASKS] ./fv3.exe

echo "Model ended: " `date`
echo -n " $( date +%s )," >> job_timestamp.txt
2 changes: 1 addition & 1 deletion tests/rt.conf
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ RUN | control_thompson_extdiag_debug
RUN | regional_control_debug | | fv3 |
RUN | regional_quilt_debug | | fv3 |
RUN | fv3_gsd_debug | | fv3 |
#RUN | fv3_gsd_diag_debug | | fv3 |
RUN | fv3_gsd_diag_debug | | fv3 |
RUN | fv3_rrfs_v1beta_debug | | fv3 |
RUN | fv3_rrfs_v1alpha_debug | | fv3 |

Expand Down
17 changes: 16 additions & 1 deletion tests/rt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,21 @@ elif [[ $MACHINE_ID = stampede.* ]]; then
MPIEXECOPTS=
cp fv3_conf/fv3_slurm.IN_stampede fv3_conf/fv3_slurm.IN

elif [[ $MACHINE_ID = expanse.* ]]; then

export PYTHONPATH=
ECFLOW_START=
QUEUE=compute
COMPILE_QUEUE=shared
PARTITION=
ACCNR=TG-EES200015
dprefix=/expanse/lustre/scratch/$USER/temp_project/run
DISKNM=/expanse/lustre/scratch/domh/temp_project/RT
STMP=$dprefix
PTMP=$dprefix
SCHEDULER=slurm
cp fv3_conf/fv3_slurm.IN_expanse fv3_conf/fv3_slurm.IN

else
die "Unknown machine ID, please edit detect_machine.sh file"
fi
Expand Down Expand Up @@ -445,7 +460,7 @@ if [[ $TESTS_FILE =~ '35d' ]]; then
TEST_35D=true
fi

BL_DATE=20211019
BL_DATE=20211022
if [[ $MACHINE_ID = hera.* ]] || [[ $MACHINE_ID = orion.* ]] || [[ $MACHINE_ID = cheyenne.* ]] || [[ $MACHINE_ID = gaea.* ]] || [[ $MACHINE_ID = jet.* ]] || [[ $MACHINE_ID = s4.* ]]; then
RTPWD=${RTPWD:-$DISKNM/NEMSfv3gfs/develop-${BL_DATE}/${RT_COMPILER^^}}
else
Expand Down
4 changes: 2 additions & 2 deletions tests/run_compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ cleanup() {
}

write_fail_test() {
if [[ ${UNIT_TEST} == true ]]; then
echo compile ${COMPILE_NR} >> $PATHRT/fail_unit_test
if [[ ${OPNREQ_TEST} == true ]]; then
echo compile ${COMPILE_NR} >> $PATHRT/fail_opnreq_test
else
echo "compile_${COMPILE_NR} failed in run_compile" >> $PATHRT/fail_test
fi
Expand Down

0 comments on commit 74c57f1

Please sign in to comment.