Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into feature/aero_prepobs
Browse files Browse the repository at this point in the history
* origin:
  Change GRIB2 parameter names and vertical levels for ocean/ice post (NOAA-EMC#2611)
  Add atmensanlfv3inc job (NOAA-EMC#2592)
  Global-workflow (AR) Generic updates for Gaea C5 (NOAA-EMC#2515)
  Update STMP and PTMP settings in host file for Orion and Hercules  (NOAA-EMC#2614)
  • Loading branch information
ypwang19 committed May 29, 2024
2 parents 8beebee + bb58e06 commit 68d3e2c
Show file tree
Hide file tree
Showing 53 changed files with 605 additions and 242 deletions.
2 changes: 1 addition & 1 deletion ci/cases/yamls/atmaerosnowDA_defaults_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ defaults:
base:
DOIAU: "NO"
DO_JEDISNOWDA: "YES"
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
2 changes: 1 addition & 1 deletion ci/cases/yamls/gefs_ci_defaults.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
defaults:
!INC {{ HOMEgfs }}/parm/config/gefs/yaml/defaults.yaml
base:
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
HPC_ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
2 changes: 1 addition & 1 deletion ci/cases/yamls/gfs_defaults_ci.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
defaults:
!INC {{ HOMEgfs }}/parm/config/gfs/yaml/defaults.yaml
base:
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
2 changes: 1 addition & 1 deletion ci/cases/yamls/gfs_extended_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defaults:
!INC {{ HOMEgfs }}/parm/config/gfs/yaml/defaults.yaml

base:
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
DO_GOES: "YES"
DO_BUFRSND: "YES"
DO_GEMPAK: "YES"
Expand Down
2 changes: 1 addition & 1 deletion ci/cases/yamls/soca_gfs_defaults_ci.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
defaults:
!INC {{ HOMEgfs }}/parm/config/gfs/yaml/defaults.yaml
base:
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
DO_JEDIOCNVAR: "YES"
2 changes: 1 addition & 1 deletion ci/cases/yamls/ufs_hybatmDA_defaults.ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ base:
DOIAU: "NO"
DO_JEDIATMVAR: "YES"
DO_JEDIATMENS: "YES"
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
atmanl:
LAYOUT_X_ATMANL: 4
LAYOUT_Y_ATMANL: 4
Expand Down
4 changes: 1 addition & 3 deletions ci/platforms/config.hera
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

export GFS_CI_ROOT=/scratch1/NCEPDEV/global/Terry.McGuinness/GFS_CI_ROOT
export ICSDIR_ROOT=/scratch1/NCEPDEV/global/glopara/data/ICSDIR
export STMP="/scratch1/NCEPDEV/stmp2/${USER}"
export PTMP="/scratch1/NCEPDEV/stmp2/${USER}"
export SLURM_ACCOUNT=nems
export HPC_ACCOUNT=nems
export max_concurrent_cases=5
export max_concurrent_pr=4
4 changes: 1 addition & 3 deletions ci/platforms/config.hercules
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

export GFS_CI_ROOT=/work2/noaa/stmp/GFS_CI_ROOT/HERCULES
export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR
export STMP="/work2/noaa/stmp/${USER}/HERCULES"
export PTMP="/work2/noaa/stmp/${USER}/HERCULES"
export SLURM_ACCOUNT=nems
export HPC_ACCOUNT=nems
export max_concurrent_cases=5
export max_concurrent_pr=4
4 changes: 1 addition & 3 deletions ci/platforms/config.orion
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

export GFS_CI_ROOT=/work2/noaa/stmp/GFS_CI_ROOT/ORION
export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR
export STMP="/work2/noaa/stmp/${USER}/ORION"
export PTMP="/work2/noaa/stmp/${USER}/ORION"
export SLURM_ACCOUNT=nems
export HPC_ACCOUNT=nems
export max_concurrent_cases=5
export max_concurrent_pr=4
4 changes: 1 addition & 3 deletions ci/platforms/config.wcoss2
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

export GFS_CI_ROOT=/lfs/h2/emc/global/noscrub/globalworkflow.ci/GFS_CI_ROOT
export ICSDIR_ROOT=/lfs/h2/emc/global/noscrub/emc.global/data/ICSDIR
export STMP="/lfs/h2/emc/stmp/${USER}"
export PTMP="/lfs/h2/emc/ptmp/${USER}"
export SLURM_ACCOUNT=GFS-DEV
export HPC_ACCOUNT=GFS-DEV
export max_concurrent_cases=5
export max_concurrent_pr=4
2 changes: 1 addition & 1 deletion ci/scripts/run-check_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ while true; do
rocotorun -v "${ROCOTO_VERBOSE:-0}" -w "${xml}" -d "${db}"

# Wait before running rocotostat
sleep 10
sleep 60

# Get job statistics
echo "Gather Rocoto statistics"
Expand Down
156 changes: 125 additions & 31 deletions ci/scripts/utils/rocotostat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,31 @@

import sys
import os
import copy
from time import sleep

from wxflow import Executable, which, Logger, CommandNotFoundError
from wxflow import which, Logger, CommandNotFoundError, ProcessError
from argparse import ArgumentParser, FileType

from collections import Counter

logger = Logger(level=os.environ.get("LOGGING_LEVEL", "DEBUG"), colored_log=False)


def attempt_multiple_times(expression, max_attempts, sleep_duration=0, exception_class=Exception):
attempt = 0
last_exception = None
while attempt < max_attempts:
try:
pass
return expression()
except exception_class as last_exception:
attempt += 1
sleep(sleep_duration)
else:
raise last_exception


def input_args():
"""
Parse command-line arguments.
Expand Down Expand Up @@ -39,66 +57,142 @@ def input_args():
return args


def rocoto_statcount():
"""
Run rocotostat and process its output.
def rocotostat_summary(rocotostat):
"""
rocoto_summary Run rocotostat and process its output.
args = input_args()
rocoto_summary(rocotostat) adds a default argument '--summary' to the rocotostat
command, runs it, and processes its output to return a dictionary with the total
number of cycles and the number of cycles marked as 'Done'.
try:
rocotostat = which("rocotostat")
except CommandNotFoundError:
logger.exception("rocotostat not found in PATH")
raise CommandNotFoundError("rocotostat not found in PATH")

rocotostat_all = which("rocotostat")
rocotostat.add_default_arg(['-w', os.path.abspath(args.w.name), '-d', os.path.abspath(args.d.name), '-s'])
rocotostat_all.add_default_arg(['-w', os.path.abspath(args.w.name), '-d', os.path.abspath(args.d.name), '-a'])
Input:
rocotostat - The rocotostat command.
rocotostat_output = rocotostat(output=str)
Output:
rocoto_status - A dictionary with the total number of cycles and the number of cycles marked as 'Done'.
"""
rocotostat = copy.deepcopy(rocotostat)
rocotostat.add_default_arg('--summary')
rocotostat_output = attempt_multiple_times(lambda: rocotostat(output=str), 3, 90, ProcessError)
rocotostat_output = rocotostat_output.splitlines()[1:]
rocotostat_output = [line.split()[0:2] for line in rocotostat_output]

rocotostat_output_all = rocotostat_all(output=str)
rocotostat_output_all = rocotostat_output_all.splitlines()[1:]
rocotostat_output_all = [line.split()[0:4] for line in rocotostat_output_all]
rocotostat_output_all = [line for line in rocotostat_output_all if len(line) != 1]

rocoto_status = {
'CYCLES_TOTAL': len(rocotostat_output),
'CYCLES_DONE': sum([sublist.count('Done') for sublist in rocotostat_output])
}
return rocoto_status


def rocoto_statcount(rocotostat):
"""
rocoto_statcount Run rocotostat and process its output.
rocoto_statcount(rocotostat) adds a default argument '--all' to the rocotostat
command, runs it, and processes its output to return a dictionary with the count
of each status case.
Input:
rocotostat - The rocotostat command.
Output:
rocoto_status - A dictionary with the count of each status case.
"""

rocotostat = copy.deepcopy(rocotostat)
rocotostat.add_default_arg('--all')

rocotostat_output = attempt_multiple_times(lambda: rocotostat(output=str), 4, 120, ProcessError)
rocotostat_output = rocotostat_output.splitlines()[1:]
rocotostat_output = [line.split()[0:4] for line in rocotostat_output]
rocotostat_output = [line for line in rocotostat_output if len(line) != 1]

status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED']

rocoto_status = {}
status_counts = Counter(case for sublist in rocotostat_output for case in sublist)
for case in status_cases:
rocoto_status[case] = sum([sublist.count(case) for sublist in rocotostat_output_all])
rocoto_status[case] = status_counts[case]

return rocoto_status


def is_done(rocoto_status):
"""
is_done Check if all cycles are done.
is_done(rocoto_status) checks if the total number of cycles equals the number of
done cycles in the rocoto_status dictionary.
Input:
rocoto_status - A dictionary with the count of each status case.
Output:
boolean - True if all cycles are done, False otherwise.
"""

if rocoto_status['CYCLES_TOTAL'] == rocoto_status['CYCLES_DONE']:
return True
else:
return False


def is_stalled(rocoto_status):
"""
is_stalled Check if all cycles are stalled.
is_stalled(rocoto_status) checks if all cycles are stalled by verifying if
there are no jobs that are RUNNING, SUBMITTING, or QUEUED.
Input:
rocoto_status - A dictionary with the count of each status case.
Output:
boolean - True if all cycles are stalled, False otherwise.
"""

if rocoto_status['RUNNING'] + rocoto_status['SUBMITTING'] + rocoto_status['QUEUED'] == 0:
return True
else:
return False


if __name__ == '__main__':
"""
main Execute the script.
main() parses the input arguments, checks if the rocotostat command is available,
adds default arguments to the rocotostat command, and runs it and reports
out to stdout spcific information of rocoto workflow.
"""

args = input_args()

error_return = 0
rocoto_status = rocoto_statcount()
try:
rocotostat = which("rocotostat")
except CommandNotFoundError:
logger.exception("rocotostat not found in PATH")
raise CommandNotFoundError("rocotostat not found in PATH")

if rocoto_status['CYCLES_TOTAL'] == rocoto_status['CYCLES_DONE']:
rocotostat.add_default_arg(['-w', os.path.abspath(args.w.name), '-d', os.path.abspath(args.d.name)])

rocoto_status = rocoto_statcount(rocotostat)
rocoto_status.update(rocotostat_summary(rocotostat))

error_return = 0
if is_done(rocoto_status):
rocoto_state = 'DONE'
elif rocoto_status['DEAD'] > 0:
error_return = rocoto_status['FAIL'] + rocoto_status['DEAD']
rocoto_state = 'FAIL'
elif 'UNKNOWN' in rocoto_status:
error_return = rocoto_status['UNKNOWN']
rocoto_state = 'UNKNOWN'
elif rocoto_status['RUNNING'] + rocoto_status['SUBMITTING'] + rocoto_status['QUEUED'] == 0:
#
# TODO for now a STALLED state will be just a warning as it can
# produce a false negative if there is a timestamp on a file dependency.
#
# error_return = -3
rocoto_state = 'STALLED'
elif is_stalled(rocoto_status):
rocoto_status = attempt_multiple_times(rocoto_statcount(rocotostat), 2, 120, ProcessError)
if is_stalled(rocoto_status):
error_return = 3
rocoto_state = 'STALLED'
else:
rocoto_state = 'RUNNING'

Expand Down
2 changes: 1 addition & 1 deletion env/AWSPW.env
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if [[ $# -ne 1 ]]; then

echo "Must specify an input argument to set runtime environment variables!"
echo "argument can be any one of the following:"
echo "atmanlvar atmanlfv3inc atmensanlrun aeroanlrun snowanl"
echo "atmanlvar atmanlfv3inc atmensanlletkf atmensanlfv3inc aeroanlrun snowanl"
echo "anal sfcanl fcst post metp"
echo "eobs eupd ecen efcs epos"
echo "postsnd awips gempak"
Expand Down
2 changes: 1 addition & 1 deletion env/CONTAINER.env
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if [[ $# -ne 1 ]]; then

echo "Must specify an input argument to set runtime environment variables!"
echo "argument can be any one of the following:"
echo "atmanlvar atmanlfv3inc atmensanlrun aeroanlrun snowanl"
echo "atmanlvar atmanlfv3inc atmensanlletkf atmensanlfv3inc aeroanlrun snowanl"
echo "anal sfcanl fcst post metp"
echo "eobs eupd ecen efcs epos"
echo "postsnd awips gempak"
Expand Down
39 changes: 39 additions & 0 deletions env/GAEA.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#! /usr/bin/env bash

if [[ $# -ne 1 ]]; then

echo "Must specify an input argument to set runtime environment variables!"
echo "argument can be any one of the following:"
echo "fcst atmos_products"
exit 1

fi

step=$1

export launcher="srun -l --export=ALL"
export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out"

ulimit -s unlimited
ulimit -a

if [[ "${step}" = "fcst" ]]; then

if [[ "${CDUMP}" =~ "gfs" ]]; then
nprocs="npe_${step}_gfs"
ppn="npe_node_${step}_gfs" || ppn="npe_node_${step}"
else
nprocs="npe_${step}"
ppn="npe_node_${step}"
fi
(( nnodes = (${!nprocs}+${!ppn}-1)/${!ppn} ))
(( ntasks = nnodes*${!ppn} ))
# With ESMF threading, the model wants to use the full node
export APRUN_UFS="${launcher} -n ${ntasks}"
unset nprocs ppn nnodes ntasks

elif [[ "${step}" = "atmos_products" ]]; then

export USE_CFP="YES" # Use MPMD for downstream product generation

fi
20 changes: 14 additions & 6 deletions env/HERA.env
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if [[ $# -ne 1 ]]; then

echo "Must specify an input argument to set runtime environment variables!"
echo "argument can be any one of the following:"
echo "atmanlvar atmensanlrun aeroanlrun snowanl atmanlfv3inc"
echo "atmanlvar atmanlfv3inc atmensanlletkf atmensanlfv3inc aeroanlrun snowanl"
echo "anal sfcanl fcst post metp"
echo "eobs eupd ecen efcs epos"
echo "postsnd awips gempak"
Expand Down Expand Up @@ -68,13 +68,21 @@ elif [[ "${step}" = "atmanlvar" ]]; then
[[ ${NTHREADS_ATMANLVAR} -gt ${nth_max} ]] && export NTHREADS_ATMANLVAR=${nth_max}
export APRUN_ATMANLVAR="${launcher} -n ${npe_atmanlvar} --cpus-per-task=${NTHREADS_ATMANLVAR}"

elif [[ "${step}" = "atmensanlrun" ]]; then
elif [[ "${step}" = "atmensanlletkf" ]]; then

nth_max=$((npe_node_max / npe_node_atmensanlrun))
nth_max=$((npe_node_max / npe_node_atmensanlletkf))

export NTHREADS_ATMENSANL=${nth_atmensanlrun:-${nth_max}}
[[ ${NTHREADS_ATMENSANL} -gt ${nth_max} ]] && export NTHREADS_ATMENSANL=${nth_max}
export APRUN_ATMENSANL="${launcher} -n ${npe_atmensanlrun} --cpus-per-task=${NTHREADS_ATMENSANL}"
export NTHREADS_ATMENSANLLETKF=${nth_atmensanlletkf:-${nth_max}}
[[ ${NTHREADS_ATMENSANLLETKF} -gt ${nth_max} ]] && export NTHREADS_ATMENSANLLETKF=${nth_max}
export APRUN_ATMENSANLLETKF="${launcher} -n ${npe_atmensanlletkf} --cpus-per-task=${NTHREADS_ATMENSANLLETKF}"

elif [[ "${step}" = "atmensanlfv3inc" ]]; then

nth_max=$((npe_node_max / npe_node_atmensanlfv3inc))

export NTHREADS_ATMENSANLFV3INC=${nth_atmensanlfv3inc:-${nth_max}}
[[ ${NTHREADS_ATMENSANLFV3INC} -gt ${nth_max} ]] && export NTHREADS_ATMENSANLFV3INC=${nth_max}
export APRUN_ATMENSANLFV3INC="${launcher} -n ${npe_atmensanlfv3inc} --cpus-per-task=${NTHREADS_ATMENSANLFV3INC}"

elif [[ "${step}" = "aeroanlrun" ]]; then

Expand Down
Loading

0 comments on commit 68d3e2c

Please sign in to comment.